View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *   http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  package org.apache.maven.impl;
20  
21  import java.io.File;
22  import java.nio.file.FileSystem;
23  import java.nio.file.Path;
24  import java.nio.file.PathMatcher;
25  import java.util.ArrayList;
26  import java.util.Arrays;
27  import java.util.Collection;
28  import java.util.Iterator;
29  import java.util.LinkedHashSet;
30  import java.util.List;
31  import java.util.Set;
32  
33  /**
34   * Determines whether a path is selected according to include/exclude patterns.
35   * The pathnames used for method parameters will be relative to some base directory
36   * and use {@code '/'} as separator, regardless of the hosting operating system.
37   *
38   * <h2>Syntax</h2>
39   * If a pattern contains the {@code ':'} character and the prefix before is longer than 1 character,
40   * then that pattern is given verbatim to {@link FileSystem#getPathMatcher(String)}, which interprets
41   * the part before {@code ':'} as the syntax (usually {@code "glob"} or {@code "regex"}).
42   * If a pattern does not contain the {@code ':'} character, or if the prefix is one character long
43   * (interpreted as a Windows drive), then the syntax defaults to a reproduction of the Maven 3 behavior.
44   * This is implemented as the {@code "glob"} syntax with the following modifications:
45   *
46   * <ul>
47   *   <li>The platform-specific separator ({@code '\\'} on Windows) is replaced by {@code '/'}.
48   *       Note that it means that the backslash cannot be used for escaping characters.</li>
49   *   <li>Trailing {@code "/"} is completed as {@code "/**"}.</li>
50   *   <li>The {@code "**"} wildcard means "0 or more directories" instead of "1 or more directories".
51   *       This is implemented by adding variants of the pattern without the {@code "**"} wildcard.</li>
52   *   <li>Bracket characters [ ] and { } are escaped.</li>
53   *   <li>On Unix only, the escape character {@code '\\'} is itself escaped.</li>
54   * </ul>
55   *
56   * If above changes are not desired, put an explicit {@code "glob:"} prefix before the pattern.
57   * Note that putting such a prefix is recommended anyway for better performances.
58   *
59   * @author Benjamin Bentmann
60   * @author Martin Desruisseaux
61   *
62   * @see java.nio.file.FileSystem#getPathMatcher(String)
63   */
64  public class PathSelector implements PathMatcher {
65      /**
66       * Patterns which should be excluded by default, like <abbr>SCM</abbr> files.
67       *
68       * <p><b>Source:</b> this list is copied from {@code plexus-utils-4.0.2} (released in
69       * September 23, 2024), class {@code org.codehaus.plexus.util.AbstractScanner}.</p>
70       */
71      private static final List<String> DEFAULT_EXCLUDES = List.of(
72              // Miscellaneous typical temporary files
73              "**/*~",
74              "**/#*#",
75              "**/.#*",
76              "**/%*%",
77              "**/._*",
78  
79              // CVS
80              "**/CVS",
81              "**/CVS/**",
82              "**/.cvsignore",
83  
84              // RCS
85              "**/RCS",
86              "**/RCS/**",
87  
88              // SCCS
89              "**/SCCS",
90              "**/SCCS/**",
91  
92              // Visual SourceSafe
93              "**/vssver.scc",
94  
95              // MKS
96              "**/project.pj",
97  
98              // Subversion
99              "**/.svn",
100             "**/.svn/**",
101 
102             // Arch
103             "**/.arch-ids",
104             "**/.arch-ids/**",
105 
106             // Bazaar
107             "**/.bzr",
108             "**/.bzr/**",
109 
110             // SurroundSCM
111             "**/.MySCMServerInfo",
112 
113             // Mac
114             "**/.DS_Store",
115 
116             // Serena Dimensions Version 10
117             "**/.metadata",
118             "**/.metadata/**",
119 
120             // Mercurial
121             "**/.hg",
122             "**/.hg/**",
123 
124             // git
125             "**/.git",
126             "**/.git/**",
127             "**/.gitignore",
128 
129             // BitKeeper
130             "**/BitKeeper",
131             "**/BitKeeper/**",
132             "**/ChangeSet",
133             "**/ChangeSet/**",
134 
135             // darcs
136             "**/_darcs",
137             "**/_darcs/**",
138             "**/.darcsrepo",
139             "**/.darcsrepo/**",
140             "**/-darcs-backup*",
141             "**/.darcs-temp-mail");
142 
143     /**
144      * Maximum number of characters of the prefix before {@code ':'} for handling as a Maven syntax.
145      */
146     private static final int MAVEN_SYNTAX_THRESHOLD = 1;
147 
148     /**
149      * The default syntax to use if none was specified. Note that when this default syntax is applied,
150      * the user-provided pattern get some changes as documented in class Javadoc.
151      */
152     private static final String DEFAULT_SYNTAX = "glob:";
153 
154     /**
155      * Characters having a special meaning in the glob syntax.
156      *
157      * @see FileSystem#getPathMatcher(String)
158      */
159     private static final String SPECIAL_CHARACTERS = "*?[]{}\\";
160 
161     /**
162      * A path matcher which accepts all files.
163      *
164      * @see #simplify()
165      */
166     private static final PathMatcher INCLUDES_ALL = (path) -> true;
167 
168     /**
169      * String representations of the normalized include filters.
170      * Each pattern shall be prefixed by its syntax, which is {@value #DEFAULT_SYNTAX} by default.
171      *
172      * @see #toString()
173      */
174     private final String[] includePatterns;
175 
176     /**
177      * String representations of the normalized exclude filters.
178      * Each pattern shall be prefixed by its syntax, which is {@value #DEFAULT_SYNTAX} by default.
179      * This array may be longer or shorter than the user-supplied excludes, depending on whether
180      * default excludes have been added and whether some unnecessary excludes have been omitted.
181      *
182      * @see #toString()
183      */
184     private final String[] excludePatterns;
185 
186     /**
187      * The matcher for includes. The length of this array is equal to {@link #includePatterns} array length.
188      */
189     private final PathMatcher[] includes;
190 
191     /**
192      * The matcher for excludes. The length of this array is equal to {@link #excludePatterns} array length.
193      */
194     private final PathMatcher[] excludes;
195 
196     /**
197      * The matcher for all directories to include. This array includes the parents of all those directories,
198      * because they need to be accepted before we can walk to the sub-directories.
199      * This is an optimization for skipping whole directories when possible.
200      */
201     private final PathMatcher[] dirIncludes;
202 
203     /**
204      * The matcher for directories to exclude. This array does <em>not</em> include the parent directories,
205      * because they may contain other sub-trees that need to be included.
206      * This is an optimization for skipping whole directories when possible.
207      */
208     private final PathMatcher[] dirExcludes;
209 
210     /**
211      * The base directory. All files will be relativized to that directory before to be matched.
212      */
213     private final Path baseDirectory;
214 
215     /**
216      * Creates a new selector from the given includes and excludes.
217      *
218      * @param directory the base directory of the files to filter
219      * @param includes the patterns of the files to include, or null or empty for including all files
220      * @param excludes the patterns of the files to exclude, or null or empty for no exclusion
221      * @param useDefaultExcludes whether to augment the excludes with a default set of <abbr>SCM</abbr> patterns
222      */
223     public PathSelector(
224             Path directory, Collection<String> includes, Collection<String> excludes, boolean useDefaultExcludes) {
225         includePatterns = normalizePatterns(includes, false);
226         excludePatterns = normalizePatterns(effectiveExcludes(excludes, includePatterns, useDefaultExcludes), true);
227         baseDirectory = directory;
228         FileSystem system = directory.getFileSystem();
229         this.includes = matchers(system, includePatterns);
230         this.excludes = matchers(system, excludePatterns);
231         dirIncludes = matchers(system, directoryPatterns(includePatterns, false));
232         dirExcludes = matchers(system, directoryPatterns(excludePatterns, true));
233     }
234 
235     /**
236      * Returns the given array of excludes, optionally expanded with a default set of excludes,
237      * then with unnecessary excludes omitted. An unnecessary exclude is an exclude which will never
238      * match a file because there is no include which would accept a file that could match the exclude.
239      * For example, if the only include is {@code "*.java"}, then the <code>"**&sol;project.pj"</code>,
240      * <code>"**&sol;.DS_Store"</code> and other excludes will never match a file and can be omitted.
241      * Because the list of {@linkplain #DEFAULT_EXCLUDES default excludes} contains many elements,
242      * removing unnecessary excludes can reduce a lot the number of matches tested on each source file.
243      *
244      * <h4>Implementation note</h4>
245      * The removal of unnecessary excludes is done on a best effort basis. The current implementation
246      * compares only the prefixes and suffixes of each pattern, keeping the pattern in case of doubt.
247      * This is not bad, but it does not remove all unnecessary patterns. It would be possible to do
248      * better in the future if benchmarking suggests that it would be worth the effort.
249      *
250      * @param excludes the user-specified excludes, potentially not yet converted to glob syntax
251      * @param includes the include patterns converted to glob syntax
252      * @param useDefaultExcludes whether to expand user exclude with the set of default excludes
253      * @return the potentially expanded or reduced set of excludes to use
254      */
255     private static Collection<String> effectiveExcludes(
256             Collection<String> excludes, final String[] includes, final boolean useDefaultExcludes) {
257         if (excludes == null || excludes.isEmpty()) {
258             if (useDefaultExcludes) {
259                 excludes = new ArrayList<>(DEFAULT_EXCLUDES);
260             } else {
261                 return List.of();
262             }
263         } else {
264             excludes = new ArrayList<>(excludes);
265             if (useDefaultExcludes) {
266                 excludes.addAll(DEFAULT_EXCLUDES);
267             }
268         }
269         /*
270          * Get the prefixes and suffixes of all includes, stopping at the first special character.
271          * Redundant prefixes and suffixes are omitted.
272          */
273         var prefixes = new String[includes.length];
274         var suffixes = new String[includes.length];
275         for (int i = 0; i < includes.length; i++) {
276             String include = includes[i];
277             if (!include.startsWith(DEFAULT_SYNTAX)) {
278                 return excludes; // Do not filter if at least one pattern is too complicated.
279             }
280             include = include.substring(DEFAULT_SYNTAX.length());
281             prefixes[i] = prefixOrSuffix(include, false);
282             suffixes[i] = prefixOrSuffix(include, true);
283         }
284         prefixes = sortByLength(prefixes, false);
285         suffixes = sortByLength(suffixes, true);
286         /*
287          * Keep only the exclude which start with one of the prefixes and end with one of the suffixes.
288          * Note that a prefix or suffix may be the empty string, which match everything.
289          */
290         final Iterator<String> it = excludes.iterator();
291         nextExclude:
292         while (it.hasNext()) {
293             final String exclude = it.next();
294             final int s = exclude.indexOf(':');
295             if (s <= MAVEN_SYNTAX_THRESHOLD || exclude.startsWith(DEFAULT_SYNTAX)) {
296                 if (cannotMatch(exclude, prefixes, false) || cannotMatch(exclude, suffixes, true)) {
297                     it.remove();
298                 }
299             }
300         }
301         return excludes;
302     }
303 
304     /**
305      * Returns the maximal amount of ordinary characters at the beginning or end of the given pattern.
306      * The prefix or suffix stops at the first {@linkplain #SPECIAL_CHARACTERS special character}.
307      *
308      * @param include the pattern for which to get a prefix or suffix without special character
309      * @param suffix {@code false} if a prefix is desired, or {@code true} if a suffix is desired
310      */
311     private static String prefixOrSuffix(final String include, boolean suffix) {
312         int s = suffix ? -1 : include.length();
313         for (int i = SPECIAL_CHARACTERS.length(); --i >= 0; ) {
314             char c = SPECIAL_CHARACTERS.charAt(i);
315             if (suffix) {
316                 s = Math.max(s, include.lastIndexOf(c));
317             } else {
318                 int p = include.indexOf(c);
319                 if (p >= 0 && p < s) {
320                     s = p;
321                 }
322             }
323         }
324         return suffix ? include.substring(s + 1) : include.substring(0, s);
325     }
326 
327     /**
328      * Returns {@code true} if the given exclude cannot match any include patterns.
329      * In case of doubt, returns {@code false}.
330      *
331      * @param exclude the exclude pattern to test
332      * @param fragments the prefixes or suffixes (fragments without special characters) of the includes
333      * @param suffix {@code false} if the specified fragments are prefixes, {@code true} if they are suffixes
334      * @return {@code true} if it is certain that the exclude pattern cannot match, or {@code false} in case of doubt
335      */
336     private static boolean cannotMatch(String exclude, final String[] fragments, final boolean suffix) {
337         exclude = prefixOrSuffix(exclude, suffix);
338         for (String fragment : fragments) {
339             int fg = fragment.length();
340             int ex = exclude.length();
341             int length = Math.min(fg, ex);
342             if (suffix) {
343                 fg -= length;
344                 ex -= length;
345             } else {
346                 fg = 0;
347                 ex = 0;
348             }
349             if (exclude.regionMatches(ex, fragment, fg, length)) {
350                 return false;
351             }
352         }
353         return true;
354     }
355 
356     /**
357      * Sorts the given patterns by their length. The main intent is to have the empty string first,
358      * while will cause the loops testing for prefixes and suffixes to stop almost immediately.
359      * Short prefixes or suffixes are also more likely to be matched.
360      *
361      * @param fragments the fragments to sort in-place
362      * @param suffix {@code false} if the specified fragments are prefixes, {@code true} if they are suffixes
363      * @return the given array, or a smaller array if some fragments were discarded because redundant
364      */
365     private static String[] sortByLength(final String[] fragments, final boolean suffix) {
366         Arrays.sort(fragments, (s1, s2) -> s1.length() - s2.length());
367         int count = 0;
368         /*
369          * Simplify the array of prefixes or suffixes by removing all redundant elements.
370          * An element is redundant if there is a shorter prefix or suffix with the same characters.
371          */
372         nextBase:
373         for (String fragment : fragments) {
374             for (int i = count; --i >= 0; ) {
375                 String base = fragments[i];
376                 if (suffix ? fragment.endsWith(base) : fragment.startsWith(base)) {
377                     continue nextBase; // Skip this fragment
378                 }
379             }
380             fragments[count++] = fragment;
381         }
382         return (fragments.length == count) ? fragments : Arrays.copyOf(fragments, count);
383     }
384 
385     /**
386      * Returns the given array of patterns with path separator normalized to {@code '/'}.
387      * Null or empty patterns are ignored, and duplications are removed.
388      *
389      * @param patterns the patterns to normalize
390      * @param excludes whether the patterns are exclude patterns
391      * @return normalized patterns without null, empty or duplicated patterns
392      */
393     private static String[] normalizePatterns(final Collection<String> patterns, final boolean excludes) {
394         if (patterns == null || patterns.isEmpty()) {
395             return new String[0];
396         }
397         // TODO: use `LinkedHashSet.newLinkedHashSet(int)` instead with JDK19.
398         final var normalized = new LinkedHashSet<String>(patterns.size());
399         for (String pattern : patterns) {
400             if (pattern != null && !pattern.isEmpty()) {
401                 if (pattern.indexOf(':') <= MAVEN_SYNTAX_THRESHOLD) {
402                     pattern = pattern.replace(File.separatorChar, '/');
403                     if (pattern.endsWith("/")) {
404                         pattern += "**";
405                     }
406                     // Following are okay only when "**" means "0 or more directories".
407                     while (pattern.endsWith("/**/**")) {
408                         pattern = pattern.substring(0, pattern.length() - 3);
409                     }
410                     while (pattern.startsWith("**/**/")) {
411                         pattern = pattern.substring(3);
412                     }
413                     pattern = pattern.replace("/**/**/", "/**/");
414                     pattern = pattern.replace("\\", "\\\\")
415                             .replace("[", "\\[")
416                             .replace("]", "\\]")
417                             .replace("{", "\\{")
418                             .replace("}", "\\}");
419                     normalized.add(DEFAULT_SYNTAX + pattern);
420                     /*
421                      * If the pattern starts or ends with "**", Java GLOB expects a directory level at
422                      * that location while Maven seems to consider that "**" can mean "no directory".
423                      * Add another pattern for reproducing this effect.
424                      */
425                     addPatternsWithOneDirRemoved(normalized, pattern, 0);
426                 } else {
427                     normalized.add(pattern);
428                 }
429             }
430         }
431         return simplify(normalized, excludes);
432     }
433 
434     /**
435      * Adds all variants of the given pattern with {@code **} removed.
436      * This is used for simulating the Maven behavior where {@code "**} may match zero directory.
437      * Tests suggest that we need an explicit GLOB pattern with no {@code "**"} for matching an absence of directory.
438      *
439      * @param patterns where to add the derived patterns
440      * @param pattern  the pattern for which to add derived forms, without the "glob:" syntax prefix
441      * @param end      should be 0 (reserved for recursive invocations of this method)
442      */
443     private static void addPatternsWithOneDirRemoved(final Set<String> patterns, final String pattern, int end) {
444         final int length = pattern.length();
445         int start;
446         while ((start = pattern.indexOf("**", end)) >= 0) {
447             end = start + 2; // 2 is the length of "**".
448             if (end < length) {
449                 if (pattern.charAt(end) != '/') {
450                     continue;
451                 }
452                 if (start == 0) {
453                     end++; // Ommit the leading slash if there is nothing before it.
454                 }
455             }
456             if (start > 0 && pattern.charAt(--start) != '/') {
457                 continue;
458             }
459             String reduced = pattern.substring(0, start) + pattern.substring(end);
460             patterns.add(DEFAULT_SYNTAX + reduced);
461             addPatternsWithOneDirRemoved(patterns, reduced, start);
462         }
463     }
464 
465     /**
466      * Applies some heuristic rules for simplifying the set of patterns,
467      * then returns the patterns as an array.
468      *
469      * @param patterns the patterns to simplify and return asarray
470      * @param excludes whether the patterns are exclude patterns
471      * @return the set content as an array, after simplification
472      */
473     private static String[] simplify(Set<String> patterns, boolean excludes) {
474         /*
475          * If the "**" pattern is present, it makes all other patterns useless.
476          * In the case of include patterns, an empty set means to include everything.
477          */
478         if (patterns.remove("**")) {
479             patterns.clear();
480             if (excludes) {
481                 patterns.add("**");
482             }
483         }
484         return patterns.toArray(String[]::new);
485     }
486 
487     /**
488      * Eventually adds the parent directory of the given patterns, without duplicated values.
489      * The patterns given to this method should have been normalized.
490      *
491      * @param patterns the normalized include or exclude patterns
492      * @param excludes whether the patterns are exclude patterns
493      * @return pattens of directories to include or exclude
494      */
495     private static String[] directoryPatterns(final String[] patterns, final boolean excludes) {
496         // TODO: use `LinkedHashSet.newLinkedHashSet(int)` instead with JDK19.
497         final var directories = new LinkedHashSet<String>(patterns.length);
498         for (String pattern : patterns) {
499             if (pattern.startsWith(DEFAULT_SYNTAX)) {
500                 if (excludes) {
501                     if (pattern.endsWith("/**")) {
502                         directories.add(pattern.substring(0, pattern.length() - 3));
503                     }
504                 } else {
505                     int s = pattern.indexOf(':');
506                     if (pattern.regionMatches(++s, "**/", 0, 3)) {
507                         s = pattern.indexOf('/', s + 3);
508                         if (s < 0) {
509                             return new String[0]; // Pattern is "**", so we need to accept everything.
510                         }
511                         directories.add(pattern.substring(0, s));
512                     }
513                 }
514             }
515         }
516         return simplify(directories, excludes);
517     }
518 
519     /**
520      * Creates the path matchers for the given patterns.
521      * The syntax (usually {@value #DEFAULT_SYNTAX}) must be specified for each pattern.
522      */
523     private static PathMatcher[] matchers(final FileSystem fs, final String[] patterns) {
524         final var matchers = new PathMatcher[patterns.length];
525         for (int i = 0; i < patterns.length; i++) {
526             matchers[i] = fs.getPathMatcher(patterns[i]);
527         }
528         return matchers;
529     }
530 
531     /**
532      * {@return a potentially simpler matcher equivalent to this matcher}.
533      */
534     @SuppressWarnings("checkstyle:MissingSwitchDefault")
535     public PathMatcher simplify() {
536         if (excludes.length == 0) {
537             switch (includes.length) {
538                 case 0:
539                     return INCLUDES_ALL;
540                 case 1:
541                     return includes[0];
542             }
543         }
544         return this;
545     }
546 
547     /**
548      * Determines whether a path is selected.
549      * This is true if the given file matches an include pattern and no exclude pattern.
550      *
551      * @param path the pathname to test, must not be {@code null}
552      * @return {@code true} if the given path is selected, {@code false} otherwise
553      */
554     @Override
555     public boolean matches(Path path) {
556         path = baseDirectory.relativize(path);
557         return (includes.length == 0 || isMatched(path, includes))
558                 && (excludes.length == 0 || !isMatched(path, excludes));
559     }
560 
561     /**
562      * {@return whether the given file matches according to one of the given matchers}.
563      */
564     private static boolean isMatched(Path path, PathMatcher[] matchers) {
565         for (PathMatcher matcher : matchers) {
566             if (matcher.matches(path)) {
567                 return true;
568             }
569         }
570         return false;
571     }
572 
573     /**
574      * Determines whether a directory could contain selected paths.
575      *
576      * @param directory the directory pathname to test, must not be {@code null}
577      * @return {@code true} if the given directory might contain selected paths, {@code false} if the
578      *         directory will definitively not contain selected paths
579      */
580     public boolean couldHoldSelected(Path directory) {
581         if (baseDirectory.equals(directory)) {
582             return true;
583         }
584         directory = baseDirectory.relativize(directory);
585         return (dirIncludes.length == 0 || isMatched(directory, dirIncludes))
586                 && (dirExcludes.length == 0 || !isMatched(directory, dirExcludes));
587     }
588 
589     /**
590      * Appends the elements of the given array in the given buffer.
591      * This is a helper method for {@link #toString()} implementations.
592      *
593      * @param buffer the buffer to add the elements to
594      * @param label label identifying the array of elements to add
595      * @param patterns the elements to append, or {@code null} if none
596      */
597     private static void append(StringBuilder buffer, String label, String[] patterns) {
598         buffer.append(label).append(": [");
599         if (patterns != null) {
600             for (int i = 0; i < patterns.length; i++) {
601                 if (i != 0) {
602                     buffer.append(", ");
603                 }
604                 buffer.append(patterns[i]);
605             }
606         }
607         buffer.append(']');
608     }
609 
610     /**
611      * {@return a string representation for logging purposes}.
612      */
613     @Override
614     public String toString() {
615         var buffer = new StringBuilder();
616         append(buffer, "includes", includePatterns);
617         append(buffer.append(", "), "excludes", excludePatterns);
618         return buffer.toString();
619     }
620 }