1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19 package org.apache.maven.impl;
20
21 import java.io.File;
22 import java.nio.file.FileSystem;
23 import java.nio.file.Path;
24 import java.nio.file.PathMatcher;
25 import java.util.ArrayList;
26 import java.util.Arrays;
27 import java.util.Collection;
28 import java.util.Iterator;
29 import java.util.LinkedHashSet;
30 import java.util.List;
31 import java.util.Set;
32
33 /**
34 * Determines whether a path is selected according to include/exclude patterns.
35 * The pathnames used for method parameters will be relative to some base directory
36 * and use {@code '/'} as separator, regardless of the hosting operating system.
37 *
38 * <h2>Syntax</h2>
39 * If a pattern contains the {@code ':'} character and the prefix before is longer than 1 character,
40 * then that pattern is given verbatim to {@link FileSystem#getPathMatcher(String)}, which interprets
41 * the part before {@code ':'} as the syntax (usually {@code "glob"} or {@code "regex"}).
42 * If a pattern does not contain the {@code ':'} character, or if the prefix is one character long
43 * (interpreted as a Windows drive), then the syntax defaults to a reproduction of the Maven 3 behavior.
44 * This is implemented as the {@code "glob"} syntax with the following modifications:
45 *
46 * <ul>
47 * <li>The platform-specific separator ({@code '\\'} on Windows) is replaced by {@code '/'}.
48 * Note that it means that the backslash cannot be used for escaping characters.</li>
49 * <li>Trailing {@code "/"} is completed as {@code "/**"}.</li>
50 * <li>The {@code "**"} wildcard means "0 or more directories" instead of "1 or more directories".
51 * This is implemented by adding variants of the pattern without the {@code "**"} wildcard.</li>
52 * <li>Bracket characters [ ] and { } are escaped.</li>
53 * <li>On Unix only, the escape character {@code '\\'} is itself escaped.</li>
54 * </ul>
55 *
56 * If above changes are not desired, put an explicit {@code "glob:"} prefix before the pattern.
57 * Note that putting such a prefix is recommended anyway for better performances.
58 *
59 * @author Benjamin Bentmann
60 * @author Martin Desruisseaux
61 *
62 * @see java.nio.file.FileSystem#getPathMatcher(String)
63 */
64 public class PathSelector implements PathMatcher {
65 /**
66 * Patterns which should be excluded by default, like <abbr>SCM</abbr> files.
67 *
68 * <p><b>Source:</b> this list is copied from {@code plexus-utils-4.0.2} (released in
69 * September 23, 2024), class {@code org.codehaus.plexus.util.AbstractScanner}.</p>
70 */
71 private static final List<String> DEFAULT_EXCLUDES = List.of(
72 // Miscellaneous typical temporary files
73 "**/*~",
74 "**/#*#",
75 "**/.#*",
76 "**/%*%",
77 "**/._*",
78
79 // CVS
80 "**/CVS",
81 "**/CVS/**",
82 "**/.cvsignore",
83
84 // RCS
85 "**/RCS",
86 "**/RCS/**",
87
88 // SCCS
89 "**/SCCS",
90 "**/SCCS/**",
91
92 // Visual SourceSafe
93 "**/vssver.scc",
94
95 // MKS
96 "**/project.pj",
97
98 // Subversion
99 "**/.svn",
100 "**/.svn/**",
101
102 // Arch
103 "**/.arch-ids",
104 "**/.arch-ids/**",
105
106 // Bazaar
107 "**/.bzr",
108 "**/.bzr/**",
109
110 // SurroundSCM
111 "**/.MySCMServerInfo",
112
113 // Mac
114 "**/.DS_Store",
115
116 // Serena Dimensions Version 10
117 "**/.metadata",
118 "**/.metadata/**",
119
120 // Mercurial
121 "**/.hg",
122 "**/.hg/**",
123
124 // git
125 "**/.git",
126 "**/.git/**",
127 "**/.gitignore",
128
129 // BitKeeper
130 "**/BitKeeper",
131 "**/BitKeeper/**",
132 "**/ChangeSet",
133 "**/ChangeSet/**",
134
135 // darcs
136 "**/_darcs",
137 "**/_darcs/**",
138 "**/.darcsrepo",
139 "**/.darcsrepo/**",
140 "**/-darcs-backup*",
141 "**/.darcs-temp-mail");
142
143 /**
144 * Maximum number of characters of the prefix before {@code ':'} for handling as a Maven syntax.
145 */
146 private static final int MAVEN_SYNTAX_THRESHOLD = 1;
147
148 /**
149 * The default syntax to use if none was specified. Note that when this default syntax is applied,
150 * the user-provided pattern get some changes as documented in class Javadoc.
151 */
152 private static final String DEFAULT_SYNTAX = "glob:";
153
154 /**
155 * Characters having a special meaning in the glob syntax.
156 *
157 * @see FileSystem#getPathMatcher(String)
158 */
159 private static final String SPECIAL_CHARACTERS = "*?[]{}\\";
160
161 /**
162 * A path matcher which accepts all files.
163 *
164 * @see #simplify()
165 */
166 private static final PathMatcher INCLUDES_ALL = (path) -> true;
167
168 /**
169 * String representations of the normalized include filters.
170 * Each pattern shall be prefixed by its syntax, which is {@value #DEFAULT_SYNTAX} by default.
171 *
172 * @see #toString()
173 */
174 private final String[] includePatterns;
175
176 /**
177 * String representations of the normalized exclude filters.
178 * Each pattern shall be prefixed by its syntax, which is {@value #DEFAULT_SYNTAX} by default.
179 * This array may be longer or shorter than the user-supplied excludes, depending on whether
180 * default excludes have been added and whether some unnecessary excludes have been omitted.
181 *
182 * @see #toString()
183 */
184 private final String[] excludePatterns;
185
186 /**
187 * The matcher for includes. The length of this array is equal to {@link #includePatterns} array length.
188 */
189 private final PathMatcher[] includes;
190
191 /**
192 * The matcher for excludes. The length of this array is equal to {@link #excludePatterns} array length.
193 */
194 private final PathMatcher[] excludes;
195
196 /**
197 * The matcher for all directories to include. This array includes the parents of all those directories,
198 * because they need to be accepted before we can walk to the sub-directories.
199 * This is an optimization for skipping whole directories when possible.
200 */
201 private final PathMatcher[] dirIncludes;
202
203 /**
204 * The matcher for directories to exclude. This array does <em>not</em> include the parent directories,
205 * because they may contain other sub-trees that need to be included.
206 * This is an optimization for skipping whole directories when possible.
207 */
208 private final PathMatcher[] dirExcludes;
209
210 /**
211 * The base directory. All files will be relativized to that directory before to be matched.
212 */
213 private final Path baseDirectory;
214
215 /**
216 * Creates a new selector from the given includes and excludes.
217 *
218 * @param directory the base directory of the files to filter
219 * @param includes the patterns of the files to include, or null or empty for including all files
220 * @param excludes the patterns of the files to exclude, or null or empty for no exclusion
221 * @param useDefaultExcludes whether to augment the excludes with a default set of <abbr>SCM</abbr> patterns
222 */
223 public PathSelector(
224 Path directory, Collection<String> includes, Collection<String> excludes, boolean useDefaultExcludes) {
225 includePatterns = normalizePatterns(includes, false);
226 excludePatterns = normalizePatterns(effectiveExcludes(excludes, includePatterns, useDefaultExcludes), true);
227 baseDirectory = directory;
228 FileSystem system = directory.getFileSystem();
229 this.includes = matchers(system, includePatterns);
230 this.excludes = matchers(system, excludePatterns);
231 dirIncludes = matchers(system, directoryPatterns(includePatterns, false));
232 dirExcludes = matchers(system, directoryPatterns(excludePatterns, true));
233 }
234
235 /**
236 * Returns the given array of excludes, optionally expanded with a default set of excludes,
237 * then with unnecessary excludes omitted. An unnecessary exclude is an exclude which will never
238 * match a file because there is no include which would accept a file that could match the exclude.
239 * For example, if the only include is {@code "*.java"}, then the <code>"**/project.pj"</code>,
240 * <code>"**/.DS_Store"</code> and other excludes will never match a file and can be omitted.
241 * Because the list of {@linkplain #DEFAULT_EXCLUDES default excludes} contains many elements,
242 * removing unnecessary excludes can reduce a lot the number of matches tested on each source file.
243 *
244 * <h4>Implementation note</h4>
245 * The removal of unnecessary excludes is done on a best effort basis. The current implementation
246 * compares only the prefixes and suffixes of each pattern, keeping the pattern in case of doubt.
247 * This is not bad, but it does not remove all unnecessary patterns. It would be possible to do
248 * better in the future if benchmarking suggests that it would be worth the effort.
249 *
250 * @param excludes the user-specified excludes, potentially not yet converted to glob syntax
251 * @param includes the include patterns converted to glob syntax
252 * @param useDefaultExcludes whether to expand user exclude with the set of default excludes
253 * @return the potentially expanded or reduced set of excludes to use
254 */
255 private static Collection<String> effectiveExcludes(
256 Collection<String> excludes, final String[] includes, final boolean useDefaultExcludes) {
257 if (excludes == null || excludes.isEmpty()) {
258 if (useDefaultExcludes) {
259 excludes = new ArrayList<>(DEFAULT_EXCLUDES);
260 } else {
261 return List.of();
262 }
263 } else {
264 excludes = new ArrayList<>(excludes);
265 if (useDefaultExcludes) {
266 excludes.addAll(DEFAULT_EXCLUDES);
267 }
268 }
269 /*
270 * Get the prefixes and suffixes of all includes, stopping at the first special character.
271 * Redundant prefixes and suffixes are omitted.
272 */
273 var prefixes = new String[includes.length];
274 var suffixes = new String[includes.length];
275 for (int i = 0; i < includes.length; i++) {
276 String include = includes[i];
277 if (!include.startsWith(DEFAULT_SYNTAX)) {
278 return excludes; // Do not filter if at least one pattern is too complicated.
279 }
280 include = include.substring(DEFAULT_SYNTAX.length());
281 prefixes[i] = prefixOrSuffix(include, false);
282 suffixes[i] = prefixOrSuffix(include, true);
283 }
284 prefixes = sortByLength(prefixes, false);
285 suffixes = sortByLength(suffixes, true);
286 /*
287 * Keep only the exclude which start with one of the prefixes and end with one of the suffixes.
288 * Note that a prefix or suffix may be the empty string, which match everything.
289 */
290 final Iterator<String> it = excludes.iterator();
291 nextExclude:
292 while (it.hasNext()) {
293 final String exclude = it.next();
294 final int s = exclude.indexOf(':');
295 if (s <= MAVEN_SYNTAX_THRESHOLD || exclude.startsWith(DEFAULT_SYNTAX)) {
296 if (cannotMatch(exclude, prefixes, false) || cannotMatch(exclude, suffixes, true)) {
297 it.remove();
298 }
299 }
300 }
301 return excludes;
302 }
303
304 /**
305 * Returns the maximal amount of ordinary characters at the beginning or end of the given pattern.
306 * The prefix or suffix stops at the first {@linkplain #SPECIAL_CHARACTERS special character}.
307 *
308 * @param include the pattern for which to get a prefix or suffix without special character
309 * @param suffix {@code false} if a prefix is desired, or {@code true} if a suffix is desired
310 */
311 private static String prefixOrSuffix(final String include, boolean suffix) {
312 int s = suffix ? -1 : include.length();
313 for (int i = SPECIAL_CHARACTERS.length(); --i >= 0; ) {
314 char c = SPECIAL_CHARACTERS.charAt(i);
315 if (suffix) {
316 s = Math.max(s, include.lastIndexOf(c));
317 } else {
318 int p = include.indexOf(c);
319 if (p >= 0 && p < s) {
320 s = p;
321 }
322 }
323 }
324 return suffix ? include.substring(s + 1) : include.substring(0, s);
325 }
326
327 /**
328 * Returns {@code true} if the given exclude cannot match any include patterns.
329 * In case of doubt, returns {@code false}.
330 *
331 * @param exclude the exclude pattern to test
332 * @param fragments the prefixes or suffixes (fragments without special characters) of the includes
333 * @param suffix {@code false} if the specified fragments are prefixes, {@code true} if they are suffixes
334 * @return {@code true} if it is certain that the exclude pattern cannot match, or {@code false} in case of doubt
335 */
336 private static boolean cannotMatch(String exclude, final String[] fragments, final boolean suffix) {
337 exclude = prefixOrSuffix(exclude, suffix);
338 for (String fragment : fragments) {
339 int fg = fragment.length();
340 int ex = exclude.length();
341 int length = Math.min(fg, ex);
342 if (suffix) {
343 fg -= length;
344 ex -= length;
345 } else {
346 fg = 0;
347 ex = 0;
348 }
349 if (exclude.regionMatches(ex, fragment, fg, length)) {
350 return false;
351 }
352 }
353 return true;
354 }
355
356 /**
357 * Sorts the given patterns by their length. The main intent is to have the empty string first,
358 * while will cause the loops testing for prefixes and suffixes to stop almost immediately.
359 * Short prefixes or suffixes are also more likely to be matched.
360 *
361 * @param fragments the fragments to sort in-place
362 * @param suffix {@code false} if the specified fragments are prefixes, {@code true} if they are suffixes
363 * @return the given array, or a smaller array if some fragments were discarded because redundant
364 */
365 private static String[] sortByLength(final String[] fragments, final boolean suffix) {
366 Arrays.sort(fragments, (s1, s2) -> s1.length() - s2.length());
367 int count = 0;
368 /*
369 * Simplify the array of prefixes or suffixes by removing all redundant elements.
370 * An element is redundant if there is a shorter prefix or suffix with the same characters.
371 */
372 nextBase:
373 for (String fragment : fragments) {
374 for (int i = count; --i >= 0; ) {
375 String base = fragments[i];
376 if (suffix ? fragment.endsWith(base) : fragment.startsWith(base)) {
377 continue nextBase; // Skip this fragment
378 }
379 }
380 fragments[count++] = fragment;
381 }
382 return (fragments.length == count) ? fragments : Arrays.copyOf(fragments, count);
383 }
384
385 /**
386 * Returns the given array of patterns with path separator normalized to {@code '/'}.
387 * Null or empty patterns are ignored, and duplications are removed.
388 *
389 * @param patterns the patterns to normalize
390 * @param excludes whether the patterns are exclude patterns
391 * @return normalized patterns without null, empty or duplicated patterns
392 */
393 private static String[] normalizePatterns(final Collection<String> patterns, final boolean excludes) {
394 if (patterns == null || patterns.isEmpty()) {
395 return new String[0];
396 }
397 // TODO: use `LinkedHashSet.newLinkedHashSet(int)` instead with JDK19.
398 final var normalized = new LinkedHashSet<String>(patterns.size());
399 for (String pattern : patterns) {
400 if (pattern != null && !pattern.isEmpty()) {
401 if (pattern.indexOf(':') <= MAVEN_SYNTAX_THRESHOLD) {
402 pattern = pattern.replace(File.separatorChar, '/');
403 if (pattern.endsWith("/")) {
404 pattern += "**";
405 }
406 // Following are okay only when "**" means "0 or more directories".
407 while (pattern.endsWith("/**/**")) {
408 pattern = pattern.substring(0, pattern.length() - 3);
409 }
410 while (pattern.startsWith("**/**/")) {
411 pattern = pattern.substring(3);
412 }
413 pattern = pattern.replace("/**/**/", "/**/");
414 pattern = pattern.replace("\\", "\\\\")
415 .replace("[", "\\[")
416 .replace("]", "\\]")
417 .replace("{", "\\{")
418 .replace("}", "\\}");
419 normalized.add(DEFAULT_SYNTAX + pattern);
420 /*
421 * If the pattern starts or ends with "**", Java GLOB expects a directory level at
422 * that location while Maven seems to consider that "**" can mean "no directory".
423 * Add another pattern for reproducing this effect.
424 */
425 addPatternsWithOneDirRemoved(normalized, pattern, 0);
426 } else {
427 normalized.add(pattern);
428 }
429 }
430 }
431 return simplify(normalized, excludes);
432 }
433
434 /**
435 * Adds all variants of the given pattern with {@code **} removed.
436 * This is used for simulating the Maven behavior where {@code "**} may match zero directory.
437 * Tests suggest that we need an explicit GLOB pattern with no {@code "**"} for matching an absence of directory.
438 *
439 * @param patterns where to add the derived patterns
440 * @param pattern the pattern for which to add derived forms, without the "glob:" syntax prefix
441 * @param end should be 0 (reserved for recursive invocations of this method)
442 */
443 private static void addPatternsWithOneDirRemoved(final Set<String> patterns, final String pattern, int end) {
444 final int length = pattern.length();
445 int start;
446 while ((start = pattern.indexOf("**", end)) >= 0) {
447 end = start + 2; // 2 is the length of "**".
448 if (end < length) {
449 if (pattern.charAt(end) != '/') {
450 continue;
451 }
452 if (start == 0) {
453 end++; // Ommit the leading slash if there is nothing before it.
454 }
455 }
456 if (start > 0 && pattern.charAt(--start) != '/') {
457 continue;
458 }
459 String reduced = pattern.substring(0, start) + pattern.substring(end);
460 patterns.add(DEFAULT_SYNTAX + reduced);
461 addPatternsWithOneDirRemoved(patterns, reduced, start);
462 }
463 }
464
465 /**
466 * Applies some heuristic rules for simplifying the set of patterns,
467 * then returns the patterns as an array.
468 *
469 * @param patterns the patterns to simplify and return asarray
470 * @param excludes whether the patterns are exclude patterns
471 * @return the set content as an array, after simplification
472 */
473 private static String[] simplify(Set<String> patterns, boolean excludes) {
474 /*
475 * If the "**" pattern is present, it makes all other patterns useless.
476 * In the case of include patterns, an empty set means to include everything.
477 */
478 if (patterns.remove("**")) {
479 patterns.clear();
480 if (excludes) {
481 patterns.add("**");
482 }
483 }
484 return patterns.toArray(String[]::new);
485 }
486
487 /**
488 * Eventually adds the parent directory of the given patterns, without duplicated values.
489 * The patterns given to this method should have been normalized.
490 *
491 * @param patterns the normalized include or exclude patterns
492 * @param excludes whether the patterns are exclude patterns
493 * @return pattens of directories to include or exclude
494 */
495 private static String[] directoryPatterns(final String[] patterns, final boolean excludes) {
496 // TODO: use `LinkedHashSet.newLinkedHashSet(int)` instead with JDK19.
497 final var directories = new LinkedHashSet<String>(patterns.length);
498 for (String pattern : patterns) {
499 if (pattern.startsWith(DEFAULT_SYNTAX)) {
500 if (excludes) {
501 if (pattern.endsWith("/**")) {
502 directories.add(pattern.substring(0, pattern.length() - 3));
503 }
504 } else {
505 int s = pattern.indexOf(':');
506 if (pattern.regionMatches(++s, "**/", 0, 3)) {
507 s = pattern.indexOf('/', s + 3);
508 if (s < 0) {
509 return new String[0]; // Pattern is "**", so we need to accept everything.
510 }
511 directories.add(pattern.substring(0, s));
512 }
513 }
514 }
515 }
516 return simplify(directories, excludes);
517 }
518
519 /**
520 * Creates the path matchers for the given patterns.
521 * The syntax (usually {@value #DEFAULT_SYNTAX}) must be specified for each pattern.
522 */
523 private static PathMatcher[] matchers(final FileSystem fs, final String[] patterns) {
524 final var matchers = new PathMatcher[patterns.length];
525 for (int i = 0; i < patterns.length; i++) {
526 matchers[i] = fs.getPathMatcher(patterns[i]);
527 }
528 return matchers;
529 }
530
531 /**
532 * {@return a potentially simpler matcher equivalent to this matcher}.
533 */
534 @SuppressWarnings("checkstyle:MissingSwitchDefault")
535 public PathMatcher simplify() {
536 if (excludes.length == 0) {
537 switch (includes.length) {
538 case 0:
539 return INCLUDES_ALL;
540 case 1:
541 return includes[0];
542 }
543 }
544 return this;
545 }
546
547 /**
548 * Determines whether a path is selected.
549 * This is true if the given file matches an include pattern and no exclude pattern.
550 *
551 * @param path the pathname to test, must not be {@code null}
552 * @return {@code true} if the given path is selected, {@code false} otherwise
553 */
554 @Override
555 public boolean matches(Path path) {
556 path = baseDirectory.relativize(path);
557 return (includes.length == 0 || isMatched(path, includes))
558 && (excludes.length == 0 || !isMatched(path, excludes));
559 }
560
561 /**
562 * {@return whether the given file matches according to one of the given matchers}.
563 */
564 private static boolean isMatched(Path path, PathMatcher[] matchers) {
565 for (PathMatcher matcher : matchers) {
566 if (matcher.matches(path)) {
567 return true;
568 }
569 }
570 return false;
571 }
572
573 /**
574 * Determines whether a directory could contain selected paths.
575 *
576 * @param directory the directory pathname to test, must not be {@code null}
577 * @return {@code true} if the given directory might contain selected paths, {@code false} if the
578 * directory will definitively not contain selected paths
579 */
580 public boolean couldHoldSelected(Path directory) {
581 if (baseDirectory.equals(directory)) {
582 return true;
583 }
584 directory = baseDirectory.relativize(directory);
585 return (dirIncludes.length == 0 || isMatched(directory, dirIncludes))
586 && (dirExcludes.length == 0 || !isMatched(directory, dirExcludes));
587 }
588
589 /**
590 * Appends the elements of the given array in the given buffer.
591 * This is a helper method for {@link #toString()} implementations.
592 *
593 * @param buffer the buffer to add the elements to
594 * @param label label identifying the array of elements to add
595 * @param patterns the elements to append, or {@code null} if none
596 */
597 private static void append(StringBuilder buffer, String label, String[] patterns) {
598 buffer.append(label).append(": [");
599 if (patterns != null) {
600 for (int i = 0; i < patterns.length; i++) {
601 if (i != 0) {
602 buffer.append(", ");
603 }
604 buffer.append(patterns[i]);
605 }
606 }
607 buffer.append(']');
608 }
609
610 /**
611 * {@return a string representation for logging purposes}.
612 */
613 @Override
614 public String toString() {
615 var buffer = new StringBuilder();
616 append(buffer, "includes", includePatterns);
617 append(buffer.append(", "), "excludes", excludePatterns);
618 return buffer.toString();
619 }
620 }