1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19 package org.apache.maven.jxr.util;
20
21 import java.util.ArrayList;
22 import java.util.Collections;
23 import java.util.List;
24 import java.util.regex.Matcher;
25 import java.util.regex.Pattern;
26
27 /**
28 * This is a small and fast word tokenizer. It has different characteristics from the normal Java tokenizer. It only
29 * considers clear words that are only ended with spaces as strings. EX: "Flight" would be a word but "Flight()" would
30 * not.
31 */
32 public class SimpleWordTokenizer {
33
34 private static final Pattern NONBREAKERS = Pattern.compile("([^()\\[ {}]+)");
35
36 private static final char[] BREAKERS = {'(', ')', '[', ' ', '{', '}'};
37
38 /**
39 * Breaks the given line into multiple tokens.
40 *
41 * @param line line to tokenize
42 * @return list of tokens
43 */
44 public static List<StringEntry> tokenize(String line) {
45
46 /*
47 * determine where to start processing this String... this could either be the start of the line or just keep
48 * going until the first
49 */
50 int start = getStart(line);
51
52 // find the first non-BREAKER char and assume that is where you want to start
53
54 if (line == null || line.length() == 0 || start == -1) {
55 return Collections.emptyList();
56 }
57
58 return tokenize(line, start);
59 }
60
61 /**
62 * Tokenize the given line but only return those tokens that match the parameter {@code find}.
63 *
64 * @param line line to search in
65 * @param find String to match
66 * @return list of matching tokens
67 */
68 public static List<StringEntry> tokenize(String line, String find) {
69
70 List<StringEntry> foundTokens = new ArrayList<>();
71
72 for (StringEntry se : tokenize(line)) {
73
74 if (se.toString().equals(find)) {
75 foundTokens.add(se);
76 }
77 }
78
79 return foundTokens;
80 }
81
82 /**
83 * Internal impl. Specify the start and end.
84 */
85 private static List<StringEntry> tokenize(String line, int start) {
86 Matcher matcher = NONBREAKERS.matcher(line.substring(start));
87
88 List<StringEntry> words = new ArrayList<>();
89
90 while (matcher.find()) {
91 StringEntry entry = new StringEntry(matcher.group(1), matcher.start() + start);
92 words.add(entry);
93 }
94
95 return words;
96 }
97
98 /**
99 * Go through the list of BREAKERS and find the closes one.
100 */
101 private static int getStart(String string) {
102
103 for (int i = 0; i < string.length(); ++i) {
104
105 if (!isBreaker(string.charAt(i))) {
106 return i;
107 }
108 }
109
110 return -1;
111 }
112
113 /**
114 * Return true if the given char is considered a breaker.
115 */
116 private static boolean isBreaker(char c) {
117
118 for (char breaker : BREAKERS) {
119
120 if (breaker == c) {
121 return true;
122 }
123 }
124
125 return false;
126 }
127 }