1 package org.apache.maven.jxr.util;
2
3 /*
4 * Licensed to the Apache Software Foundation (ASF) under one
5 * or more contributor license agreements. See the NOTICE file
6 * distributed with this work for additional information
7 * regarding copyright ownership. The ASF licenses this file
8 * to you under the Apache License, Version 2.0 (the
9 * "License"); you may not use this file except in compliance
10 * with the License. You may obtain a copy of the License at
11 *
12 * http://www.apache.org/licenses/LICENSE-2.0
13 *
14 * Unless required by applicable law or agreed to in writing,
15 * software distributed under the License is distributed on an
16 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17 * KIND, either express or implied. See the License for the
18 * specific language governing permissions and limitations
19 * under the License.
20 */
21
22 import java.util.Collections;
23 import java.util.Vector;
24
25 /**
26 * This is a small and fast word tokenizer. It has different characteristics from the normal Java tokenizer. It only
27 * considers clear words that are only ended with spaces as strings. EX: "Flight" would be a word but "Flight()" would
28 * not.
29 */
30 public class SimpleWordTokenizer
31 {
32
33 /**
34 * Description of the Field
35 */
36 public static final char[] BREAKERS = { '(', ')', '[', ' ', '{', '}' };
37
38 /**
39 * Break the given line into multiple StringUtils
40 */
41 public static StringEntry[] tokenize( String line )
42 {
43
44 /*
45 * determine where to start processing this String... this could either be the start of the line or just keep
46 * going until the first
47 */
48 int start = getStart( line );
49
50 // find the first non-BREAKER char and assume that is where you want to start
51
52 if ( line == null || line.length() == 0 || start == -1 )
53 {
54 return new StringEntry[0];
55 }
56
57 return tokenize( line, start );
58 }
59
60 /**
61 * Tokenize the given line but only return StringUtils that match the parameter find.
62 *
63 * @param line String to search in
64 * @param find String to match.
65 */
66 public static StringEntry[] tokenize( String line, String find )
67 {
68
69 Vector<StringEntry> v = new Vector<StringEntry>();
70
71 for ( StringEntry se : tokenize( line ) )
72 {
73
74 if ( se.toString().equals( find ) )
75 {
76 v.addElement( se );
77 }
78
79 }
80
81 StringEntry[] found = new StringEntry[v.size()];
82 Collections.sort( v );
83 v.copyInto( found );
84 return found;
85 }
86
87 /**
88 * Internal impl. Specify the start and end.
89 */
90 private static StringEntry[] tokenize( String line, int start )
91 {
92
93 Vector<StringEntry> words = new Vector<StringEntry>();
94
95 // algorithm works like this... break the line out into segments
96 // that are separated by spaces, and if the entire String doesn't contain
97 // a non-Alpha char then assume it is a word.
98 while ( true )
99 {
100
101 int next = getNextBreak( line, start );
102
103 if ( next < 0 || next <= start )
104 {
105 break;
106 }
107
108 String word = line.substring( start, next );
109
110 if ( isWord( word ) )
111 {
112 words.addElement( new StringEntry( word, start ) );
113 }
114
115 start = next + 1;
116 }
117
118 StringEntry[] found = new StringEntry[words.size()];
119 words.copyInto( found );
120 return found;
121 }
122
123 /**
124 * Go through the entire String and if any character is not a Java identifier part (_, a, b, c, d, etc) then return
125 * false.
126 */
127 private static boolean isWord( String string )
128 {
129
130 if ( string == null || string.length() == 0 )
131 {
132
133 return false;
134 }
135
136 for ( int i = 0; i < string.length(); ++i )
137 {
138
139 char c = string.charAt( i );
140
141 if ( !Character.isJavaIdentifierPart( c ) && c != '.' )
142 {
143 return false;
144 }
145
146 }
147
148 return true;
149 }
150
151 /**
152 * Go through the list of BREAKERS and find the closes one.
153 */
154 private static int getNextBreak( String string, int start )
155 {
156
157 int breakPoint = -1;
158
159 for ( int i = 0; i < BREAKERS.length; ++i )
160 {
161
162 int next = string.indexOf( BREAKERS[i], start );
163
164 if ( breakPoint == -1 || next < breakPoint && next != -1 )
165 {
166
167 breakPoint = next;
168
169 }
170
171 }
172
173 // if the breakPoint is still -1 go to the end of the string
174 if ( breakPoint == -1 )
175 {
176 breakPoint = string.length();
177 }
178
179 return breakPoint;
180 }
181
182 /**
183 * Go through the list of BREAKERS and find the closes one.
184 */
185 private static int getStart( String string )
186 {
187
188 for ( int i = 0; i < string.length(); ++i )
189 {
190
191 if ( !isBreaker( string.charAt( i ) ) )
192 {
193 return i;
194 }
195
196 }
197
198 return -1;
199 }
200
201 /**
202 * Return true if the given char is considered a breaker.
203 */
204 private static boolean isBreaker( char c )
205 {
206
207 for ( int i = 0; i < BREAKERS.length; ++i )
208 {
209
210 if ( BREAKERS[i] == c )
211 {
212 return true;
213 }
214
215 }
216
217 return false;
218 }
219
220 }