1 package org.apache.maven.jxr.util;
2
3 /*
4 * Licensed to the Apache Software Foundation (ASF) under one
5 * or more contributor license agreements. See the NOTICE file
6 * distributed with this work for additional information
7 * regarding copyright ownership. The ASF licenses this file
8 * to you under the Apache License, Version 2.0 (the
9 * "License"); you may not use this file except in compliance
10 * with the License. You may obtain a copy of the License at
11 *
12 * http://www.apache.org/licenses/LICENSE-2.0
13 *
14 * Unless required by applicable law or agreed to in writing,
15 * software distributed under the License is distributed on an
16 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17 * KIND, either express or implied. See the License for the
18 * specific language governing permissions and limitations
19 * under the License.
20 */
21
22 import java.util.Collections;
23 import java.util.Vector;
24
25 /**
26 * This is a small and fast word tokenizer. It has different characteristics
27 * from the normal Java tokenizer. It only considers clear words that are only
28 * ended with spaces as strings. EX: "Flight" would be a word but "Flight()"
29 * would not.
30 */
31 public class SimpleWordTokenizer
32 {
33
34 /**
35 * Description of the Field
36 */
37 public static final char[] BREAKERS = {'(', ')', '[', ' ', '{', '}'};
38
39 /**
40 * Break the given line into multiple StringUtils
41 */
42 public static StringEntry[] tokenize( String line )
43 {
44
45 /*
46 determine where to start processing this String... this could
47 either be the start of the line or just keep going until the first
48 */
49 int start = getStart( line );
50
51 //find the first non-BREAKER char and assume that is where you want to start
52
53 if ( line == null || line.length() == 0 || start == -1 )
54 {
55 return new StringEntry[0];
56 }
57
58 return tokenize( line, start );
59 }
60
61
62 /**
63 * Tokenize the given line but only return StringUtils that match the parameter
64 * find.
65 *
66 * @param line String to search in
67 * @param find String to match.
68 */
69 public static StringEntry[] tokenize( String line, String find )
70 {
71
72 Vector v = new Vector();
73
74 StringEntry[] se = tokenize( line );
75
76 for ( int i = 0; i < se.length; ++i )
77 {
78
79 if ( se[i].toString().equals( find ) )
80 {
81 v.addElement( se[i] );
82 }
83
84 }
85
86 StringEntry[] found = new StringEntry[v.size()];
87 Collections.sort( v );
88 v.copyInto( found );
89 return found;
90 }
91
92 /**
93 * Internal impl. Specify the start and end.
94 */
95 private static StringEntry[] tokenize( String line, int start )
96 {
97
98 Vector words = new Vector();
99
100 //algorithm works like this... break the line out into segments
101 //that are separated by spaces, and if the entire String doesn't contain
102 //a non-Alpha char then assume it is a word.
103 while ( true )
104 {
105
106 int next = getNextBreak( line, start );
107
108 if ( next < 0 || next <= start )
109 {
110 break;
111 }
112
113 String word = line.substring( start, next );
114
115 if ( isWord( word ) )
116 {
117 words.addElement( new StringEntry( word, start ) );
118 }
119
120 start = next + 1;
121 }
122
123 StringEntry[] found = new StringEntry[words.size()];
124 words.copyInto( found );
125 return found;
126 }
127
128
129 /**
130 * Go through the entire String and if any character is not a Java identifier part (_, a, b,
131 * c, d, etc) then return false.
132 */
133 private static boolean isWord( String string )
134 {
135
136 if ( string == null || string.length() == 0 )
137 {
138
139 return false;
140 }
141
142 for ( int i = 0; i < string.length(); ++i )
143 {
144
145 char c = string.charAt( i );
146
147 if ( !Character.isJavaIdentifierPart( c ) && c != '.' )
148 {
149 return false;
150 }
151
152 }
153
154 return true;
155 }
156
157 /**
158 * Go through the list of BREAKERS and find the closes one.
159 */
160 private static int getNextBreak( String string, int start )
161 {
162
163 int breakPoint = -1;
164
165 for ( int i = 0; i < BREAKERS.length; ++i )
166 {
167
168 int next = string.indexOf( BREAKERS[i], start );
169
170 if ( breakPoint == -1 || next < breakPoint && next != -1 )
171 {
172
173 breakPoint = next;
174
175 }
176
177 }
178
179 //if the breakPoint is still -1 go to the end of the string
180 if ( breakPoint == -1 )
181 {
182 breakPoint = string.length();
183 }
184
185 return breakPoint;
186 }
187
188 /**
189 * Go through the list of BREAKERS and find the closes one.
190 */
191 private static int getStart( String string )
192 {
193
194 for ( int i = 0; i < string.length(); ++i )
195 {
196
197 if ( isBreaker( string.charAt( i ) ) == false )
198 {
199 return i;
200 }
201
202 }
203
204 return -1;
205 }
206
207
208 /**
209 * Return true if the given char is considered a breaker.
210 */
211 private static boolean isBreaker( char c )
212 {
213
214 for ( int i = 0; i < BREAKERS.length; ++i )
215 {
216
217 if ( BREAKERS[i] == c )
218 {
219 return true;
220 }
221
222 }
223
224 return false;
225 }
226
227 }
228