View Javadoc
1   package org.apache.maven.jxr.util;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *   http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing,
15   * software distributed under the License is distributed on an
16   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17   * KIND, either express or implied.  See the License for the
18   * specific language governing permissions and limitations
19   * under the License.
20   */
21  
22  import java.util.Collections;
23  import java.util.Vector;
24  
25  /**
26   * This is a small and fast word tokenizer. It has different characteristics
27   * from the normal Java tokenizer. It only considers clear words that are only
28   * ended with spaces as strings. EX: "Flight" would be a word but "Flight()"
29   * would not.
30   */
31  public class SimpleWordTokenizer
32  {
33  
34      /**
35       * Description of the Field
36       */
37      public static final char[] BREAKERS = {'(', ')', '[', ' ', '{', '}'};
38  
39      /**
40       * Break the given line into multiple StringUtils
41       */
42      public static StringEntry[] tokenize( String line )
43      {
44  
45          /*
46          determine where to start processing this String... this could
47          either be the start of the line or just keep going until the first
48          */
49          int start = getStart( line );
50  
51          //find the first non-BREAKER char and assume that is where you want to start
52  
53          if ( line == null || line.length() == 0 || start == -1 )
54          {
55              return new StringEntry[0];
56          }
57  
58          return tokenize( line, start );
59      }
60  
61  
62      /**
63       * Tokenize the given line but only return StringUtils that match the parameter
64       * find.
65       *
66       * @param line String to search in
67       * @param find String to match.
68       */
69      public static StringEntry[] tokenize( String line, String find )
70      {
71  
72          Vector v = new Vector();
73  
74          StringEntry[] se = tokenize( line );
75  
76          for ( int i = 0; i < se.length; ++i )
77          {
78  
79              if ( se[i].toString().equals( find ) )
80              {
81                  v.addElement( se[i] );
82              }
83  
84          }
85  
86          StringEntry[] found = new StringEntry[v.size()];
87          Collections.sort( v );
88          v.copyInto( found );
89          return found;
90      }
91  
92      /**
93       * Internal impl. Specify the start and end.
94       */
95      private static StringEntry[] tokenize( String line, int start )
96      {
97  
98          Vector words = new Vector();
99  
100         //algorithm works like this... break the line out into segments
101         //that are separated by spaces, and if the entire String doesn't contain
102         //a non-Alpha char then assume it is a word.
103         while ( true )
104         {
105 
106             int next = getNextBreak( line, start );
107 
108             if ( next < 0 || next <= start )
109             {
110                 break;
111             }
112 
113             String word = line.substring( start, next );
114 
115             if ( isWord( word ) )
116             {
117                 words.addElement( new StringEntry( word, start ) );
118             }
119 
120             start = next + 1;
121         }
122 
123         StringEntry[] found = new StringEntry[words.size()];
124         words.copyInto( found );
125         return found;
126     }
127 
128 
129     /**
130      * Go through the entire String and if any character is not a Java identifier part (_, a, b,
131      * c, d, etc) then return false.
132      */
133     private static boolean isWord( String string )
134     {
135 
136         if ( string == null || string.length() == 0 )
137         {
138 
139             return false;
140         }
141 
142         for ( int i = 0; i < string.length(); ++i )
143         {
144 
145             char c = string.charAt( i );
146 
147             if ( !Character.isJavaIdentifierPart( c ) && c != '.' )
148             {
149                 return false;
150             }
151 
152         }
153 
154         return true;
155     }
156 
157     /**
158      * Go through the list of BREAKERS and find the closes one.
159      */
160     private static int getNextBreak( String string, int start )
161     {
162 
163         int breakPoint = -1;
164 
165         for ( int i = 0; i < BREAKERS.length; ++i )
166         {
167 
168             int next = string.indexOf( BREAKERS[i], start );
169 
170             if ( breakPoint == -1 || next < breakPoint && next != -1 )
171             {
172 
173                 breakPoint = next;
174 
175             }
176 
177         }
178 
179         //if the breakPoint is still -1 go to the end of the string
180         if ( breakPoint == -1 )
181         {
182             breakPoint = string.length();
183         }
184 
185         return breakPoint;
186     }
187 
188     /**
189      * Go through the list of BREAKERS and find the closes one.
190      */
191     private static int getStart( String string )
192     {
193 
194         for ( int i = 0; i < string.length(); ++i )
195         {
196 
197             if ( isBreaker( string.charAt( i ) ) == false )
198             {
199                 return i;
200             }
201 
202         }
203 
204         return -1;
205     }
206 
207 
208     /**
209      * Return true if the given char is considered a breaker.
210      */
211     private static boolean isBreaker( char c )
212     {
213 
214         for ( int i = 0; i < BREAKERS.length; ++i )
215         {
216 
217             if ( BREAKERS[i] == c )
218             {
219                 return true;
220             }
221 
222         }
223 
224         return false;
225     }
226 
227 }
228