View Javadoc
1   package org.apache.maven.jxr.util;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *   http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing,
15   * software distributed under the License is distributed on an
16   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17   * KIND, either express or implied.  See the License for the
18   * specific language governing permissions and limitations
19   * under the License.
20   */
21  
22  import java.util.ArrayList;
23  import java.util.Collections;
24  import java.util.List;
25  import java.util.regex.Matcher;
26  import java.util.regex.Pattern;
27  
28  /**
29   * This is a small and fast word tokenizer. It has different characteristics from the normal Java tokenizer. It only
30   * considers clear words that are only ended with spaces as strings. EX: "Flight" would be a word but "Flight()" would
31   * not.
32   */
33  public class SimpleWordTokenizer
34  {
35  
36      private static final Pattern NONBREAKERS = Pattern.compile( "([^()\\[ {}]+)" );
37      
38      /**
39       * Description of the Field
40       */
41      private static final char[] BREAKERS = { '(', ')', '[', ' ', '{', '}' };
42  
43      /**
44       * Break the given line into multiple StringUtils
45       */
46      public static List<StringEntry> tokenize( String line )
47      {
48  
49          /*
50           * determine where to start processing this String... this could either be the start of the line or just keep
51           * going until the first
52           */
53          int start = getStart( line );
54  
55          // find the first non-BREAKER char and assume that is where you want to start
56  
57          if ( line == null || line.length() == 0 || start == -1 )
58          {
59              return Collections.emptyList();
60          }
61  
62          return tokenize( line, start );
63      }
64  
65      /**
66       * Tokenize the given line but only return StringUtils that match the parameter find.
67       *
68       * @param line String to search in
69       * @param find String to match.
70       */
71      public static List<StringEntry> tokenize( String line, String find )
72      {
73  
74          List<StringEntry> foundTokens = new ArrayList<>();
75  
76          for ( StringEntry se : tokenize( line ) )
77          {
78  
79              if ( se.toString().equals( find ) )
80              {
81                  foundTokens.add( se );
82              }
83  
84          }
85  
86          return foundTokens;
87      }
88  
89      /**
90       * Internal impl. Specify the start and end.
91       */
92      private static List<StringEntry> tokenize( String line, int start )
93      {
94          Matcher matcher = NONBREAKERS.matcher( line.substring( start ) );
95  
96          List<StringEntry> words = new ArrayList<>();
97  
98          while ( matcher.find() )
99          {
100             StringEntry entry = new StringEntry( matcher.group( 1 ), matcher.start() + start );
101             words.add( entry );
102         }
103 
104         return words;
105     }
106 
107     /**
108      * Go through the list of BREAKERS and find the closes one.
109      */
110     private static int getStart( String string )
111     {
112 
113         for ( int i = 0; i < string.length(); ++i )
114         {
115 
116             if ( !isBreaker( string.charAt( i ) ) )
117             {
118                 return i;
119             }
120 
121         }
122 
123         return -1;
124     }
125 
126     /**
127      * Return true if the given char is considered a breaker.
128      */
129     private static boolean isBreaker( char c )
130     {
131 
132         for ( char breaker : BREAKERS )
133         {
134 
135             if ( breaker == c )
136             {
137                 return true;
138             }
139 
140         }
141 
142         return false;
143     }
144 
145 }