View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *   http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  package org.apache.maven.jxr.util;
20  
21  import java.util.ArrayList;
22  import java.util.Collections;
23  import java.util.List;
24  import java.util.regex.Matcher;
25  import java.util.regex.Pattern;
26  
27  /**
28   * This is a small and fast word tokenizer. It has different characteristics from the normal Java tokenizer. It only
29   * considers clear words that are only ended with spaces as strings. EX: "Flight" would be a word but "Flight()" would
30   * not.
31   */
32  public class SimpleWordTokenizer {
33  
34      private static final Pattern NONBREAKERS = Pattern.compile("([^()\\[ {}]+)");
35  
36      private static final char[] BREAKERS = {'(', ')', '[', ' ', '{', '}'};
37  
38      /**
39       * Breaks the given line into multiple tokens.
40       *
41       * @param line line to tokenize
42       * @return list of tokens
43       */
44      public static List<StringEntry> tokenize(String line) {
45  
46          /*
47           * determine where to start processing this String... this could either be the start of the line or just keep
48           * going until the first
49           */
50          int start = getStart(line);
51  
52          // find the first non-BREAKER char and assume that is where you want to start
53  
54          if (line == null || line.length() == 0 || start == -1) {
55              return Collections.emptyList();
56          }
57  
58          return tokenize(line, start);
59      }
60  
61      /**
62       * Tokenize the given line but only return those tokens that match the parameter {@code find}.
63       *
64       * @param line line to search in
65       * @param find String to match
66       * @return list of matching tokens
67       */
68      public static List<StringEntry> tokenize(String line, String find) {
69  
70          List<StringEntry> foundTokens = new ArrayList<>();
71  
72          for (StringEntry se : tokenize(line)) {
73  
74              if (se.toString().equals(find)) {
75                  foundTokens.add(se);
76              }
77          }
78  
79          return foundTokens;
80      }
81  
82      /**
83       * Internal impl. Specify the start and end.
84       */
85      private static List<StringEntry> tokenize(String line, int start) {
86          Matcher matcher = NONBREAKERS.matcher(line.substring(start));
87  
88          List<StringEntry> words = new ArrayList<>();
89  
90          while (matcher.find()) {
91              StringEntry entry = new StringEntry(matcher.group(1), matcher.start() + start);
92              words.add(entry);
93          }
94  
95          return words;
96      }
97  
98      /**
99       * Go through the list of BREAKERS and find the closes one.
100      */
101     private static int getStart(String string) {
102 
103         for (int i = 0; i < string.length(); ++i) {
104 
105             if (!isBreaker(string.charAt(i))) {
106                 return i;
107             }
108         }
109 
110         return -1;
111     }
112 
113     /**
114      * Return true if the given char is considered a breaker.
115      */
116     private static boolean isBreaker(char c) {
117 
118         for (char breaker : BREAKERS) {
119 
120             if (breaker == c) {
121                 return true;
122             }
123         }
124 
125         return false;
126     }
127 }