1 package org.apache.maven.jxr.util;
2
3 /*
4 * Licensed to the Apache Software Foundation (ASF) under one
5 * or more contributor license agreements. See the NOTICE file
6 * distributed with this work for additional information
7 * regarding copyright ownership. The ASF licenses this file
8 * to you under the Apache License, Version 2.0 (the
9 * "License"); you may not use this file except in compliance
10 * with the License. You may obtain a copy of the License at
11 *
12 * http://www.apache.org/licenses/LICENSE-2.0
13 *
14 * Unless required by applicable law or agreed to in writing,
15 * software distributed under the License is distributed on an
16 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17 * KIND, either express or implied. See the License for the
18 * specific language governing permissions and limitations
19 * under the License.
20 */
21
22 import java.util.ArrayList;
23 import java.util.Collections;
24 import java.util.List;
25 import java.util.regex.Matcher;
26 import java.util.regex.Pattern;
27
28 /**
29 * This is a small and fast word tokenizer. It has different characteristics from the normal Java tokenizer. It only
30 * considers clear words that are only ended with spaces as strings. EX: "Flight" would be a word but "Flight()" would
31 * not.
32 */
33 public class SimpleWordTokenizer
34 {
35
36 private static final Pattern NONBREAKERS = Pattern.compile( "([^()\\[ {}]+)" );
37
38 /**
39 * Description of the Field
40 */
41 private static final char[] BREAKERS = { '(', ')', '[', ' ', '{', '}' };
42
43 /**
44 * Break the given line into multiple StringUtils
45 */
46 public static List<StringEntry> tokenize( String line )
47 {
48
49 /*
50 * determine where to start processing this String... this could either be the start of the line or just keep
51 * going until the first
52 */
53 int start = getStart( line );
54
55 // find the first non-BREAKER char and assume that is where you want to start
56
57 if ( line == null || line.length() == 0 || start == -1 )
58 {
59 return Collections.emptyList();
60 }
61
62 return tokenize( line, start );
63 }
64
65 /**
66 * Tokenize the given line but only return StringUtils that match the parameter find.
67 *
68 * @param line String to search in
69 * @param find String to match.
70 */
71 public static List<StringEntry> tokenize( String line, String find )
72 {
73
74 List<StringEntry> foundTokens = new ArrayList<>();
75
76 for ( StringEntry se : tokenize( line ) )
77 {
78
79 if ( se.toString().equals( find ) )
80 {
81 foundTokens.add( se );
82 }
83
84 }
85
86 return foundTokens;
87 }
88
89 /**
90 * Internal impl. Specify the start and end.
91 */
92 private static List<StringEntry> tokenize( String line, int start )
93 {
94 Matcher matcher = NONBREAKERS.matcher( line.substring( start ) );
95
96 List<StringEntry> words = new ArrayList<StringEntry>();
97
98 while ( matcher.find() )
99 {
100 StringEntryingEntry.html#StringEntry">StringEntry entry = new StringEntry( matcher.group( 1 ), matcher.start() + start );
101 words.add( entry );
102 }
103
104 return words;
105 }
106
107 /**
108 * Go through the list of BREAKERS and find the closes one.
109 */
110 private static int getStart( String string )
111 {
112
113 for ( int i = 0; i < string.length(); ++i )
114 {
115
116 if ( !isBreaker( string.charAt( i ) ) )
117 {
118 return i;
119 }
120
121 }
122
123 return -1;
124 }
125
126 /**
127 * Return true if the given char is considered a breaker.
128 */
129 private static boolean isBreaker( char c )
130 {
131
132 for ( int i = 0; i < BREAKERS.length; ++i )
133 {
134
135 if ( BREAKERS[i] == c )
136 {
137 return true;
138 }
139
140 }
141
142 return false;
143 }
144
145 }