1 package org.apache.maven.doxia.module.twiki.parser;
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 import java.util.ArrayList;
23 import java.util.List;
24 import java.util.StringTokenizer;
25 import java.util.regex.Matcher;
26 import java.util.regex.Pattern;
27
28
29
30
31
32
33
34 public class TextParser
35 {
36
37
38
39 private static final Pattern WIKIWORD_PATTERN =
40 Pattern.compile( "(!?([A-Z]\\w*[.])?([A-Z][a-z]+){2,}(#\\w*)?)" );
41
42
43
44
45 private static final Pattern SPECIFICLINK_PATTERN = Pattern.compile( "!?\\[\\[([^\\]]+)\\]\\[([^\\]]+)\\]\\]" );
46
47
48
49
50 private static final Pattern FORCEDLINK_PATTERN = Pattern.compile( "(!)?(\\[\\[(.+)\\]\\])" );
51
52
53
54
55 private static final Pattern ANCHOR_PATTERN = Pattern.compile( "#(([A-Z][A-Za-z]*){2,})" );
56
57
58
59
60 private static final Pattern URL_PATTERN = Pattern.compile( "(\\w+):[/][/][^\\s]*" );
61
62
63
64
65 private static final Pattern IMAGE_PATTERN = Pattern.compile( "(.*)\\.(png|jpg|gif|bmp)" );
66
67
68
69
70 private static final Pattern IMAGE_TAG_PATTERN =
71 Pattern.compile( "<img\\b.*?\\bsrc=([\"'])(.*?)\\1.*>", Pattern.CASE_INSENSITIVE );
72
73
74 private static final Pattern HTML_TAG_PATTERN = Pattern.compile( "<(/?)([\\w]*)(.*?)(/?)>", Pattern.DOTALL );
75
76
77
78
79 private final WikiWordLinkResolver wikiWordLinkResolver;
80
81
82 private boolean noautolink;
83
84
85
86
87
88
89 public TextParser( final WikiWordLinkResolver resolver )
90 {
91 this.wikiWordLinkResolver = resolver;
92 }
93
94
95
96
97
98
99
100 public final List<Block> parse( final String line )
101 {
102 final List<Block> ret = new ArrayList<Block>();
103
104 final Matcher linkMatcher = SPECIFICLINK_PATTERN.matcher( line );
105 final Matcher wikiMatcher = WIKIWORD_PATTERN.matcher( line );
106 final Matcher forcedLinkMatcher = FORCEDLINK_PATTERN.matcher( line );
107 final Matcher anchorMatcher = ANCHOR_PATTERN.matcher( line );
108 final Matcher urlMatcher = URL_PATTERN.matcher( line );
109 final Matcher imageTagMatcher = IMAGE_TAG_PATTERN.matcher( line );
110
111 final Matcher tagMatcher = HTML_TAG_PATTERN.matcher( line );
112 Matcher xhtmlMatcher = null;
113 if ( tagMatcher.find() )
114 {
115 String tag = tagMatcher.group( 2 );
116
117 Pattern pattern =
118 Pattern.compile( "(\\<" + tag + ".*\\>)(.*)?(\\<\\/" + tag + "\\>)(.*)?", Pattern.DOTALL );
119 xhtmlMatcher = pattern.matcher( line );
120 }
121
122 if ( xhtmlMatcher != null && xhtmlMatcher.find() )
123 {
124 parseXHTML( line, ret, xhtmlMatcher );
125 }
126 else if ( linkMatcher.find() )
127 {
128 parseLink( line, ret, linkMatcher );
129 }
130 else if ( wikiMatcher.find() && startLikeWord( wikiMatcher, line ) && !noautolink )
131 {
132 parseWiki( line, ret, wikiMatcher );
133 }
134 else if ( forcedLinkMatcher.find() )
135 {
136 parseForcedLink( line, ret, forcedLinkMatcher );
137 }
138 else if ( anchorMatcher.find() && isAWord( anchorMatcher, line ) )
139 {
140 parseAnchor( line, ret, anchorMatcher );
141 }
142 else if ( urlMatcher.find() && isAWord( urlMatcher, line ) )
143 {
144 parseUrl( line, ret, urlMatcher );
145 }
146 else if ( imageTagMatcher.find() )
147 {
148 parseImage( line, ret, imageTagMatcher );
149 }
150 else
151 {
152 if ( line.length() != 0 )
153 {
154 ret.add( new TextBlock( line ) );
155 }
156 }
157
158 return ret;
159 }
160
161
162
163
164
165
166
167 private void parseImage( final String line, final List<Block> ret, final Matcher imageTagMatcher )
168 {
169 ret.addAll( parse( line.substring( 0, imageTagMatcher.start() ) ) );
170 final String src = imageTagMatcher.group( 2 );
171 ret.add( new ImageBlock( src ) );
172 ret.addAll( parse( line.substring( imageTagMatcher.end(), line.length() ) ) );
173 }
174
175
176
177
178
179
180
181 private void parseUrl( final String line, final List<Block> ret, final Matcher urlMatcher )
182 {
183 ret.addAll( parse( line.substring( 0, urlMatcher.start() ) ) );
184 final String url = urlMatcher.group( 0 );
185 final Matcher imageMatcher = IMAGE_PATTERN.matcher( url );
186 if ( imageMatcher.matches() )
187 {
188 ret.add( new ImageBlock( url ) );
189 }
190 else
191 {
192 ret.add( new LinkBlock( url, new TextBlock( url ) ) );
193 }
194 ret.addAll( parse( line.substring( urlMatcher.end(), line.length() ) ) );
195 }
196
197
198
199
200
201
202
203 private void parseAnchor( final String line, final List<Block> ret, final Matcher anchorMatcher )
204 {
205 ret.addAll( parse( line.substring( 0, anchorMatcher.start() ) ) );
206 ret.add( new AnchorBlock( anchorMatcher.group( 1 ) ) );
207 ret.addAll( parse( line.substring( anchorMatcher.end(), line.length() ) ) );
208 }
209
210
211
212
213
214
215
216 private void parseForcedLink( final String line, final List<Block> ret, final Matcher forcedLinkMatcher )
217 {
218 if ( forcedLinkMatcher.group( 1 ) != null )
219 {
220 ret.add( new TextBlock( forcedLinkMatcher.group( 2 ) ) );
221 }
222 else
223 {
224 final String showText = forcedLinkMatcher.group( 3 );
225
226 if ( showText.trim().startsWith( "mailto:" ) )
227 {
228 String s = showText.trim();
229 int i = s.indexOf( ' ' );
230 if ( i == -1 )
231 {
232 ret.add( new TextBlock( s ) );
233 }
234 else
235 {
236 ret.add( new LinkBlock( s.substring( 0, i ), new TextBlock( s.substring( i ).trim() ) ) );
237 }
238 }
239 else
240 {
241 ret.addAll( parse( line.substring( 0, forcedLinkMatcher.start() ) ) );
242 ret.add( createLink( showText, showText ) );
243 ret.addAll( parse( line.substring( forcedLinkMatcher.end(), line.length() ) ) );
244 }
245 }
246 }
247
248
249
250
251
252
253
254 private Block createLink( final String link, final String showText )
255 {
256 final Block content;
257 if ( URL_PATTERN.matcher( showText ).matches() && IMAGE_PATTERN.matcher( showText ).matches() )
258 {
259 content = new ImageBlock( showText );
260 }
261 else
262 {
263 content = new TextBlock( showText );
264 }
265
266 if ( URL_PATTERN.matcher( link ).matches() )
267 {
268 return new LinkBlock( link, content );
269 }
270
271 final StringTokenizer tokenizer = new StringTokenizer( link );
272 final StringBuffer sb = new StringBuffer();
273
274 while ( tokenizer.hasMoreElements() )
275 {
276 final String s = tokenizer.nextToken();
277 sb.append( s.substring( 0, 1 ).toUpperCase() );
278 sb.append( s.substring( 1 ) );
279 }
280 return new WikiWordBlock( sb.toString(), content, wikiWordLinkResolver );
281 }
282
283
284
285
286
287
288
289 private void parseWiki( final String line, final List<Block> ret, final Matcher wikiMatcher )
290 {
291 final String wikiWord = wikiMatcher.group();
292 ret.addAll( parse( line.substring( 0, wikiMatcher.start() ) ) );
293 if ( wikiWord.startsWith( "!" ) )
294 {
295 ret.add( new TextBlock( wikiWord.substring( 1 ) ) );
296 }
297 else
298 {
299 ret.add( new WikiWordBlock( wikiWord, wikiWordLinkResolver ) );
300 }
301 ret.addAll( parse( line.substring( wikiMatcher.end(), line.length() ) ) );
302 }
303
304
305
306
307
308
309
310 private void parseLink( final String line, final List<Block> ret, final Matcher linkMatcher )
311 {
312 ret.addAll( parse( line.substring( 0, linkMatcher.start() ) ) );
313 if ( line.charAt( linkMatcher.start() ) == '!' )
314 {
315 ret.add( new TextBlock( line.substring( linkMatcher.start() + 1, linkMatcher.end() ) ) );
316 }
317 else
318 {
319 ret.add( createLink( linkMatcher.group( 1 ), linkMatcher.group( 2 ) ) );
320 }
321 ret.addAll( parse( line.substring( linkMatcher.end(), line.length() ) ) );
322 }
323
324
325
326
327
328
329
330
331 private void parseXHTML( final String line, final List<Block> ret, final Matcher xhtmlMatcher )
332 {
333 if ( xhtmlMatcher.group( 1 ).indexOf( "noautolink" ) != -1 )
334 {
335 noautolink = true;
336 }
337 else
338 {
339 ret.add( new XHTMLBlock( xhtmlMatcher.group( 1 ) ) );
340 }
341
342 ret.addAll( parse( xhtmlMatcher.group( 2 ) ) );
343
344 if ( xhtmlMatcher.group( 1 ).indexOf( "noautolink" ) != -1 )
345 {
346 noautolink = false;
347 }
348 else
349 {
350 ret.add( new XHTMLBlock( xhtmlMatcher.group( 3 ) ) );
351 }
352
353 ret.addAll( parse( xhtmlMatcher.group( 4 ) ) );
354 }
355
356
357
358
359
360
361
362 private boolean isAWord( final Matcher m, final String line )
363 {
364 return startLikeWord( m, line ) && endLikeWord( m, line );
365 }
366
367
368
369
370
371
372 private boolean startLikeWord( final Matcher m, final String line )
373 {
374 final int start = m.start();
375
376 boolean ret = false;
377 if ( start == 0 )
378 {
379 ret = true;
380 }
381 else if ( start > 0 )
382 {
383 if ( isSpace( line.charAt( start - 1 ) ) )
384 {
385 ret = true;
386 }
387 }
388
389 return ret;
390 }
391
392
393
394
395
396
397 private boolean endLikeWord( final Matcher m, final String line )
398 {
399 final int end = m.end();
400
401 boolean ret = true;
402 if ( end < line.length() )
403 {
404 ret = isSpace( line.charAt( end ) );
405 }
406
407 return ret;
408 }
409
410
411
412
413
414 private boolean isSpace( final char c )
415 {
416 return c == ' ' || c == '\t';
417 }
418 }