1 package org.apache.maven.doxia.module.twiki.parser;
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 import java.util.ArrayList;
23 import java.util.List;
24 import java.util.StringTokenizer;
25 import java.util.regex.Matcher;
26 import java.util.regex.Pattern;
27
28
29
30
31
32
33
34 public class TextParser
35 {
36
37
38
39 private static final Pattern WIKIWORD_PATTERN =
40 Pattern.compile( "(!?([A-Z]\\w*[.])?([A-Z][a-z]+){2,}(#\\w*)?)" );
41
42
43
44
45 private static final Pattern SPECIFICLINK_PATTERN = Pattern.compile( "!?\\[\\[([^\\]]+)\\]\\[([^\\]]+)\\]\\]" );
46
47
48
49
50 private static final Pattern FORCEDLINK_PATTERN = Pattern.compile( "(!)?(\\[\\[(.+)\\]\\])" );
51
52
53
54
55 private static final Pattern ANCHOR_PATTERN = Pattern.compile( "#(([A-Z][A-Za-z]*){2,})" );
56
57
58
59
60 private static final Pattern URL_PATTERN = Pattern.compile( "(\\w+):[/][/][^\\s]*" );
61
62
63
64
65 private static final Pattern IMAGE_PATTERN = Pattern.compile( "(.*)\\.(png|jpg|gif|bmp)" );
66
67
68
69
70 private static final Pattern IMAGE_TAG_PATTERN =
71 Pattern.compile( "<img\\b.*?\\bsrc=([\"'])(.*?)\\1.*>", Pattern.CASE_INSENSITIVE );
72
73
74 private static final Pattern HTML_TAG_PATTERN = Pattern.compile( "<(/?)([\\w]*)(.*?)(/?)>", Pattern.DOTALL );
75
76
77
78
79 private final WikiWordLinkResolver wikiWordLinkResolver;
80
81
82 private boolean noautolink;
83
84
85
86
87
88
89 public TextParser( final WikiWordLinkResolver resolver )
90 {
91 this.wikiWordLinkResolver = resolver;
92 }
93
94
95
96
97
98
99
100 public final List<Block> parse( final String line )
101 {
102 final List<Block> ret = new ArrayList<Block>();
103
104 final Matcher linkMatcher = SPECIFICLINK_PATTERN.matcher( line );
105 final Matcher wikiMatcher = WIKIWORD_PATTERN.matcher( line );
106 final Matcher forcedLinkMatcher = FORCEDLINK_PATTERN.matcher( line );
107 final Matcher anchorMatcher = ANCHOR_PATTERN.matcher( line );
108 final Matcher urlMatcher = URL_PATTERN.matcher( line );
109 final Matcher imageTagMatcher = IMAGE_TAG_PATTERN.matcher( line );
110
111 final Matcher tagMatcher = HTML_TAG_PATTERN.matcher( line );
112 Matcher xhtmlMatcher = null;
113 if ( tagMatcher.find() )
114 {
115 String tag = tagMatcher.group( 2 );
116
117 Pattern pattern =
118 Pattern.compile( "(\\<" + tag + ".*\\>)(.*)?(\\<\\/" + tag + "\\>)(.*)?", Pattern.DOTALL );
119 xhtmlMatcher = pattern.matcher( line );
120 }
121
122 if ( xhtmlMatcher != null && xhtmlMatcher.find() )
123 {
124 parseXHTML( line, ret, xhtmlMatcher );
125 }
126 else if ( linkMatcher.find() )
127 {
128 parseLink( line, ret, linkMatcher );
129 }
130 else if ( wikiMatcher.find() && startLikeWord( wikiMatcher, line ) && !noautolink )
131 {
132 parseWiki( line, ret, wikiMatcher );
133 }
134 else if ( forcedLinkMatcher.find() )
135 {
136 parseForcedLink( line, ret, forcedLinkMatcher );
137 }
138 else if ( anchorMatcher.find() && isAWord( anchorMatcher, line ) )
139 {
140 parseAnchor( line, ret, anchorMatcher );
141 }
142 else if ( urlMatcher.find() && isAWord( urlMatcher, line ) )
143 {
144 parseUrl( line, ret, urlMatcher );
145 }
146 else if ( imageTagMatcher.find() )
147 {
148 parseImage( line, ret, imageTagMatcher );
149 }
150 else
151 {
152 if ( line.length() != 0 )
153 {
154 ret.add( new TextBlock( line ) );
155 }
156 }
157
158 return ret;
159 }
160
161
162
163
164
165
166
167 private void parseImage( final String line, final List<Block> ret, final Matcher imageTagMatcher )
168 {
169 ret.addAll( parse( line.substring( 0, imageTagMatcher.start() ) ) );
170 final String src = imageTagMatcher.group( 2 );
171 ret.add( new ImageBlock( src ) );
172 ret.addAll( parse( line.substring( imageTagMatcher.end(), line.length() ) ) );
173 }
174
175
176
177
178
179
180
181 private void parseUrl( final String line, final List<Block> ret, final Matcher urlMatcher )
182 {
183 ret.addAll( parse( line.substring( 0, urlMatcher.start() ) ) );
184 final String url = urlMatcher.group( 0 );
185 final Matcher imageMatcher = IMAGE_PATTERN.matcher( url );
186 if ( imageMatcher.matches() )
187 {
188 ret.add( new ImageBlock( url ) );
189 }
190 else
191 {
192 ret.add( new LinkBlock( url, new TextBlock( url ) ) );
193 }
194 ret.addAll( parse( line.substring( urlMatcher.end(), line.length() ) ) );
195 }
196
197
198
199
200
201
202
203 private void parseAnchor( final String line, final List<Block> ret, final Matcher anchorMatcher )
204 {
205 ret.addAll( parse( line.substring( 0, anchorMatcher.start() ) ) );
206 ret.add( new AnchorBlock( anchorMatcher.group( 1 ) ) );
207 ret.addAll( parse( line.substring( anchorMatcher.end(), line.length() ) ) );
208 }
209
210
211
212
213
214
215
216 private void parseForcedLink( final String line, final List<Block> ret, final Matcher forcedLinkMatcher )
217 {
218 if ( forcedLinkMatcher.group( 1 ) != null )
219 {
220 ret.add( new TextBlock( forcedLinkMatcher.group( 2 ) ) );
221 }
222 else
223 {
224 final String showText = forcedLinkMatcher.group( 3 );
225
226 if ( showText.trim().startsWith( "mailto:" ) )
227 {
228 String s = showText.trim();
229 int i = s.indexOf( ' ' );
230 if ( i == -1 )
231 {
232 ret.add( new TextBlock( s ) );
233 }
234 else
235 {
236 ret.add( new LinkBlock( s.substring( 0, i ), new TextBlock( s.substring( i ).trim() ) ) );
237 }
238 }
239 else
240 {
241 ret.addAll( parse( line.substring( 0, forcedLinkMatcher.start() ) ) );
242 ret.add( createLink( showText, showText ) );
243 ret.addAll( parse( line.substring( forcedLinkMatcher.end(), line.length() ) ) );
244 }
245 }
246 }
247
248
249
250
251
252
253
254 private Block createLink( final String link, final String showText )
255 {
256 final Block content;
257 if ( URL_PATTERN.matcher( showText ).matches() && IMAGE_PATTERN.matcher( showText ).matches() )
258 {
259 content = new ImageBlock( showText );
260 }
261 else
262 {
263 content = new TextBlock( showText );
264 }
265
266 if ( URL_PATTERN.matcher( link ).matches() )
267 {
268 return new LinkBlock( link, content );
269 }
270
271 final StringTokenizer tokenizer = new StringTokenizer( link );
272 final StringBuilder sb = new StringBuilder();
273
274 while ( tokenizer.hasMoreElements() )
275 {
276 final String s = tokenizer.nextToken();
277 sb.append( s.substring( 0, 1 ).toUpperCase() );
278 sb.append( s.substring( 1 ) );
279 }
280 return new WikiWordBlock( sb.toString(), content, wikiWordLinkResolver );
281 }
282
283
284
285
286
287
288
289 private void parseWiki( final String line, final List<Block> ret, final Matcher wikiMatcher )
290 {
291 final String wikiWord = wikiMatcher.group();
292 ret.addAll( parse( line.substring( 0, wikiMatcher.start() ) ) );
293 if ( wikiWord.startsWith( "!" ) )
294 {
295 ret.add( new TextBlock( wikiWord.substring( 1 ) ) );
296 }
297 else
298 {
299 ret.add( new WikiWordBlock( wikiWord, wikiWordLinkResolver ) );
300 }
301 ret.addAll( parse( line.substring( wikiMatcher.end(), line.length() ) ) );
302 }
303
304
305
306
307
308
309
310 private void parseLink( final String line, final List<Block> ret, final Matcher linkMatcher )
311 {
312 ret.addAll( parse( line.substring( 0, linkMatcher.start() ) ) );
313 if ( line.charAt( linkMatcher.start() ) == '!' )
314 {
315 ret.add( new TextBlock( line.substring( linkMatcher.start() + 1, linkMatcher.end() ) ) );
316 }
317 else
318 {
319 ret.add( createLink( linkMatcher.group( 1 ), linkMatcher.group( 2 ) ) );
320 }
321 ret.addAll( parse( line.substring( linkMatcher.end(), line.length() ) ) );
322 }
323
324
325
326
327
328
329
330
331 private void parseXHTML( final String line, final List<Block> ret, final Matcher xhtmlMatcher )
332 {
333 ret.addAll( parse( line.substring( 0, xhtmlMatcher.start() ) ) );
334 if ( xhtmlMatcher.group( 1 ).indexOf( "noautolink" ) != -1 )
335 {
336 noautolink = true;
337 }
338 else
339 {
340 ret.add( new XHTMLBlock( xhtmlMatcher.group( 1 ) ) );
341 }
342
343 ret.addAll( parse( xhtmlMatcher.group( 2 ) ) );
344
345 if ( xhtmlMatcher.group( 1 ).indexOf( "noautolink" ) != -1 )
346 {
347 noautolink = false;
348 }
349 else
350 {
351 ret.add( new XHTMLBlock( xhtmlMatcher.group( 3 ) ) );
352 }
353
354 ret.addAll( parse( xhtmlMatcher.group( 4 ) ) );
355 }
356
357
358
359
360
361
362
363 private boolean isAWord( final Matcher m, final String line )
364 {
365 return startLikeWord( m, line ) && endLikeWord( m, line );
366 }
367
368
369
370
371
372
373 private boolean startLikeWord( final Matcher m, final String line )
374 {
375 final int start = m.start();
376
377 boolean ret = false;
378 if ( start == 0 )
379 {
380 ret = true;
381 }
382 else if ( start > 0 )
383 {
384 if ( isSpace( line.charAt( start - 1 ) ) )
385 {
386 ret = true;
387 }
388 }
389
390 return ret;
391 }
392
393
394
395
396
397
398 private boolean endLikeWord( final Matcher m, final String line )
399 {
400 final int end = m.end();
401
402 boolean ret = true;
403 if ( end < line.length() )
404 {
405 ret = isSpace( line.charAt( end ) );
406 }
407
408 return ret;
409 }
410
411
412
413
414
415 private boolean isSpace( final char c )
416 {
417 return c == ' ' || c == '\t';
418 }
419 }