View Javadoc

1   package org.apache.maven.doxia.module.twiki.parser;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *   http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing,
15   * software distributed under the License is distributed on an
16   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17   * KIND, either express or implied.  See the License for the
18   * specific language governing permissions and limitations
19   * under the License.
20   */
21  
22  import java.util.ArrayList;
23  import java.util.List;
24  import java.util.StringTokenizer;
25  import java.util.regex.Matcher;
26  import java.util.regex.Pattern;
27  
28  /**
29   * Parse almost plain text in search of WikiWords, links, ...
30   *
31   * @author Juan F. Codagnone
32   * @version $Id: TextParser.java 1090706 2011-04-09 23:15:28Z hboutemy $
33   */
34  public class TextParser
35  {
36      /**
37       * pattern to detect WikiWords
38       */
39      private static final Pattern WIKIWORD_PATTERN =
40          Pattern.compile( "(!?([A-Z]\\w*[.])?([A-Z][a-z]+){2,}(#\\w*)?)" );
41  
42      /**
43       * pattern to detect SpecificLinks links [[reference][text]]
44       */
45      private static final Pattern SPECIFICLINK_PATTERN = Pattern.compile( "!?\\[\\[([^\\]]+)\\]\\[([^\\]]+)\\]\\]" );
46  
47      /**
48       * pattern to detect ForcedLinks links [[reference asd]]
49       */
50      private static final Pattern FORCEDLINK_PATTERN = Pattern.compile( "(!)?(\\[\\[(.+)\\]\\])" );
51  
52      /**
53       * anchor name
54       */
55      private static final Pattern ANCHOR_PATTERN = Pattern.compile( "#(([A-Z][A-Za-z]*){2,})" );
56  
57      /**
58       * url word
59       */
60      private static final Pattern URL_PATTERN = Pattern.compile( "(\\w+):[/][/][^\\s]*" );
61  
62      /**
63       *  image pattern specification
64       */
65      private static final Pattern IMAGE_PATTERN = Pattern.compile( "(.*)\\.(png|jpg|gif|bmp)" );
66  
67      /**
68       *  image tag pattern specification (used for images at relative URLs)
69       */
70      private static final Pattern IMAGE_TAG_PATTERN =
71          Pattern.compile( "<img\\b.*?\\bsrc=([\"'])(.*?)\\1.*>", Pattern.CASE_INSENSITIVE );
72  
73      /** HTML tag pattern */
74      private static final Pattern HTML_TAG_PATTERN = Pattern.compile( "<(/?)([\\w]*)(.*?)(/?)>", Pattern.DOTALL );
75  
76      /**
77       * resolves wikiWordLinks
78       */
79      private final WikiWordLinkResolver wikiWordLinkResolver;
80  
81      /** resolves noautolink tag */
82      private boolean noautolink;
83  
84      /**
85       * Creates the TextParser.
86       *
87       * @param resolver resolver for wikiWord links
88       */
89      public TextParser( final WikiWordLinkResolver resolver )
90      {
91          this.wikiWordLinkResolver = resolver;
92      }
93  
94      /**
95       * <p>parse.</p>
96       *
97       * @param line line to parse
98       * @return a list of block that represents the input
99       */
100     public final List<Block> parse( final String line )
101     {
102         final List<Block> ret = new ArrayList<Block>();
103 
104         final Matcher linkMatcher = SPECIFICLINK_PATTERN.matcher( line );
105         final Matcher wikiMatcher = WIKIWORD_PATTERN.matcher( line );
106         final Matcher forcedLinkMatcher = FORCEDLINK_PATTERN.matcher( line );
107         final Matcher anchorMatcher = ANCHOR_PATTERN.matcher( line );
108         final Matcher urlMatcher = URL_PATTERN.matcher( line );
109         final Matcher imageTagMatcher = IMAGE_TAG_PATTERN.matcher( line );
110 
111         final Matcher tagMatcher = HTML_TAG_PATTERN.matcher( line );
112         Matcher xhtmlMatcher = null;
113         if ( tagMatcher.find() )
114         {
115             String tag = tagMatcher.group( 2 );
116 
117             Pattern pattern =
118                 Pattern.compile( "(\\<" + tag + ".*\\>)(.*)?(\\<\\/" + tag + "\\>)(.*)?", Pattern.DOTALL );
119             xhtmlMatcher = pattern.matcher( line );
120         }
121 
122         if ( xhtmlMatcher != null && xhtmlMatcher.find() )
123         {
124             parseXHTML( line, ret, xhtmlMatcher );
125         }
126         else if ( linkMatcher.find() )
127         {
128             parseLink( line, ret, linkMatcher );
129         }
130         else if ( wikiMatcher.find() && startLikeWord( wikiMatcher, line ) && !noautolink )
131         {
132             parseWiki( line, ret, wikiMatcher );
133         }
134         else if ( forcedLinkMatcher.find() )
135         {
136             parseForcedLink( line, ret, forcedLinkMatcher );
137         }
138         else if ( anchorMatcher.find() && isAWord( anchorMatcher, line ) )
139         {
140             parseAnchor( line, ret, anchorMatcher );
141         }
142         else if ( urlMatcher.find() && isAWord( urlMatcher, line ) )
143         {
144             parseUrl( line, ret, urlMatcher );
145         }
146         else if ( imageTagMatcher.find() )
147         {
148             parseImage( line, ret, imageTagMatcher );
149         }
150         else
151         {
152             if ( line.length() != 0 )
153             {
154                 ret.add( new TextBlock( line ) );
155             }
156         }
157 
158         return ret;
159     }
160 
161     /**
162      * Parses the image tag
163      * @param line the line to parse
164      * @param ret where the results live
165      * @param imageTagMatcher image tag matcher
166      */
167     private void parseImage( final String line, final List<Block> ret, final Matcher imageTagMatcher )
168     {
169         ret.addAll( parse( line.substring( 0, imageTagMatcher.start() ) ) );
170         final String src = imageTagMatcher.group( 2 );
171         ret.add( new ImageBlock( src ) );
172         ret.addAll( parse( line.substring( imageTagMatcher.end(), line.length() ) ) );
173     }
174 
175     /**
176      * Parses the url
177      * @param line the line to parse
178      * @param ret where the results live
179      * @param urlMatcher url matcher
180      */
181     private void parseUrl( final String line, final List<Block> ret, final Matcher urlMatcher )
182     {
183         ret.addAll( parse( line.substring( 0, urlMatcher.start() ) ) );
184         final String url = urlMatcher.group( 0 );
185         final Matcher imageMatcher = IMAGE_PATTERN.matcher( url );
186         if ( imageMatcher.matches() )
187         {
188             ret.add( new ImageBlock( url ) );
189         }
190         else
191         {
192             ret.add( new LinkBlock( url, new TextBlock( url ) ) );
193         }
194         ret.addAll( parse( line.substring( urlMatcher.end(), line.length() ) ) );
195     }
196 
197     /**
198      * Parses the anchor
199      * @param line the line to parse
200      * @param ret where the results live
201      * @param anchorMatcher anchor matcher
202      */
203     private void parseAnchor( final String line, final List<Block> ret, final Matcher anchorMatcher )
204     {
205         ret.addAll( parse( line.substring( 0, anchorMatcher.start() ) ) );
206         ret.add( new AnchorBlock( anchorMatcher.group( 1 ) ) );
207         ret.addAll( parse( line.substring( anchorMatcher.end(), line.length() ) ) );
208     }
209 
210     /**
211      * Parses the link
212      * @param line line to parse
213      * @param ret where the results live
214      * @param forcedLinkMatcher forced link matcher
215      */
216     private void parseForcedLink( final String line, final List<Block> ret, final Matcher forcedLinkMatcher )
217     {
218         if ( forcedLinkMatcher.group( 1 ) != null )
219         {
220             ret.add( new TextBlock( forcedLinkMatcher.group( 2 ) ) );
221         }
222         else
223         {
224             final String showText = forcedLinkMatcher.group( 3 );
225             // mailto link:
226             if ( showText.trim().startsWith( "mailto:" ) )
227             {
228                 String s = showText.trim();
229                 int i = s.indexOf( ' ' );
230                 if ( i == -1 )
231                 {
232                     ret.add( new TextBlock( s ) );
233                 }
234                 else
235                 {
236                     ret.add( new LinkBlock( s.substring( 0, i ), new TextBlock( s.substring( i ).trim() ) ) );
237                 }
238             }
239             else
240             {
241                 ret.addAll( parse( line.substring( 0, forcedLinkMatcher.start() ) ) );
242                 ret.add( createLink( showText, showText ) );
243                 ret.addAll( parse( line.substring( forcedLinkMatcher.end(), line.length() ) ) );
244             }
245         }
246     }
247 
248     /**
249      * Decides between a WikiWordBlock or a a LinkBlock
250      * @param link the link text
251      * @param showText the show text.
252      * @return either a WikiWordBlock or a LinkBlock
253      */
254     private Block createLink( final String link, final String showText )
255     {
256         final Block content;
257         if ( URL_PATTERN.matcher( showText ).matches() && IMAGE_PATTERN.matcher( showText ).matches() )
258         {
259             content = new ImageBlock( showText );
260         }
261         else
262         {
263             content = new TextBlock( showText );
264         }
265 
266         if ( URL_PATTERN.matcher( link ).matches() )
267         {
268             return new LinkBlock( link, content );
269         }
270 
271         final StringTokenizer tokenizer = new StringTokenizer( link );
272         final StringBuffer sb = new StringBuffer();
273 
274         while ( tokenizer.hasMoreElements() )
275         {
276             final String s = tokenizer.nextToken();
277             sb.append( s.substring( 0, 1 ).toUpperCase() );
278             sb.append( s.substring( 1 ) );
279         }
280         return new WikiWordBlock( sb.toString(), content, wikiWordLinkResolver );
281     }
282 
283     /**
284      * Parses a wiki word
285      * @param line the line to parse
286      * @param ret where the results live
287      * @param wikiMatcher wiki matcher
288      */
289     private void parseWiki( final String line, final List<Block> ret, final Matcher wikiMatcher )
290     {
291         final String wikiWord = wikiMatcher.group();
292         ret.addAll( parse( line.substring( 0, wikiMatcher.start() ) ) );
293         if ( wikiWord.startsWith( "!" ) )
294         { // link prevention
295             ret.add( new TextBlock( wikiWord.substring( 1 ) ) );
296         }
297         else
298         {
299             ret.add( new WikiWordBlock( wikiWord, wikiWordLinkResolver ) );
300         }
301         ret.addAll( parse( line.substring( wikiMatcher.end(), line.length() ) ) );
302     }
303 
304     /**
305      * Parses a link
306      * @param line the line to parse
307      * @param ret where the results live
308      * @param linkMatcher link matcher
309      */
310     private void parseLink( final String line, final List<Block> ret, final Matcher linkMatcher )
311     {
312         ret.addAll( parse( line.substring( 0, linkMatcher.start() ) ) );
313         if ( line.charAt( linkMatcher.start() ) == '!' )
314         {
315             ret.add( new TextBlock( line.substring( linkMatcher.start() + 1, linkMatcher.end() ) ) );
316         }
317         else
318         {
319             ret.add( createLink( linkMatcher.group( 1 ), linkMatcher.group( 2 ) ) );
320         }
321         ret.addAll( parse( line.substring( linkMatcher.end(), line.length() ) ) );
322     }
323 
324     /**
325      * Parses xhtml.
326      *
327      * @param line the line to parse
328      * @param ret where the results live
329      * @param xhtmlMatcher xhtml matcher
330      */
331     private void parseXHTML( final String line, final List<Block> ret, final Matcher xhtmlMatcher )
332     {
333         if ( xhtmlMatcher.group( 1 ).indexOf( "noautolink" ) != -1 )
334         {
335             noautolink = true;
336         }
337         else
338         {
339             ret.add( new XHTMLBlock( xhtmlMatcher.group( 1 ) ) );
340         }
341 
342         ret.addAll( parse( xhtmlMatcher.group( 2 ) ) );
343 
344         if ( xhtmlMatcher.group( 1 ).indexOf( "noautolink" ) != -1 )
345         {
346             noautolink = false;
347         }
348         else
349         {
350             ret.add( new XHTMLBlock( xhtmlMatcher.group( 3 ) ) );
351         }
352 
353         ret.addAll( parse( xhtmlMatcher.group( 4 ) ) );
354     }
355 
356     /**
357      * @param m    matcher to test
358      * @param line line to test
359      * @return <code>true</code> if the match on m represent a word (must be
360      *         a space before the word or must be the beginning of the line)
361      */
362     private boolean isAWord( final Matcher m, final String line )
363     {
364         return startLikeWord( m, line ) && endLikeWord( m, line );
365     }
366 
367     /**
368      * @param m matcher to test
369      * @param line line to test
370      * @return true if it is the beginning of a word
371      */
372     private boolean startLikeWord( final Matcher m, final String line )
373     {
374         final int start = m.start();
375 
376         boolean ret = false;
377         if ( start == 0 )
378         {
379             ret = true;
380         }
381         else if ( start > 0 )
382         {
383             if ( isSpace( line.charAt( start - 1 ) ) )
384             {
385                 ret = true;
386             }
387         }
388 
389         return ret;
390     }
391 
392     /**
393      * @param m matcher to test
394      * @param line line to test
395      * @return true if it is the end of a word
396      */
397     private boolean endLikeWord( final Matcher m, final String line )
398     {
399         final int end = m.end();
400 
401         boolean ret = true;
402         if ( end < line.length() )
403         {
404             ret = isSpace( line.charAt( end ) );
405         }
406 
407         return ret;
408     }
409 
410     /**
411      * @param c char to test
412      * @return <code>true</code> if c is a space char
413      */
414     private boolean isSpace( final char c )
415     {
416         return c == ' ' || c == '\t';
417     }
418 }