View Javadoc
1   package org.apache.maven.doxia.module.markdown;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *   http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing,
15   * software distributed under the License is distributed on an
16   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17   * KIND, either express or implied.  See the License for the
18   * specific language governing permissions and limitations
19   * under the License.
20   */
21  
22  import org.apache.commons.lang.StringEscapeUtils;
23  import org.apache.commons.lang.StringUtils;
24  import org.apache.maven.doxia.markup.HtmlMarkup;
25  import org.apache.maven.doxia.module.xhtml.XhtmlParser;
26  import org.apache.maven.doxia.parser.AbstractParser;
27  import org.apache.maven.doxia.parser.ParseException;
28  import org.apache.maven.doxia.parser.Parser;
29  import org.apache.maven.doxia.sink.Sink;
30  import org.codehaus.plexus.component.annotations.Component;
31  import org.codehaus.plexus.component.annotations.Requirement;
32  import org.codehaus.plexus.util.IOUtil;
33  import org.codehaus.plexus.util.xml.pull.XmlPullParser;
34  import org.pegdown.Extensions;
35  import org.pegdown.PegDownProcessor;
36  import org.pegdown.ast.HeaderNode;
37  import org.pegdown.ast.HtmlBlockNode;
38  import org.pegdown.ast.Node;
39  import org.pegdown.ast.RootNode;
40  import org.pegdown.ast.SuperNode;
41  import org.pegdown.ast.TextNode;
42  
43  import java.io.IOException;
44  import java.io.Reader;
45  import java.io.StringReader;
46  import java.util.regex.Matcher;
47  import java.util.regex.Pattern;
48  
49  /**
50   * Implementation of {@link org.apache.maven.doxia.parser.Parser} for Markdown documents.
51   * <p/>
52   * Defers effective parsing to the <a href="http://pegdown.org">PegDown library</a>, which generates HTML content
53   * then delegates parsing of this content to a slightly modified Doxia Xhtml parser.
54   *
55   * @author Julien Nicoulaud <julien.nicoulaud@gmail.com>
56   * @since 1.3
57   * @see MarkdownToDoxiaHtmlSerializer
58   */
59  @Component( role = Parser.class, hint = "markdown" )
60  public class MarkdownParser
61      extends AbstractParser
62  {
63  
64      /**
65       * The role hint for the {@link MarkdownParser} Plexus component.
66       */
67      public static final String ROLE_HINT = "markdown";
68  
69      /**
70       * The {@link PegDownProcessor} used to convert Pegdown documents to HTML.
71       */
72      protected static final PegDownProcessor PEGDOWN_PROCESSOR =
73          new PegDownProcessor( Extensions.ALL & ~Extensions.HARDWRAPS, Long.MAX_VALUE );
74  
75      /**
76       * Regex that identifies a multimarkdown-style metadata section at the start of the document
77       */
78      private static final String MULTI_MARKDOWN_METADATA_SECTION =
79          "^(((?:[^\\s:][^:]*):(?:.*(?:\r?\n\\p{Blank}+[^\\s].*)*\r?\n))+)(?:\\s*\r?\n)";
80  
81      /**
82       * Regex that captures the key and value of a multimarkdown-style metadata entry.
83       */
84      private static final String MULTI_MARKDOWN_METADATA_ENTRY =
85          "([^\\s:][^:]*):(.*(?:\r?\n\\p{Blank}+[^\\s].*)*)\r?\n";
86  
87      /**
88       * In order to ensure that we have minimal risk of false positives when slurping metadata sections, the
89       * first key in the metadata section must be one of these standard keys or else the entire metadata section is
90       * ignored.
91       */
92      private static final String[] STANDARD_METADATA_KEYS =
93          { "title", "author", "date", "address", "affiliation", "copyright", "email", "keywords", "language", "phone",
94              "subtitle" };
95  
96      public int getType()
97      {
98          return TXT_TYPE;
99      }
100 
101     @Requirement
102     private PegDownHtmlParser parser;
103 
104     public void parse( Reader source, Sink sink )
105         throws ParseException
106     {
107         try
108         {
109             // Markdown to HTML (using Pegdown library)
110             String html = toHtml( source );
111             // then HTML to Sink API
112             parser.parse( new StringReader( html ), sink );
113         }
114         catch ( IOException e )
115         {
116             throw new ParseException( "Failed reading Markdown source document", e );
117         }
118     }
119 
120     /**
121      * uses PegDown library to parse content and generate HTML output.
122      * 
123      * @param source the Markdown source
124      * @return HTML content generated by PegDown 
125      * @throws IOException
126      * @see MarkdownToDoxiaHtmlSerializer
127      */
128     private String toHtml( Reader source )
129         throws IOException
130     {
131         String text = IOUtil.toString( source );
132         StringBuilder html = new StringBuilder( text.length() * 2 );
133         html.append( "<html>" );
134         html.append( "<head>" );
135         Pattern metadataPattern = Pattern.compile( MULTI_MARKDOWN_METADATA_SECTION, Pattern.MULTILINE );
136         Matcher metadataMatcher = metadataPattern.matcher( text );
137         boolean haveTitle = false;
138         if ( metadataMatcher.find() )
139         {
140             metadataPattern = Pattern.compile( MULTI_MARKDOWN_METADATA_ENTRY, Pattern.MULTILINE );
141             Matcher lineMatcher = metadataPattern.matcher( metadataMatcher.group( 1 ) );
142             boolean first = true;
143             while ( lineMatcher.find() )
144             {
145                 String key = StringUtils.trimToEmpty( lineMatcher.group( 1 ) );
146                 if ( first )
147                 {
148                     boolean found = false;
149                     for ( String k : STANDARD_METADATA_KEYS )
150                     {
151                         if ( k.equalsIgnoreCase( key ) )
152                         {
153                             found = true;
154                             break;
155                         }
156                     }
157                     if ( !found )
158                     {
159                         break;
160                     }
161                     first = false;
162                 }
163                 String value = StringUtils.trimToEmpty( lineMatcher.group( 2 ) );
164                 if ( "title".equalsIgnoreCase( key ) )
165                 {
166                     haveTitle = true;
167                     html.append( "<title>" );
168                     html.append( StringEscapeUtils.escapeXml( value ) );
169                     html.append( "</title>" );
170                 }
171                 else if ( "author".equalsIgnoreCase( key ) )
172                 {
173                     html.append( "<meta name=\'author\' content=\'" );
174                     html.append( StringEscapeUtils.escapeXml( value ) );
175                     html.append( "\' />" );
176                 }
177                 else if ( "date".equalsIgnoreCase( key ) )
178                 {
179                     html.append( "<meta name=\'date\' content=\'" );
180                     html.append( StringEscapeUtils.escapeXml( value ) );
181                     html.append( "\' />" );
182                 }
183                 else
184                 {
185                     html.append( "<meta name=\'" );
186                     html.append( StringEscapeUtils.escapeXml( key ) );
187                     html.append( "\' content=\'" );
188                     html.append( StringEscapeUtils.escapeXml( value ) );
189                     html.append( "\' />" );
190                 }
191             }
192             if ( !first )
193             {
194                 text = text.substring( metadataMatcher.end() );
195             }
196         }
197         RootNode rootNode = PEGDOWN_PROCESSOR.parseMarkdown( text.toCharArray() );
198         if ( !haveTitle && rootNode.getChildren().size() > 0 )
199         {
200             // use the first (non-comment) node only if it is a heading
201             int i = 0;
202             Node firstNode = null;
203             while ( i < rootNode.getChildren().size() && isHtmlComment(
204                 ( firstNode = rootNode.getChildren().get( i ) ) ) )
205             {
206                 i++;
207             }
208             if ( firstNode instanceof HeaderNode )
209             {
210                 html.append( "<title>" );
211                 html.append( StringEscapeUtils.escapeXml( nodeText( firstNode ) ) );
212                 html.append( "</title>" );
213             }
214         }
215         html.append( "</head>" );
216         html.append( "<body>" );
217         html.append( new MarkdownToDoxiaHtmlSerializer().toHtml( rootNode ) );
218         html.append( "</body>" );
219         html.append( "</html>" );
220 
221         return html.toString();
222     }
223 
224     public static boolean isHtmlComment( Node node )
225     {
226         if ( node instanceof HtmlBlockNode )
227         {
228             HtmlBlockNode blockNode = (HtmlBlockNode) node;
229             return blockNode.getText().startsWith( "<!--" );
230         }
231         return false;
232     }
233 
234     public static String nodeText( Node node )
235     {
236         StringBuilder builder = new StringBuilder();
237         if ( node instanceof TextNode )
238         {
239             builder.append( TextNode.class.cast( node ).getText() );
240         }
241         else
242         {
243             for ( Node n : node.getChildren() )
244             {
245                 if ( n instanceof TextNode )
246                 {
247                     builder.append( TextNode.class.cast( n ).getText() );
248                 }
249                 else if ( n instanceof SuperNode )
250                 {
251                     builder.append( nodeText( n ) );
252                 }
253             }
254         }
255         return builder.toString();
256     }
257 
258     /**
259      * Internal parser for HTML generated by PegDown library.
260      */
261     @Component( role = PegDownHtmlParser.class )
262     public static class PegDownHtmlParser
263         extends XhtmlParser
264     {
265         public PegDownHtmlParser()
266         {
267             super();
268         }
269 
270         @Override
271         protected boolean baseEndTag( XmlPullParser parser, Sink sink )
272         {
273             boolean visited = super.baseEndTag( parser, sink );
274             if ( !visited )
275             {
276                 if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) )
277                 {
278                     handleUnknown( parser, sink, TAG_TYPE_END );
279                     visited = true;
280                 }
281             }
282             return visited;
283         }
284     
285         @Override
286         protected boolean baseStartTag( XmlPullParser parser, Sink sink )
287         {
288             boolean visited = super.baseStartTag( parser, sink );
289             if ( !visited )
290             {
291                 if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) )
292                 {
293                     handleUnknown( parser, sink, TAG_TYPE_START );
294                     visited = true;
295                 }
296             }
297             return visited;
298         }
299     }
300 }