View Javadoc
1   package org.apache.maven.doxia.module.markdown;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *   http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing,
15   * software distributed under the License is distributed on an
16   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17   * KIND, either express or implied.  See the License for the
18   * specific language governing permissions and limitations
19   * under the License.
20   */
21  
22  import com.vladsch.flexmark.ast.Heading;
23  import com.vladsch.flexmark.ast.HtmlCommentBlock;
24  import com.vladsch.flexmark.util.ast.Node;
25  import com.vladsch.flexmark.ast.util.TextCollectingVisitor;
26  import com.vladsch.flexmark.html.HtmlRenderer;
27  import com.vladsch.flexmark.util.options.MutableDataSet;
28  import com.vladsch.flexmark.ext.escaped.character.EscapedCharacterExtension;
29  import com.vladsch.flexmark.ext.abbreviation.AbbreviationExtension;
30  import com.vladsch.flexmark.ext.autolink.AutolinkExtension;
31  import com.vladsch.flexmark.ext.definition.DefinitionExtension;
32  import com.vladsch.flexmark.ext.typographic.TypographicExtension;
33  import com.vladsch.flexmark.ext.tables.TablesExtension;
34  import com.vladsch.flexmark.ext.wikilink.WikiLinkExtension;
35  import com.vladsch.flexmark.ext.gfm.strikethrough.StrikethroughExtension;
36  
37  import org.apache.maven.doxia.markup.HtmlMarkup;
38  import org.apache.maven.doxia.module.xhtml.XhtmlParser;
39  import org.apache.maven.doxia.parser.AbstractParser;
40  import org.apache.maven.doxia.parser.ParseException;
41  import org.apache.maven.doxia.parser.Parser;
42  import org.apache.maven.doxia.sink.Sink;
43  import org.apache.maven.doxia.util.HtmlTools;
44  import org.codehaus.plexus.component.annotations.Component;
45  import org.codehaus.plexus.component.annotations.Requirement;
46  import org.codehaus.plexus.util.IOUtil;
47  import org.codehaus.plexus.util.xml.pull.XmlPullParser;
48  
49  import java.io.IOException;
50  import java.io.Reader;
51  import java.util.Arrays;
52  import java.util.regex.Matcher;
53  import java.util.regex.Pattern;
54  
55  /**
56   * <p>
57   * Implementation of {@link org.apache.maven.doxia.parser.Parser} for Markdown documents.
58   * </p>
59   * <p>
60   * Defers effective parsing to the <a href="https://github.com/vsch/flexmark-java">flexmark-java library</a>,
61   * which generates HTML content then delegates parsing of this content to a slightly modified Doxia Xhtml parser.
62   * (before 1.8, the <a href="http://pegdown.org">PegDown library</a> was used)
63   * </p>
64   *
65   * @author Vladimir Schneider
66   * @author Julien Nicoulaud
67   * @since 1.3
68   */
69  @Component( role = Parser.class, hint = MarkdownParser.ROLE_HINT )
70  public class MarkdownParser
71      extends AbstractParser
72  {
73  
74      /**
75       * The role hint for the {@link MarkdownParser} Plexus component.
76       */
77      public static final String ROLE_HINT = "markdown";
78  
79      /**
80       * Regex that identifies a multimarkdown-style metadata section at the start of the document
81       *
82       * In order to ensure that we have minimal risk of false positives when slurping metadata sections, the
83       * first key in the metadata section must be one of these standard keys or else the entire metadata section is
84       * ignored.
85       */
86      private static final Pattern METADATA_SECTION_PATTERN = Pattern.compile(
87              "\\A^\\s*"
88              + "(?:title|author|date|address|affiliation|copyright|email|keywords|language|phone|subtitle)"
89              + "[ \\t]*:[ \\t]*[^\\r\\n]*[ \\t]*$[\\r\\n]+"
90              + "(?:^[ \\t]*[^:\\r\\n]+[ \\t]*:[ \\t]*[^\\r\\n]*[ \\t]*$[\\r\\n]+)*",
91              Pattern.MULTILINE | Pattern.CASE_INSENSITIVE );
92  
93      /**
94       * Regex that captures the key and value of a multimarkdown-style metadata entry.
95       */
96      private static final Pattern METADATA_ENTRY_PATTERN = Pattern.compile(
97              "^[ \\t]*([^:\\r\\n]+?)[ \\t]*:[ \\t]*([^\\r\\n]*)[ \\t]*$",
98              Pattern.MULTILINE );
99  
100     /**
101      * <p>getType.</p>
102      *
103      * @return a int.
104      */
105     @Override
106     public int getType()
107     {
108         return TXT_TYPE;
109     }
110 
111     /**
112      * The parser of the HTML produced by Flexmark, that we will
113      * use to convert this HTML to Sink events
114      */
115     @Requirement
116     private MarkdownHtmlParser parser;
117 
118     /**
119      * Flexmark's Markdown parser (one static instance fits all)
120      */
121     private static final com.vladsch.flexmark.parser.Parser FLEXMARK_PARSER;
122 
123     /**
124      * Flexmark's HTML renderer (its output will be re-parsed and converted to Sink events)
125      */
126     private static final HtmlRenderer FLEXMARK_HTML_RENDERER;
127 
128     // Initialize the Flexmark parser and renderer, once and for all
129     static
130     {
131         MutableDataSet flexmarkOptions = new MutableDataSet();
132 
133         // Enable the extensions that we used to have in Pegdown
134         flexmarkOptions.set( com.vladsch.flexmark.parser.Parser.EXTENSIONS, Arrays.asList(
135                 EscapedCharacterExtension.create(),
136                 AbbreviationExtension.create(),
137                 AutolinkExtension.create(),
138                 DefinitionExtension.create(),
139                 TypographicExtension.create(),
140                 TablesExtension.create(),
141                 WikiLinkExtension.create(),
142                 StrikethroughExtension.create()
143         ) );
144 
145         // Disable wrong apostrophe replacement
146         flexmarkOptions.set( TypographicExtension.SINGLE_QUOTE_UNMATCHED, "&apos;" );
147 
148         // Additional options on the HTML rendering
149         flexmarkOptions.set( HtmlRenderer.HTML_BLOCK_OPEN_TAG_EOL, false );
150         flexmarkOptions.set( HtmlRenderer.HTML_BLOCK_CLOSE_TAG_EOL, false );
151         flexmarkOptions.set( HtmlRenderer.MAX_TRAILING_BLANK_LINES, -1 );
152 
153         // Build the Markdown parser
154         FLEXMARK_PARSER = com.vladsch.flexmark.parser.Parser.builder( flexmarkOptions ).build();
155 
156         // Build the HTML renderer
157         FLEXMARK_HTML_RENDERER = HtmlRenderer.builder( flexmarkOptions )
158                 .linkResolverFactory( new FlexmarkDoxiaLinkResolver.Factory() )
159                 .build();
160 
161     }
162 
163     /** {@inheritDoc} */
164     @Override
165     public void parse( Reader source, Sink sink, String reference )
166         throws ParseException
167     {
168         try
169         {
170             // Markdown to HTML (using flexmark-java library)
171             String html = toHtml( source );
172 
173             // then HTML to Sink API
174             parser.parse( html, sink );
175         }
176         catch ( IOException e )
177         {
178             throw new ParseException( "Failed reading Markdown source document", e );
179         }
180     }
181 
182     /**
183      * uses flexmark-java library to parse content and generate HTML output.
184      *
185      * @param source the Markdown source
186      * @return HTML content generated by flexmark-java
187      * @throws IOException passed through
188      */
189     String toHtml( Reader source )
190         throws IOException
191     {
192         // Read the source
193         String text = IOUtil.toString( source );
194 
195         // Now, build the HTML document
196         StringBuilder html = new StringBuilder( 1000 );
197         html.append( "<html>" );
198         html.append( "<head>" );
199 
200         // First, we interpret the "metadata" section of the document and add the corresponding HTML headers
201         Matcher metadataMatcher = METADATA_SECTION_PATTERN.matcher( text );
202         boolean haveTitle = false;
203         if ( metadataMatcher.find() )
204         {
205             Matcher entryMatcher = METADATA_ENTRY_PATTERN.matcher( metadataMatcher.group( 0 ) );
206             while ( entryMatcher.find() )
207             {
208                 String key = entryMatcher.group( 1 );
209                 String value = entryMatcher.group( 2 );
210                 if ( "title".equalsIgnoreCase( key ) )
211                 {
212                     haveTitle = true;
213                     html.append( "<title>" );
214                     html.append( HtmlTools.escapeHTML( value, false ) );
215                     html.append( "</title>" );
216                 }
217                 else
218                 {
219                     html.append( "<meta name='" );
220                     html.append( HtmlTools.escapeHTML( key ) );
221                     html.append( "' content='" );
222                     html.append( HtmlTools.escapeHTML( value ) );
223                     html.append( "' />" );
224                 }
225             }
226 
227             // Trim the metadata from the source
228             text = text.substring( metadataMatcher.end( 0 ) );
229 
230         }
231 
232         // Now is the time to parse the Markdown document
233         // (after we've trimmed out the metadatas, and before we check for its headings)
234         Node documentRoot = FLEXMARK_PARSER.parse( text );
235 
236         // Special trick: if there is no title specified as a metadata in the header, we will use the first
237         // heading as the document title
238         if ( !haveTitle && documentRoot.hasChildren() )
239         {
240             // Skip the comment nodes
241             Node firstNode = documentRoot.getFirstChild();
242             while ( firstNode != null && firstNode instanceof HtmlCommentBlock )
243             {
244                 firstNode = firstNode.getNext();
245             }
246 
247             // If this first non-comment node is a heading, we use it as the document title
248             if ( firstNode != null && firstNode instanceof Heading )
249             {
250                 html.append( "<title>" );
251                 TextCollectingVisitor collectingVisitor = new TextCollectingVisitor();
252                 String headingText = collectingVisitor.collectAndGetText( firstNode );
253                 html.append( HtmlTools.escapeHTML( headingText, false ) );
254                 html.append( "</title>" );
255             }
256         }
257         html.append( "</head>" );
258         html.append( "<body>" );
259 
260         // Convert our Markdown document to HTML and append it to our HTML
261         FLEXMARK_HTML_RENDERER.render( documentRoot, html );
262 
263         html.append( "</body>" );
264         html.append( "</html>" );
265 
266         return html.toString();
267     }
268 
269     /**
270      * Internal parser for HTML generated by the Markdown library.
271      *
272      * 2 special things:
273      * <ul>
274      * <li> DIV elements are translated as Unknown Sink events
275      * <li> PRE elements are all considered as boxed
276      * </ul>
277      * PRE elements need to be "boxed" because the XhtmlSink will surround the
278      * corresponding verbatim() Sink event with a DIV element with class="source",
279      * which is how most Maven Skin (incl. Fluido) recognize a block of code, which
280      * needs to be highlighted accordingly.
281      */
282     @Component( role = MarkdownHtmlParser.class )
283     public static class MarkdownHtmlParser
284         extends XhtmlParser
285     {
286         public MarkdownHtmlParser()
287         {
288             super();
289         }
290 
291         @Override
292         protected void init()
293         {
294             super.init();
295             super.boxed = true;
296         }
297 
298         @Override
299         protected boolean baseEndTag( XmlPullParser parser, Sink sink )
300         {
301             boolean visited = super.baseEndTag( parser, sink );
302             if ( !visited )
303             {
304                 if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) )
305                 {
306                     handleUnknown( parser, sink, TAG_TYPE_END );
307                     visited = true;
308                 }
309             }
310             return visited;
311         }
312 
313         @Override
314         protected boolean baseStartTag( XmlPullParser parser, Sink sink )
315         {
316             boolean visited = super.baseStartTag( parser, sink );
317             if ( !visited )
318             {
319                 if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) )
320                 {
321                     handleUnknown( parser, sink, TAG_TYPE_START );
322                     super.boxed = true;
323                     visited = true;
324                 }
325             }
326             return visited;
327         }
328     }
329 }