View Javadoc
1   package org.apache.maven.doxia.module.markdown;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *   http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing,
15   * software distributed under the License is distributed on an
16   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17   * KIND, either express or implied.  See the License for the
18   * specific language governing permissions and limitations
19   * under the License.
20   */
21  
22  import com.vladsch.flexmark.ast.Heading;
23  import com.vladsch.flexmark.ast.HtmlCommentBlock;
24  import com.vladsch.flexmark.util.ast.Node;
25  import com.vladsch.flexmark.ast.util.TextCollectingVisitor;
26  import com.vladsch.flexmark.html.HtmlRenderer;
27  import com.vladsch.flexmark.profiles.pegdown.Extensions;
28  import com.vladsch.flexmark.profiles.pegdown.PegdownOptionsAdapter;
29  import com.vladsch.flexmark.util.builder.Extension;
30  import com.vladsch.flexmark.util.options.MutableDataHolder;
31  import org.apache.commons.lang3.StringEscapeUtils;
32  import org.apache.commons.lang3.StringUtils;
33  import org.apache.maven.doxia.markup.HtmlMarkup;
34  import org.apache.maven.doxia.module.xhtml.XhtmlParser;
35  import org.apache.maven.doxia.parser.AbstractParser;
36  import org.apache.maven.doxia.parser.ParseException;
37  import org.apache.maven.doxia.parser.Parser;
38  import org.apache.maven.doxia.sink.Sink;
39  import org.codehaus.plexus.component.annotations.Component;
40  import org.codehaus.plexus.component.annotations.Requirement;
41  import org.codehaus.plexus.util.IOUtil;
42  import org.codehaus.plexus.util.xml.pull.XmlPullParser;
43  
44  import java.io.IOException;
45  import java.io.Reader;
46  import java.io.StringReader;
47  import java.util.ArrayList;
48  import java.util.regex.Matcher;
49  import java.util.regex.Pattern;
50  
51  /**
52   * <p>
53   * Implementation of {@link org.apache.maven.doxia.parser.Parser} for Markdown documents.
54   * </p>
55   * <p>
56   * Defers effective parsing to the <a href="https://github.com/vsch/flexmark-java">flexmark-java library</a>,
57   * which generates HTML content then delegates parsing of this content to a slightly modified Doxia Xhtml parser.
58   * (before 1.8, the <a href="http://pegdown.org">PegDown library</a> was used)
59   * </p>
60   *
61   * @author Vladimir Schneider
62   * @author Julien Nicoulaud
63   * @since 1.3
64   */
65  @Component( role = Parser.class, hint = MarkdownParser.ROLE_HINT )
66  public class MarkdownParser
67      extends AbstractParser
68  {
69  
70      /**
71       * The role hint for the {@link MarkdownParser} Plexus component.
72       */
73      public static final String ROLE_HINT = "markdown";
74  
75      /**
76       * Regex that identifies a multimarkdown-style metadata section at the start of the document
77       */
78      private static final String MULTI_MARKDOWN_METADATA_SECTION =
79          "^(((?:[^\\s:][^:]*):(?:.*(?:\r?\n\\p{Blank}+[^\\s].*)*\r?\n))+)(?:\\s*\r?\n)";
80  
81      /**
82       * Regex that captures the key and value of a multimarkdown-style metadata entry.
83       */
84      private static final String MULTI_MARKDOWN_METADATA_ENTRY =
85          "([^\\s:][^:]*):(.*(?:\r?\n\\p{Blank}+[^\\s].*)*)\r?\n";
86  
87      /**
88       * In order to ensure that we have minimal risk of false positives when slurping metadata sections, the
89       * first key in the metadata section must be one of these standard keys or else the entire metadata section is
90       * ignored.
91       */
92      private static final String[] STANDARD_METADATA_KEYS =
93          { "title", "author", "date", "address", "affiliation", "copyright", "email", "keywords", "language", "phone",
94              "subtitle" };
95  
96      /**
97       * <p>getType.</p>
98       *
99       * @return a int.
100      */
101     public int getType()
102     {
103         return TXT_TYPE;
104     }
105 
106     @Requirement
107     private MarkdownHtmlParser parser;
108 
109     /** {@inheritDoc} */
110     public void parse( Reader source, Sink sink )
111         throws ParseException
112     {
113         try
114         {
115             // Markdown to HTML (using flexmark-java library)
116             String html = toHtml( source );
117             // then HTML to Sink API
118             parser.parse( new StringReader( html ), sink );
119         }
120         catch ( IOException e )
121         {
122             throw new ParseException( "Failed reading Markdown source document", e );
123         }
124     }
125 
126     /**
127      * uses flexmark-java library to parse content and generate HTML output.
128      *
129      * @param source the Markdown source
130      * @return HTML content generated by flexmark-java
131      * @throws IOException passed through
132      */
133     String toHtml( Reader source )
134         throws IOException
135     {
136         String text = IOUtil.toString( source );
137         MutableDataHolder flexmarkOptions = PegdownOptionsAdapter.flexmarkOptions(
138                 Extensions.ALL & ~( Extensions.HARDWRAPS | Extensions.ANCHORLINKS ) ).toMutable();
139         ArrayList<Extension> extensions = new ArrayList<>();
140         for ( Extension extension : flexmarkOptions.get( com.vladsch.flexmark.parser.Parser.EXTENSIONS ) )
141         {
142             extensions.add( extension );
143         }
144 
145         extensions.add( FlexmarkDoxiaExtension.create() );
146         flexmarkOptions.set( com.vladsch.flexmark.parser.Parser.EXTENSIONS, extensions );
147         flexmarkOptions.set( HtmlRenderer.HTML_BLOCK_OPEN_TAG_EOL, false );
148         flexmarkOptions.set( HtmlRenderer.HTML_BLOCK_CLOSE_TAG_EOL, false );
149         flexmarkOptions.set( HtmlRenderer.MAX_TRAILING_BLANK_LINES, -1 );
150 
151         com.vladsch.flexmark.parser.Parser parser = com.vladsch.flexmark.parser.Parser.builder( flexmarkOptions )
152                 .build();
153         HtmlRenderer renderer = HtmlRenderer.builder( flexmarkOptions )
154                                     .linkResolverFactory( new FlexmarkDoxiaLinkResolver.Factory() )
155                                     .build();
156 
157 
158         StringBuilder html = new StringBuilder( 1000 );
159         html.append( "<html>" );
160         html.append( "<head>" );
161         Pattern metadataPattern = Pattern.compile( MULTI_MARKDOWN_METADATA_SECTION, Pattern.MULTILINE );
162         Matcher metadataMatcher = metadataPattern.matcher( text );
163         boolean haveTitle = false;
164         if ( metadataMatcher.find() )
165         {
166             metadataPattern = Pattern.compile( MULTI_MARKDOWN_METADATA_ENTRY, Pattern.MULTILINE );
167             Matcher lineMatcher = metadataPattern.matcher( metadataMatcher.group( 1 ) );
168             boolean first = true;
169             while ( lineMatcher.find() )
170             {
171                 String key = StringUtils.trimToEmpty( lineMatcher.group( 1 ) );
172                 if ( first )
173                 {
174                     boolean found = false;
175                     for ( String k : STANDARD_METADATA_KEYS )
176                     {
177                         if ( k.equalsIgnoreCase( key ) )
178                         {
179                             found = true;
180                             break;
181                         }
182                     }
183                     if ( !found )
184                     {
185                         break;
186                     }
187                     first = false;
188                 }
189                 String value = StringUtils.trimToEmpty( lineMatcher.group( 2 ) );
190                 if ( "title".equalsIgnoreCase( key ) )
191                 {
192                     haveTitle = true;
193                     html.append( "<title>" );
194                     html.append( StringEscapeUtils.escapeXml( value ) );
195                     html.append( "</title>" );
196                 }
197                 else if ( "author".equalsIgnoreCase( key ) )
198                 {
199                     html.append( "<meta name=\'author\' content=\'" );
200                     html.append( StringEscapeUtils.escapeXml( value ) );
201                     html.append( "\' />" );
202                 }
203                 else if ( "date".equalsIgnoreCase( key ) )
204                 {
205                     html.append( "<meta name=\'date\' content=\'" );
206                     html.append( StringEscapeUtils.escapeXml( value ) );
207                     html.append( "\' />" );
208                 }
209                 else
210                 {
211                     html.append( "<meta name=\'" );
212                     html.append( StringEscapeUtils.escapeXml( key ) );
213                     html.append( "\' content=\'" );
214                     html.append( StringEscapeUtils.escapeXml( value ) );
215                     html.append( "\' />" );
216                 }
217             }
218             if ( !first )
219             {
220                 text = text.substring( metadataMatcher.end() );
221             }
222         }
223 
224         Node rootNode = parser.parse( text );
225         String markdownHtml = renderer.render( rootNode );
226 
227         if ( !haveTitle && rootNode.hasChildren() )
228         {
229             // use the first (non-comment) node only if it is a heading
230             Node firstNode = rootNode.getFirstChild();
231             while ( firstNode != null && !( firstNode instanceof Heading ) )
232             {
233                 if ( !( firstNode instanceof HtmlCommentBlock ) )
234                 {
235                     break;
236                 }
237                 firstNode = firstNode.getNext();
238             }
239 
240             if ( firstNode instanceof Heading )
241             {
242                 html.append( "<title>" );
243                 TextCollectingVisitor collectingVisitor = new TextCollectingVisitor();
244                 String headingText = collectingVisitor.collectAndGetText( firstNode );
245                 html.append( StringEscapeUtils.escapeXml( headingText ) );
246                 html.append( "</title>" );
247             }
248         }
249         html.append( "</head>" );
250         html.append( "<body>" );
251         html.append( markdownHtml );
252         html.append( "</body>" );
253         html.append( "</html>" );
254 
255         return html.toString();
256     }
257 
258     /**
259      * Internal parser for HTML generated by the Markdown library.
260      */
261     @Component( role = MarkdownHtmlParser.class )
262     public static class MarkdownHtmlParser
263         extends XhtmlParser
264     {
265         public MarkdownHtmlParser()
266         {
267             super();
268         }
269 
270         @Override
271         protected boolean baseEndTag( XmlPullParser parser, Sink sink )
272         {
273             boolean visited = super.baseEndTag( parser, sink );
274             if ( !visited )
275             {
276                 if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) )
277                 {
278                     handleUnknown( parser, sink, TAG_TYPE_END );
279                     visited = true;
280                 }
281             }
282             return visited;
283         }
284 
285         @Override
286         protected boolean baseStartTag( XmlPullParser parser, Sink sink )
287         {
288             boolean visited = super.baseStartTag( parser, sink );
289             if ( !visited )
290             {
291                 if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) )
292                 {
293                     handleUnknown( parser, sink, TAG_TYPE_START );
294                     visited = true;
295                 }
296             }
297             return visited;
298         }
299     }
300 }