View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *   http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  package org.apache.maven.doxia.module.markdown;
20  
21  import javax.inject.Inject;
22  import javax.inject.Named;
23  import javax.inject.Singleton;
24  
25  import java.io.IOException;
26  import java.io.Reader;
27  import java.util.Arrays;
28  import java.util.Collections;
29  import java.util.LinkedHashMap;
30  import java.util.List;
31  import java.util.Map;
32  import java.util.Map.Entry;
33  import java.util.regex.Matcher;
34  import java.util.regex.Pattern;
35  import java.util.stream.Collectors;
36  
37  import com.vladsch.flexmark.ast.Heading;
38  import com.vladsch.flexmark.ast.HtmlCommentBlock;
39  import com.vladsch.flexmark.ast.util.TextCollectingVisitor;
40  import com.vladsch.flexmark.ext.abbreviation.AbbreviationExtension;
41  import com.vladsch.flexmark.ext.autolink.AutolinkExtension;
42  import com.vladsch.flexmark.ext.definition.DefinitionExtension;
43  import com.vladsch.flexmark.ext.escaped.character.EscapedCharacterExtension;
44  import com.vladsch.flexmark.ext.gfm.strikethrough.StrikethroughExtension;
45  import com.vladsch.flexmark.ext.tables.TablesExtension;
46  import com.vladsch.flexmark.ext.typographic.TypographicExtension;
47  import com.vladsch.flexmark.ext.wikilink.WikiLinkExtension;
48  import com.vladsch.flexmark.ext.yaml.front.matter.YamlFrontMatterExtension;
49  import com.vladsch.flexmark.html.HtmlRenderer;
50  import com.vladsch.flexmark.util.ast.Node;
51  import com.vladsch.flexmark.util.data.MutableDataSet;
52  import org.apache.commons.io.IOUtils;
53  import org.apache.maven.doxia.markup.HtmlMarkup;
54  import org.apache.maven.doxia.markup.TextMarkup;
55  import org.apache.maven.doxia.module.xhtml5.Xhtml5Parser;
56  import org.apache.maven.doxia.parser.AbstractTextParser;
57  import org.apache.maven.doxia.parser.ParseException;
58  import org.apache.maven.doxia.sink.Sink;
59  import org.apache.maven.doxia.util.HtmlTools;
60  import org.codehaus.plexus.util.xml.pull.XmlPullParser;
61  
62  /**
63   * <p>
64   * Implementation of {@link org.apache.maven.doxia.parser.Parser} for Markdown documents.
65   * </p>
66   * <p>
67   * Defers effective parsing to the <a href="https://github.com/vsch/flexmark-java">flexmark-java library</a>,
68   * which generates HTML content then delegates parsing of this content to a slightly modified Doxia Xhtml5 parser.
69   * (before 1.8, the <a href="http://pegdown.org">PegDown library</a> was used)
70   * </p>
71   *
72   * @author Vladimir Schneider
73   * @author Julien Nicoulaud
74   * @since 1.3
75   */
76  @Singleton
77  @Named("markdown")
78  public class MarkdownParser extends AbstractTextParser implements TextMarkup {
79  
80      /**
81       * Regex that identifies a multimarkdown-style metadata section at the start of the document
82       *
83       * In order to ensure that we have minimal risk of false positives when slurping metadata sections, the
84       * first key in the metadata section must be one of these standard keys or else the entire metadata section is
85       * ignored.
86       * @see <a href="https://fletcher.github.io/MultiMarkdown-5/metadata.html">Multimarkdown Metadata</a>
87       */
88      private static final Pattern METADATA_SECTION_PATTERN = Pattern.compile(
89              "\\A^"
90                      + "(?:title|author|date|address|affiliation|copyright|email|keywords|language|phone|subtitle)"
91                      + "[ \\t]*:[\\S\\s]+?^[ \\t]*$",
92              Pattern.MULTILINE | Pattern.CASE_INSENSITIVE);
93  
94      /**
95       * Regex that captures the key and value of a multimarkdown-style metadata entry.
96       * Group 1 captures the key, group 2 captures the value. Multivalues are not supported in the syntax!
97       * Multiline values need to be normalized
98       * @see <a href="https://fletcher.github.io/MultiMarkdown-5/metadata.html">Multimarkdown Metadata</a>
99       *
100      */
101     private static final Pattern METADATA_ENTRY_PATTERN = Pattern.compile(
102             "^([^:\\r\\n]+?)[ \\t]*:([\\S\\s]+?)(?=(?:^(?:[^:\\r\\n]+?)[ \\t]*:)|^[ \\t]*$)", Pattern.MULTILINE);
103 
104     /**
105      * The parser of the HTML produced by Flexmark, that we will
106      * use to convert this HTML to Sink events
107      */
108     @Inject
109     private MarkdownHtmlParser parser;
110 
111     /**
112      * Flexmark's Markdown parser (one static instance fits all)
113      */
114     private static final com.vladsch.flexmark.parser.Parser FLEXMARK_PARSER;
115 
116     /**
117      * Flexmark's Markdown Metadata parser
118      */
119     private static final com.vladsch.flexmark.parser.Parser FLEXMARK_METADATA_PARSER;
120 
121     /**
122      * Flexmark's HTML renderer (its output will be re-parsed and converted to Sink events)
123      */
124     private static final HtmlRenderer FLEXMARK_HTML_RENDERER;
125 
126     // Initialize the Flexmark parser and renderer, once and for all
127     static {
128         MutableDataSet flexmarkOptions = new MutableDataSet();
129 
130         // Enable the extensions that we used to have in Pegdown
131         flexmarkOptions.set(
132                 com.vladsch.flexmark.parser.Parser.EXTENSIONS,
133                 Arrays.asList(
134                         EscapedCharacterExtension.create(),
135                         AbbreviationExtension.create(),
136                         AutolinkExtension.create(),
137                         DefinitionExtension.create(),
138                         TypographicExtension.create(),
139                         TablesExtension.create(),
140                         WikiLinkExtension.create(),
141                         StrikethroughExtension.create()));
142 
143         // Disable wrong apostrophe replacement
144         flexmarkOptions.set(TypographicExtension.SINGLE_QUOTE_UNMATCHED, "&apos;");
145 
146         // Additional options on the HTML rendering
147         flexmarkOptions.set(HtmlRenderer.HTML_BLOCK_OPEN_TAG_EOL, false);
148         flexmarkOptions.set(HtmlRenderer.HTML_BLOCK_CLOSE_TAG_EOL, false);
149         flexmarkOptions.set(HtmlRenderer.MAX_TRAILING_BLANK_LINES, -1);
150 
151         // Build the Markdown parser
152         FLEXMARK_PARSER =
153                 com.vladsch.flexmark.parser.Parser.builder(flexmarkOptions).build();
154 
155         MutableDataSet flexmarkMetadataOptions = new MutableDataSet();
156         flexmarkMetadataOptions.set(
157                 com.vladsch.flexmark.parser.Parser.EXTENSIONS, Arrays.asList(YamlFrontMatterExtension.create()));
158         FLEXMARK_METADATA_PARSER = com.vladsch.flexmark.parser.Parser.builder(flexmarkMetadataOptions)
159                 .build();
160 
161         // Build the HTML renderer
162         FLEXMARK_HTML_RENDERER = HtmlRenderer.builder(flexmarkOptions)
163                 .linkResolverFactory(new FlexmarkDoxiaLinkResolver.Factory())
164                 .build();
165     }
166 
167     /** {@inheritDoc} */
168     @Override
169     public void parse(Reader source, Sink sink, String reference) throws ParseException {
170         try {
171             // Markdown to HTML (using flexmark-java library)
172             String html = toHtml(source);
173 
174             // then HTML to Sink API
175             parser.parse(html, getWrappedSink(sink));
176         } catch (IOException e) {
177             throw new ParseException("Failed reading Markdown source document", e);
178         }
179     }
180 
181     private boolean processMetadataForHtml(StringBuilder html, StringBuilder source) {
182         final Map<String, List<String>> metadata;
183         final int endOffset; // end of metadata within source
184         // support two types of metadata:
185         if (source.toString().startsWith("---")) {
186             // 1. YAML front matter (https://github.com/vsch/flexmark-java/wiki/Extensions#yaml-front-matter)
187             Node documentRoot = FLEXMARK_METADATA_PARSER.parse(source.toString());
188             YamlFrontMatterVisitor visitor = new YamlFrontMatterVisitor();
189             visitor.visit(documentRoot);
190             metadata = visitor.getData();
191             endOffset = visitor.getEndOffset();
192         } else {
193             // 2. Multimarkdown metadata (https://fletcher.github.io/MultiMarkdown-5/metadata.html), not yet supported
194             // by Flexmark (https://github.com/vsch/flexmark-java/issues/550)
195             metadata = new LinkedHashMap<>();
196             Matcher metadataMatcher = METADATA_SECTION_PATTERN.matcher(source);
197             if (metadataMatcher.find()) {
198                 String entry = metadataMatcher.group(0) + EOL;
199                 Matcher entryMatcher = METADATA_ENTRY_PATTERN.matcher(entry);
200                 while (entryMatcher.find()) {
201                     String key = entryMatcher.group(1);
202                     String value = normalizeMultilineValue(entryMatcher.group(2));
203                     metadata.put(key, Collections.singletonList(value));
204                 }
205                 endOffset = metadataMatcher.end(0);
206             } else {
207                 endOffset = 0;
208             }
209         }
210         if (endOffset > 0) {
211             // Trim the metadata from the source
212             source.delete(0, endOffset);
213         }
214         return writeHtmlMetadata(html, metadata);
215     }
216 
217     static String normalizeMultilineValue(String value) {
218         return value.trim().replaceAll("[ \\t]*[\\r\\n]+[ \\t]*", " ");
219     }
220 
221     private boolean writeHtmlMetadata(StringBuilder html, Map<String, List<String>> data) {
222         boolean containsTitle = false;
223         for (Entry<String, List<String>> entry : data.entrySet()) {
224             if (writeHtmlMetadata(html, entry.getKey(), entry.getValue())) {
225                 containsTitle = true;
226             }
227         }
228         return containsTitle;
229     }
230 
231     private boolean writeHtmlMetadata(StringBuilder html, String key, List<String> values) {
232         if ("title".equalsIgnoreCase(key)) {
233             html.append("<title>");
234             html.append(HtmlTools.escapeHTML(values.stream().collect(Collectors.joining(", ")), false));
235             html.append("</title>");
236             return true;
237         } else {
238             if (key.equalsIgnoreCase("author") && values.size() > 1) {
239                 // for multiple authors emit multiple meta tags
240                 for (String value : values) {
241                     writeHtmlMetadata(html, key, Collections.singletonList(value));
242                 }
243             } else {
244                 // every other multi-value should just be concatenated and emitted in a single meta tag
245                 final String separator;
246                 if (key.equalsIgnoreCase("keywords")) {
247                     separator = ",";
248                 } else {
249                     separator = EOL;
250                 }
251                 html.append("<meta name='");
252                 html.append(HtmlTools.escapeHTML(key));
253                 html.append("' content='");
254                 html.append(HtmlTools.escapeHTML(values.stream().collect(Collectors.joining(separator))));
255                 html.append("' />");
256             }
257             return false;
258         }
259     }
260 
261     /**
262      * uses flexmark-java library to parse content and generate HTML output.
263      *
264      * @param source the Markdown source
265      * @return HTML content generated by flexmark-java
266      * @throws IOException passed through
267      */
268     String toHtml(Reader source) throws IOException {
269         // Read the source
270         StringBuilder markdownText = new StringBuilder(IOUtils.toString(source));
271 
272         // Now, build the HTML document
273         StringBuilder html = new StringBuilder(1000);
274         html.append("<html>");
275         html.append("<head>");
276 
277         boolean haveTitle = processMetadataForHtml(html, markdownText);
278 
279         // Now is the time to parse the Markdown document
280         // (after we've trimmed out the metadatas, and before we check for its headings)
281         Node documentRoot = FLEXMARK_PARSER.parse(markdownText.toString());
282 
283         // Special trick: if there is no title specified as a metadata in the header, we will use the first
284         // heading as the document title
285         if (!haveTitle && documentRoot.hasChildren()) {
286             // Skip the comment nodes
287             Node firstNode = documentRoot.getFirstChild();
288             while (firstNode != null && firstNode instanceof HtmlCommentBlock) {
289                 firstNode = firstNode.getNext();
290             }
291 
292             // If this first non-comment node is a heading, we use it as the document title
293             if (firstNode != null && firstNode instanceof Heading) {
294                 html.append("<title>");
295                 TextCollectingVisitor collectingVisitor = new TextCollectingVisitor();
296                 String headingText = collectingVisitor.collectAndGetText(firstNode);
297                 html.append(HtmlTools.escapeHTML(headingText, false));
298                 html.append("</title>");
299             }
300         }
301         html.append("</head>");
302         html.append("<body>");
303 
304         // Convert our Markdown document to HTML and append it to our HTML
305         FLEXMARK_HTML_RENDERER.render(documentRoot, html);
306 
307         html.append("</body>");
308         html.append("</html>");
309 
310         return html.toString();
311     }
312 
313     /**
314      * Internal parser for HTML generated by the Markdown library.
315      *
316      * 2 special things:
317      * <ul>
318      * <li> DIV elements are translated as Unknown Sink events
319      * </ul>
320      * PRE elements need to be "source" because the Xhtml5Sink will surround the
321      * corresponding verbatim() Sink event with a DIV element with class="source",
322      * which is how most Maven Skin (incl. Fluido) recognize a block of code, which
323      * needs to be highlighted accordingly.
324      */
325     @Named
326     public static class MarkdownHtmlParser extends Xhtml5Parser {
327         public MarkdownHtmlParser() {
328             super();
329         }
330 
331         @Override
332         protected void init() {
333             super.init();
334         }
335 
336         @Override
337         protected boolean baseEndTag(XmlPullParser parser, Sink sink) {
338             boolean visited = super.baseEndTag(parser, sink);
339             if (!visited) {
340                 if (parser.getName().equals(HtmlMarkup.DIV.toString())) {
341                     handleUnknown(parser, sink, TAG_TYPE_END);
342                     visited = true;
343                 }
344             }
345             return visited;
346         }
347 
348         @Override
349         protected boolean baseStartTag(XmlPullParser parser, Sink sink) {
350             boolean visited = super.baseStartTag(parser, sink);
351             if (!visited) {
352                 if (parser.getName().equals(HtmlMarkup.DIV.toString())) {
353                     handleUnknown(parser, sink, TAG_TYPE_START);
354                     visited = true;
355                 }
356             }
357             return visited;
358         }
359     }
360 }