View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *   http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  package org.apache.maven.doxia.module.markdown;
20  
21  import javax.inject.Inject;
22  import javax.inject.Named;
23  import javax.inject.Singleton;
24  
25  import java.io.IOException;
26  import java.io.Reader;
27  import java.util.Arrays;
28  import java.util.Collections;
29  import java.util.LinkedHashMap;
30  import java.util.List;
31  import java.util.Map;
32  import java.util.Map.Entry;
33  import java.util.regex.Matcher;
34  import java.util.regex.Pattern;
35  import java.util.stream.Collectors;
36  
37  import com.vladsch.flexmark.ast.Heading;
38  import com.vladsch.flexmark.ast.HtmlCommentBlock;
39  import com.vladsch.flexmark.ext.abbreviation.AbbreviationExtension;
40  import com.vladsch.flexmark.ext.autolink.AutolinkExtension;
41  import com.vladsch.flexmark.ext.definition.DefinitionExtension;
42  import com.vladsch.flexmark.ext.escaped.character.EscapedCharacterExtension;
43  import com.vladsch.flexmark.ext.footnotes.FootnoteExtension;
44  import com.vladsch.flexmark.ext.gfm.strikethrough.StrikethroughExtension;
45  import com.vladsch.flexmark.ext.tables.TablesExtension;
46  import com.vladsch.flexmark.ext.typographic.TypographicExtension;
47  import com.vladsch.flexmark.ext.wikilink.WikiLinkExtension;
48  import com.vladsch.flexmark.ext.yaml.front.matter.YamlFrontMatterExtension;
49  import com.vladsch.flexmark.html.HtmlRenderer;
50  import com.vladsch.flexmark.util.ast.Node;
51  import com.vladsch.flexmark.util.ast.TextCollectingVisitor;
52  import com.vladsch.flexmark.util.data.MutableDataSet;
53  import org.apache.commons.io.IOUtils;
54  import org.apache.maven.doxia.markup.HtmlMarkup;
55  import org.apache.maven.doxia.markup.TextMarkup;
56  import org.apache.maven.doxia.module.xhtml5.Xhtml5Parser;
57  import org.apache.maven.doxia.parser.AbstractTextParser;
58  import org.apache.maven.doxia.parser.ParseException;
59  import org.apache.maven.doxia.sink.Sink;
60  import org.apache.maven.doxia.util.HtmlTools;
61  import org.codehaus.plexus.util.xml.pull.XmlPullParser;
62  
63  /**
64   * <p>
65   * Implementation of {@link org.apache.maven.doxia.parser.Parser} for Markdown documents.
66   * </p>
67   * <p>
68   * Defers effective parsing to the <a href="https://github.com/vsch/flexmark-java">flexmark-java library</a>,
69   * which generates HTML content then delegates parsing of this content to a slightly modified Doxia Xhtml5 parser.
70   * (before 1.8, the <a href="http://pegdown.org">PegDown library</a> was used)
71   * </p>
72   *
73   * @author Vladimir Schneider
74   * @author Julien Nicoulaud
75   * @since 1.3
76   */
77  @Singleton
78  @Named("markdown")
79  public class MarkdownParser extends AbstractTextParser implements TextMarkup {
80  
81      /**
82       * Regex that identifies a multimarkdown-style metadata section at the start of the document
83       *
84       * In order to ensure that we have minimal risk of false positives when slurping metadata sections, the
85       * first key in the metadata section must be one of these standard keys or else the entire metadata section is
86       * ignored.
87       * @see <a href="https://fletcher.github.io/MultiMarkdown-5/metadata.html">Multimarkdown Metadata</a>
88       */
89      private static final Pattern METADATA_SECTION_PATTERN = Pattern.compile(
90              "\\A^"
91                      + "(?:title|author|date|address|affiliation|copyright|email|keywords|language|phone|subtitle)"
92                      + "[ \\t]*:[\\S\\s]+?^[ \\t]*$",
93              Pattern.MULTILINE | Pattern.CASE_INSENSITIVE);
94  
95      /**
96       * Regex that captures the key and value of a multimarkdown-style metadata entry.
97       * Group 1 captures the key, group 2 captures the value. Multivalues are not supported in the syntax!
98       * Multiline values need to be normalized
99       * @see <a href="https://fletcher.github.io/MultiMarkdown-5/metadata.html">Multimarkdown Metadata</a>
100      *
101      */
102     private static final Pattern METADATA_ENTRY_PATTERN = Pattern.compile(
103             "^([^:\\r\\n]+?)[ \\t]*:([\\S\\s]+?)(?=(?:^(?:[^:\\r\\n]+?)[ \\t]*:)|^[ \\t]*$)", Pattern.MULTILINE);
104 
105     /**
106      * The parser of the HTML produced by Flexmark, that we will
107      * use to convert this HTML to Sink events
108      */
109     @Inject
110     private MarkdownHtmlParser parser;
111 
112     /**
113      * Flexmark's Markdown parser (one static instance fits all)
114      */
115     private static final com.vladsch.flexmark.parser.Parser FLEXMARK_PARSER;
116 
117     /**
118      * Flexmark's Markdown Metadata parser
119      */
120     private static final com.vladsch.flexmark.parser.Parser FLEXMARK_METADATA_PARSER;
121 
122     /**
123      * Flexmark's HTML renderer (its output will be re-parsed and converted to Sink events)
124      */
125     private static final HtmlRenderer FLEXMARK_HTML_RENDERER;
126 
127     // Initialize the Flexmark parser and renderer, once and for all
128     static {
129         MutableDataSet flexmarkOptions = new MutableDataSet();
130 
131         // Enable the extensions that we used to have in Pegdown
132         flexmarkOptions.set(
133                 com.vladsch.flexmark.parser.Parser.EXTENSIONS,
134                 Arrays.asList(
135                         EscapedCharacterExtension.create(),
136                         AbbreviationExtension.create(),
137                         AutolinkExtension.create(),
138                         DefinitionExtension.create(),
139                         TypographicExtension.create(),
140                         TablesExtension.create(),
141                         WikiLinkExtension.create(),
142                         FootnoteExtension.create(),
143                         StrikethroughExtension.create()));
144 
145         // Disable wrong apostrophe replacement
146         flexmarkOptions.set(TypographicExtension.SINGLE_QUOTE_UNMATCHED, "&apos;");
147 
148         // Additional options on the HTML rendering
149         flexmarkOptions.set(HtmlRenderer.HTML_BLOCK_OPEN_TAG_EOL, false);
150         flexmarkOptions.set(HtmlRenderer.HTML_BLOCK_CLOSE_TAG_EOL, false);
151         flexmarkOptions.set(HtmlRenderer.MAX_TRAILING_BLANK_LINES, -1);
152         flexmarkOptions.set(HtmlRenderer.FENCED_CODE_NO_LANGUAGE_CLASS, "nohighlight nocode");
153 
154         // Build the Markdown parser
155         FLEXMARK_PARSER =
156                 com.vladsch.flexmark.parser.Parser.builder(flexmarkOptions).build();
157 
158         MutableDataSet flexmarkMetadataOptions = new MutableDataSet();
159         flexmarkMetadataOptions.set(
160                 com.vladsch.flexmark.parser.Parser.EXTENSIONS, Arrays.asList(YamlFrontMatterExtension.create()));
161         FLEXMARK_METADATA_PARSER = com.vladsch.flexmark.parser.Parser.builder(flexmarkMetadataOptions)
162                 .build();
163 
164         // Build the HTML renderer
165         FLEXMARK_HTML_RENDERER = HtmlRenderer.builder(flexmarkOptions)
166                 .linkResolverFactory(new FlexmarkDoxiaLinkResolver.Factory())
167                 .build();
168     }
169 
170     /** {@inheritDoc} */
171     @Override
172     public void parse(Reader source, Sink sink, String reference) throws ParseException {
173         try {
174             // Markdown to HTML (using flexmark-java library)
175             String html = toHtml(source);
176 
177             // TODO: add locator for the markdown source (not the intermediate HTML format)
178             // this requires writing a custom renderer not leveraging the XHTML parser
179 
180             // then HTML to Sink API
181             parser.parse(html, getWrappedSink(sink), "Intermediate HTML from " + reference);
182         } catch (IOException e) {
183             throw new ParseException("Failed reading Markdown source document", e);
184         }
185     }
186 
187     private boolean processMetadataForHtml(StringBuilder html, StringBuilder source) {
188         final Map<String, List<String>> metadata;
189         final int endOffset; // end of metadata within source
190         // support two types of metadata:
191         if (source.toString().startsWith("---")) {
192             // 1. YAML front matter (https://github.com/vsch/flexmark-java/wiki/Extensions#yaml-front-matter)
193             Node documentRoot = FLEXMARK_METADATA_PARSER.parse(source.toString());
194             YamlFrontMatterVisitor visitor = new YamlFrontMatterVisitor();
195             visitor.visit(documentRoot);
196             metadata = visitor.getData();
197             endOffset = visitor.getEndOffset();
198         } else {
199             // 2. Multimarkdown metadata (https://fletcher.github.io/MultiMarkdown-5/metadata.html), not yet supported
200             // by Flexmark (https://github.com/vsch/flexmark-java/issues/550)
201             metadata = new LinkedHashMap<>();
202             Matcher metadataMatcher = METADATA_SECTION_PATTERN.matcher(source);
203             if (metadataMatcher.find()) {
204                 String entry = metadataMatcher.group(0) + EOL;
205                 Matcher entryMatcher = METADATA_ENTRY_PATTERN.matcher(entry);
206                 while (entryMatcher.find()) {
207                     String key = entryMatcher.group(1);
208                     String value = normalizeMultilineValue(entryMatcher.group(2));
209                     metadata.put(key, Collections.singletonList(value));
210                 }
211                 endOffset = metadataMatcher.end(0);
212             } else {
213                 endOffset = 0;
214             }
215         }
216         if (endOffset > 0) {
217             // Trim the metadata from the source
218             source.delete(0, endOffset);
219         }
220         return writeHtmlMetadata(html, metadata);
221     }
222 
223     static String normalizeMultilineValue(String value) {
224         return value.trim().replaceAll("[ \\t]*[\\r\\n]+[ \\t]*", " ");
225     }
226 
227     private boolean writeHtmlMetadata(StringBuilder html, Map<String, List<String>> data) {
228         boolean containsTitle = false;
229         for (Entry<String, List<String>> entry : data.entrySet()) {
230             if (writeHtmlMetadata(html, entry.getKey(), entry.getValue())) {
231                 containsTitle = true;
232             }
233         }
234         return containsTitle;
235     }
236 
237     private boolean writeHtmlMetadata(StringBuilder html, String key, List<String> values) {
238         if ("title".equalsIgnoreCase(key)) {
239             html.append("<title>");
240             html.append(HtmlTools.escapeHTML(values.stream().collect(Collectors.joining(", ")), false));
241             html.append("</title>");
242             return true;
243         } else {
244             if (key.equalsIgnoreCase("author") && values.size() > 1) {
245                 // for multiple authors emit multiple meta tags
246                 for (String value : values) {
247                     writeHtmlMetadata(html, key, Collections.singletonList(value));
248                 }
249             } else {
250                 // every other multi-value should just be concatenated and emitted in a single meta tag
251                 final String separator;
252                 if (key.equalsIgnoreCase("keywords")) {
253                     separator = ",";
254                 } else {
255                     separator = EOL;
256                 }
257                 html.append("<meta name='");
258                 html.append(HtmlTools.escapeHTML(key));
259                 html.append("' content='");
260                 html.append(HtmlTools.escapeHTML(values.stream().collect(Collectors.joining(separator))));
261                 html.append("' />");
262             }
263             return false;
264         }
265     }
266 
267     /**
268      * uses flexmark-java library to parse content and generate HTML output.
269      *
270      * @param source the Markdown source
271      * @return HTML content generated by flexmark-java
272      * @throws IOException passed through
273      */
274     String toHtml(Reader source) throws IOException {
275         // Read the source
276         StringBuilder markdownText = new StringBuilder(IOUtils.toString(source));
277 
278         // Now, build the HTML document
279         StringBuilder html = new StringBuilder(1000);
280         html.append("<html>");
281         html.append("<head>");
282 
283         boolean haveTitle = processMetadataForHtml(html, markdownText);
284 
285         // Now is the time to parse the Markdown document
286         // (after we've trimmed out the metadatas, and before we check for its headings)
287         Node documentRoot = FLEXMARK_PARSER.parse(markdownText.toString());
288 
289         // Special trick: if there is no title specified as a metadata in the header, we will use the first
290         // heading as the document title
291         if (!haveTitle && documentRoot.hasChildren()) {
292             // Skip the comment nodes
293             Node firstNode = documentRoot.getFirstChild();
294             while (firstNode != null && firstNode instanceof HtmlCommentBlock) {
295                 firstNode = firstNode.getNext();
296             }
297 
298             // If this first non-comment node is a heading, we use it as the document title
299             if (firstNode != null && firstNode instanceof Heading) {
300                 html.append("<title>");
301                 TextCollectingVisitor collectingVisitor = new TextCollectingVisitor();
302                 String headingText = collectingVisitor.collectAndGetText(firstNode);
303                 html.append(HtmlTools.escapeHTML(headingText, false));
304                 html.append("</title>");
305             }
306         }
307         html.append("</head>");
308         html.append("<body>");
309 
310         // Convert our Markdown document to HTML and append it to our HTML
311         FLEXMARK_HTML_RENDERER.render(documentRoot, html);
312 
313         html.append("</body>");
314         html.append("</html>");
315 
316         return html.toString();
317     }
318 
319     /**
320      * Internal parser for HTML generated by the Markdown library.
321      *
322      * 2 special things:
323      * <ul>
324      * <li> DIV elements are translated as Unknown Sink events
325      * </ul>
326      * PRE elements need to be "source" because the Xhtml5Sink will surround the
327      * corresponding verbatim() Sink event with a DIV element with class="source",
328      * which is how most Maven Skin (incl. Fluido) recognize a block of code, which
329      * needs to be highlighted accordingly.
330      */
331     @Named
332     public static class MarkdownHtmlParser extends Xhtml5Parser {
333         public MarkdownHtmlParser() {
334             super();
335         }
336 
337         @Override
338         protected void init() {
339             super.init();
340         }
341 
342         @Override
343         protected boolean baseEndTag(XmlPullParser parser, Sink sink) {
344             boolean visited = super.baseEndTag(parser, sink);
345             if (!visited) {
346                 if (parser.getName().equals(HtmlMarkup.DIV.toString())) {
347                     handleUnknown(parser, sink, TAG_TYPE_END);
348                     visited = true;
349                 }
350             }
351             return visited;
352         }
353 
354         @Override
355         protected boolean baseStartTag(XmlPullParser parser, Sink sink) {
356             boolean visited = super.baseStartTag(parser, sink);
357             if (!visited) {
358                 if (parser.getName().equals(HtmlMarkup.DIV.toString())) {
359                     handleUnknown(parser, sink, TAG_TYPE_START);
360                     visited = true;
361                 }
362             }
363             return visited;
364         }
365     }
366 }