View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *   http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  package org.apache.maven.doxia.module.markdown;
20  
21  import javax.inject.Inject;
22  import javax.inject.Named;
23  import javax.inject.Singleton;
24  
25  import java.io.IOException;
26  import java.io.Reader;
27  import java.util.Arrays;
28  import java.util.Collections;
29  import java.util.LinkedHashMap;
30  import java.util.List;
31  import java.util.Map;
32  import java.util.Map.Entry;
33  import java.util.regex.Matcher;
34  import java.util.regex.Pattern;
35  import java.util.stream.Collectors;
36  
37  import com.vladsch.flexmark.ast.Heading;
38  import com.vladsch.flexmark.ast.HtmlCommentBlock;
39  import com.vladsch.flexmark.ast.util.TextCollectingVisitor;
40  import com.vladsch.flexmark.ext.abbreviation.AbbreviationExtension;
41  import com.vladsch.flexmark.ext.autolink.AutolinkExtension;
42  import com.vladsch.flexmark.ext.definition.DefinitionExtension;
43  import com.vladsch.flexmark.ext.escaped.character.EscapedCharacterExtension;
44  import com.vladsch.flexmark.ext.gfm.strikethrough.StrikethroughExtension;
45  import com.vladsch.flexmark.ext.tables.TablesExtension;
46  import com.vladsch.flexmark.ext.typographic.TypographicExtension;
47  import com.vladsch.flexmark.ext.wikilink.WikiLinkExtension;
48  import com.vladsch.flexmark.ext.yaml.front.matter.YamlFrontMatterExtension;
49  import com.vladsch.flexmark.html.HtmlRenderer;
50  import com.vladsch.flexmark.util.ast.Node;
51  import com.vladsch.flexmark.util.data.MutableDataSet;
52  import org.apache.commons.io.IOUtils;
53  import org.apache.maven.doxia.markup.HtmlMarkup;
54  import org.apache.maven.doxia.markup.TextMarkup;
55  import org.apache.maven.doxia.module.xhtml5.Xhtml5Parser;
56  import org.apache.maven.doxia.parser.AbstractTextParser;
57  import org.apache.maven.doxia.parser.ParseException;
58  import org.apache.maven.doxia.sink.Sink;
59  import org.apache.maven.doxia.util.HtmlTools;
60  import org.codehaus.plexus.util.xml.pull.XmlPullParser;
61  
62  /**
63   * <p>
64   * Implementation of {@link org.apache.maven.doxia.parser.Parser} for Markdown documents.
65   * </p>
66   * <p>
67   * Defers effective parsing to the <a href="https://github.com/vsch/flexmark-java">flexmark-java library</a>,
68   * which generates HTML content then delegates parsing of this content to a slightly modified Doxia Xhtml5 parser.
69   * (before 1.8, the <a href="http://pegdown.org">PegDown library</a> was used)
70   * </p>
71   *
72   * @author Vladimir Schneider
73   * @author Julien Nicoulaud
74   * @since 1.3
75   */
76  @Singleton
77  @Named("markdown")
78  public class MarkdownParser extends AbstractTextParser implements TextMarkup {
79  
80      /**
81       * Regex that identifies a multimarkdown-style metadata section at the start of the document
82       *
83       * In order to ensure that we have minimal risk of false positives when slurping metadata sections, the
84       * first key in the metadata section must be one of these standard keys or else the entire metadata section is
85       * ignored.
86       * @see <a href="https://fletcher.github.io/MultiMarkdown-5/metadata.html">Multimarkdown Metadata</a>
87       */
88      private static final Pattern METADATA_SECTION_PATTERN = Pattern.compile(
89              "\\A^"
90                      + "(?:title|author|date|address|affiliation|copyright|email|keywords|language|phone|subtitle)"
91                      + "[ \\t]*:[\\S\\s]+?^[ \\t]*$",
92              Pattern.MULTILINE | Pattern.CASE_INSENSITIVE);
93  
94      /**
95       * Regex that captures the key and value of a multimarkdown-style metadata entry.
96       * Group 1 captures the key, group 2 captures the value. Multivalues are not supported in the syntax!
97       * Multiline values need to be normalized
98       * @see <a href="https://fletcher.github.io/MultiMarkdown-5/metadata.html">Multimarkdown Metadata</a>
99       *
100      */
101     private static final Pattern METADATA_ENTRY_PATTERN = Pattern.compile(
102             "^([^:\\r\\n]+?)[ \\t]*:([\\S\\s]+?)(?=(?:^(?:[^:\\r\\n]+?)[ \\t]*:)|^[ \\t]*$)", Pattern.MULTILINE);
103 
104     /**
105      * The parser of the HTML produced by Flexmark, that we will
106      * use to convert this HTML to Sink events
107      */
108     @Inject
109     private MarkdownHtmlParser parser;
110 
111     /**
112      * Flexmark's Markdown parser (one static instance fits all)
113      */
114     private static final com.vladsch.flexmark.parser.Parser FLEXMARK_PARSER;
115 
116     /**
117      * Flexmark's Markdown Metadata parser
118      */
119     private static final com.vladsch.flexmark.parser.Parser FLEXMARK_METADATA_PARSER;
120 
121     /**
122      * Flexmark's HTML renderer (its output will be re-parsed and converted to Sink events)
123      */
124     private static final HtmlRenderer FLEXMARK_HTML_RENDERER;
125 
126     // Initialize the Flexmark parser and renderer, once and for all
127     static {
128         MutableDataSet flexmarkOptions = new MutableDataSet();
129 
130         // Enable the extensions that we used to have in Pegdown
131         flexmarkOptions.set(
132                 com.vladsch.flexmark.parser.Parser.EXTENSIONS,
133                 Arrays.asList(
134                         EscapedCharacterExtension.create(),
135                         AbbreviationExtension.create(),
136                         AutolinkExtension.create(),
137                         DefinitionExtension.create(),
138                         TypographicExtension.create(),
139                         TablesExtension.create(),
140                         WikiLinkExtension.create(),
141                         StrikethroughExtension.create()));
142 
143         // Disable wrong apostrophe replacement
144         flexmarkOptions.set(TypographicExtension.SINGLE_QUOTE_UNMATCHED, "&apos;");
145 
146         // Additional options on the HTML rendering
147         flexmarkOptions.set(HtmlRenderer.HTML_BLOCK_OPEN_TAG_EOL, false);
148         flexmarkOptions.set(HtmlRenderer.HTML_BLOCK_CLOSE_TAG_EOL, false);
149         flexmarkOptions.set(HtmlRenderer.MAX_TRAILING_BLANK_LINES, -1);
150 
151         // Build the Markdown parser
152         FLEXMARK_PARSER =
153                 com.vladsch.flexmark.parser.Parser.builder(flexmarkOptions).build();
154 
155         MutableDataSet flexmarkMetadataOptions = new MutableDataSet();
156         flexmarkMetadataOptions.set(
157                 com.vladsch.flexmark.parser.Parser.EXTENSIONS, Arrays.asList(YamlFrontMatterExtension.create()));
158         FLEXMARK_METADATA_PARSER = com.vladsch.flexmark.parser.Parser.builder(flexmarkMetadataOptions)
159                 .build();
160 
161         // Build the HTML renderer
162         FLEXMARK_HTML_RENDERER = HtmlRenderer.builder(flexmarkOptions)
163                 .linkResolverFactory(new FlexmarkDoxiaLinkResolver.Factory())
164                 .build();
165     }
166 
167     /** {@inheritDoc} */
168     @Override
169     public void parse(Reader source, Sink sink, String reference) throws ParseException {
170         try {
171             // Markdown to HTML (using flexmark-java library)
172             String html = toHtml(source);
173 
174             // TODO: add locator for the markdown source (not the intermediate HTML format)
175             // this requires writing a custom renderer not leveraging the XHTML parser
176 
177             // then HTML to Sink API
178             parser.parse(html, getWrappedSink(sink), "Intermediate HTML from " + reference);
179         } catch (IOException e) {
180             throw new ParseException("Failed reading Markdown source document", e);
181         }
182     }
183 
184     private boolean processMetadataForHtml(StringBuilder html, StringBuilder source) {
185         final Map<String, List<String>> metadata;
186         final int endOffset; // end of metadata within source
187         // support two types of metadata:
188         if (source.toString().startsWith("---")) {
189             // 1. YAML front matter (https://github.com/vsch/flexmark-java/wiki/Extensions#yaml-front-matter)
190             Node documentRoot = FLEXMARK_METADATA_PARSER.parse(source.toString());
191             YamlFrontMatterVisitor visitor = new YamlFrontMatterVisitor();
192             visitor.visit(documentRoot);
193             metadata = visitor.getData();
194             endOffset = visitor.getEndOffset();
195         } else {
196             // 2. Multimarkdown metadata (https://fletcher.github.io/MultiMarkdown-5/metadata.html), not yet supported
197             // by Flexmark (https://github.com/vsch/flexmark-java/issues/550)
198             metadata = new LinkedHashMap<>();
199             Matcher metadataMatcher = METADATA_SECTION_PATTERN.matcher(source);
200             if (metadataMatcher.find()) {
201                 String entry = metadataMatcher.group(0) + EOL;
202                 Matcher entryMatcher = METADATA_ENTRY_PATTERN.matcher(entry);
203                 while (entryMatcher.find()) {
204                     String key = entryMatcher.group(1);
205                     String value = normalizeMultilineValue(entryMatcher.group(2));
206                     metadata.put(key, Collections.singletonList(value));
207                 }
208                 endOffset = metadataMatcher.end(0);
209             } else {
210                 endOffset = 0;
211             }
212         }
213         if (endOffset > 0) {
214             // Trim the metadata from the source
215             source.delete(0, endOffset);
216         }
217         return writeHtmlMetadata(html, metadata);
218     }
219 
220     static String normalizeMultilineValue(String value) {
221         return value.trim().replaceAll("[ \\t]*[\\r\\n]+[ \\t]*", " ");
222     }
223 
224     private boolean writeHtmlMetadata(StringBuilder html, Map<String, List<String>> data) {
225         boolean containsTitle = false;
226         for (Entry<String, List<String>> entry : data.entrySet()) {
227             if (writeHtmlMetadata(html, entry.getKey(), entry.getValue())) {
228                 containsTitle = true;
229             }
230         }
231         return containsTitle;
232     }
233 
234     private boolean writeHtmlMetadata(StringBuilder html, String key, List<String> values) {
235         if ("title".equalsIgnoreCase(key)) {
236             html.append("<title>");
237             html.append(HtmlTools.escapeHTML(values.stream().collect(Collectors.joining(", ")), false));
238             html.append("</title>");
239             return true;
240         } else {
241             if (key.equalsIgnoreCase("author") && values.size() > 1) {
242                 // for multiple authors emit multiple meta tags
243                 for (String value : values) {
244                     writeHtmlMetadata(html, key, Collections.singletonList(value));
245                 }
246             } else {
247                 // every other multi-value should just be concatenated and emitted in a single meta tag
248                 final String separator;
249                 if (key.equalsIgnoreCase("keywords")) {
250                     separator = ",";
251                 } else {
252                     separator = EOL;
253                 }
254                 html.append("<meta name='");
255                 html.append(HtmlTools.escapeHTML(key));
256                 html.append("' content='");
257                 html.append(HtmlTools.escapeHTML(values.stream().collect(Collectors.joining(separator))));
258                 html.append("' />");
259             }
260             return false;
261         }
262     }
263 
264     /**
265      * uses flexmark-java library to parse content and generate HTML output.
266      *
267      * @param source the Markdown source
268      * @return HTML content generated by flexmark-java
269      * @throws IOException passed through
270      */
271     String toHtml(Reader source) throws IOException {
272         // Read the source
273         StringBuilder markdownText = new StringBuilder(IOUtils.toString(source));
274 
275         // Now, build the HTML document
276         StringBuilder html = new StringBuilder(1000);
277         html.append("<html>");
278         html.append("<head>");
279 
280         boolean haveTitle = processMetadataForHtml(html, markdownText);
281 
282         // Now is the time to parse the Markdown document
283         // (after we've trimmed out the metadatas, and before we check for its headings)
284         Node documentRoot = FLEXMARK_PARSER.parse(markdownText.toString());
285 
286         // Special trick: if there is no title specified as a metadata in the header, we will use the first
287         // heading as the document title
288         if (!haveTitle && documentRoot.hasChildren()) {
289             // Skip the comment nodes
290             Node firstNode = documentRoot.getFirstChild();
291             while (firstNode != null && firstNode instanceof HtmlCommentBlock) {
292                 firstNode = firstNode.getNext();
293             }
294 
295             // If this first non-comment node is a heading, we use it as the document title
296             if (firstNode != null && firstNode instanceof Heading) {
297                 html.append("<title>");
298                 TextCollectingVisitor collectingVisitor = new TextCollectingVisitor();
299                 String headingText = collectingVisitor.collectAndGetText(firstNode);
300                 html.append(HtmlTools.escapeHTML(headingText, false));
301                 html.append("</title>");
302             }
303         }
304         html.append("</head>");
305         html.append("<body>");
306 
307         // Convert our Markdown document to HTML and append it to our HTML
308         FLEXMARK_HTML_RENDERER.render(documentRoot, html);
309 
310         html.append("</body>");
311         html.append("</html>");
312 
313         return html.toString();
314     }
315 
316     /**
317      * Internal parser for HTML generated by the Markdown library.
318      *
319      * 2 special things:
320      * <ul>
321      * <li> DIV elements are translated as Unknown Sink events
322      * </ul>
323      * PRE elements need to be "source" because the Xhtml5Sink will surround the
324      * corresponding verbatim() Sink event with a DIV element with class="source",
325      * which is how most Maven Skin (incl. Fluido) recognize a block of code, which
326      * needs to be highlighted accordingly.
327      */
328     @Named
329     public static class MarkdownHtmlParser extends Xhtml5Parser {
330         public MarkdownHtmlParser() {
331             super();
332         }
333 
334         @Override
335         protected void init() {
336             super.init();
337         }
338 
339         @Override
340         protected boolean baseEndTag(XmlPullParser parser, Sink sink) {
341             boolean visited = super.baseEndTag(parser, sink);
342             if (!visited) {
343                 if (parser.getName().equals(HtmlMarkup.DIV.toString())) {
344                     handleUnknown(parser, sink, TAG_TYPE_END);
345                     visited = true;
346                 }
347             }
348             return visited;
349         }
350 
351         @Override
352         protected boolean baseStartTag(XmlPullParser parser, Sink sink) {
353             boolean visited = super.baseStartTag(parser, sink);
354             if (!visited) {
355                 if (parser.getName().equals(HtmlMarkup.DIV.toString())) {
356                     handleUnknown(parser, sink, TAG_TYPE_START);
357                     visited = true;
358                 }
359             }
360             return visited;
361         }
362     }
363 }