001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *   http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing,
013 * software distributed under the License is distributed on an
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 * KIND, either express or implied.  See the License for the
016 * specific language governing permissions and limitations
017 * under the License.
018 */
019package org.apache.maven.doxia.module.markdown;
020
021import javax.inject.Inject;
022import javax.inject.Named;
023import javax.inject.Singleton;
024
025import java.io.IOException;
026import java.io.Reader;
027import java.util.Arrays;
028import java.util.Collections;
029import java.util.LinkedHashMap;
030import java.util.List;
031import java.util.Map;
032import java.util.Map.Entry;
033import java.util.regex.Matcher;
034import java.util.regex.Pattern;
035import java.util.stream.Collectors;
036
037import com.vladsch.flexmark.ast.Heading;
038import com.vladsch.flexmark.ast.HtmlCommentBlock;
039import com.vladsch.flexmark.ext.abbreviation.AbbreviationExtension;
040import com.vladsch.flexmark.ext.autolink.AutolinkExtension;
041import com.vladsch.flexmark.ext.definition.DefinitionExtension;
042import com.vladsch.flexmark.ext.escaped.character.EscapedCharacterExtension;
043import com.vladsch.flexmark.ext.footnotes.FootnoteExtension;
044import com.vladsch.flexmark.ext.gfm.strikethrough.StrikethroughExtension;
045import com.vladsch.flexmark.ext.tables.TablesExtension;
046import com.vladsch.flexmark.ext.typographic.TypographicExtension;
047import com.vladsch.flexmark.ext.wikilink.WikiLinkExtension;
048import com.vladsch.flexmark.ext.yaml.front.matter.YamlFrontMatterExtension;
049import com.vladsch.flexmark.html.HtmlRenderer;
050import com.vladsch.flexmark.util.ast.Node;
051import com.vladsch.flexmark.util.ast.TextCollectingVisitor;
052import com.vladsch.flexmark.util.data.MutableDataSet;
053import org.apache.commons.io.IOUtils;
054import org.apache.maven.doxia.markup.HtmlMarkup;
055import org.apache.maven.doxia.markup.TextMarkup;
056import org.apache.maven.doxia.module.xhtml5.Xhtml5Parser;
057import org.apache.maven.doxia.parser.AbstractTextParser;
058import org.apache.maven.doxia.parser.ParseException;
059import org.apache.maven.doxia.sink.Sink;
060import org.apache.maven.doxia.util.HtmlTools;
061import org.codehaus.plexus.util.xml.pull.XmlPullParser;
062
063/**
064 * <p>
065 * Implementation of {@link org.apache.maven.doxia.parser.Parser} for Markdown documents.
066 * </p>
067 * <p>
068 * Defers effective parsing to the <a href="https://github.com/vsch/flexmark-java">flexmark-java library</a>,
069 * which generates HTML content then delegates parsing of this content to a slightly modified Doxia Xhtml5 parser.
070 * (before 1.8, the <a href="http://pegdown.org">PegDown library</a> was used)
071 * </p>
072 *
073 * @author Vladimir Schneider
074 * @author Julien Nicoulaud
075 * @since 1.3
076 */
077@Singleton
078@Named("markdown")
079public class MarkdownParser extends AbstractTextParser implements TextMarkup {
080
081    /**
082     * Regex that identifies a multimarkdown-style metadata section at the start of the document
083     *
084     * In order to ensure that we have minimal risk of false positives when slurping metadata sections, the
085     * first key in the metadata section must be one of these standard keys or else the entire metadata section is
086     * ignored.
087     * @see <a href="https://fletcher.github.io/MultiMarkdown-5/metadata.html">Multimarkdown Metadata</a>
088     */
089    private static final Pattern METADATA_SECTION_PATTERN = Pattern.compile(
090            "\\A^"
091                    + "(?:title|author|date|address|affiliation|copyright|email|keywords|language|phone|subtitle)"
092                    + "[ \\t]*:[\\S\\s]+?^[ \\t]*$",
093            Pattern.MULTILINE | Pattern.CASE_INSENSITIVE);
094
095    /**
096     * Regex that captures the key and value of a multimarkdown-style metadata entry.
097     * Group 1 captures the key, group 2 captures the value. Multivalues are not supported in the syntax!
098     * Multiline values need to be normalized
099     * @see <a href="https://fletcher.github.io/MultiMarkdown-5/metadata.html">Multimarkdown Metadata</a>
100     *
101     */
102    private static final Pattern METADATA_ENTRY_PATTERN = Pattern.compile(
103            "^([^:\\r\\n]+?)[ \\t]*:([\\S\\s]+?)(?=(?:^(?:[^:\\r\\n]+?)[ \\t]*:)|^[ \\t]*$)", Pattern.MULTILINE);
104
105    /**
106     * The parser of the HTML produced by Flexmark, that we will
107     * use to convert this HTML to Sink events
108     */
109    @Inject
110    private MarkdownHtmlParser parser;
111
112    /**
113     * Flexmark's Markdown parser (one static instance fits all)
114     */
115    private static final com.vladsch.flexmark.parser.Parser FLEXMARK_PARSER;
116
117    /**
118     * Flexmark's Markdown Metadata parser
119     */
120    private static final com.vladsch.flexmark.parser.Parser FLEXMARK_METADATA_PARSER;
121
122    /**
123     * Flexmark's HTML renderer (its output will be re-parsed and converted to Sink events)
124     */
125    private static final HtmlRenderer FLEXMARK_HTML_RENDERER;
126
127    // Initialize the Flexmark parser and renderer, once and for all
128    static {
129        MutableDataSet flexmarkOptions = new MutableDataSet();
130
131        // Enable the extensions that we used to have in Pegdown
132        flexmarkOptions.set(
133                com.vladsch.flexmark.parser.Parser.EXTENSIONS,
134                Arrays.asList(
135                        EscapedCharacterExtension.create(),
136                        AbbreviationExtension.create(),
137                        AutolinkExtension.create(),
138                        DefinitionExtension.create(),
139                        TypographicExtension.create(),
140                        TablesExtension.create(),
141                        WikiLinkExtension.create(),
142                        FootnoteExtension.create(),
143                        StrikethroughExtension.create()));
144
145        // Disable wrong apostrophe replacement
146        flexmarkOptions.set(TypographicExtension.SINGLE_QUOTE_UNMATCHED, "&apos;");
147
148        // Additional options on the HTML rendering
149        flexmarkOptions.set(HtmlRenderer.HTML_BLOCK_OPEN_TAG_EOL, false);
150        flexmarkOptions.set(HtmlRenderer.HTML_BLOCK_CLOSE_TAG_EOL, false);
151        flexmarkOptions.set(HtmlRenderer.MAX_TRAILING_BLANK_LINES, -1);
152        flexmarkOptions.set(HtmlRenderer.FENCED_CODE_NO_LANGUAGE_CLASS, "nohighlight nocode");
153
154        // Build the Markdown parser
155        FLEXMARK_PARSER =
156                com.vladsch.flexmark.parser.Parser.builder(flexmarkOptions).build();
157
158        MutableDataSet flexmarkMetadataOptions = new MutableDataSet();
159        flexmarkMetadataOptions.set(
160                com.vladsch.flexmark.parser.Parser.EXTENSIONS, Arrays.asList(YamlFrontMatterExtension.create()));
161        FLEXMARK_METADATA_PARSER = com.vladsch.flexmark.parser.Parser.builder(flexmarkMetadataOptions)
162                .build();
163
164        // Build the HTML renderer
165        FLEXMARK_HTML_RENDERER = HtmlRenderer.builder(flexmarkOptions)
166                .linkResolverFactory(new FlexmarkDoxiaLinkResolver.Factory())
167                .build();
168    }
169
170    /** {@inheritDoc} */
171    @Override
172    public void parse(Reader source, Sink sink, String reference) throws ParseException {
173        try {
174            // Markdown to HTML (using flexmark-java library)
175            String html = toHtml(source);
176
177            // TODO: add locator for the markdown source (not the intermediate HTML format)
178            // this requires writing a custom renderer not leveraging the XHTML parser
179
180            // then HTML to Sink API
181            parser.parse(html, getWrappedSink(sink), "Intermediate HTML from " + reference);
182        } catch (IOException e) {
183            throw new ParseException("Failed reading Markdown source document", e);
184        }
185    }
186
187    private boolean processMetadataForHtml(StringBuilder html, StringBuilder source) {
188        final Map<String, List<String>> metadata;
189        final int endOffset; // end of metadata within source
190        // support two types of metadata:
191        if (source.toString().startsWith("---")) {
192            // 1. YAML front matter (https://github.com/vsch/flexmark-java/wiki/Extensions#yaml-front-matter)
193            Node documentRoot = FLEXMARK_METADATA_PARSER.parse(source.toString());
194            YamlFrontMatterVisitor visitor = new YamlFrontMatterVisitor();
195            visitor.visit(documentRoot);
196            metadata = visitor.getData();
197            endOffset = visitor.getEndOffset();
198        } else {
199            // 2. Multimarkdown metadata (https://fletcher.github.io/MultiMarkdown-5/metadata.html), not yet supported
200            // by Flexmark (https://github.com/vsch/flexmark-java/issues/550)
201            metadata = new LinkedHashMap<>();
202            Matcher metadataMatcher = METADATA_SECTION_PATTERN.matcher(source);
203            if (metadataMatcher.find()) {
204                String entry = metadataMatcher.group(0) + EOL;
205                Matcher entryMatcher = METADATA_ENTRY_PATTERN.matcher(entry);
206                while (entryMatcher.find()) {
207                    String key = entryMatcher.group(1);
208                    String value = normalizeMultilineValue(entryMatcher.group(2));
209                    metadata.put(key, Collections.singletonList(value));
210                }
211                endOffset = metadataMatcher.end(0);
212            } else {
213                endOffset = 0;
214            }
215        }
216        if (endOffset > 0) {
217            // Trim the metadata from the source
218            source.delete(0, endOffset);
219        }
220        return writeHtmlMetadata(html, metadata);
221    }
222
223    static String normalizeMultilineValue(String value) {
224        return value.trim().replaceAll("[ \\t]*[\\r\\n]+[ \\t]*", " ");
225    }
226
227    private boolean writeHtmlMetadata(StringBuilder html, Map<String, List<String>> data) {
228        boolean containsTitle = false;
229        for (Entry<String, List<String>> entry : data.entrySet()) {
230            if (writeHtmlMetadata(html, entry.getKey(), entry.getValue())) {
231                containsTitle = true;
232            }
233        }
234        return containsTitle;
235    }
236
237    private boolean writeHtmlMetadata(StringBuilder html, String key, List<String> values) {
238        if ("title".equalsIgnoreCase(key)) {
239            html.append("<title>");
240            html.append(HtmlTools.escapeHTML(values.stream().collect(Collectors.joining(", ")), false));
241            html.append("</title>");
242            return true;
243        } else {
244            if (key.equalsIgnoreCase("author") && values.size() > 1) {
245                // for multiple authors emit multiple meta tags
246                for (String value : values) {
247                    writeHtmlMetadata(html, key, Collections.singletonList(value));
248                }
249            } else {
250                // every other multi-value should just be concatenated and emitted in a single meta tag
251                final String separator;
252                if (key.equalsIgnoreCase("keywords")) {
253                    separator = ",";
254                } else {
255                    separator = EOL;
256                }
257                html.append("<meta name='");
258                html.append(HtmlTools.escapeHTML(key));
259                html.append("' content='");
260                html.append(HtmlTools.escapeHTML(values.stream().collect(Collectors.joining(separator))));
261                html.append("' />");
262            }
263            return false;
264        }
265    }
266
267    /**
268     * uses flexmark-java library to parse content and generate HTML output.
269     *
270     * @param source the Markdown source
271     * @return HTML content generated by flexmark-java
272     * @throws IOException passed through
273     */
274    String toHtml(Reader source) throws IOException {
275        // Read the source
276        StringBuilder markdownText = new StringBuilder(IOUtils.toString(source));
277
278        // Now, build the HTML document
279        StringBuilder html = new StringBuilder(1000);
280        html.append("<html>");
281        html.append("<head>");
282
283        boolean haveTitle = processMetadataForHtml(html, markdownText);
284
285        // Now is the time to parse the Markdown document
286        // (after we've trimmed out the metadatas, and before we check for its headings)
287        Node documentRoot = FLEXMARK_PARSER.parse(markdownText.toString());
288
289        // Special trick: if there is no title specified as a metadata in the header, we will use the first
290        // heading as the document title
291        if (!haveTitle && documentRoot.hasChildren()) {
292            // Skip the comment nodes
293            Node firstNode = documentRoot.getFirstChild();
294            while (firstNode != null && firstNode instanceof HtmlCommentBlock) {
295                firstNode = firstNode.getNext();
296            }
297
298            // If this first non-comment node is a heading, we use it as the document title
299            if (firstNode != null && firstNode instanceof Heading) {
300                html.append("<title>");
301                TextCollectingVisitor collectingVisitor = new TextCollectingVisitor();
302                String headingText = collectingVisitor.collectAndGetText(firstNode);
303                html.append(HtmlTools.escapeHTML(headingText, false));
304                html.append("</title>");
305            }
306        }
307        html.append("</head>");
308        html.append("<body>");
309
310        // Convert our Markdown document to HTML and append it to our HTML
311        FLEXMARK_HTML_RENDERER.render(documentRoot, html);
312
313        html.append("</body>");
314        html.append("</html>");
315
316        return html.toString();
317    }
318
319    /**
320     * Internal parser for HTML generated by the Markdown library.
321     *
322     * 2 special things:
323     * <ul>
324     * <li> DIV elements are translated as Unknown Sink events
325     * </ul>
326     * PRE elements need to be "source" because the Xhtml5Sink will surround the
327     * corresponding verbatim() Sink event with a DIV element with class="source",
328     * which is how most Maven Skin (incl. Fluido) recognize a block of code, which
329     * needs to be highlighted accordingly.
330     */
331    @Named
332    public static class MarkdownHtmlParser extends Xhtml5Parser {
333        public MarkdownHtmlParser() {
334            super();
335        }
336
337        @Override
338        protected void init() {
339            super.init();
340        }
341
342        @Override
343        protected boolean baseEndTag(XmlPullParser parser, Sink sink) {
344            boolean visited = super.baseEndTag(parser, sink);
345            if (!visited) {
346                if (parser.getName().equals(HtmlMarkup.DIV.toString())) {
347                    handleUnknown(parser, sink, TAG_TYPE_END);
348                    visited = true;
349                }
350            }
351            return visited;
352        }
353
354        @Override
355        protected boolean baseStartTag(XmlPullParser parser, Sink sink) {
356            boolean visited = super.baseStartTag(parser, sink);
357            if (!visited) {
358                if (parser.getName().equals(HtmlMarkup.DIV.toString())) {
359                    handleUnknown(parser, sink, TAG_TYPE_START);
360                    visited = true;
361                }
362            }
363            return visited;
364        }
365    }
366}