001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *   http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing,
013 * software distributed under the License is distributed on an
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 * KIND, either express or implied.  See the License for the
016 * specific language governing permissions and limitations
017 * under the License.
018 */
019package org.apache.maven.doxia.module.markdown;
020
021import javax.inject.Inject;
022import javax.inject.Named;
023import javax.inject.Singleton;
024
025import java.io.IOException;
026import java.io.Reader;
027import java.util.Arrays;
028import java.util.Collections;
029import java.util.LinkedHashMap;
030import java.util.List;
031import java.util.Map;
032import java.util.Map.Entry;
033import java.util.regex.Matcher;
034import java.util.regex.Pattern;
035import java.util.stream.Collectors;
036
037import com.vladsch.flexmark.ast.Heading;
038import com.vladsch.flexmark.ast.HtmlCommentBlock;
039import com.vladsch.flexmark.ast.util.TextCollectingVisitor;
040import com.vladsch.flexmark.ext.abbreviation.AbbreviationExtension;
041import com.vladsch.flexmark.ext.autolink.AutolinkExtension;
042import com.vladsch.flexmark.ext.definition.DefinitionExtension;
043import com.vladsch.flexmark.ext.escaped.character.EscapedCharacterExtension;
044import com.vladsch.flexmark.ext.gfm.strikethrough.StrikethroughExtension;
045import com.vladsch.flexmark.ext.tables.TablesExtension;
046import com.vladsch.flexmark.ext.typographic.TypographicExtension;
047import com.vladsch.flexmark.ext.wikilink.WikiLinkExtension;
048import com.vladsch.flexmark.ext.yaml.front.matter.YamlFrontMatterExtension;
049import com.vladsch.flexmark.html.HtmlRenderer;
050import com.vladsch.flexmark.util.ast.Node;
051import com.vladsch.flexmark.util.data.MutableDataSet;
052import org.apache.commons.io.IOUtils;
053import org.apache.maven.doxia.markup.HtmlMarkup;
054import org.apache.maven.doxia.markup.TextMarkup;
055import org.apache.maven.doxia.module.xhtml5.Xhtml5Parser;
056import org.apache.maven.doxia.parser.AbstractTextParser;
057import org.apache.maven.doxia.parser.ParseException;
058import org.apache.maven.doxia.sink.Sink;
059import org.apache.maven.doxia.util.HtmlTools;
060import org.codehaus.plexus.util.xml.pull.XmlPullParser;
061
062/**
063 * <p>
064 * Implementation of {@link org.apache.maven.doxia.parser.Parser} for Markdown documents.
065 * </p>
066 * <p>
067 * Defers effective parsing to the <a href="https://github.com/vsch/flexmark-java">flexmark-java library</a>,
068 * which generates HTML content then delegates parsing of this content to a slightly modified Doxia Xhtml5 parser.
069 * (before 1.8, the <a href="http://pegdown.org">PegDown library</a> was used)
070 * </p>
071 *
072 * @author Vladimir Schneider
073 * @author Julien Nicoulaud
074 * @since 1.3
075 */
076@Singleton
077@Named("markdown")
078public class MarkdownParser extends AbstractTextParser implements TextMarkup {
079
080    /**
081     * Regex that identifies a multimarkdown-style metadata section at the start of the document
082     *
083     * In order to ensure that we have minimal risk of false positives when slurping metadata sections, the
084     * first key in the metadata section must be one of these standard keys or else the entire metadata section is
085     * ignored.
086     * @see <a href="https://fletcher.github.io/MultiMarkdown-5/metadata.html">Multimarkdown Metadata</a>
087     */
088    private static final Pattern METADATA_SECTION_PATTERN = Pattern.compile(
089            "\\A^"
090                    + "(?:title|author|date|address|affiliation|copyright|email|keywords|language|phone|subtitle)"
091                    + "[ \\t]*:[\\S\\s]+?^[ \\t]*$",
092            Pattern.MULTILINE | Pattern.CASE_INSENSITIVE);
093
094    /**
095     * Regex that captures the key and value of a multimarkdown-style metadata entry.
096     * Group 1 captures the key, group 2 captures the value. Multivalues are not supported in the syntax!
097     * Multiline values need to be normalized
098     * @see <a href="https://fletcher.github.io/MultiMarkdown-5/metadata.html">Multimarkdown Metadata</a>
099     *
100     */
101    private static final Pattern METADATA_ENTRY_PATTERN = Pattern.compile(
102            "^([^:\\r\\n]+?)[ \\t]*:([\\S\\s]+?)(?=(?:^(?:[^:\\r\\n]+?)[ \\t]*:)|^[ \\t]*$)", Pattern.MULTILINE);
103
104    /**
105     * The parser of the HTML produced by Flexmark, that we will
106     * use to convert this HTML to Sink events
107     */
108    @Inject
109    private MarkdownHtmlParser parser;
110
111    /**
112     * Flexmark's Markdown parser (one static instance fits all)
113     */
114    private static final com.vladsch.flexmark.parser.Parser FLEXMARK_PARSER;
115
116    /**
117     * Flexmark's Markdown Metadata parser
118     */
119    private static final com.vladsch.flexmark.parser.Parser FLEXMARK_METADATA_PARSER;
120
121    /**
122     * Flexmark's HTML renderer (its output will be re-parsed and converted to Sink events)
123     */
124    private static final HtmlRenderer FLEXMARK_HTML_RENDERER;
125
126    // Initialize the Flexmark parser and renderer, once and for all
127    static {
128        MutableDataSet flexmarkOptions = new MutableDataSet();
129
130        // Enable the extensions that we used to have in Pegdown
131        flexmarkOptions.set(
132                com.vladsch.flexmark.parser.Parser.EXTENSIONS,
133                Arrays.asList(
134                        EscapedCharacterExtension.create(),
135                        AbbreviationExtension.create(),
136                        AutolinkExtension.create(),
137                        DefinitionExtension.create(),
138                        TypographicExtension.create(),
139                        TablesExtension.create(),
140                        WikiLinkExtension.create(),
141                        StrikethroughExtension.create()));
142
143        // Disable wrong apostrophe replacement
144        flexmarkOptions.set(TypographicExtension.SINGLE_QUOTE_UNMATCHED, "&apos;");
145
146        // Additional options on the HTML rendering
147        flexmarkOptions.set(HtmlRenderer.HTML_BLOCK_OPEN_TAG_EOL, false);
148        flexmarkOptions.set(HtmlRenderer.HTML_BLOCK_CLOSE_TAG_EOL, false);
149        flexmarkOptions.set(HtmlRenderer.MAX_TRAILING_BLANK_LINES, -1);
150
151        // Build the Markdown parser
152        FLEXMARK_PARSER =
153                com.vladsch.flexmark.parser.Parser.builder(flexmarkOptions).build();
154
155        MutableDataSet flexmarkMetadataOptions = new MutableDataSet();
156        flexmarkMetadataOptions.set(
157                com.vladsch.flexmark.parser.Parser.EXTENSIONS, Arrays.asList(YamlFrontMatterExtension.create()));
158        FLEXMARK_METADATA_PARSER = com.vladsch.flexmark.parser.Parser.builder(flexmarkMetadataOptions)
159                .build();
160
161        // Build the HTML renderer
162        FLEXMARK_HTML_RENDERER = HtmlRenderer.builder(flexmarkOptions)
163                .linkResolverFactory(new FlexmarkDoxiaLinkResolver.Factory())
164                .build();
165    }
166
167    /** {@inheritDoc} */
168    @Override
169    public void parse(Reader source, Sink sink, String reference) throws ParseException {
170        try {
171            // Markdown to HTML (using flexmark-java library)
172            String html = toHtml(source);
173
174            // TODO: add locator for the markdown source (not the intermediate HTML format)
175            // this requires writing a custom renderer not leveraging the XHTML parser
176
177            // then HTML to Sink API
178            parser.parse(html, getWrappedSink(sink), "Intermediate HTML from " + reference);
179        } catch (IOException e) {
180            throw new ParseException("Failed reading Markdown source document", e);
181        }
182    }
183
184    private boolean processMetadataForHtml(StringBuilder html, StringBuilder source) {
185        final Map<String, List<String>> metadata;
186        final int endOffset; // end of metadata within source
187        // support two types of metadata:
188        if (source.toString().startsWith("---")) {
189            // 1. YAML front matter (https://github.com/vsch/flexmark-java/wiki/Extensions#yaml-front-matter)
190            Node documentRoot = FLEXMARK_METADATA_PARSER.parse(source.toString());
191            YamlFrontMatterVisitor visitor = new YamlFrontMatterVisitor();
192            visitor.visit(documentRoot);
193            metadata = visitor.getData();
194            endOffset = visitor.getEndOffset();
195        } else {
196            // 2. Multimarkdown metadata (https://fletcher.github.io/MultiMarkdown-5/metadata.html), not yet supported
197            // by Flexmark (https://github.com/vsch/flexmark-java/issues/550)
198            metadata = new LinkedHashMap<>();
199            Matcher metadataMatcher = METADATA_SECTION_PATTERN.matcher(source);
200            if (metadataMatcher.find()) {
201                String entry = metadataMatcher.group(0) + EOL;
202                Matcher entryMatcher = METADATA_ENTRY_PATTERN.matcher(entry);
203                while (entryMatcher.find()) {
204                    String key = entryMatcher.group(1);
205                    String value = normalizeMultilineValue(entryMatcher.group(2));
206                    metadata.put(key, Collections.singletonList(value));
207                }
208                endOffset = metadataMatcher.end(0);
209            } else {
210                endOffset = 0;
211            }
212        }
213        if (endOffset > 0) {
214            // Trim the metadata from the source
215            source.delete(0, endOffset);
216        }
217        return writeHtmlMetadata(html, metadata);
218    }
219
220    static String normalizeMultilineValue(String value) {
221        return value.trim().replaceAll("[ \\t]*[\\r\\n]+[ \\t]*", " ");
222    }
223
224    private boolean writeHtmlMetadata(StringBuilder html, Map<String, List<String>> data) {
225        boolean containsTitle = false;
226        for (Entry<String, List<String>> entry : data.entrySet()) {
227            if (writeHtmlMetadata(html, entry.getKey(), entry.getValue())) {
228                containsTitle = true;
229            }
230        }
231        return containsTitle;
232    }
233
234    private boolean writeHtmlMetadata(StringBuilder html, String key, List<String> values) {
235        if ("title".equalsIgnoreCase(key)) {
236            html.append("<title>");
237            html.append(HtmlTools.escapeHTML(values.stream().collect(Collectors.joining(", ")), false));
238            html.append("</title>");
239            return true;
240        } else {
241            if (key.equalsIgnoreCase("author") && values.size() > 1) {
242                // for multiple authors emit multiple meta tags
243                for (String value : values) {
244                    writeHtmlMetadata(html, key, Collections.singletonList(value));
245                }
246            } else {
247                // every other multi-value should just be concatenated and emitted in a single meta tag
248                final String separator;
249                if (key.equalsIgnoreCase("keywords")) {
250                    separator = ",";
251                } else {
252                    separator = EOL;
253                }
254                html.append("<meta name='");
255                html.append(HtmlTools.escapeHTML(key));
256                html.append("' content='");
257                html.append(HtmlTools.escapeHTML(values.stream().collect(Collectors.joining(separator))));
258                html.append("' />");
259            }
260            return false;
261        }
262    }
263
264    /**
265     * uses flexmark-java library to parse content and generate HTML output.
266     *
267     * @param source the Markdown source
268     * @return HTML content generated by flexmark-java
269     * @throws IOException passed through
270     */
271    String toHtml(Reader source) throws IOException {
272        // Read the source
273        StringBuilder markdownText = new StringBuilder(IOUtils.toString(source));
274
275        // Now, build the HTML document
276        StringBuilder html = new StringBuilder(1000);
277        html.append("<html>");
278        html.append("<head>");
279
280        boolean haveTitle = processMetadataForHtml(html, markdownText);
281
282        // Now is the time to parse the Markdown document
283        // (after we've trimmed out the metadatas, and before we check for its headings)
284        Node documentRoot = FLEXMARK_PARSER.parse(markdownText.toString());
285
286        // Special trick: if there is no title specified as a metadata in the header, we will use the first
287        // heading as the document title
288        if (!haveTitle && documentRoot.hasChildren()) {
289            // Skip the comment nodes
290            Node firstNode = documentRoot.getFirstChild();
291            while (firstNode != null && firstNode instanceof HtmlCommentBlock) {
292                firstNode = firstNode.getNext();
293            }
294
295            // If this first non-comment node is a heading, we use it as the document title
296            if (firstNode != null && firstNode instanceof Heading) {
297                html.append("<title>");
298                TextCollectingVisitor collectingVisitor = new TextCollectingVisitor();
299                String headingText = collectingVisitor.collectAndGetText(firstNode);
300                html.append(HtmlTools.escapeHTML(headingText, false));
301                html.append("</title>");
302            }
303        }
304        html.append("</head>");
305        html.append("<body>");
306
307        // Convert our Markdown document to HTML and append it to our HTML
308        FLEXMARK_HTML_RENDERER.render(documentRoot, html);
309
310        html.append("</body>");
311        html.append("</html>");
312
313        return html.toString();
314    }
315
316    /**
317     * Internal parser for HTML generated by the Markdown library.
318     *
319     * 2 special things:
320     * <ul>
321     * <li> DIV elements are translated as Unknown Sink events
322     * </ul>
323     * PRE elements need to be "source" because the Xhtml5Sink will surround the
324     * corresponding verbatim() Sink event with a DIV element with class="source",
325     * which is how most Maven Skin (incl. Fluido) recognize a block of code, which
326     * needs to be highlighted accordingly.
327     */
328    @Named
329    public static class MarkdownHtmlParser extends Xhtml5Parser {
330        public MarkdownHtmlParser() {
331            super();
332        }
333
334        @Override
335        protected void init() {
336            super.init();
337        }
338
339        @Override
340        protected boolean baseEndTag(XmlPullParser parser, Sink sink) {
341            boolean visited = super.baseEndTag(parser, sink);
342            if (!visited) {
343                if (parser.getName().equals(HtmlMarkup.DIV.toString())) {
344                    handleUnknown(parser, sink, TAG_TYPE_END);
345                    visited = true;
346                }
347            }
348            return visited;
349        }
350
351        @Override
352        protected boolean baseStartTag(XmlPullParser parser, Sink sink) {
353            boolean visited = super.baseStartTag(parser, sink);
354            if (!visited) {
355                if (parser.getName().equals(HtmlMarkup.DIV.toString())) {
356                    handleUnknown(parser, sink, TAG_TYPE_START);
357                    visited = true;
358                }
359            }
360            return visited;
361        }
362    }
363}