001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019package org.apache.maven.doxia.module.markdown; 020 021import javax.inject.Inject; 022import javax.inject.Named; 023import javax.inject.Singleton; 024 025import java.io.IOException; 026import java.io.Reader; 027import java.util.Arrays; 028import java.util.Collections; 029import java.util.LinkedHashMap; 030import java.util.List; 031import java.util.Map; 032import java.util.Map.Entry; 033import java.util.regex.Matcher; 034import java.util.regex.Pattern; 035import java.util.stream.Collectors; 036 037import com.vladsch.flexmark.ast.Heading; 038import com.vladsch.flexmark.ast.HtmlCommentBlock; 039import com.vladsch.flexmark.ext.abbreviation.AbbreviationExtension; 040import com.vladsch.flexmark.ext.autolink.AutolinkExtension; 041import com.vladsch.flexmark.ext.definition.DefinitionExtension; 042import com.vladsch.flexmark.ext.escaped.character.EscapedCharacterExtension; 043import com.vladsch.flexmark.ext.footnotes.FootnoteExtension; 044import com.vladsch.flexmark.ext.gfm.strikethrough.StrikethroughExtension; 045import com.vladsch.flexmark.ext.tables.TablesExtension; 046import com.vladsch.flexmark.ext.typographic.TypographicExtension; 047import com.vladsch.flexmark.ext.wikilink.WikiLinkExtension; 048import com.vladsch.flexmark.ext.yaml.front.matter.YamlFrontMatterExtension; 049import com.vladsch.flexmark.html.HtmlRenderer; 050import com.vladsch.flexmark.util.ast.Node; 051import com.vladsch.flexmark.util.ast.TextCollectingVisitor; 052import com.vladsch.flexmark.util.data.MutableDataSet; 053import org.apache.commons.io.IOUtils; 054import org.apache.maven.doxia.markup.HtmlMarkup; 055import org.apache.maven.doxia.markup.TextMarkup; 056import org.apache.maven.doxia.module.xhtml5.Xhtml5Parser; 057import org.apache.maven.doxia.parser.AbstractTextParser; 058import org.apache.maven.doxia.parser.ParseException; 059import org.apache.maven.doxia.sink.Sink; 060import org.apache.maven.doxia.util.HtmlTools; 061import org.codehaus.plexus.util.xml.pull.XmlPullParser; 062 063/** 064 * <p> 065 * Implementation of {@link org.apache.maven.doxia.parser.Parser} for Markdown documents. 066 * </p> 067 * <p> 068 * Defers effective parsing to the <a href="https://github.com/vsch/flexmark-java">flexmark-java library</a>, 069 * which generates HTML content then delegates parsing of this content to a slightly modified Doxia Xhtml5 parser. 070 * (before 1.8, the <a href="http://pegdown.org">PegDown library</a> was used) 071 * </p> 072 * 073 * @author Vladimir Schneider 074 * @author Julien Nicoulaud 075 * @since 1.3 076 */ 077@Singleton 078@Named("markdown") 079public class MarkdownParser extends AbstractTextParser implements TextMarkup { 080 081 /** 082 * Regex that identifies a multimarkdown-style metadata section at the start of the document 083 * 084 * In order to ensure that we have minimal risk of false positives when slurping metadata sections, the 085 * first key in the metadata section must be one of these standard keys or else the entire metadata section is 086 * ignored. 087 * @see <a href="https://fletcher.github.io/MultiMarkdown-5/metadata.html">Multimarkdown Metadata</a> 088 */ 089 private static final Pattern METADATA_SECTION_PATTERN = Pattern.compile( 090 "\\A^" 091 + "(?:title|author|date|address|affiliation|copyright|email|keywords|language|phone|subtitle)" 092 + "[ \\t]*:[\\S\\s]+?^[ \\t]*$", 093 Pattern.MULTILINE | Pattern.CASE_INSENSITIVE); 094 095 /** 096 * Regex that captures the key and value of a multimarkdown-style metadata entry. 097 * Group 1 captures the key, group 2 captures the value. Multivalues are not supported in the syntax! 098 * Multiline values need to be normalized 099 * @see <a href="https://fletcher.github.io/MultiMarkdown-5/metadata.html">Multimarkdown Metadata</a> 100 * 101 */ 102 private static final Pattern METADATA_ENTRY_PATTERN = Pattern.compile( 103 "^([^:\\r\\n]+?)[ \\t]*:([\\S\\s]+?)(?=(?:^(?:[^:\\r\\n]+?)[ \\t]*:)|^[ \\t]*$)", Pattern.MULTILINE); 104 105 /** 106 * The parser of the HTML produced by Flexmark, that we will 107 * use to convert this HTML to Sink events 108 */ 109 @Inject 110 private MarkdownHtmlParser parser; 111 112 /** 113 * Flexmark's Markdown parser (one static instance fits all) 114 */ 115 private static final com.vladsch.flexmark.parser.Parser FLEXMARK_PARSER; 116 117 /** 118 * Flexmark's Markdown Metadata parser 119 */ 120 private static final com.vladsch.flexmark.parser.Parser FLEXMARK_METADATA_PARSER; 121 122 /** 123 * Flexmark's HTML renderer (its output will be re-parsed and converted to Sink events) 124 */ 125 private static final HtmlRenderer FLEXMARK_HTML_RENDERER; 126 127 // Initialize the Flexmark parser and renderer, once and for all 128 static { 129 MutableDataSet flexmarkOptions = new MutableDataSet(); 130 131 // Enable the extensions that we used to have in Pegdown 132 flexmarkOptions.set( 133 com.vladsch.flexmark.parser.Parser.EXTENSIONS, 134 Arrays.asList( 135 EscapedCharacterExtension.create(), 136 AbbreviationExtension.create(), 137 AutolinkExtension.create(), 138 DefinitionExtension.create(), 139 TypographicExtension.create(), 140 TablesExtension.create(), 141 WikiLinkExtension.create(), 142 FootnoteExtension.create(), 143 StrikethroughExtension.create())); 144 145 // Disable wrong apostrophe replacement 146 flexmarkOptions.set(TypographicExtension.SINGLE_QUOTE_UNMATCHED, "'"); 147 148 // Additional options on the HTML rendering 149 flexmarkOptions.set(HtmlRenderer.HTML_BLOCK_OPEN_TAG_EOL, false); 150 flexmarkOptions.set(HtmlRenderer.HTML_BLOCK_CLOSE_TAG_EOL, false); 151 flexmarkOptions.set(HtmlRenderer.MAX_TRAILING_BLANK_LINES, -1); 152 flexmarkOptions.set(HtmlRenderer.FENCED_CODE_NO_LANGUAGE_CLASS, "nohighlight nocode"); 153 154 // Build the Markdown parser 155 FLEXMARK_PARSER = 156 com.vladsch.flexmark.parser.Parser.builder(flexmarkOptions).build(); 157 158 MutableDataSet flexmarkMetadataOptions = new MutableDataSet(); 159 flexmarkMetadataOptions.set( 160 com.vladsch.flexmark.parser.Parser.EXTENSIONS, Arrays.asList(YamlFrontMatterExtension.create())); 161 FLEXMARK_METADATA_PARSER = com.vladsch.flexmark.parser.Parser.builder(flexmarkMetadataOptions) 162 .build(); 163 164 // Build the HTML renderer 165 FLEXMARK_HTML_RENDERER = HtmlRenderer.builder(flexmarkOptions) 166 .linkResolverFactory(new FlexmarkDoxiaLinkResolver.Factory()) 167 .build(); 168 } 169 170 /** {@inheritDoc} */ 171 @Override 172 public void parse(Reader source, Sink sink, String reference) throws ParseException { 173 try { 174 // Markdown to HTML (using flexmark-java library) 175 String html = toHtml(source); 176 177 // TODO: add locator for the markdown source (not the intermediate HTML format) 178 // this requires writing a custom renderer not leveraging the XHTML parser 179 180 // then HTML to Sink API 181 parser.parse(html, getWrappedSink(sink), "Intermediate HTML from " + reference); 182 } catch (IOException e) { 183 throw new ParseException("Failed reading Markdown source document", e); 184 } 185 } 186 187 private boolean processMetadataForHtml(StringBuilder html, StringBuilder source) { 188 final Map<String, List<String>> metadata; 189 final int endOffset; // end of metadata within source 190 // support two types of metadata: 191 if (source.toString().startsWith("---")) { 192 // 1. YAML front matter (https://github.com/vsch/flexmark-java/wiki/Extensions#yaml-front-matter) 193 Node documentRoot = FLEXMARK_METADATA_PARSER.parse(source.toString()); 194 YamlFrontMatterVisitor visitor = new YamlFrontMatterVisitor(); 195 visitor.visit(documentRoot); 196 metadata = visitor.getData(); 197 endOffset = visitor.getEndOffset(); 198 } else { 199 // 2. Multimarkdown metadata (https://fletcher.github.io/MultiMarkdown-5/metadata.html), not yet supported 200 // by Flexmark (https://github.com/vsch/flexmark-java/issues/550) 201 metadata = new LinkedHashMap<>(); 202 Matcher metadataMatcher = METADATA_SECTION_PATTERN.matcher(source); 203 if (metadataMatcher.find()) { 204 String entry = metadataMatcher.group(0) + EOL; 205 Matcher entryMatcher = METADATA_ENTRY_PATTERN.matcher(entry); 206 while (entryMatcher.find()) { 207 String key = entryMatcher.group(1); 208 String value = normalizeMultilineValue(entryMatcher.group(2)); 209 metadata.put(key, Collections.singletonList(value)); 210 } 211 endOffset = metadataMatcher.end(0); 212 } else { 213 endOffset = 0; 214 } 215 } 216 if (endOffset > 0) { 217 // Trim the metadata from the source 218 source.delete(0, endOffset); 219 } 220 return writeHtmlMetadata(html, metadata); 221 } 222 223 static String normalizeMultilineValue(String value) { 224 return value.trim().replaceAll("[ \\t]*[\\r\\n]+[ \\t]*", " "); 225 } 226 227 private boolean writeHtmlMetadata(StringBuilder html, Map<String, List<String>> data) { 228 boolean containsTitle = false; 229 for (Entry<String, List<String>> entry : data.entrySet()) { 230 if (writeHtmlMetadata(html, entry.getKey(), entry.getValue())) { 231 containsTitle = true; 232 } 233 } 234 return containsTitle; 235 } 236 237 private boolean writeHtmlMetadata(StringBuilder html, String key, List<String> values) { 238 if ("title".equalsIgnoreCase(key)) { 239 html.append("<title>"); 240 html.append(HtmlTools.escapeHTML(values.stream().collect(Collectors.joining(", ")), false)); 241 html.append("</title>"); 242 return true; 243 } else { 244 if (key.equalsIgnoreCase("author") && values.size() > 1) { 245 // for multiple authors emit multiple meta tags 246 for (String value : values) { 247 writeHtmlMetadata(html, key, Collections.singletonList(value)); 248 } 249 } else { 250 // every other multi-value should just be concatenated and emitted in a single meta tag 251 final String separator; 252 if (key.equalsIgnoreCase("keywords")) { 253 separator = ","; 254 } else { 255 separator = EOL; 256 } 257 html.append("<meta name='"); 258 html.append(HtmlTools.escapeHTML(key)); 259 html.append("' content='"); 260 html.append(HtmlTools.escapeHTML(values.stream().collect(Collectors.joining(separator)))); 261 html.append("' />"); 262 } 263 return false; 264 } 265 } 266 267 /** 268 * uses flexmark-java library to parse content and generate HTML output. 269 * 270 * @param source the Markdown source 271 * @return HTML content generated by flexmark-java 272 * @throws IOException passed through 273 */ 274 String toHtml(Reader source) throws IOException { 275 // Read the source 276 StringBuilder markdownText = new StringBuilder(IOUtils.toString(source)); 277 278 // Now, build the HTML document 279 StringBuilder html = new StringBuilder(1000); 280 html.append("<html>"); 281 html.append("<head>"); 282 283 boolean haveTitle = processMetadataForHtml(html, markdownText); 284 285 // Now is the time to parse the Markdown document 286 // (after we've trimmed out the metadatas, and before we check for its headings) 287 Node documentRoot = FLEXMARK_PARSER.parse(markdownText.toString()); 288 289 // Special trick: if there is no title specified as a metadata in the header, we will use the first 290 // heading as the document title 291 if (!haveTitle && documentRoot.hasChildren()) { 292 // Skip the comment nodes 293 Node firstNode = documentRoot.getFirstChild(); 294 while (firstNode != null && firstNode instanceof HtmlCommentBlock) { 295 firstNode = firstNode.getNext(); 296 } 297 298 // If this first non-comment node is a heading, we use it as the document title 299 if (firstNode != null && firstNode instanceof Heading) { 300 html.append("<title>"); 301 TextCollectingVisitor collectingVisitor = new TextCollectingVisitor(); 302 String headingText = collectingVisitor.collectAndGetText(firstNode); 303 html.append(HtmlTools.escapeHTML(headingText, false)); 304 html.append("</title>"); 305 } 306 } 307 html.append("</head>"); 308 html.append("<body>"); 309 310 // Convert our Markdown document to HTML and append it to our HTML 311 FLEXMARK_HTML_RENDERER.render(documentRoot, html); 312 313 html.append("</body>"); 314 html.append("</html>"); 315 316 return html.toString(); 317 } 318 319 /** 320 * Internal parser for HTML generated by the Markdown library. 321 * 322 * 2 special things: 323 * <ul> 324 * <li> DIV elements are translated as Unknown Sink events 325 * </ul> 326 * PRE elements need to be "source" because the Xhtml5Sink will surround the 327 * corresponding verbatim() Sink event with a DIV element with class="source", 328 * which is how most Maven Skin (incl. Fluido) recognize a block of code, which 329 * needs to be highlighted accordingly. 330 */ 331 @Named 332 public static class MarkdownHtmlParser extends Xhtml5Parser { 333 public MarkdownHtmlParser() { 334 super(); 335 } 336 337 @Override 338 protected void init() { 339 super.init(); 340 } 341 342 @Override 343 protected boolean baseEndTag(XmlPullParser parser, Sink sink) { 344 boolean visited = super.baseEndTag(parser, sink); 345 if (!visited) { 346 if (parser.getName().equals(HtmlMarkup.DIV.toString())) { 347 handleUnknown(parser, sink, TAG_TYPE_END); 348 visited = true; 349 } 350 } 351 return visited; 352 } 353 354 @Override 355 protected boolean baseStartTag(XmlPullParser parser, Sink sink) { 356 boolean visited = super.baseStartTag(parser, sink); 357 if (!visited) { 358 if (parser.getName().equals(HtmlMarkup.DIV.toString())) { 359 handleUnknown(parser, sink, TAG_TYPE_START); 360 visited = true; 361 } 362 } 363 return visited; 364 } 365 } 366}