001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019package org.apache.maven.doxia.module.markdown; 020 021import javax.inject.Inject; 022import javax.inject.Named; 023import javax.inject.Singleton; 024 025import java.io.IOException; 026import java.io.Reader; 027import java.util.Arrays; 028import java.util.Collections; 029import java.util.LinkedHashMap; 030import java.util.List; 031import java.util.Map; 032import java.util.Map.Entry; 033import java.util.regex.Matcher; 034import java.util.regex.Pattern; 035import java.util.stream.Collectors; 036 037import com.vladsch.flexmark.ast.Heading; 038import com.vladsch.flexmark.ast.HtmlCommentBlock; 039import com.vladsch.flexmark.ast.util.TextCollectingVisitor; 040import com.vladsch.flexmark.ext.abbreviation.AbbreviationExtension; 041import com.vladsch.flexmark.ext.autolink.AutolinkExtension; 042import com.vladsch.flexmark.ext.definition.DefinitionExtension; 043import com.vladsch.flexmark.ext.escaped.character.EscapedCharacterExtension; 044import com.vladsch.flexmark.ext.gfm.strikethrough.StrikethroughExtension; 045import com.vladsch.flexmark.ext.tables.TablesExtension; 046import com.vladsch.flexmark.ext.typographic.TypographicExtension; 047import com.vladsch.flexmark.ext.wikilink.WikiLinkExtension; 048import com.vladsch.flexmark.ext.yaml.front.matter.YamlFrontMatterExtension; 049import com.vladsch.flexmark.html.HtmlRenderer; 050import com.vladsch.flexmark.util.ast.Node; 051import com.vladsch.flexmark.util.data.MutableDataSet; 052import org.apache.commons.io.IOUtils; 053import org.apache.maven.doxia.markup.HtmlMarkup; 054import org.apache.maven.doxia.markup.TextMarkup; 055import org.apache.maven.doxia.module.xhtml5.Xhtml5Parser; 056import org.apache.maven.doxia.parser.AbstractTextParser; 057import org.apache.maven.doxia.parser.ParseException; 058import org.apache.maven.doxia.sink.Sink; 059import org.apache.maven.doxia.util.HtmlTools; 060import org.codehaus.plexus.util.xml.pull.XmlPullParser; 061 062/** 063 * <p> 064 * Implementation of {@link org.apache.maven.doxia.parser.Parser} for Markdown documents. 065 * </p> 066 * <p> 067 * Defers effective parsing to the <a href="https://github.com/vsch/flexmark-java">flexmark-java library</a>, 068 * which generates HTML content then delegates parsing of this content to a slightly modified Doxia Xhtml5 parser. 069 * (before 1.8, the <a href="http://pegdown.org">PegDown library</a> was used) 070 * </p> 071 * 072 * @author Vladimir Schneider 073 * @author Julien Nicoulaud 074 * @since 1.3 075 */ 076@Singleton 077@Named("markdown") 078public class MarkdownParser extends AbstractTextParser implements TextMarkup { 079 080 /** 081 * Regex that identifies a multimarkdown-style metadata section at the start of the document 082 * 083 * In order to ensure that we have minimal risk of false positives when slurping metadata sections, the 084 * first key in the metadata section must be one of these standard keys or else the entire metadata section is 085 * ignored. 086 * @see <a href="https://fletcher.github.io/MultiMarkdown-5/metadata.html">Multimarkdown Metadata</a> 087 */ 088 private static final Pattern METADATA_SECTION_PATTERN = Pattern.compile( 089 "\\A^" 090 + "(?:title|author|date|address|affiliation|copyright|email|keywords|language|phone|subtitle)" 091 + "[ \\t]*:[\\S\\s]+?^[ \\t]*$", 092 Pattern.MULTILINE | Pattern.CASE_INSENSITIVE); 093 094 /** 095 * Regex that captures the key and value of a multimarkdown-style metadata entry. 096 * Group 1 captures the key, group 2 captures the value. Multivalues are not supported in the syntax! 097 * Multiline values need to be normalized 098 * @see <a href="https://fletcher.github.io/MultiMarkdown-5/metadata.html">Multimarkdown Metadata</a> 099 * 100 */ 101 private static final Pattern METADATA_ENTRY_PATTERN = Pattern.compile( 102 "^([^:\\r\\n]+?)[ \\t]*:([\\S\\s]+?)(?=(?:^(?:[^:\\r\\n]+?)[ \\t]*:)|^[ \\t]*$)", Pattern.MULTILINE); 103 104 /** 105 * The parser of the HTML produced by Flexmark, that we will 106 * use to convert this HTML to Sink events 107 */ 108 @Inject 109 private MarkdownHtmlParser parser; 110 111 /** 112 * Flexmark's Markdown parser (one static instance fits all) 113 */ 114 private static final com.vladsch.flexmark.parser.Parser FLEXMARK_PARSER; 115 116 /** 117 * Flexmark's Markdown Metadata parser 118 */ 119 private static final com.vladsch.flexmark.parser.Parser FLEXMARK_METADATA_PARSER; 120 121 /** 122 * Flexmark's HTML renderer (its output will be re-parsed and converted to Sink events) 123 */ 124 private static final HtmlRenderer FLEXMARK_HTML_RENDERER; 125 126 // Initialize the Flexmark parser and renderer, once and for all 127 static { 128 MutableDataSet flexmarkOptions = new MutableDataSet(); 129 130 // Enable the extensions that we used to have in Pegdown 131 flexmarkOptions.set( 132 com.vladsch.flexmark.parser.Parser.EXTENSIONS, 133 Arrays.asList( 134 EscapedCharacterExtension.create(), 135 AbbreviationExtension.create(), 136 AutolinkExtension.create(), 137 DefinitionExtension.create(), 138 TypographicExtension.create(), 139 TablesExtension.create(), 140 WikiLinkExtension.create(), 141 StrikethroughExtension.create())); 142 143 // Disable wrong apostrophe replacement 144 flexmarkOptions.set(TypographicExtension.SINGLE_QUOTE_UNMATCHED, "'"); 145 146 // Additional options on the HTML rendering 147 flexmarkOptions.set(HtmlRenderer.HTML_BLOCK_OPEN_TAG_EOL, false); 148 flexmarkOptions.set(HtmlRenderer.HTML_BLOCK_CLOSE_TAG_EOL, false); 149 flexmarkOptions.set(HtmlRenderer.MAX_TRAILING_BLANK_LINES, -1); 150 151 // Build the Markdown parser 152 FLEXMARK_PARSER = 153 com.vladsch.flexmark.parser.Parser.builder(flexmarkOptions).build(); 154 155 MutableDataSet flexmarkMetadataOptions = new MutableDataSet(); 156 flexmarkMetadataOptions.set( 157 com.vladsch.flexmark.parser.Parser.EXTENSIONS, Arrays.asList(YamlFrontMatterExtension.create())); 158 FLEXMARK_METADATA_PARSER = com.vladsch.flexmark.parser.Parser.builder(flexmarkMetadataOptions) 159 .build(); 160 161 // Build the HTML renderer 162 FLEXMARK_HTML_RENDERER = HtmlRenderer.builder(flexmarkOptions) 163 .linkResolverFactory(new FlexmarkDoxiaLinkResolver.Factory()) 164 .build(); 165 } 166 167 /** {@inheritDoc} */ 168 @Override 169 public void parse(Reader source, Sink sink, String reference) throws ParseException { 170 try { 171 // Markdown to HTML (using flexmark-java library) 172 String html = toHtml(source); 173 174 // TODO: add locator for the markdown source (not the intermediate HTML format) 175 // this requires writing a custom renderer not leveraging the XHTML parser 176 177 // then HTML to Sink API 178 parser.parse(html, getWrappedSink(sink), "Intermediate HTML from " + reference); 179 } catch (IOException e) { 180 throw new ParseException("Failed reading Markdown source document", e); 181 } 182 } 183 184 private boolean processMetadataForHtml(StringBuilder html, StringBuilder source) { 185 final Map<String, List<String>> metadata; 186 final int endOffset; // end of metadata within source 187 // support two types of metadata: 188 if (source.toString().startsWith("---")) { 189 // 1. YAML front matter (https://github.com/vsch/flexmark-java/wiki/Extensions#yaml-front-matter) 190 Node documentRoot = FLEXMARK_METADATA_PARSER.parse(source.toString()); 191 YamlFrontMatterVisitor visitor = new YamlFrontMatterVisitor(); 192 visitor.visit(documentRoot); 193 metadata = visitor.getData(); 194 endOffset = visitor.getEndOffset(); 195 } else { 196 // 2. Multimarkdown metadata (https://fletcher.github.io/MultiMarkdown-5/metadata.html), not yet supported 197 // by Flexmark (https://github.com/vsch/flexmark-java/issues/550) 198 metadata = new LinkedHashMap<>(); 199 Matcher metadataMatcher = METADATA_SECTION_PATTERN.matcher(source); 200 if (metadataMatcher.find()) { 201 String entry = metadataMatcher.group(0) + EOL; 202 Matcher entryMatcher = METADATA_ENTRY_PATTERN.matcher(entry); 203 while (entryMatcher.find()) { 204 String key = entryMatcher.group(1); 205 String value = normalizeMultilineValue(entryMatcher.group(2)); 206 metadata.put(key, Collections.singletonList(value)); 207 } 208 endOffset = metadataMatcher.end(0); 209 } else { 210 endOffset = 0; 211 } 212 } 213 if (endOffset > 0) { 214 // Trim the metadata from the source 215 source.delete(0, endOffset); 216 } 217 return writeHtmlMetadata(html, metadata); 218 } 219 220 static String normalizeMultilineValue(String value) { 221 return value.trim().replaceAll("[ \\t]*[\\r\\n]+[ \\t]*", " "); 222 } 223 224 private boolean writeHtmlMetadata(StringBuilder html, Map<String, List<String>> data) { 225 boolean containsTitle = false; 226 for (Entry<String, List<String>> entry : data.entrySet()) { 227 if (writeHtmlMetadata(html, entry.getKey(), entry.getValue())) { 228 containsTitle = true; 229 } 230 } 231 return containsTitle; 232 } 233 234 private boolean writeHtmlMetadata(StringBuilder html, String key, List<String> values) { 235 if ("title".equalsIgnoreCase(key)) { 236 html.append("<title>"); 237 html.append(HtmlTools.escapeHTML(values.stream().collect(Collectors.joining(", ")), false)); 238 html.append("</title>"); 239 return true; 240 } else { 241 if (key.equalsIgnoreCase("author") && values.size() > 1) { 242 // for multiple authors emit multiple meta tags 243 for (String value : values) { 244 writeHtmlMetadata(html, key, Collections.singletonList(value)); 245 } 246 } else { 247 // every other multi-value should just be concatenated and emitted in a single meta tag 248 final String separator; 249 if (key.equalsIgnoreCase("keywords")) { 250 separator = ","; 251 } else { 252 separator = EOL; 253 } 254 html.append("<meta name='"); 255 html.append(HtmlTools.escapeHTML(key)); 256 html.append("' content='"); 257 html.append(HtmlTools.escapeHTML(values.stream().collect(Collectors.joining(separator)))); 258 html.append("' />"); 259 } 260 return false; 261 } 262 } 263 264 /** 265 * uses flexmark-java library to parse content and generate HTML output. 266 * 267 * @param source the Markdown source 268 * @return HTML content generated by flexmark-java 269 * @throws IOException passed through 270 */ 271 String toHtml(Reader source) throws IOException { 272 // Read the source 273 StringBuilder markdownText = new StringBuilder(IOUtils.toString(source)); 274 275 // Now, build the HTML document 276 StringBuilder html = new StringBuilder(1000); 277 html.append("<html>"); 278 html.append("<head>"); 279 280 boolean haveTitle = processMetadataForHtml(html, markdownText); 281 282 // Now is the time to parse the Markdown document 283 // (after we've trimmed out the metadatas, and before we check for its headings) 284 Node documentRoot = FLEXMARK_PARSER.parse(markdownText.toString()); 285 286 // Special trick: if there is no title specified as a metadata in the header, we will use the first 287 // heading as the document title 288 if (!haveTitle && documentRoot.hasChildren()) { 289 // Skip the comment nodes 290 Node firstNode = documentRoot.getFirstChild(); 291 while (firstNode != null && firstNode instanceof HtmlCommentBlock) { 292 firstNode = firstNode.getNext(); 293 } 294 295 // If this first non-comment node is a heading, we use it as the document title 296 if (firstNode != null && firstNode instanceof Heading) { 297 html.append("<title>"); 298 TextCollectingVisitor collectingVisitor = new TextCollectingVisitor(); 299 String headingText = collectingVisitor.collectAndGetText(firstNode); 300 html.append(HtmlTools.escapeHTML(headingText, false)); 301 html.append("</title>"); 302 } 303 } 304 html.append("</head>"); 305 html.append("<body>"); 306 307 // Convert our Markdown document to HTML and append it to our HTML 308 FLEXMARK_HTML_RENDERER.render(documentRoot, html); 309 310 html.append("</body>"); 311 html.append("</html>"); 312 313 return html.toString(); 314 } 315 316 /** 317 * Internal parser for HTML generated by the Markdown library. 318 * 319 * 2 special things: 320 * <ul> 321 * <li> DIV elements are translated as Unknown Sink events 322 * </ul> 323 * PRE elements need to be "source" because the Xhtml5Sink will surround the 324 * corresponding verbatim() Sink event with a DIV element with class="source", 325 * which is how most Maven Skin (incl. Fluido) recognize a block of code, which 326 * needs to be highlighted accordingly. 327 */ 328 @Named 329 public static class MarkdownHtmlParser extends Xhtml5Parser { 330 public MarkdownHtmlParser() { 331 super(); 332 } 333 334 @Override 335 protected void init() { 336 super.init(); 337 } 338 339 @Override 340 protected boolean baseEndTag(XmlPullParser parser, Sink sink) { 341 boolean visited = super.baseEndTag(parser, sink); 342 if (!visited) { 343 if (parser.getName().equals(HtmlMarkup.DIV.toString())) { 344 handleUnknown(parser, sink, TAG_TYPE_END); 345 visited = true; 346 } 347 } 348 return visited; 349 } 350 351 @Override 352 protected boolean baseStartTag(XmlPullParser parser, Sink sink) { 353 boolean visited = super.baseStartTag(parser, sink); 354 if (!visited) { 355 if (parser.getName().equals(HtmlMarkup.DIV.toString())) { 356 handleUnknown(parser, sink, TAG_TYPE_START); 357 visited = true; 358 } 359 } 360 return visited; 361 } 362 } 363}