001package org.apache.maven.doxia.module.markdown; 002 003/* 004 * Licensed to the Apache Software Foundation (ASF) under one 005 * or more contributor license agreements. See the NOTICE file 006 * distributed with this work for additional information 007 * regarding copyright ownership. The ASF licenses this file 008 * to you under the Apache License, Version 2.0 (the 009 * "License"); you may not use this file except in compliance 010 * with the License. You may obtain a copy of the License at 011 * 012 * http://www.apache.org/licenses/LICENSE-2.0 013 * 014 * Unless required by applicable law or agreed to in writing, 015 * software distributed under the License is distributed on an 016 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 017 * KIND, either express or implied. See the License for the 018 * specific language governing permissions and limitations 019 * under the License. 020 */ 021 022import org.apache.commons.lang.StringEscapeUtils; 023import org.apache.commons.lang.StringUtils; 024import org.apache.maven.doxia.markup.HtmlMarkup; 025import org.apache.maven.doxia.module.xhtml.XhtmlParser; 026import org.apache.maven.doxia.parser.AbstractParser; 027import org.apache.maven.doxia.parser.ParseException; 028import org.apache.maven.doxia.parser.Parser; 029import org.apache.maven.doxia.sink.Sink; 030import org.codehaus.plexus.component.annotations.Component; 031import org.codehaus.plexus.component.annotations.Requirement; 032import org.codehaus.plexus.util.IOUtil; 033import org.codehaus.plexus.util.xml.pull.XmlPullParser; 034import org.pegdown.Extensions; 035import org.pegdown.PegDownProcessor; 036import org.pegdown.ast.HeaderNode; 037import org.pegdown.ast.HtmlBlockNode; 038import org.pegdown.ast.Node; 039import org.pegdown.ast.RootNode; 040import org.pegdown.ast.SuperNode; 041import org.pegdown.ast.TextNode; 042 043import java.io.IOException; 044import java.io.Reader; 045import java.io.StringReader; 046import java.util.regex.Matcher; 047import java.util.regex.Pattern; 048 049/** 050 * Implementation of {@link org.apache.maven.doxia.parser.Parser} for Markdown documents. 051 * <p/> 052 * Defers effective parsing to the <a href="http://pegdown.org">PegDown library</a>, which generates HTML content 053 * then delegates parsing of this content to a slightly modified Doxia Xhtml parser. 054 * 055 * @author Julien Nicoulaud <julien.nicoulaud@gmail.com> 056 * @since 1.3 057 * @see MarkdownToDoxiaHtmlSerializer 058 */ 059@Component( role = Parser.class, hint = "markdown" ) 060public class MarkdownParser 061 extends AbstractParser 062{ 063 064 /** 065 * The role hint for the {@link MarkdownParser} Plexus component. 066 */ 067 public static final String ROLE_HINT = "markdown"; 068 069 /** 070 * The {@link PegDownProcessor} used to convert Pegdown documents to HTML. 071 */ 072 protected static final PegDownProcessor PEGDOWN_PROCESSOR = 073 new PegDownProcessor( Extensions.ALL & ~Extensions.HARDWRAPS, Long.MAX_VALUE ); 074 075 /** 076 * Regex that identifies a multimarkdown-style metadata section at the start of the document 077 */ 078 private static final String MULTI_MARKDOWN_METADATA_SECTION = 079 "^(((?:[^\\s:][^:]*):(?:.*(?:\r?\n\\p{Blank}+[^\\s].*)*\r?\n))+)(?:\\s*\r?\n)"; 080 081 /** 082 * Regex that captures the key and value of a multimarkdown-style metadata entry. 083 */ 084 private static final String MULTI_MARKDOWN_METADATA_ENTRY = 085 "([^\\s:][^:]*):(.*(?:\r?\n\\p{Blank}+[^\\s].*)*)\r?\n"; 086 087 /** 088 * In order to ensure that we have minimal risk of false positives when slurping metadata sections, the 089 * first key in the metadata section must be one of these standard keys or else the entire metadata section is 090 * ignored. 091 */ 092 private static final String[] STANDARD_METADATA_KEYS = 093 { "title", "author", "date", "address", "affiliation", "copyright", "email", "keywords", "language", "phone", 094 "subtitle" }; 095 096 public int getType() 097 { 098 return TXT_TYPE; 099 } 100 101 @Requirement 102 private PegDownHtmlParser parser; 103 104 public void parse( Reader source, Sink sink ) 105 throws ParseException 106 { 107 try 108 { 109 // Markdown to HTML (using Pegdown library) 110 String html = toHtml( source ); 111 // then HTML to Sink API 112 parser.parse( new StringReader( html ), sink ); 113 } 114 catch ( IOException e ) 115 { 116 throw new ParseException( "Failed reading Markdown source document", e ); 117 } 118 } 119 120 /** 121 * uses PegDown library to parse content and generate HTML output. 122 * 123 * @param source the Markdown source 124 * @return HTML content generated by PegDown 125 * @throws IOException 126 * @see MarkdownToDoxiaHtmlSerializer 127 */ 128 private String toHtml( Reader source ) 129 throws IOException 130 { 131 String text = IOUtil.toString( source ); 132 StringBuilder html = new StringBuilder( text.length() * 2 ); 133 html.append( "<html>" ); 134 html.append( "<head>" ); 135 Pattern metadataPattern = Pattern.compile( MULTI_MARKDOWN_METADATA_SECTION, Pattern.MULTILINE ); 136 Matcher metadataMatcher = metadataPattern.matcher( text ); 137 boolean haveTitle = false; 138 if ( metadataMatcher.find() ) 139 { 140 metadataPattern = Pattern.compile( MULTI_MARKDOWN_METADATA_ENTRY, Pattern.MULTILINE ); 141 Matcher lineMatcher = metadataPattern.matcher( metadataMatcher.group( 1 ) ); 142 boolean first = true; 143 while ( lineMatcher.find() ) 144 { 145 String key = StringUtils.trimToEmpty( lineMatcher.group( 1 ) ); 146 if ( first ) 147 { 148 boolean found = false; 149 for ( String k : STANDARD_METADATA_KEYS ) 150 { 151 if ( k.equalsIgnoreCase( key ) ) 152 { 153 found = true; 154 break; 155 } 156 } 157 if ( !found ) 158 { 159 break; 160 } 161 first = false; 162 } 163 String value = StringUtils.trimToEmpty( lineMatcher.group( 2 ) ); 164 if ( "title".equalsIgnoreCase( key ) ) 165 { 166 haveTitle = true; 167 html.append( "<title>" ); 168 html.append( StringEscapeUtils.escapeXml( value ) ); 169 html.append( "</title>" ); 170 } 171 else if ( "author".equalsIgnoreCase( key ) ) 172 { 173 html.append( "<meta name=\'author\' content=\'" ); 174 html.append( StringEscapeUtils.escapeXml( value ) ); 175 html.append( "\' />" ); 176 } 177 else if ( "date".equalsIgnoreCase( key ) ) 178 { 179 html.append( "<meta name=\'date\' content=\'" ); 180 html.append( StringEscapeUtils.escapeXml( value ) ); 181 html.append( "\' />" ); 182 } 183 else 184 { 185 html.append( "<meta name=\'" ); 186 html.append( StringEscapeUtils.escapeXml( key ) ); 187 html.append( "\' content=\'" ); 188 html.append( StringEscapeUtils.escapeXml( value ) ); 189 html.append( "\' />" ); 190 } 191 } 192 if ( !first ) 193 { 194 text = text.substring( metadataMatcher.end() ); 195 } 196 } 197 RootNode rootNode = PEGDOWN_PROCESSOR.parseMarkdown( text.toCharArray() ); 198 if ( !haveTitle && rootNode.getChildren().size() > 0 ) 199 { 200 // use the first (non-comment) node only if it is a heading 201 int i = 0; 202 Node firstNode = null; 203 while ( i < rootNode.getChildren().size() && isHtmlComment( 204 ( firstNode = rootNode.getChildren().get( i ) ) ) ) 205 { 206 i++; 207 } 208 if ( firstNode instanceof HeaderNode ) 209 { 210 html.append( "<title>" ); 211 html.append( StringEscapeUtils.escapeXml( nodeText( firstNode ) ) ); 212 html.append( "</title>" ); 213 } 214 } 215 html.append( "</head>" ); 216 html.append( "<body>" ); 217 html.append( new MarkdownToDoxiaHtmlSerializer().toHtml( rootNode ) ); 218 html.append( "</body>" ); 219 html.append( "</html>" ); 220 221 return html.toString(); 222 } 223 224 public static boolean isHtmlComment( Node node ) 225 { 226 if ( node instanceof HtmlBlockNode ) 227 { 228 HtmlBlockNode blockNode = (HtmlBlockNode) node; 229 return blockNode.getText().startsWith( "<!--" ); 230 } 231 return false; 232 } 233 234 public static String nodeText( Node node ) 235 { 236 StringBuilder builder = new StringBuilder(); 237 if ( node instanceof TextNode ) 238 { 239 builder.append( TextNode.class.cast( node ).getText() ); 240 } 241 else 242 { 243 for ( Node n : node.getChildren() ) 244 { 245 if ( n instanceof TextNode ) 246 { 247 builder.append( TextNode.class.cast( n ).getText() ); 248 } 249 else if ( n instanceof SuperNode ) 250 { 251 builder.append( nodeText( n ) ); 252 } 253 } 254 } 255 return builder.toString(); 256 } 257 258 /** 259 * Internal parser for HTML generated by PegDown library. 260 */ 261 @Component( role = PegDownHtmlParser.class ) 262 public static class PegDownHtmlParser 263 extends XhtmlParser 264 { 265 public PegDownHtmlParser() 266 { 267 super(); 268 } 269 270 @Override 271 protected boolean baseEndTag( XmlPullParser parser, Sink sink ) 272 { 273 boolean visited = super.baseEndTag( parser, sink ); 274 if ( !visited ) 275 { 276 if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) ) 277 { 278 handleUnknown( parser, sink, TAG_TYPE_END ); 279 visited = true; 280 } 281 } 282 return visited; 283 } 284 285 @Override 286 protected boolean baseStartTag( XmlPullParser parser, Sink sink ) 287 { 288 boolean visited = super.baseStartTag( parser, sink ); 289 if ( !visited ) 290 { 291 if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) ) 292 { 293 handleUnknown( parser, sink, TAG_TYPE_START ); 294 visited = true; 295 } 296 } 297 return visited; 298 } 299 } 300}