001package org.apache.maven.doxia.module.markdown; 002 003/* 004 * Licensed to the Apache Software Foundation (ASF) under one 005 * or more contributor license agreements. See the NOTICE file 006 * distributed with this work for additional information 007 * regarding copyright ownership. The ASF licenses this file 008 * to you under the Apache License, Version 2.0 (the 009 * "License"); you may not use this file except in compliance 010 * with the License. You may obtain a copy of the License at 011 * 012 * http://www.apache.org/licenses/LICENSE-2.0 013 * 014 * Unless required by applicable law or agreed to in writing, 015 * software distributed under the License is distributed on an 016 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 017 * KIND, either express or implied. See the License for the 018 * specific language governing permissions and limitations 019 * under the License. 020 */ 021 022import org.apache.commons.lang.StringEscapeUtils; 023import org.apache.commons.lang.StringUtils; 024import org.apache.maven.doxia.macro.MacroExecutionException; 025import org.apache.maven.doxia.markup.HtmlMarkup; 026import org.apache.maven.doxia.module.xhtml.XhtmlParser; 027import org.apache.maven.doxia.parser.ParseException; 028import org.apache.maven.doxia.parser.Parser; 029import org.apache.maven.doxia.sink.Sink; 030import org.codehaus.plexus.component.annotations.Component; 031import org.codehaus.plexus.util.IOUtil; 032import org.codehaus.plexus.util.xml.pull.XmlPullParser; 033import org.codehaus.plexus.util.xml.pull.XmlPullParserException; 034import org.pegdown.Extensions; 035import org.pegdown.PegDownProcessor; 036import org.pegdown.ast.HeaderNode; 037import org.pegdown.ast.HtmlBlockNode; 038import org.pegdown.ast.Node; 039import org.pegdown.ast.RootNode; 040import org.pegdown.ast.SuperNode; 041import org.pegdown.ast.TextNode; 042 043import java.io.IOException; 044import java.io.Reader; 045import java.io.StringReader; 046import java.util.regex.Matcher; 047import java.util.regex.Pattern; 048 049/** 050 * Implementation of {@link org.apache.maven.doxia.parser.Parser} for Markdown documents. 051 * <p/> 052 * Defers parsing to the <a href="http://pegdown.org">PegDown library</a>. 053 * 054 * @author Julien Nicoulaud <julien.nicoulaud@gmail.com> 055 * @since 1.3 056 */ 057@Component( role = Parser.class, hint = "markdown" ) 058public class MarkdownParser 059 extends XhtmlParser 060{ 061 062 /** 063 * The role hint for the {@link MarkdownParser} Plexus component. 064 */ 065 public static final String ROLE_HINT = "markdown"; 066 067 /** 068 * The {@link PegDownProcessor} used to convert Pegdown documents to HTML. 069 */ 070 protected static final PegDownProcessor PEGDOWN_PROCESSOR = 071 new PegDownProcessor( Extensions.ALL & ~Extensions.HARDWRAPS, Long.MAX_VALUE ); 072 073 /** 074 * Regex that identifies a multimarkdown-style metadata section at the start of the document 075 */ 076 private static final String MULTI_MARKDOWN_METADATA_SECTION = 077 "^(((?:[^\\s:][^:]*):(?:.*(?:\r?\n\\p{Blank}+[^\\s].*)*\r?\n))+)(?:\\s*\r?\n)"; 078 079 /** 080 * Regex that captures the key and value of a multimarkdown-style metadata entry. 081 */ 082 private static final String MULTI_MARKDOWN_METADATA_ENTRY = 083 "([^\\s:][^:]*):(.*(?:\r?\n\\p{Blank}+[^\\s].*)*)\r?\n"; 084 085 /** 086 * In order to ensure that we have minimal risk of false positives when slurping metadata sections, the 087 * first key in the metadata section must be one of these standard keys or else the entire metadata section is 088 * ignored. 089 */ 090 private static final String[] STANDARD_METADATA_KEYS = 091 { "title", "author", "date", "address", "affiliation", "copyright", "email", "keywords", "language", "phone", 092 "subtitle" }; 093 094 095 /** 096 * {@inheritDoc} 097 */ 098 @Override 099 public void parse( Reader source, Sink sink ) 100 throws ParseException 101 { 102 try 103 { 104 String text = IOUtil.toString( source ); 105 StringBuilder html = new StringBuilder( text.length() * 2 ); 106 html.append( "<html>" ); 107 html.append( "<head>" ); 108 Pattern metadataPattern = Pattern.compile( MULTI_MARKDOWN_METADATA_SECTION, Pattern.MULTILINE ); 109 Matcher metadataMatcher = metadataPattern.matcher( text ); 110 boolean haveTitle = false; 111 if ( metadataMatcher.find() ) 112 { 113 metadataPattern = Pattern.compile( MULTI_MARKDOWN_METADATA_ENTRY, Pattern.MULTILINE ); 114 Matcher lineMatcher = metadataPattern.matcher( metadataMatcher.group( 1 ) ); 115 boolean first = true; 116 while ( lineMatcher.find() ) 117 { 118 String key = StringUtils.trimToEmpty( lineMatcher.group( 1 ) ); 119 if ( first ) 120 { 121 boolean found = false; 122 for ( String k : STANDARD_METADATA_KEYS ) 123 { 124 if ( k.equalsIgnoreCase( key ) ) 125 { 126 found = true; 127 break; 128 } 129 } 130 if ( !found ) 131 { 132 break; 133 } 134 first = false; 135 } 136 String value = StringUtils.trimToEmpty( lineMatcher.group( 2 ) ); 137 if ( "title".equalsIgnoreCase( key ) ) 138 { 139 haveTitle = true; 140 html.append( "<title>" ); 141 html.append( StringEscapeUtils.escapeXml( value ) ); 142 html.append( "</title>" ); 143 } 144 else if ( "author".equalsIgnoreCase( key ) ) 145 { 146 html.append( "<meta name=\'author\' content=\'" ); 147 html.append( StringEscapeUtils.escapeXml( value ) ); 148 html.append( "\' />" ); 149 } 150 else if ( "date".equalsIgnoreCase( key ) ) 151 { 152 html.append( "<meta name=\'date\' content=\'" ); 153 html.append( StringEscapeUtils.escapeXml( value ) ); 154 html.append( "\' />" ); 155 } 156 else 157 { 158 html.append( "<meta name=\'" ); 159 html.append( StringEscapeUtils.escapeXml( key ) ); 160 html.append( "\' content=\'" ); 161 html.append( StringEscapeUtils.escapeXml( value ) ); 162 html.append( "\' />" ); 163 } 164 } 165 if ( !first ) 166 { 167 text = text.substring( metadataMatcher.end() ); 168 } 169 } 170 RootNode rootNode = PEGDOWN_PROCESSOR.parseMarkdown( text.toCharArray() ); 171 if ( !haveTitle && rootNode.getChildren().size() > 0 ) 172 { 173 // use the first (non-comment) node only if it is a heading 174 int i = 0; 175 Node firstNode = null; 176 while ( i < rootNode.getChildren().size() && isHtmlComment( 177 ( firstNode = rootNode.getChildren().get( i ) ) ) ) 178 { 179 i++; 180 } 181 if ( firstNode instanceof HeaderNode ) 182 { 183 html.append( "<title>" ); 184 html.append( StringEscapeUtils.escapeXml( nodeText( firstNode ) ) ); 185 html.append( "</title>" ); 186 } 187 } 188 html.append( "</head>" ); 189 html.append( "<body>" ); 190 html.append( new MarkdownToDoxiaHtmlSerializer().toHtml( rootNode ) ); 191 html.append( "</body>" ); 192 html.append( "</html>" ); 193 super.parse( new StringReader( html.toString() ), sink ); 194 } 195 catch ( IOException e ) 196 { 197 throw new ParseException( "Failed reading Markdown source document", e ); 198 } 199 } 200 201 public static boolean isHtmlComment( Node node ) { 202 if (node instanceof HtmlBlockNode) { 203 HtmlBlockNode blockNode = (HtmlBlockNode) node; 204 return blockNode.getText().startsWith( "<!--" ); 205 } 206 return false; 207 } 208 209 public static String nodeText( Node node ) 210 { 211 StringBuilder builder = new StringBuilder(); 212 if ( node instanceof TextNode ) 213 { 214 builder.append( TextNode.class.cast( node ).getText() ); 215 } 216 else 217 { 218 for ( Node n : node.getChildren() ) 219 { 220 if ( n instanceof TextNode ) 221 { 222 builder.append( TextNode.class.cast( n ).getText() ); 223 } 224 else if ( n instanceof SuperNode ) 225 { 226 builder.append( nodeText( n ) ); 227 } 228 } 229 } 230 return builder.toString(); 231 } 232 233 @Override 234 protected boolean baseEndTag( XmlPullParser parser, Sink sink ) 235 { 236 boolean visited = super.baseEndTag( parser, sink ); 237 if ( !visited ) 238 { 239 if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) ) 240 { 241 handleUnknown( parser, sink, TAG_TYPE_END ); 242 visited = true; 243 } 244 } 245 return visited; 246 } 247 248 @Override 249 protected boolean baseStartTag(XmlPullParser parser, Sink sink) { 250 boolean visited = super.baseStartTag( parser, sink ); 251 if ( !visited ) 252 { 253 if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) ) 254 { 255 handleUnknown( parser, sink, TAG_TYPE_START ); 256 visited = true; 257 } 258 } 259 return visited; 260 } 261}