View Javadoc
1   package org.apache.maven.doxia.module.markdown;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *   http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing,
15   * software distributed under the License is distributed on an
16   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17   * KIND, either express or implied.  See the License for the
18   * specific language governing permissions and limitations
19   * under the License.
20   */
21  
22  import org.apache.commons.lang.StringEscapeUtils;
23  import org.apache.commons.lang.StringUtils;
24  import org.apache.maven.doxia.macro.MacroExecutionException;
25  import org.apache.maven.doxia.markup.HtmlMarkup;
26  import org.apache.maven.doxia.module.xhtml.XhtmlParser;
27  import org.apache.maven.doxia.parser.ParseException;
28  import org.apache.maven.doxia.parser.Parser;
29  import org.apache.maven.doxia.sink.Sink;
30  import org.codehaus.plexus.component.annotations.Component;
31  import org.codehaus.plexus.util.IOUtil;
32  import org.codehaus.plexus.util.xml.pull.XmlPullParser;
33  import org.codehaus.plexus.util.xml.pull.XmlPullParserException;
34  import org.pegdown.Extensions;
35  import org.pegdown.PegDownProcessor;
36  import org.pegdown.ast.HeaderNode;
37  import org.pegdown.ast.HtmlBlockNode;
38  import org.pegdown.ast.Node;
39  import org.pegdown.ast.RootNode;
40  import org.pegdown.ast.SuperNode;
41  import org.pegdown.ast.TextNode;
42  
43  import java.io.IOException;
44  import java.io.Reader;
45  import java.io.StringReader;
46  import java.util.regex.Matcher;
47  import java.util.regex.Pattern;
48  
49  /**
50   * Implementation of {@link org.apache.maven.doxia.parser.Parser} for Markdown documents.
51   * <p/>
52   * Defers parsing to the <a href="http://pegdown.org">PegDown library</a>.
53   *
54   * @author Julien Nicoulaud <julien.nicoulaud@gmail.com>
55   * @since 1.3
56   */
57  @Component( role = Parser.class, hint = "markdown" )
58  public class MarkdownParser
59      extends XhtmlParser
60  {
61  
62      /**
63       * The role hint for the {@link MarkdownParser} Plexus component.
64       */
65      public static final String ROLE_HINT = "markdown";
66  
67      /**
68       * The {@link PegDownProcessor} used to convert Pegdown documents to HTML.
69       */
70      protected static final PegDownProcessor PEGDOWN_PROCESSOR =
71          new PegDownProcessor( Extensions.ALL & ~Extensions.HARDWRAPS, Long.MAX_VALUE );
72  
73      /**
74       * Regex that identifies a multimarkdown-style metadata section at the start of the document
75       */
76      private static final String MULTI_MARKDOWN_METADATA_SECTION =
77          "^(((?:[^\\s:][^:]*):(?:.*(?:\r?\n\\p{Blank}+[^\\s].*)*\r?\n))+)(?:\\s*\r?\n)";
78  
79      /**
80       * Regex that captures the key and value of a multimarkdown-style metadata entry.
81       */
82      private static final String MULTI_MARKDOWN_METADATA_ENTRY =
83          "([^\\s:][^:]*):(.*(?:\r?\n\\p{Blank}+[^\\s].*)*)\r?\n";
84  
85      /**
86       * In order to ensure that we have minimal risk of false positives when slurping metadata sections, the
87       * first key in the metadata section must be one of these standard keys or else the entire metadata section is
88       * ignored.
89       */
90      private static final String[] STANDARD_METADATA_KEYS =
91          { "title", "author", "date", "address", "affiliation", "copyright", "email", "keywords", "language", "phone",
92              "subtitle" };
93  
94  
95      /**
96       * {@inheritDoc}
97       */
98      @Override
99      public void parse( Reader source, Sink sink )
100         throws ParseException
101     {
102         try
103         {
104             String text = IOUtil.toString( source );
105             StringBuilder html = new StringBuilder( text.length() * 2 );
106             html.append( "<html>" );
107             html.append( "<head>" );
108             Pattern metadataPattern = Pattern.compile( MULTI_MARKDOWN_METADATA_SECTION, Pattern.MULTILINE );
109             Matcher metadataMatcher = metadataPattern.matcher( text );
110             boolean haveTitle = false;
111             if ( metadataMatcher.find() )
112             {
113                 metadataPattern = Pattern.compile( MULTI_MARKDOWN_METADATA_ENTRY, Pattern.MULTILINE );
114                 Matcher lineMatcher = metadataPattern.matcher( metadataMatcher.group( 1 ) );
115                 boolean first = true;
116                 while ( lineMatcher.find() )
117                 {
118                     String key = StringUtils.trimToEmpty( lineMatcher.group( 1 ) );
119                     if ( first )
120                     {
121                         boolean found = false;
122                         for ( String k : STANDARD_METADATA_KEYS )
123                         {
124                             if ( k.equalsIgnoreCase( key ) )
125                             {
126                                 found = true;
127                                 break;
128                             }
129                         }
130                         if ( !found )
131                         {
132                             break;
133                         }
134                         first = false;
135                     }
136                     String value = StringUtils.trimToEmpty( lineMatcher.group( 2 ) );
137                     if ( "title".equalsIgnoreCase( key ) )
138                     {
139                         haveTitle = true;
140                         html.append( "<title>" );
141                         html.append( StringEscapeUtils.escapeXml( value ) );
142                         html.append( "</title>" );
143                     }
144                     else if ( "author".equalsIgnoreCase( key ) )
145                     {
146                         html.append( "<meta name=\'author\' content=\'" );
147                         html.append( StringEscapeUtils.escapeXml( value ) );
148                         html.append( "\' />" );
149                     }
150                     else if ( "date".equalsIgnoreCase( key ) )
151                     {
152                         html.append( "<meta name=\'date\' content=\'" );
153                         html.append( StringEscapeUtils.escapeXml( value ) );
154                         html.append( "\' />" );
155                     }
156                     else
157                     {
158                         html.append( "<meta name=\'" );
159                         html.append( StringEscapeUtils.escapeXml( key ) );
160                         html.append( "\' content=\'" );
161                         html.append( StringEscapeUtils.escapeXml( value ) );
162                         html.append( "\' />" );
163                     }
164                 }
165                 if ( !first )
166                 {
167                     text = text.substring( metadataMatcher.end() );
168                 }
169             }
170             RootNode rootNode = PEGDOWN_PROCESSOR.parseMarkdown( text.toCharArray() );
171             if ( !haveTitle && rootNode.getChildren().size() > 0 )
172             {
173                 // use the first (non-comment) node only if it is a heading
174                 int i = 0;
175                 Node firstNode = null;
176                 while ( i < rootNode.getChildren().size() && isHtmlComment(
177                     ( firstNode = rootNode.getChildren().get( i ) ) ) )
178                 {
179                     i++;
180                 }
181                 if ( firstNode instanceof HeaderNode )
182                 {
183                     html.append( "<title>" );
184                     html.append( StringEscapeUtils.escapeXml( nodeText( firstNode ) ) );
185                     html.append( "</title>" );
186                 }
187             }
188             html.append( "</head>" );
189             html.append( "<body>" );
190             html.append( new MarkdownToDoxiaHtmlSerializer().toHtml( rootNode ) );
191             html.append( "</body>" );
192             html.append( "</html>" );
193             super.parse( new StringReader( html.toString() ), sink );
194         }
195         catch ( IOException e )
196         {
197             throw new ParseException( "Failed reading Markdown source document", e );
198         }
199     }
200 
201     public static boolean isHtmlComment( Node node ) {
202         if (node instanceof HtmlBlockNode) {
203             HtmlBlockNode blockNode = (HtmlBlockNode) node;
204             return blockNode.getText().startsWith( "<!--" );
205         }
206         return false;
207     }
208 
209     public static String nodeText( Node node )
210     {
211         StringBuilder builder = new StringBuilder();
212         if ( node instanceof TextNode )
213         {
214             builder.append( TextNode.class.cast( node ).getText() );
215         }
216         else
217         {
218             for ( Node n : node.getChildren() )
219             {
220                 if ( n instanceof TextNode )
221                 {
222                     builder.append( TextNode.class.cast( n ).getText() );
223                 }
224                 else if ( n instanceof SuperNode )
225                 {
226                     builder.append( nodeText( n ) );
227                 }
228             }
229         }
230         return builder.toString();
231     }
232 
233     @Override
234     protected boolean baseEndTag( XmlPullParser parser, Sink sink )
235     {
236         boolean visited = super.baseEndTag( parser, sink );
237         if ( !visited )
238         {
239             if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) )
240             {
241                 handleUnknown( parser, sink, TAG_TYPE_END );
242                 visited = true;
243             }
244         }
245         return visited;
246     }
247 
248     @Override
249     protected boolean baseStartTag(XmlPullParser parser, Sink sink) {
250         boolean visited = super.baseStartTag( parser, sink );
251         if ( !visited )
252         {
253             if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) )
254             {
255                 handleUnknown( parser, sink, TAG_TYPE_START );
256                 visited = true;
257             }
258         }
259         return visited;
260     }
261 }