View Javadoc

1   package org.apache.maven.doxia.module.markdown;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *   http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing,
15   * software distributed under the License is distributed on an
16   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17   * KIND, either express or implied.  See the License for the
18   * specific language governing permissions and limitations
19   * under the License.
20   */
21  
22  import org.apache.commons.lang.StringEscapeUtils;
23  import org.apache.commons.lang.StringUtils;
24  import org.apache.maven.doxia.module.xhtml.XhtmlParser;
25  import org.apache.maven.doxia.parser.ParseException;
26  import org.apache.maven.doxia.parser.Parser;
27  import org.apache.maven.doxia.sink.Sink;
28  import org.codehaus.plexus.component.annotations.Component;
29  import org.codehaus.plexus.util.IOUtil;
30  import org.pegdown.Extensions;
31  import org.pegdown.PegDownProcessor;
32  import org.pegdown.ast.HeaderNode;
33  import org.pegdown.ast.HtmlBlockNode;
34  import org.pegdown.ast.Node;
35  import org.pegdown.ast.RootNode;
36  import org.pegdown.ast.SuperNode;
37  import org.pegdown.ast.TextNode;
38  
39  import java.io.IOException;
40  import java.io.Reader;
41  import java.io.StringReader;
42  import java.util.regex.Matcher;
43  import java.util.regex.Pattern;
44  
45  /**
46   * Implementation of {@link org.apache.maven.doxia.parser.Parser} for Markdown documents.
47   * <p/>
48   * Defers parsing to the <a href="http://pegdown.org">PegDown library</a>.
49   *
50   * @author Julien Nicoulaud <julien.nicoulaud@gmail.com>
51   * @since 1.3
52   */
53  @Component( role = Parser.class, hint = "markdown" )
54  public class MarkdownParser
55      extends XhtmlParser
56  {
57  
58      /**
59       * The role hint for the {@link MarkdownParser} Plexus component.
60       */
61      public static final String ROLE_HINT = "markdown";
62  
63      /**
64       * The {@link PegDownProcessor} used to convert Pegdown documents to HTML.
65       */
66      protected static final PegDownProcessor PEGDOWN_PROCESSOR =
67          new PegDownProcessor( Extensions.ALL & ~Extensions.HARDWRAPS, Long.MAX_VALUE );
68  
69      /**
70       * Regex that identifies a multimarkdown-style metadata section at the start of the document
71       */
72      private static final String MULTI_MARKDOWN_METADATA_SECTION =
73          "^(((?:[^\\s:][^:]*):(?:.*(?:\r?\n\\s[^\\s].*)*\r?\n))+)(?:\\s*\r?\n)";
74  
75      /**
76       * Regex that captures the key and value of a multimarkdown-style metadata entry.
77       */
78      private static final String MULTI_MARKDOWN_METADATA_ENTRY = "([^\\s:][^:]*):(.*(?:\r?\n\\s[^\\s].*)*)\r?\n";
79  
80      /**
81       * In order to ensure that we have minimal risk of false positives when slurping metadata sections, the
82       * first key in the metadata section must be one of these standard keys or else the entire metadata section is
83       * ignored.
84       */
85      private static final String[] STANDARD_METADATA_KEYS =
86          { "title", "author", "date", "address", "affiliation", "copyright", "email", "keywords", "language", "phone",
87              "subtitle" };
88  
89  
90      /**
91       * {@inheritDoc}
92       */
93      @Override
94      public void parse( Reader source, Sink sink )
95          throws ParseException
96      {
97          try
98          {
99              String text = IOUtil.toString( source );
100             StringBuilder html = new StringBuilder( text.length() * 2 );
101             html.append( "<html>" );
102             html.append( "<head>" );
103             Pattern metadataPattern = Pattern.compile( MULTI_MARKDOWN_METADATA_SECTION, Pattern.MULTILINE );
104             Matcher metadataMatcher = metadataPattern.matcher( text );
105             boolean haveTitle = false;
106             if ( metadataMatcher.find() )
107             {
108                 metadataPattern = Pattern.compile( MULTI_MARKDOWN_METADATA_ENTRY, Pattern.MULTILINE );
109                 Matcher lineMatcher = metadataPattern.matcher( metadataMatcher.group( 1 ) );
110                 boolean first = true;
111                 while ( lineMatcher.find() )
112                 {
113                     String key = StringUtils.trimToEmpty( lineMatcher.group( 1 ) );
114                     if ( first )
115                     {
116                         boolean found = false;
117                         for ( String k : STANDARD_METADATA_KEYS )
118                         {
119                             if ( k.equalsIgnoreCase( key ) )
120                             {
121                                 found = true;
122                                 break;
123                             }
124                         }
125                         if ( !found )
126                         {
127                             break;
128                         }
129                         first = false;
130                     }
131                     String value = StringUtils.trimToEmpty( lineMatcher.group( 2 ) );
132                     if ( "title".equalsIgnoreCase( key ) )
133                     {
134                         haveTitle = true;
135                         html.append( "<title>" );
136                         html.append( StringEscapeUtils.escapeXml( value ) );
137                         html.append( "</title>" );
138                     }
139                     else if ( "author".equalsIgnoreCase( key ) )
140                     {
141                         html.append( "<meta name=\'author\' content=\'" );
142                         html.append( StringEscapeUtils.escapeXml( value ) );
143                         html.append( "\' />" );
144                     }
145                     else if ( "date".equalsIgnoreCase( key ) )
146                     {
147                         html.append( "<meta name=\'date\' content=\'" );
148                         html.append( StringEscapeUtils.escapeXml( value ) );
149                         html.append( "\' />" );
150                     }
151                     else
152                     {
153                         html.append( "<meta name=\'" );
154                         html.append( StringEscapeUtils.escapeXml( key ) );
155                         html.append( "\' content=\'" );
156                         html.append( StringEscapeUtils.escapeXml( value ) );
157                         html.append( "\' />" );
158                     }
159                 }
160                 if ( !first )
161                 {
162                     text = text.substring( metadataMatcher.end() );
163                 }
164             }
165             RootNode rootNode = PEGDOWN_PROCESSOR.parseMarkdown( text.toCharArray() );
166             if ( !haveTitle && rootNode.getChildren().size() > 0 )
167             {
168                 // use the first (non-comment) node only if it is a heading
169                 int i = 0;
170                 Node firstNode = null;
171                 while ( i < rootNode.getChildren().size() && isHtmlComment(
172                     ( firstNode = rootNode.getChildren().get( i ) ) ) )
173                 {
174                     i++;
175                 }
176                 if ( firstNode instanceof HeaderNode )
177                 {
178                     html.append( "<title>" );
179                     html.append( StringEscapeUtils.escapeXml( nodeText( firstNode ) ) );
180                     html.append( "</title>" );
181                 }
182             }
183             html.append( "</head>" );
184             html.append( "<body>" );
185             html.append( new MarkdownToDoxiaHtmlSerializer().toHtml( rootNode ) );
186             html.append( "</body>" );
187             html.append( "</html>" );
188             super.parse( new StringReader( html.toString() ), sink );
189         }
190         catch ( IOException e )
191         {
192             throw new ParseException( "Failed reading Markdown source document", e );
193         }
194     }
195 
196     public static boolean isHtmlComment( Node node ) {
197         if (node instanceof HtmlBlockNode) {
198             HtmlBlockNode blockNode = (HtmlBlockNode) node;
199             return blockNode.getText().startsWith( "<!--" );
200         }
201         return false;
202     }
203 
204     public static String nodeText( Node node )
205     {
206         StringBuilder builder = new StringBuilder();
207         if ( node instanceof TextNode )
208         {
209             builder.append( TextNode.class.cast( node ).getText() );
210         }
211         else
212         {
213             for ( Node n : node.getChildren() )
214             {
215                 if ( n instanceof TextNode )
216                 {
217                     builder.append( TextNode.class.cast( n ).getText() );
218                 }
219                 else if ( n instanceof SuperNode )
220                 {
221                     builder.append( nodeText( n ) );
222                 }
223             }
224         }
225         return builder.toString();
226     }
227 
228 }