001package org.apache.maven.doxia.module.markdown;
002
003/*
004 * Licensed to the Apache Software Foundation (ASF) under one
005 * or more contributor license agreements.  See the NOTICE file
006 * distributed with this work for additional information
007 * regarding copyright ownership.  The ASF licenses this file
008 * to you under the Apache License, Version 2.0 (the
009 * "License"); you may not use this file except in compliance
010 * with the License.  You may obtain a copy of the License at
011 *
012 *   http://www.apache.org/licenses/LICENSE-2.0
013 *
014 * Unless required by applicable law or agreed to in writing,
015 * software distributed under the License is distributed on an
016 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
017 * KIND, either express or implied.  See the License for the
018 * specific language governing permissions and limitations
019 * under the License.
020 */
021
022import org.apache.commons.lang.StringEscapeUtils;
023import org.apache.commons.lang.StringUtils;
024import org.apache.maven.doxia.markup.HtmlMarkup;
025import org.apache.maven.doxia.module.xhtml.XhtmlParser;
026import org.apache.maven.doxia.parser.AbstractParser;
027import org.apache.maven.doxia.parser.ParseException;
028import org.apache.maven.doxia.parser.Parser;
029import org.apache.maven.doxia.sink.Sink;
030import org.codehaus.plexus.component.annotations.Component;
031import org.codehaus.plexus.component.annotations.Requirement;
032import org.codehaus.plexus.util.IOUtil;
033import org.codehaus.plexus.util.xml.pull.XmlPullParser;
034import org.pegdown.Extensions;
035import org.pegdown.PegDownProcessor;
036import org.pegdown.ast.HeaderNode;
037import org.pegdown.ast.HtmlBlockNode;
038import org.pegdown.ast.Node;
039import org.pegdown.ast.RootNode;
040import org.pegdown.ast.SuperNode;
041import org.pegdown.ast.TextNode;
042
043import java.io.IOException;
044import java.io.Reader;
045import java.io.StringReader;
046import java.util.regex.Matcher;
047import java.util.regex.Pattern;
048
049/**
050 * Implementation of {@link org.apache.maven.doxia.parser.Parser} for Markdown documents.
051 * <p/>
052 * Defers effective parsing to the <a href="http://pegdown.org">PegDown library</a>, which generates HTML content
053 * then delegates parsing of this content to a slightly modified Doxia Xhtml parser.
054 *
055 * @author Julien Nicoulaud <julien.nicoulaud@gmail.com>
056 * @since 1.3
057 * @see MarkdownToDoxiaHtmlSerializer
058 */
059@Component( role = Parser.class, hint = "markdown" )
060public class MarkdownParser
061    extends AbstractParser
062{
063
064    /**
065     * The role hint for the {@link MarkdownParser} Plexus component.
066     */
067    public static final String ROLE_HINT = "markdown";
068
069    /**
070     * The {@link PegDownProcessor} used to convert Pegdown documents to HTML.
071     */
072    protected static final PegDownProcessor PEGDOWN_PROCESSOR =
073        new PegDownProcessor( Extensions.ALL & ~Extensions.HARDWRAPS, Long.MAX_VALUE );
074
075    /**
076     * Regex that identifies a multimarkdown-style metadata section at the start of the document
077     */
078    private static final String MULTI_MARKDOWN_METADATA_SECTION =
079        "^(((?:[^\\s:][^:]*):(?:.*(?:\r?\n\\p{Blank}+[^\\s].*)*\r?\n))+)(?:\\s*\r?\n)";
080
081    /**
082     * Regex that captures the key and value of a multimarkdown-style metadata entry.
083     */
084    private static final String MULTI_MARKDOWN_METADATA_ENTRY =
085        "([^\\s:][^:]*):(.*(?:\r?\n\\p{Blank}+[^\\s].*)*)\r?\n";
086
087    /**
088     * In order to ensure that we have minimal risk of false positives when slurping metadata sections, the
089     * first key in the metadata section must be one of these standard keys or else the entire metadata section is
090     * ignored.
091     */
092    private static final String[] STANDARD_METADATA_KEYS =
093        { "title", "author", "date", "address", "affiliation", "copyright", "email", "keywords", "language", "phone",
094            "subtitle" };
095
096    public int getType()
097    {
098        return TXT_TYPE;
099    }
100
101    @Requirement
102    private PegDownHtmlParser parser;
103
104    public void parse( Reader source, Sink sink )
105        throws ParseException
106    {
107        try
108        {
109            // Markdown to HTML (using Pegdown library)
110            String html = toHtml( source );
111            // then HTML to Sink API
112            parser.parse( new StringReader( html ), sink );
113        }
114        catch ( IOException e )
115        {
116            throw new ParseException( "Failed reading Markdown source document", e );
117        }
118    }
119
120    /**
121     * uses PegDown library to parse content and generate HTML output.
122     * 
123     * @param source the Markdown source
124     * @return HTML content generated by PegDown 
125     * @throws IOException
126     * @see MarkdownToDoxiaHtmlSerializer
127     */
128    private String toHtml( Reader source )
129        throws IOException
130    {
131        String text = IOUtil.toString( source );
132        StringBuilder html = new StringBuilder( text.length() * 2 );
133        html.append( "<html>" );
134        html.append( "<head>" );
135        Pattern metadataPattern = Pattern.compile( MULTI_MARKDOWN_METADATA_SECTION, Pattern.MULTILINE );
136        Matcher metadataMatcher = metadataPattern.matcher( text );
137        boolean haveTitle = false;
138        if ( metadataMatcher.find() )
139        {
140            metadataPattern = Pattern.compile( MULTI_MARKDOWN_METADATA_ENTRY, Pattern.MULTILINE );
141            Matcher lineMatcher = metadataPattern.matcher( metadataMatcher.group( 1 ) );
142            boolean first = true;
143            while ( lineMatcher.find() )
144            {
145                String key = StringUtils.trimToEmpty( lineMatcher.group( 1 ) );
146                if ( first )
147                {
148                    boolean found = false;
149                    for ( String k : STANDARD_METADATA_KEYS )
150                    {
151                        if ( k.equalsIgnoreCase( key ) )
152                        {
153                            found = true;
154                            break;
155                        }
156                    }
157                    if ( !found )
158                    {
159                        break;
160                    }
161                    first = false;
162                }
163                String value = StringUtils.trimToEmpty( lineMatcher.group( 2 ) );
164                if ( "title".equalsIgnoreCase( key ) )
165                {
166                    haveTitle = true;
167                    html.append( "<title>" );
168                    html.append( StringEscapeUtils.escapeXml( value ) );
169                    html.append( "</title>" );
170                }
171                else if ( "author".equalsIgnoreCase( key ) )
172                {
173                    html.append( "<meta name=\'author\' content=\'" );
174                    html.append( StringEscapeUtils.escapeXml( value ) );
175                    html.append( "\' />" );
176                }
177                else if ( "date".equalsIgnoreCase( key ) )
178                {
179                    html.append( "<meta name=\'date\' content=\'" );
180                    html.append( StringEscapeUtils.escapeXml( value ) );
181                    html.append( "\' />" );
182                }
183                else
184                {
185                    html.append( "<meta name=\'" );
186                    html.append( StringEscapeUtils.escapeXml( key ) );
187                    html.append( "\' content=\'" );
188                    html.append( StringEscapeUtils.escapeXml( value ) );
189                    html.append( "\' />" );
190                }
191            }
192            if ( !first )
193            {
194                text = text.substring( metadataMatcher.end() );
195            }
196        }
197        RootNode rootNode = PEGDOWN_PROCESSOR.parseMarkdown( text.toCharArray() );
198        if ( !haveTitle && rootNode.getChildren().size() > 0 )
199        {
200            // use the first (non-comment) node only if it is a heading
201            int i = 0;
202            Node firstNode = null;
203            while ( i < rootNode.getChildren().size() && isHtmlComment(
204                ( firstNode = rootNode.getChildren().get( i ) ) ) )
205            {
206                i++;
207            }
208            if ( firstNode instanceof HeaderNode )
209            {
210                html.append( "<title>" );
211                html.append( StringEscapeUtils.escapeXml( nodeText( firstNode ) ) );
212                html.append( "</title>" );
213            }
214        }
215        html.append( "</head>" );
216        html.append( "<body>" );
217        html.append( new MarkdownToDoxiaHtmlSerializer().toHtml( rootNode ) );
218        html.append( "</body>" );
219        html.append( "</html>" );
220
221        return html.toString();
222    }
223
224    public static boolean isHtmlComment( Node node )
225    {
226        if ( node instanceof HtmlBlockNode )
227        {
228            HtmlBlockNode blockNode = (HtmlBlockNode) node;
229            return blockNode.getText().startsWith( "<!--" );
230        }
231        return false;
232    }
233
234    public static String nodeText( Node node )
235    {
236        StringBuilder builder = new StringBuilder();
237        if ( node instanceof TextNode )
238        {
239            builder.append( TextNode.class.cast( node ).getText() );
240        }
241        else
242        {
243            for ( Node n : node.getChildren() )
244            {
245                if ( n instanceof TextNode )
246                {
247                    builder.append( TextNode.class.cast( n ).getText() );
248                }
249                else if ( n instanceof SuperNode )
250                {
251                    builder.append( nodeText( n ) );
252                }
253            }
254        }
255        return builder.toString();
256    }
257
258    /**
259     * Internal parser for HTML generated by PegDown library.
260     */
261    @Component( role = PegDownHtmlParser.class )
262    public static class PegDownHtmlParser
263        extends XhtmlParser
264    {
265        public PegDownHtmlParser()
266        {
267            super();
268        }
269
270        @Override
271        protected boolean baseEndTag( XmlPullParser parser, Sink sink )
272        {
273            boolean visited = super.baseEndTag( parser, sink );
274            if ( !visited )
275            {
276                if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) )
277                {
278                    handleUnknown( parser, sink, TAG_TYPE_END );
279                    visited = true;
280                }
281            }
282            return visited;
283        }
284    
285        @Override
286        protected boolean baseStartTag( XmlPullParser parser, Sink sink )
287        {
288            boolean visited = super.baseStartTag( parser, sink );
289            if ( !visited )
290            {
291                if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) )
292                {
293                    handleUnknown( parser, sink, TAG_TYPE_START );
294                    visited = true;
295                }
296            }
297            return visited;
298        }
299    }
300}