MarkdownParser

package org.apache.maven.doxia.module.markdown;

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

import org.apache.commons.lang.StringEscapeUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.maven.doxia.markup.HtmlMarkup;
import org.apache.maven.doxia.module.xhtml.XhtmlParser;
import org.apache.maven.doxia.parser.AbstractParser;
import org.apache.maven.doxia.parser.ParseException;
import org.apache.maven.doxia.parser.Parser;
import org.apache.maven.doxia.sink.Sink;
import org.codehaus.plexus.component.annotations.Component;
import org.codehaus.plexus.component.annotations.Requirement;
import org.codehaus.plexus.util.IOUtil;
import org.codehaus.plexus.util.xml.pull.XmlPullParser;
import org.pegdown.Extensions;
import org.pegdown.PegDownProcessor;
import org.pegdown.ast.HeaderNode;
import org.pegdown.ast.HtmlBlockNode;
import org.pegdown.ast.Node;
import org.pegdown.ast.RootNode;
import org.pegdown.ast.SuperNode;
import org.pegdown.ast.TextNode;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Implementation of {@link org.apache.maven.doxia.parser.Parser} for Markdown documents.
 * <p/>
 * Defers effective parsing to the <a href="http://pegdown.org">PegDown library</a>, which generates HTML content
 * then delegates parsing of this content to a slightly modified Doxia Xhtml parser.
 *
 * @author Julien Nicoulaud <julien.nicoulaud@gmail.com>
 * @since 1.3
 * @see MarkdownToDoxiaHtmlSerializer
 */
@Component( role = Parser.class, hint = "markdown" )
public class MarkdownParser
    extends AbstractParser
{

    /**
     * The role hint for the {@link MarkdownParser} Plexus component.
     */
    public static final String ROLE_HINT = "markdown";

    /**
     * The {@link PegDownProcessor} used to convert Pegdown documents to HTML.
     */
    protected static final PegDownProcessor PEGDOWN_PROCESSOR =
        new PegDownProcessor( Extensions.ALL & ~Extensions.HARDWRAPS, Long.MAX_VALUE );

    /**
     * Regex that identifies a multimarkdown-style metadata section at the start of the document
     */
    private static final String MULTI_MARKDOWN_METADATA_SECTION =
        "^(((?:[^\\s:][^:]*):(?:.*(?:\r?\n\\p{Blank}+[^\\s].*)*\r?\n))+)(?:\\s*\r?\n)";

    /**
     * Regex that captures the key and value of a multimarkdown-style metadata entry.
     */
    private static final String MULTI_MARKDOWN_METADATA_ENTRY =
        "([^\\s:][^:]*):(.*(?:\r?\n\\p{Blank}+[^\\s].*)*)\r?\n";

    /**
     * In order to ensure that we have minimal risk of false positives when slurping metadata sections, the
     * first key in the metadata section must be one of these standard keys or else the entire metadata section is
     * ignored.
     */
    private static final String[] STANDARD_METADATA_KEYS =
        { "title", "author", "date", "address", "affiliation", "copyright", "email", "keywords", "language", "phone",
            "subtitle" };

    public int getType()
    {
        return TXT_TYPE;
    }

    @Requirement
    private PegDownHtmlParser parser;

    public void parse( Reader source, Sink sink )
        throws ParseException
    {
        try
        {
            // Markdown to HTML (using Pegdown library)
            String html = toHtml( source );
            // then HTML to Sink API
            parser.parse( new StringReader( html ), sink );
        }
        catch ( IOException e )
        {
            throw new ParseException( "Failed reading Markdown source document", e );
        }
    }

    /**
     * uses PegDown library to parse content and generate HTML output.
     *
     * @param source the Markdown source
     * @return HTML content generated by PegDown
     * @throws IOException
     * @see MarkdownToDoxiaHtmlSerializer
     */
    private String toHtml( Reader source )
        throws IOException
    {
        String text = IOUtil.toString( source );
        StringBuilder html = new StringBuilder( text.length() * 2 );
        html.append( "<html>" );
        html.append( "<head>" );
        Pattern metadataPattern = Pattern.compile( MULTI_MARKDOWN_METADATA_SECTION, Pattern.MULTILINE );
        Matcher metadataMatcher = metadataPattern.matcher( text );
        boolean haveTitle = false;
        if ( metadataMatcher.find() )
        {
            metadataPattern = Pattern.compile( MULTI_MARKDOWN_METADATA_ENTRY, Pattern.MULTILINE );
            Matcher lineMatcher = metadataPattern.matcher( metadataMatcher.group( 1 ) );
            boolean first = true;
            while ( lineMatcher.find() )
            {
                String key = StringUtils.trimToEmpty( lineMatcher.group( 1 ) );
                if ( first )
                {
                    boolean found = false;
                    for ( String k : STANDARD_METADATA_KEYS )
                    {
                        if ( k.equalsIgnoreCase( key ) )
                        {
                            found = true;
                            break;
                        }
                    }
                    if ( !found )
                    {
                        break;
                    }
                    first = false;
                }
                String value = StringUtils.trimToEmpty( lineMatcher.group( 2 ) );
                if ( "title".equalsIgnoreCase( key ) )
                {
                    haveTitle = true;
                    html.append( "<title>" );
                    html.append( StringEscapeUtils.escapeXml( value ) );
                    html.append( "</title>" );
                }
                else if ( "author".equalsIgnoreCase( key ) )
                {
                    html.append( "<meta name=\'author\' content=\'" );
                    html.append( StringEscapeUtils.escapeXml( value ) );
                    html.append( "\' />" );
                }
                else if ( "date".equalsIgnoreCase( key ) )
                {
                    html.append( "<meta name=\'date\' content=\'" );
                    html.append( StringEscapeUtils.escapeXml( value ) );
                    html.append( "\' />" );
                }
                else
                {
                    html.append( "<meta name=\'" );
                    html.append( StringEscapeUtils.escapeXml( key ) );
                    html.append( "\' content=\'" );
                    html.append( StringEscapeUtils.escapeXml( value ) );
                    html.append( "\' />" );
                }
            }
            if ( !first )
            {
                text = text.substring( metadataMatcher.end() );
            }
        }
        RootNode rootNode = PEGDOWN_PROCESSOR.parseMarkdown( text.toCharArray() );
        if ( !haveTitle && rootNode.getChildren().size() > 0 )
        {
            // use the first (non-comment) node only if it is a heading
            int i = 0;
            Node firstNode = null;
            while ( i < rootNode.getChildren().size() && isHtmlComment(
                ( firstNode = rootNode.getChildren().get( i ) ) ) )
            {
                i++;
            }
            if ( firstNode instanceof HeaderNode )
            {
                html.append( "<title>" );
                html.append( StringEscapeUtils.escapeXml( nodeText( firstNode ) ) );
                html.append( "</title>" );
            }
        }
        html.append( "</head>" );
        html.append( "<body>" );
        html.append( new MarkdownToDoxiaHtmlSerializer().toHtml( rootNode ) );
        html.append( "</body>" );
        html.append( "</html>" );

        return html.toString();
    }

    public static boolean isHtmlComment( Node node )
    {
        if ( node instanceof HtmlBlockNode )
        {
            HtmlBlockNode blockNode = (HtmlBlockNode) node;
            return blockNode.getText().startsWith( "<!--" );
        }
        return false;
    }

    public static String nodeText( Node node )
    {
        StringBuilder builder = new StringBuilder();
        if ( node instanceof TextNode )
        {
            builder.append( TextNode.class.cast( node ).getText() );
        }
        else
        {
            for ( Node n : node.getChildren() )
            {
                if ( n instanceof TextNode )
                {
                    builder.append( TextNode.class.cast( n ).getText() );
                }
                else if ( n instanceof SuperNode )
                {
                    builder.append( nodeText( n ) );
                }
            }
        }
        return builder.toString();
    }

    /**
     * Internal parser for HTML generated by PegDown library.
     */
    @Component( role = PegDownHtmlParser.class )
    public static class PegDownHtmlParser
        extends XhtmlParser
    {
        public PegDownHtmlParser()
        {
            super();
        }

        @Override
        protected boolean baseEndTag( XmlPullParser parser, Sink sink )
        {
            boolean visited = super.baseEndTag( parser, sink );
            if ( !visited )
            {
                if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) )
                {
                    handleUnknown( parser, sink, TAG_TYPE_END );
                    visited = true;
                }
            }
            return visited;
        }

        @Override
        protected boolean baseStartTag( XmlPullParser parser, Sink sink )
        {
            boolean visited = super.baseStartTag( parser, sink );
            if ( !visited )
            {
                if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) )
                {
                    handleUnknown( parser, sink, TAG_TYPE_START );
                    visited = true;
                }
            }
            return visited;
        }
    }
}