001package org.apache.maven.doxia.module.markdown;
002
003/*
004 * Licensed to the Apache Software Foundation (ASF) under one
005 * or more contributor license agreements.  See the NOTICE file
006 * distributed with this work for additional information
007 * regarding copyright ownership.  The ASF licenses this file
008 * to you under the Apache License, Version 2.0 (the
009 * "License"); you may not use this file except in compliance
010 * with the License.  You may obtain a copy of the License at
011 *
012 *   http://www.apache.org/licenses/LICENSE-2.0
013 *
014 * Unless required by applicable law or agreed to in writing,
015 * software distributed under the License is distributed on an
016 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
017 * KIND, either express or implied.  See the License for the
018 * specific language governing permissions and limitations
019 * under the License.
020 */
021
022import org.apache.commons.lang.StringEscapeUtils;
023import org.apache.commons.lang.StringUtils;
024import org.apache.maven.doxia.macro.MacroExecutionException;
025import org.apache.maven.doxia.markup.HtmlMarkup;
026import org.apache.maven.doxia.module.xhtml.XhtmlParser;
027import org.apache.maven.doxia.parser.ParseException;
028import org.apache.maven.doxia.parser.Parser;
029import org.apache.maven.doxia.sink.Sink;
030import org.codehaus.plexus.component.annotations.Component;
031import org.codehaus.plexus.util.IOUtil;
032import org.codehaus.plexus.util.xml.pull.XmlPullParser;
033import org.codehaus.plexus.util.xml.pull.XmlPullParserException;
034import org.pegdown.Extensions;
035import org.pegdown.PegDownProcessor;
036import org.pegdown.ast.HeaderNode;
037import org.pegdown.ast.HtmlBlockNode;
038import org.pegdown.ast.Node;
039import org.pegdown.ast.RootNode;
040import org.pegdown.ast.SuperNode;
041import org.pegdown.ast.TextNode;
042
043import java.io.IOException;
044import java.io.Reader;
045import java.io.StringReader;
046import java.util.regex.Matcher;
047import java.util.regex.Pattern;
048
049/**
050 * Implementation of {@link org.apache.maven.doxia.parser.Parser} for Markdown documents.
051 * <p/>
052 * Defers parsing to the <a href="http://pegdown.org">PegDown library</a>.
053 *
054 * @author Julien Nicoulaud <julien.nicoulaud@gmail.com>
055 * @since 1.3
056 */
057@Component( role = Parser.class, hint = "markdown" )
058public class MarkdownParser
059    extends XhtmlParser
060{
061
062    /**
063     * The role hint for the {@link MarkdownParser} Plexus component.
064     */
065    public static final String ROLE_HINT = "markdown";
066
067    /**
068     * The {@link PegDownProcessor} used to convert Pegdown documents to HTML.
069     */
070    protected static final PegDownProcessor PEGDOWN_PROCESSOR =
071        new PegDownProcessor( Extensions.ALL & ~Extensions.HARDWRAPS, Long.MAX_VALUE );
072
073    /**
074     * Regex that identifies a multimarkdown-style metadata section at the start of the document
075     */
076    private static final String MULTI_MARKDOWN_METADATA_SECTION =
077        "^(((?:[^\\s:][^:]*):(?:.*(?:\r?\n\\p{Blank}+[^\\s].*)*\r?\n))+)(?:\\s*\r?\n)";
078
079    /**
080     * Regex that captures the key and value of a multimarkdown-style metadata entry.
081     */
082    private static final String MULTI_MARKDOWN_METADATA_ENTRY =
083        "([^\\s:][^:]*):(.*(?:\r?\n\\p{Blank}+[^\\s].*)*)\r?\n";
084
085    /**
086     * In order to ensure that we have minimal risk of false positives when slurping metadata sections, the
087     * first key in the metadata section must be one of these standard keys or else the entire metadata section is
088     * ignored.
089     */
090    private static final String[] STANDARD_METADATA_KEYS =
091        { "title", "author", "date", "address", "affiliation", "copyright", "email", "keywords", "language", "phone",
092            "subtitle" };
093
094
095    /**
096     * {@inheritDoc}
097     */
098    @Override
099    public void parse( Reader source, Sink sink )
100        throws ParseException
101    {
102        try
103        {
104            String text = IOUtil.toString( source );
105            StringBuilder html = new StringBuilder( text.length() * 2 );
106            html.append( "<html>" );
107            html.append( "<head>" );
108            Pattern metadataPattern = Pattern.compile( MULTI_MARKDOWN_METADATA_SECTION, Pattern.MULTILINE );
109            Matcher metadataMatcher = metadataPattern.matcher( text );
110            boolean haveTitle = false;
111            if ( metadataMatcher.find() )
112            {
113                metadataPattern = Pattern.compile( MULTI_MARKDOWN_METADATA_ENTRY, Pattern.MULTILINE );
114                Matcher lineMatcher = metadataPattern.matcher( metadataMatcher.group( 1 ) );
115                boolean first = true;
116                while ( lineMatcher.find() )
117                {
118                    String key = StringUtils.trimToEmpty( lineMatcher.group( 1 ) );
119                    if ( first )
120                    {
121                        boolean found = false;
122                        for ( String k : STANDARD_METADATA_KEYS )
123                        {
124                            if ( k.equalsIgnoreCase( key ) )
125                            {
126                                found = true;
127                                break;
128                            }
129                        }
130                        if ( !found )
131                        {
132                            break;
133                        }
134                        first = false;
135                    }
136                    String value = StringUtils.trimToEmpty( lineMatcher.group( 2 ) );
137                    if ( "title".equalsIgnoreCase( key ) )
138                    {
139                        haveTitle = true;
140                        html.append( "<title>" );
141                        html.append( StringEscapeUtils.escapeXml( value ) );
142                        html.append( "</title>" );
143                    }
144                    else if ( "author".equalsIgnoreCase( key ) )
145                    {
146                        html.append( "<meta name=\'author\' content=\'" );
147                        html.append( StringEscapeUtils.escapeXml( value ) );
148                        html.append( "\' />" );
149                    }
150                    else if ( "date".equalsIgnoreCase( key ) )
151                    {
152                        html.append( "<meta name=\'date\' content=\'" );
153                        html.append( StringEscapeUtils.escapeXml( value ) );
154                        html.append( "\' />" );
155                    }
156                    else
157                    {
158                        html.append( "<meta name=\'" );
159                        html.append( StringEscapeUtils.escapeXml( key ) );
160                        html.append( "\' content=\'" );
161                        html.append( StringEscapeUtils.escapeXml( value ) );
162                        html.append( "\' />" );
163                    }
164                }
165                if ( !first )
166                {
167                    text = text.substring( metadataMatcher.end() );
168                }
169            }
170            RootNode rootNode = PEGDOWN_PROCESSOR.parseMarkdown( text.toCharArray() );
171            if ( !haveTitle && rootNode.getChildren().size() > 0 )
172            {
173                // use the first (non-comment) node only if it is a heading
174                int i = 0;
175                Node firstNode = null;
176                while ( i < rootNode.getChildren().size() && isHtmlComment(
177                    ( firstNode = rootNode.getChildren().get( i ) ) ) )
178                {
179                    i++;
180                }
181                if ( firstNode instanceof HeaderNode )
182                {
183                    html.append( "<title>" );
184                    html.append( StringEscapeUtils.escapeXml( nodeText( firstNode ) ) );
185                    html.append( "</title>" );
186                }
187            }
188            html.append( "</head>" );
189            html.append( "<body>" );
190            html.append( new MarkdownToDoxiaHtmlSerializer().toHtml( rootNode ) );
191            html.append( "</body>" );
192            html.append( "</html>" );
193            super.parse( new StringReader( html.toString() ), sink );
194        }
195        catch ( IOException e )
196        {
197            throw new ParseException( "Failed reading Markdown source document", e );
198        }
199    }
200
201    public static boolean isHtmlComment( Node node ) {
202        if (node instanceof HtmlBlockNode) {
203            HtmlBlockNode blockNode = (HtmlBlockNode) node;
204            return blockNode.getText().startsWith( "<!--" );
205        }
206        return false;
207    }
208
209    public static String nodeText( Node node )
210    {
211        StringBuilder builder = new StringBuilder();
212        if ( node instanceof TextNode )
213        {
214            builder.append( TextNode.class.cast( node ).getText() );
215        }
216        else
217        {
218            for ( Node n : node.getChildren() )
219            {
220                if ( n instanceof TextNode )
221                {
222                    builder.append( TextNode.class.cast( n ).getText() );
223                }
224                else if ( n instanceof SuperNode )
225                {
226                    builder.append( nodeText( n ) );
227                }
228            }
229        }
230        return builder.toString();
231    }
232
233    @Override
234    protected boolean baseEndTag( XmlPullParser parser, Sink sink )
235    {
236        boolean visited = super.baseEndTag( parser, sink );
237        if ( !visited )
238        {
239            if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) )
240            {
241                handleUnknown( parser, sink, TAG_TYPE_END );
242                visited = true;
243            }
244        }
245        return visited;
246    }
247
248    @Override
249    protected boolean baseStartTag(XmlPullParser parser, Sink sink) {
250        boolean visited = super.baseStartTag( parser, sink );
251        if ( !visited )
252        {
253            if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) )
254            {
255                handleUnknown( parser, sink, TAG_TYPE_START );
256                visited = true;
257            }
258        }
259        return visited;
260    }
261}