View Javadoc
1   package org.apache.maven.doxia.parser;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *   http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing,
15   * software distributed under the License is distributed on an
16   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17   * KIND, either express or implied.  See the License for the
18   * specific language governing permissions and limitations
19   * under the License.
20   */
21  
22  import java.io.Reader;
23  import java.util.Stack;
24  
25  import javax.swing.text.html.HTML.Attribute;
26  
27  import org.apache.maven.doxia.macro.MacroExecutionException;
28  import org.apache.maven.doxia.markup.HtmlMarkup;
29  import org.apache.maven.doxia.sink.Sink;
30  import org.apache.maven.doxia.sink.SinkEventAttributes;
31  import org.apache.maven.doxia.sink.impl.SinkEventAttributeSet;
32  import org.apache.maven.doxia.util.DoxiaUtils;
33  
34  import org.codehaus.plexus.util.StringUtils;
35  import org.codehaus.plexus.util.xml.pull.XmlPullParser;
36  import org.codehaus.plexus.util.xml.pull.XmlPullParserException;
37  import org.slf4j.Logger;
38  import org.slf4j.LoggerFactory;
39  
40  /**
41   * Common base parser for xhtml5 events.
42   */
43  public class Xhtml5BaseParser
44      extends AbstractXmlParser
45          implements HtmlMarkup
46  {
47      private static final Logger LOGGER = LoggerFactory.getLogger( Xhtml5BaseParser.class );
48  
49      /**
50       * True if a <script></script> or <style></style> block is read. CDATA sections within are
51       * handled as rawText.
52       */
53      private boolean scriptBlock;
54  
55      /** Used to distinguish <a href=""> from <a name="">. */
56      private boolean isLink;
57  
58      /** Used to distinguish <a href=""> from <a name="">. */
59      private boolean isAnchor;
60  
61      /** Used for nested lists. */
62      private int orderedListDepth = 0;
63  
64      /** Counts section level. */
65      private int sectionLevel;
66  
67      /** Counts heading level. */
68      private int headingLevel;
69  
70      /** Verbatim flag, true whenever we are inside a <pre> tag. */
71      private boolean inVerbatim;
72  
73      /** Used to keep track of closing tags for content events */
74      private Stack<String> divStack = new Stack<>();
75  
76      /** Used to wrap the definedTerm with its definition, even when one is omitted */
77      boolean hasDefinitionListItem = false;
78  
79      /** {@inheritDoc} */
80      @Override
81      public void parse( Reader source, Sink sink, String reference )
82          throws ParseException
83      {
84          init();
85  
86          try
87          {
88              super.parse( source, sink, reference );
89          }
90          finally
91          {
92              setSecondParsing( false );
93              init();
94          }
95      }
96  
97      /**
98       * {@inheritDoc}
99       *
100      * Adds all XHTML (HTML 5.2) entities to the parser so that they can be recognized and resolved
101      * without additional DTD.
102      */
103     @Override
104     protected void initXmlParser( XmlPullParser parser )
105         throws XmlPullParserException
106     {
107         super.initXmlParser( parser );
108     }
109 
110     /**
111      * <p>
112      *   Goes through a common list of possible html5 start tags. These include only tags that can go into
113      *   the body of an xhtml5 document and so should be re-usable by different xhtml-based parsers.
114      * </p>
115      * <p>
116      *   The currently handled tags are:
117      * </p>
118      * <p>
119      *   <code>
120      *      &lt;article&gt;, &lt;nav&gt;, &lt;aside&gt;, &lt;section&gt;, &lt;h2&gt;, &lt;h3&gt;, &lt;h4&gt;,
121      *      &lt;h5&gt;, &lt;h6&gt;, &lt;header&gt;, &lt;main&gt;, &lt;footer&gt;, &lt;em&gt;, &lt;strong&gt;,
122      *      &lt;small&gt;, &lt;s&gt;, &lt;cite&gt;, &lt;q&gt;, &lt;dfn&gt;, &lt;abbr&gt;, &lt;i&gt;,
123      *      &lt;b&gt;, &lt;code&gt;, &lt;samp&gt;, &lt;kbd&gt;, &lt;sub&gt;, &lt;sup&gt;, &lt;u&gt;,
124      *      &lt;mark&gt;, &lt;ruby&gt;, &lt;rb&gt;, &lt;rt&gt;, &lt;rtc&gt;, &lt;rp&gt;, &lt;bdi&gt;,
125      *      &lt;bdo&gt;, &lt;span&gt;, &lt;ins&gt;, &lt;del&gt;, &lt;p&gt;, &lt;pre&gt;, &lt;ul&gt;,
126      *      &lt;ol&gt;, &lt;li&gt;, &lt;dl&gt;, &lt;dt&gt;, &lt;dd&gt;, &lt;a&gt;, &lt;table&gt;,
127      *      &lt;tr&gt;, &lt;th&gt;, &lt;td&gt;, &lt;caption&gt;, &lt;br/&gt;, &lt;wbr/&gt;, &lt;hr/&gt;,
128      *      &lt;img/&gt;.
129      *   </code>
130      * </p>
131      *
132      * @param parser A parser.
133      * @param sink the sink to receive the events.
134      * @return True if the event has been handled by this method, i.e. the tag was recognized, false otherwise.
135      */
136     protected boolean baseStartTag( XmlPullParser parser, Sink sink )
137     {
138         boolean visited = true;
139 
140         SinkEventAttributeSet attribs = getAttributesFromParser( parser );
141 
142         if ( parser.getName().equals( HtmlMarkup.ARTICLE.toString() ) )
143         {
144             sink.article( attribs );
145         }
146         else if ( parser.getName().equals( HtmlMarkup.NAV.toString() ) )
147         {
148             sink.navigation( attribs );
149         }
150         else if ( parser.getName().equals( HtmlMarkup.ASIDE.toString() ) )
151         {
152             sink.sidebar( attribs );
153         }
154         else if ( parser.getName().equals( HtmlMarkup.SECTION.toString() ) )
155         {
156             handleSectionStart( sink, attribs );
157         }
158         else if ( parser.getName().equals( HtmlMarkup.H2.toString() ) )
159         {
160             handleHeadingStart( sink, Sink.SECTION_LEVEL_1, attribs );
161         }
162         else if ( parser.getName().equals( HtmlMarkup.H3.toString() ) )
163         {
164             handleHeadingStart( sink, Sink.SECTION_LEVEL_2, attribs );
165         }
166         else if ( parser.getName().equals( HtmlMarkup.H4.toString() ) )
167         {
168             handleHeadingStart( sink, Sink.SECTION_LEVEL_3, attribs );
169         }
170         else if ( parser.getName().equals( HtmlMarkup.H5.toString() ) )
171         {
172             handleHeadingStart( sink, Sink.SECTION_LEVEL_4, attribs );
173         }
174         else if ( parser.getName().equals( HtmlMarkup.H6.toString() ) )
175         {
176             handleHeadingStart( sink, Sink.SECTION_LEVEL_5, attribs );
177         }
178         else if ( parser.getName().equals( HtmlMarkup.HEADER.toString() ) )
179         {
180             sink.header( attribs );
181         }
182         else if ( parser.getName().equals( HtmlMarkup.MAIN.toString() ) )
183         {
184             sink.content( attribs );
185         }
186         else if ( parser.getName().equals( HtmlMarkup.FOOTER.toString() ) )
187         {
188             sink.footer( attribs );
189         }
190         else if ( parser.getName().equals( HtmlMarkup.EM.toString() ) )
191         {
192             attribs.addAttributes( SinkEventAttributeSet.Semantics.EMPHASIS );
193             sink.inline( attribs );
194         }
195         else if ( parser.getName().equals( HtmlMarkup.STRONG.toString() ) )
196         {
197             attribs.addAttributes( SinkEventAttributeSet.Semantics.STRONG );
198             sink.inline( attribs );
199         }
200         else if ( parser.getName().equals( HtmlMarkup.SMALL.toString() ) )
201         {
202             attribs.addAttributes( SinkEventAttributeSet.Semantics.SMALL );
203             sink.inline( attribs );
204         }
205         else if ( parser.getName().equals( HtmlMarkup.S.toString() ) )
206         {
207             attribs.addAttributes( SinkEventAttributeSet.Semantics.LINE_THROUGH );
208             sink.inline( attribs );
209             /* deprecated line-through support */
210         }
211         else if ( parser.getName().equals( HtmlMarkup.CITE.toString() ) )
212         {
213             attribs.addAttributes( SinkEventAttributeSet.Semantics.CITATION );
214             sink.inline( attribs );
215         }
216         else if ( parser.getName().equals( HtmlMarkup.Q.toString() ) )
217         {
218             attribs.addAttributes( SinkEventAttributeSet.Semantics.QUOTE );
219             sink.inline( attribs );
220         }
221         else if ( parser.getName().equals( HtmlMarkup.DFN.toString() ) )
222         {
223             attribs.addAttributes( SinkEventAttributeSet.Semantics.DEFINITION );
224             sink.inline( attribs );
225         }
226         else if ( parser.getName().equals( HtmlMarkup.ABBR.toString() ) )
227         {
228             attribs.addAttributes( SinkEventAttributeSet.Semantics.ABBREVIATION );
229             sink.inline( attribs );
230         }
231         else if ( parser.getName().equals( HtmlMarkup.I.toString() ) )
232         {
233             attribs.addAttributes( SinkEventAttributeSet.Semantics.ITALIC );
234             sink.inline( attribs );
235         }
236         else if ( parser.getName().equals( HtmlMarkup.B.toString() ) )
237         {
238             attribs.addAttributes( SinkEventAttributeSet.Semantics.BOLD );
239             sink.inline( attribs );
240         }
241         else if ( parser.getName().equals( HtmlMarkup.CODE.toString() ) )
242         {
243             attribs.addAttributes( SinkEventAttributeSet.Semantics.CODE );
244             sink.inline( attribs );
245         }
246         else if ( parser.getName().equals( HtmlMarkup.VAR.toString() ) )
247         {
248             attribs.addAttributes( SinkEventAttributeSet.Semantics.VARIABLE );
249             sink.inline( attribs );
250         }
251         else if ( parser.getName().equals( HtmlMarkup.SAMP.toString() ) )
252         {
253             attribs.addAttributes( SinkEventAttributeSet.Semantics.SAMPLE );
254             sink.inline( attribs );
255         }
256         else if ( parser.getName().equals( HtmlMarkup.KBD.toString() ) )
257         {
258             attribs.addAttributes( SinkEventAttributeSet.Semantics.KEYBOARD );
259             sink.inline( attribs );
260         }
261         else if ( parser.getName().equals( HtmlMarkup.SUP.toString() ) )
262         {
263             attribs.addAttributes( SinkEventAttributeSet.Semantics.SUPERSCRIPT );
264             sink.inline( attribs );
265         }
266         else if ( parser.getName().equals( HtmlMarkup.SUB.toString() ) )
267         {
268             attribs.addAttributes( SinkEventAttributeSet.Semantics.SUBSCRIPT );
269             sink.inline( attribs );
270         }
271         else if ( parser.getName().equals( HtmlMarkup.U.toString() ) )
272         {
273             attribs.addAttributes( SinkEventAttributeSet.Semantics.ANNOTATION );
274             sink.inline( attribs );
275         }
276         else if ( parser.getName().equals( HtmlMarkup.MARK.toString() ) )
277         {
278             attribs.addAttributes( SinkEventAttributeSet.Semantics.HIGHLIGHT );
279             sink.inline( attribs );
280         }
281         else if ( parser.getName().equals( HtmlMarkup.RUBY.toString() ) )
282         {
283             attribs.addAttributes( SinkEventAttributeSet.Semantics.RUBY );
284             sink.inline( attribs );
285         }
286         else if ( parser.getName().equals( HtmlMarkup.RB.toString() ) )
287         {
288             attribs.addAttributes( SinkEventAttributeSet.Semantics.RUBY_BASE );
289             sink.inline( attribs );
290         }
291         else if ( parser.getName().equals( HtmlMarkup.RT.toString() ) )
292         {
293             attribs.addAttributes( SinkEventAttributeSet.Semantics.RUBY_TEXT );
294             sink.inline( attribs );
295         }
296         else if ( parser.getName().equals( HtmlMarkup.RTC.toString() ) )
297         {
298             attribs.addAttributes( SinkEventAttributeSet.Semantics.RUBY_TEXT_CONTAINER );
299             sink.inline( attribs );
300         }
301         else if ( parser.getName().equals( HtmlMarkup.RP.toString() ) )
302         {
303             attribs.addAttributes( SinkEventAttributeSet.Semantics.RUBY_PARANTHESES );
304             sink.inline( attribs );
305         }
306         else if ( parser.getName().equals( HtmlMarkup.BDI.toString() ) )
307         {
308             attribs.addAttributes( SinkEventAttributeSet.Semantics.BIDIRECTIONAL_ISOLATION );
309             sink.inline( attribs );
310         }
311         else if ( parser.getName().equals( HtmlMarkup.BDO.toString() ) )
312         {
313             attribs.addAttributes( SinkEventAttributeSet.Semantics.BIDIRECTIONAL_OVERRIDE );
314             sink.inline( attribs );
315         }
316         else if ( parser.getName().equals( HtmlMarkup.SPAN.toString() ) )
317         {
318             attribs.addAttributes( SinkEventAttributeSet.Semantics.PHRASE );
319             sink.inline( attribs );
320         }
321         else if ( parser.getName().equals( HtmlMarkup.INS.toString() ) )
322         {
323             attribs.addAttributes( SinkEventAttributeSet.Semantics.INSERT );
324             sink.inline( attribs );
325         }
326         else if ( parser.getName().equals( HtmlMarkup.DEL.toString() ) )
327         {
328             attribs.addAttributes( SinkEventAttributeSet.Semantics.DELETE );
329             sink.inline( attribs );
330         }
331         else if ( parser.getName().equals( HtmlMarkup.P.toString() ) )
332         {
333             handlePStart( sink, attribs );
334         }
335         else if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) )
336         {
337             handleDivStart( parser, attribs, sink );
338         }
339         else if ( parser.getName().equals( HtmlMarkup.PRE.toString() ) )
340         {
341             handlePreStart( attribs, sink );
342         }
343         else if ( parser.getName().equals( HtmlMarkup.UL.toString() ) )
344         {
345             sink.list( attribs );
346         }
347         else if ( parser.getName().equals( HtmlMarkup.OL.toString() ) )
348         {
349             handleOLStart( parser, sink, attribs );
350         }
351         else if ( parser.getName().equals( HtmlMarkup.LI.toString() ) )
352         {
353             handleLIStart( sink, attribs );
354         }
355         else if ( parser.getName().equals( HtmlMarkup.DL.toString() ) )
356         {
357             sink.definitionList( attribs );
358         }
359         else if ( parser.getName().equals( HtmlMarkup.DT.toString() ) )
360         {
361             if ( hasDefinitionListItem )
362             {
363                 // close previous listItem
364                 sink.definitionListItem_();
365             }
366             sink.definitionListItem( attribs );
367             hasDefinitionListItem = true;
368             sink.definedTerm( attribs );
369         }
370         else if ( parser.getName().equals( HtmlMarkup.DD.toString() ) )
371         {
372             if ( !hasDefinitionListItem )
373             {
374                 sink.definitionListItem( attribs );
375             }
376             sink.definition( attribs );
377         }
378         else if ( ( parser.getName().equals( HtmlMarkup.FIGURE.toString() ) ) )
379         {
380             sink.figure( attribs );
381         }
382         else if ( ( parser.getName().equals( HtmlMarkup.FIGCAPTION.toString() ) ) )
383         {
384             sink.figureCaption( attribs );
385         }
386         else if ( parser.getName().equals( HtmlMarkup.A.toString() ) )
387         {
388             handleAStart( parser, sink, attribs );
389         }
390         else if ( parser.getName().equals( HtmlMarkup.TABLE.toString() ) )
391         {
392             handleTableStart( sink, attribs, parser );
393         }
394         else if ( parser.getName().equals( HtmlMarkup.TR.toString() ) )
395         {
396             sink.tableRow( attribs );
397         }
398         else if ( parser.getName().equals( HtmlMarkup.TH.toString() ) )
399         {
400             sink.tableHeaderCell( attribs );
401         }
402         else if ( parser.getName().equals( HtmlMarkup.TD.toString() ) )
403         {
404             sink.tableCell( attribs );
405         }
406         else if ( parser.getName().equals( HtmlMarkup.CAPTION.toString() ) )
407         {
408             sink.tableCaption( attribs );
409         }
410         else if ( parser.getName().equals( HtmlMarkup.BR.toString() ) )
411         {
412             sink.lineBreak( attribs );
413         }
414         else if ( parser.getName().equals( HtmlMarkup.WBR.toString() ) )
415         {
416             sink.lineBreakOpportunity( attribs );
417         }
418         else if ( parser.getName().equals( HtmlMarkup.HR.toString() ) )
419         {
420             sink.horizontalRule( attribs );
421         }
422         else if ( parser.getName().equals( HtmlMarkup.IMG.toString() ) )
423         {
424             handleImgStart( parser, sink, attribs );
425         }
426         else if ( parser.getName().equals( HtmlMarkup.SCRIPT.toString() )
427             || parser.getName().equals( HtmlMarkup.STYLE.toString() ) )
428         {
429             handleUnknown( parser, sink, TAG_TYPE_START );
430             scriptBlock = true;
431         }
432         else
433         {
434             visited = false;
435         }
436 
437         return visited;
438     }
439 
440     /**
441      * <p>
442      *   Goes through a common list of possible html end tags.
443      *   These should be re-usable by different xhtml-based parsers.
444      *   The tags handled here are the same as for {@link #baseStartTag(XmlPullParser,Sink)},
445      *   except for the empty elements ({@code <br/>, <hr/>, <img/>}).
446      * </p>
447      *
448      * @param parser A parser.
449      * @param sink the sink to receive the events.
450      * @return True if the event has been handled by this method, false otherwise.
451      */
452     protected boolean baseEndTag( XmlPullParser parser, Sink sink )
453     {
454         boolean visited = true;
455 
456         if ( parser.getName().equals( HtmlMarkup.P.toString() ) )
457         {
458             sink.paragraph_();
459         }
460         else if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) )
461         {
462             handleDivEnd( sink );
463         }
464         else if ( parser.getName().equals( HtmlMarkup.PRE.toString() ) )
465         {
466             verbatim_();
467 
468             sink.verbatim_();
469         }
470         else if ( parser.getName().equals( HtmlMarkup.UL.toString() ) )
471         {
472             sink.list_();
473         }
474         else if ( parser.getName().equals( HtmlMarkup.OL.toString() ) )
475         {
476             sink.numberedList_();
477             orderedListDepth--;
478         }
479         else if ( parser.getName().equals( HtmlMarkup.LI.toString() ) )
480         {
481             handleListItemEnd( sink );
482         }
483         else if ( parser.getName().equals( HtmlMarkup.DL.toString() ) )
484         {
485             if ( hasDefinitionListItem )
486             {
487                 sink.definitionListItem_();
488                 hasDefinitionListItem = false;
489             }
490             sink.definitionList_();
491         }
492         else if ( parser.getName().equals( HtmlMarkup.DT.toString() ) )
493         {
494             sink.definedTerm_();
495         }
496         else if ( parser.getName().equals( HtmlMarkup.DD.toString() ) )
497         {
498             sink.definition_();
499             sink.definitionListItem_();
500             hasDefinitionListItem = false;
501         }
502         else if ( ( parser.getName().equals( HtmlMarkup.FIGURE.toString() ) ) )
503         {
504             sink.figure_();
505         }
506         else if ( ( parser.getName().equals( HtmlMarkup.FIGCAPTION.toString() ) ) )
507         {
508             sink.figureCaption_();
509         }
510         else if ( parser.getName().equals( HtmlMarkup.A.toString() ) )
511         {
512             handleAEnd( sink );
513         }
514 
515         else if ( parser.getName().equals( HtmlMarkup.EM.toString() ) )
516         {
517             sink.inline_();
518         }
519         else if ( parser.getName().equals( HtmlMarkup.STRONG.toString() ) )
520         {
521             sink.inline_();
522         }
523         else if ( parser.getName().equals( HtmlMarkup.SMALL.toString() ) )
524         {
525             sink.inline_();
526         }
527         else if ( parser.getName().equals( HtmlMarkup.S.toString() ) )
528         {
529             sink.inline_();
530         }
531         else if ( parser.getName().equals( HtmlMarkup.CITE.toString() ) )
532         {
533             sink.inline_();
534         }
535         else if ( parser.getName().equals( HtmlMarkup.Q.toString() ) )
536         {
537             sink.inline_();
538         }
539         else if ( parser.getName().equals( HtmlMarkup.DFN.toString() ) )
540         {
541             sink.inline_();
542         }
543         else if ( parser.getName().equals( HtmlMarkup.ABBR.toString() ) )
544         {
545             sink.inline_();
546         }
547         else if ( parser.getName().equals( HtmlMarkup.I.toString() ) )
548         {
549             sink.inline_();
550         }
551         else if ( parser.getName().equals( HtmlMarkup.B.toString() ) )
552         {
553             sink.inline_();
554         }
555         else if ( parser.getName().equals( HtmlMarkup.CODE.toString() ) )
556         {
557             sink.inline_();
558         }
559         else if ( parser.getName().equals( HtmlMarkup.VAR.toString() ) )
560         {
561             sink.inline_();
562         }
563         else if ( parser.getName().equals( HtmlMarkup.SAMP.toString() ) )
564         {
565             sink.inline_();
566         }
567         else if ( parser.getName().equals( HtmlMarkup.KBD.toString() ) )
568         {
569             sink.inline_();
570         }
571         else if ( parser.getName().equals( HtmlMarkup.SUP.toString() ) )
572         {
573             sink.inline_();
574         }
575         else if ( parser.getName().equals( HtmlMarkup.SUB.toString() ) )
576         {
577             sink.inline_();
578         }
579         else if ( parser.getName().equals( HtmlMarkup.U.toString() ) )
580         {
581             sink.inline_();
582         }
583         else if ( parser.getName().equals( HtmlMarkup.MARK.toString() ) )
584         {
585             sink.inline_();
586         }
587         else if ( parser.getName().equals( HtmlMarkup.RUBY.toString() ) )
588         {
589             sink.inline_();
590         }
591         else if ( parser.getName().equals( HtmlMarkup.RB.toString() ) )
592         {
593             sink.inline_();
594         }
595         else if ( parser.getName().equals( HtmlMarkup.RT.toString() ) )
596         {
597             sink.inline_();
598         }
599         else if ( parser.getName().equals( HtmlMarkup.RTC.toString() ) )
600         {
601             sink.inline_();
602         }
603         else if ( parser.getName().equals( HtmlMarkup.RP.toString() ) )
604         {
605             sink.inline_();
606         }
607         else if ( parser.getName().equals( HtmlMarkup.BDI.toString() ) )
608         {
609             sink.inline_();
610         }
611         else if ( parser.getName().equals( HtmlMarkup.BDO.toString() ) )
612         {
613             sink.inline_();
614         }
615         else if ( parser.getName().equals( HtmlMarkup.SPAN.toString() ) )
616         {
617             sink.inline_();
618         }
619         else if ( parser.getName().equals( HtmlMarkup.INS.toString() ) )
620         {
621             sink.inline_();
622         }
623         else if ( parser.getName().equals( HtmlMarkup.DEL.toString() ) )
624         {
625             sink.inline_();
626         }
627 
628         // ----------------------------------------------------------------------
629         // Tables
630         // ----------------------------------------------------------------------
631 
632         else if ( parser.getName().equals( HtmlMarkup.TABLE.toString() ) )
633         {
634             sink.tableRows_();
635 
636             sink.table_();
637         }
638         else if ( parser.getName().equals( HtmlMarkup.TR.toString() ) )
639         {
640             sink.tableRow_();
641         }
642         else if ( parser.getName().equals( HtmlMarkup.TH.toString() ) )
643         {
644             sink.tableHeaderCell_();
645         }
646         else if ( parser.getName().equals( HtmlMarkup.TD.toString() ) )
647         {
648             sink.tableCell_();
649         }
650         else if ( parser.getName().equals( HtmlMarkup.CAPTION.toString() ) )
651         {
652             sink.tableCaption_();
653         }
654         else if ( parser.getName().equals( HtmlMarkup.ARTICLE.toString() ) )
655         {
656             sink.article_();
657         }
658         else if ( parser.getName().equals( HtmlMarkup.NAV.toString() ) )
659         {
660             sink.navigation_();
661         }
662         else if ( parser.getName().equals( HtmlMarkup.ASIDE.toString() ) )
663         {
664             sink.sidebar_();
665         }
666         else if ( parser.getName().equals( HtmlMarkup.SECTION.toString() ) )
667         {
668             handleSectionEnd( sink );
669         }
670         else if ( parser.getName().equals( HtmlMarkup.H2.toString() ) )
671         {
672             sink.sectionTitle1_();
673         }
674         else if ( parser.getName().equals( HtmlMarkup.H3.toString() ) )
675         {
676             sink.sectionTitle2_();
677         }
678         else if ( parser.getName().equals( HtmlMarkup.H4.toString() ) )
679         {
680             sink.sectionTitle3_();
681         }
682         else if ( parser.getName().equals( HtmlMarkup.H5.toString() ) )
683         {
684             sink.sectionTitle4_();
685         }
686         else if ( parser.getName().equals( HtmlMarkup.H6.toString() ) )
687         {
688             sink.sectionTitle5_();
689         }
690         else if ( parser.getName().equals( HtmlMarkup.HEADER.toString() ) )
691         {
692             sink.header_();
693         }
694         else if ( parser.getName().equals( HtmlMarkup.MAIN.toString() ) )
695         {
696             sink.content_();
697         }
698         else if ( parser.getName().equals( HtmlMarkup.FOOTER.toString() ) )
699         {
700             sink.footer_();
701         }
702         else if ( parser.getName().equals( HtmlMarkup.SCRIPT.toString() )
703             || parser.getName().equals( HtmlMarkup.STYLE.toString() ) )
704         {
705             handleUnknown( parser, sink, TAG_TYPE_END );
706 
707             scriptBlock = false;
708         }
709         else
710         {
711             visited = false;
712         }
713 
714         return visited;
715     }
716 
717     /**
718      * {@inheritDoc}
719      *
720      * Just calls {@link #baseStartTag(XmlPullParser,Sink)}, this should be
721      * overridden by implementing parsers to include additional tags.
722      */
723     protected void handleStartTag( XmlPullParser parser, Sink sink )
724         throws XmlPullParserException, MacroExecutionException
725     {
726         if ( !baseStartTag( parser, sink ) )
727         {
728             LOGGER.warn( "Unrecognized xml tag <{}> at [{}:{}]", parser.getName(),
729                     parser.getLineNumber(), parser.getColumnNumber() );
730         }
731     }
732 
733     /**
734      * {@inheritDoc}
735      *
736      * Just calls {@link #baseEndTag(XmlPullParser,Sink)}, this should be
737      * overridden by implementing parsers to include additional tags.
738      */
739     protected void handleEndTag( XmlPullParser parser, Sink sink )
740         throws XmlPullParserException, MacroExecutionException
741     {
742         if ( !baseEndTag( parser, sink ) )
743         {
744             // unrecognized tag is already logged in StartTag
745         }
746     }
747 
748     /** {@inheritDoc} */
749     @Override
750     protected void handleText( XmlPullParser parser, Sink sink )
751         throws XmlPullParserException
752     {
753         String text = getText( parser );
754 
755         /*
756          * NOTE: Don't do any whitespace trimming here. Whitespace normalization has already been performed by the
757          * parser so any whitespace that makes it here is significant.
758          *
759          * NOTE: text within script tags is ignored, scripting code should be embedded in CDATA.
760          */
761         if ( StringUtils.isNotEmpty( text ) && !isScriptBlock() )
762         {
763             sink.text( text );
764         }
765     }
766 
767     /** {@inheritDoc} */
768     @Override
769     protected void handleComment( XmlPullParser parser, Sink sink )
770         throws XmlPullParserException
771     {
772         String text = getText( parser );
773 
774         if ( "PB".equals( text.trim() ) )
775         {
776             sink.pageBreak();
777         }
778         else
779         {
780             if ( isEmitComments() )
781             {
782                 sink.comment( text );
783             }
784         }
785     }
786 
787     /** {@inheritDoc} */
788     @Override
789     protected void handleCdsect( XmlPullParser parser, Sink sink )
790         throws XmlPullParserException
791     {
792         String text = getText( parser );
793 
794         if ( isScriptBlock() )
795         {
796             sink.unknown( CDATA, new Object[] { CDATA_TYPE, text }, null );
797         }
798         else
799         {
800             sink.text( text );
801         }
802     }
803 
804     /**
805      * Make sure sections are nested consecutively.
806      *
807      * <p>
808      * HTML5 heading tags H1 to H6 imply sections where they are not
809      * present, that means we have to open close any sections that
810      * are missing in between.
811      * </p>
812      *
813      * <p>
814      * For instance, if the following sequence is parsed:
815      * </p>
816      * <pre>
817      * &lt;h3&gt;&lt;/h3&gt;
818      * &lt;h6&gt;&lt;/h6&gt;
819      * </pre>
820      * <p>
821      * we have to insert two section starts before we open the <code>&lt;h6&gt;</code>.
822      * In the following sequence
823      * </p>
824      * <pre>
825      * &lt;h6&gt;&lt;/h6&gt;
826      * &lt;h3&gt;&lt;/h3&gt;
827      * </pre>
828      * <p>
829      * we have to close two sections before we open the <code>&lt;h3&gt;</code>.
830      * </p>
831      *
832      * <p>The current level is set to newLevel afterwards.</p>
833      *
834      * @param newLevel the new section level, all upper levels have to be closed.
835      * @param sink the sink to receive the events.
836      * @param attribs a {@link org.apache.maven.doxia.sink.impl.SinkEventAttributeSet} object.
837      */
838     protected void consecutiveSections( int newLevel, Sink sink, SinkEventAttributeSet attribs )
839     {
840         closeOpenSections( newLevel, sink );
841         openMissingSections( newLevel, sink );
842 
843         this.headingLevel = newLevel;
844     }
845 
846     /**
847      * Close open sections.
848      *
849      * @param newLevel the new section level, all upper levels have to be closed.
850      * @param sink the sink to receive the events.
851      */
852     private void closeOpenSections( int newLevel, Sink sink )
853     {
854         while ( this.headingLevel >= newLevel
855                 && this.sectionLevel < headingLevel )
856         {
857             if ( headingLevel == Sink.SECTION_LEVEL_5 )
858             {
859                 sink.section5_();
860             }
861             else if ( headingLevel == Sink.SECTION_LEVEL_4 )
862             {
863                 sink.section4_();
864             }
865             else if ( headingLevel == Sink.SECTION_LEVEL_3 )
866             {
867                 sink.section3_();
868             }
869             else if ( headingLevel == Sink.SECTION_LEVEL_2 )
870             {
871                 sink.section2_();
872             }
873             else if ( headingLevel == Sink.SECTION_LEVEL_1 )
874             {
875                 sink.section1_();
876             }
877 
878             this.headingLevel--;
879         }
880     }
881 
882     /**
883      * Open missing sections.
884      *
885      * @param newLevel the new section level, all lower levels have to be opened.
886      * @param sink the sink to receive the events.
887      */
888     private void openMissingSections( int newLevel, Sink sink )
889     {
890         while ( this.headingLevel < newLevel
891                 && this.sectionLevel < newLevel )
892         {
893             this.headingLevel++;
894 
895             if ( headingLevel == Sink.SECTION_LEVEL_5 )
896             {
897                 sink.section5();
898             }
899             else if ( headingLevel == Sink.SECTION_LEVEL_4 )
900             {
901                 sink.section4();
902             }
903             else if ( headingLevel == Sink.SECTION_LEVEL_3 )
904             {
905                 sink.section3();
906             }
907             else if ( headingLevel == Sink.SECTION_LEVEL_2 )
908             {
909                 sink.section2();
910             }
911             else if ( headingLevel == Sink.SECTION_LEVEL_1 )
912             {
913                 sink.section1();
914             }
915         }
916     }
917 
918     /**
919      * Return the current section level.
920      *
921      * @return the current section level.
922      */
923     protected int getSectionLevel()
924     {
925         return this.headingLevel;
926     }
927 
928     /**
929      * Set the current section level.
930      *
931      * @param newLevel the new section level.
932      */
933     protected void setSectionLevel( int newLevel )
934     {
935         this.headingLevel = newLevel;
936     }
937 
938     /**
939      * Stop verbatim mode.
940      */
941     protected void verbatim_()
942     {
943         this.inVerbatim = false;
944     }
945 
946     /**
947      * Start verbatim mode.
948      */
949     protected void verbatim()
950     {
951         this.inVerbatim = true;
952     }
953 
954     /**
955      * Checks if we are currently inside a &lt;pre&gt; tag.
956      *
957      * @return true if we are currently in verbatim mode.
958      */
959     protected boolean isVerbatim()
960     {
961         return this.inVerbatim;
962     }
963 
964     /**
965      * Checks if we are currently inside a &lt;script&gt; tag.
966      *
967      * @return true if we are currently inside <code>&lt;script&gt;</code> tags.
968      * @since 1.1.1.
969      */
970     protected boolean isScriptBlock()
971     {
972         return this.scriptBlock;
973     }
974 
975     /**
976      * Checks if the given id is a valid Doxia id and if not, returns a transformed one.
977      *
978      * @param id The id to validate.
979      * @return A transformed id or the original id if it was already valid.
980      * @see DoxiaUtils#encodeId(String)
981      */
982     protected String validAnchor( String id )
983     {
984         if ( !DoxiaUtils.isValidId( id ) )
985         {
986             String linkAnchor = DoxiaUtils.encodeId( id, true );
987 
988             LOGGER.debug( "Modified invalid link '{}' to '{}'", id, linkAnchor );
989 
990             return linkAnchor;
991         }
992 
993         return id;
994     }
995 
996     /** {@inheritDoc} */
997     @Override
998     protected void init()
999     {
1000         super.init();
1001 
1002         this.scriptBlock = false;
1003         this.isLink = false;
1004         this.isAnchor = false;
1005         this.orderedListDepth = 0;
1006         this.headingLevel = 0;
1007         this.inVerbatim = false;
1008     }
1009 
1010     private void handleAEnd( Sink sink )
1011     {
1012         if ( isLink )
1013         {
1014             sink.link_();
1015             isLink = false;
1016         }
1017         else if ( isAnchor )
1018         {
1019             sink.anchor_();
1020             isAnchor = false;
1021         }
1022     }
1023 
1024     private void handleAStart( XmlPullParser parser, Sink sink, SinkEventAttributeSet attribs )
1025     {
1026         String href = parser.getAttributeValue( null, Attribute.HREF.toString() );
1027 
1028         if ( href != null )
1029         {
1030             int hashIndex = href.indexOf( '#' );
1031             if ( hashIndex != -1 && !DoxiaUtils.isExternalLink( href ) )
1032             {
1033                 String hash = href.substring( hashIndex + 1 );
1034 
1035                 if ( !DoxiaUtils.isValidId( hash ) )
1036                 {
1037                     href = href.substring( 0, hashIndex ) + "#" + DoxiaUtils.encodeId( hash, true );
1038 
1039                     LOGGER.debug( "Modified invalid link '{}' to '{}'", hash, href );
1040                 }
1041             }
1042             sink.link( href, attribs );
1043             isLink = true;
1044         }
1045         else
1046         {
1047             String name = parser.getAttributeValue( null, Attribute.NAME.toString() );
1048 
1049             if ( name != null )
1050             {
1051                 sink.anchor( validAnchor( name ), attribs );
1052                 isAnchor = true;
1053             }
1054             else
1055             {
1056                 String id = parser.getAttributeValue( null, Attribute.ID.toString() );
1057                 if ( id != null )
1058                 {
1059                     sink.anchor( validAnchor( id ), attribs );
1060                     isAnchor = true;
1061                 }
1062             }
1063         }
1064     }
1065 
1066     private boolean handleDivStart( XmlPullParser parser, SinkEventAttributeSet attribs, Sink sink )
1067     {
1068         String divclass = parser.getAttributeValue( null, Attribute.CLASS.toString() );
1069 
1070         this.divStack.push( divclass );
1071 
1072         if ( "content".equals( divclass ) )
1073         {
1074             SinkEventAttributeSet atts = new SinkEventAttributeSet( attribs );
1075             atts.removeAttribute( SinkEventAttributes.CLASS );
1076             sink.content( atts );
1077         }
1078         if ( "source".equals( divclass ) )
1079         {
1080             return false;
1081         }
1082         else
1083         {
1084             sink.division( attribs );
1085         }
1086 
1087         return true;
1088     }
1089 
1090     private boolean handleDivEnd( Sink sink )
1091     {
1092         String divclass = divStack.pop();
1093 
1094         if ( "content".equals( divclass ) )
1095         {
1096             sink.content_();
1097         }
1098         if ( "source".equals( divclass ) )
1099         {
1100             return false;
1101         }
1102         else
1103         {
1104             sink.division_();
1105         }
1106 
1107         return true;
1108     }
1109 
1110     private void handleImgStart( XmlPullParser parser, Sink sink, SinkEventAttributeSet attribs )
1111     {
1112         String src = parser.getAttributeValue( null, Attribute.SRC.toString() );
1113 
1114         if ( src != null )
1115         {
1116             sink.figureGraphics( src, attribs );
1117         }
1118     }
1119 
1120     private void handleLIStart( Sink sink, SinkEventAttributeSet attribs )
1121     {
1122         if ( orderedListDepth == 0 )
1123         {
1124             sink.listItem( attribs );
1125         }
1126         else
1127         {
1128             sink.numberedListItem( attribs );
1129         }
1130     }
1131 
1132     private void handleListItemEnd( Sink sink )
1133     {
1134         if ( orderedListDepth == 0 )
1135         {
1136             sink.listItem_();
1137         }
1138         else
1139         {
1140             sink.numberedListItem_();
1141         }
1142     }
1143 
1144     private void handleOLStart( XmlPullParser parser, Sink sink, SinkEventAttributeSet attribs )
1145     {
1146         int numbering = Sink.NUMBERING_DECIMAL;
1147         // this will have to be generalized if we handle styles
1148         String style = parser.getAttributeValue( null, Attribute.STYLE.toString() );
1149 
1150         if ( style != null )
1151         {
1152             switch ( style )
1153             {
1154                 case "list-style-type: upper-alpha":
1155                     numbering = Sink.NUMBERING_UPPER_ALPHA;
1156                     break;
1157                 case "list-style-type: lower-alpha":
1158                     numbering = Sink.NUMBERING_LOWER_ALPHA;
1159                     break;
1160                 case "list-style-type: upper-roman":
1161                     numbering = Sink.NUMBERING_UPPER_ROMAN;
1162                     break;
1163                 case "list-style-type: lower-roman":
1164                     numbering = Sink.NUMBERING_LOWER_ROMAN;
1165                     break;
1166                 case "list-style-type: decimal":
1167                     numbering = Sink.NUMBERING_DECIMAL;
1168                     break;
1169                 default:
1170                     // ignore all other
1171             }
1172         }
1173 
1174         sink.numberedList( numbering, attribs );
1175         orderedListDepth++;
1176     }
1177 
1178     private void handlePStart( Sink sink, SinkEventAttributeSet attribs )
1179     {
1180         sink.paragraph( attribs );
1181     }
1182 
1183     /*
1184      * The PRE element tells visual user agents that the enclosed text is
1185      * "preformatted". When handling preformatted text, visual user agents:
1186      * - May leave white space intact.
1187      * - May render text with a fixed-pitch font.
1188      * - May disable automatic word wrap.
1189      * - Must not disable bidirectional processing.
1190      * Non-visual user agents are not required to respect extra white space
1191      * in the content of a PRE element.
1192      */
1193     private void handlePreStart( SinkEventAttributeSet attribs, Sink sink )
1194     {
1195         verbatim();
1196         sink.verbatim( attribs );
1197     }
1198 
1199     private void handleSectionStart( Sink sink, SinkEventAttributeSet attribs )
1200     {
1201         sink.section( ++sectionLevel, attribs );
1202     }
1203 
1204     private void handleHeadingStart( Sink sink, int level, SinkEventAttributeSet attribs )
1205     {
1206         consecutiveSections( level, sink, attribs );
1207         sink.sectionTitle( level, attribs );
1208     }
1209 
1210     private void handleSectionEnd( Sink sink )
1211     {
1212         closeOpenSections( sectionLevel, sink );
1213         this.headingLevel = 0;
1214 
1215         sink.section_( sectionLevel-- );
1216     }
1217 
1218     private void handleTableStart( Sink sink, SinkEventAttributeSet attribs, XmlPullParser parser )
1219     {
1220         sink.table( attribs );
1221         String border = parser.getAttributeValue( null, Attribute.BORDER.toString() );
1222         boolean grid = true;
1223 
1224         if ( border == null || "0".equals( border ) )
1225         {
1226             grid = false;
1227         }
1228 
1229         String align = parser.getAttributeValue( null, Attribute.ALIGN.toString() );
1230         int[] justif = {Sink.JUSTIFY_LEFT};
1231 
1232         if ( "center".equals( align ) )
1233         {
1234             justif[0] = Sink.JUSTIFY_CENTER;
1235         }
1236         else if ( "right".equals( align ) )
1237         {
1238             justif[0] = Sink.JUSTIFY_RIGHT;
1239         }
1240 
1241         sink.tableRows( justif, grid );
1242     }
1243 }