View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *   http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  package org.apache.maven.doxia.parser;
20  
21  import javax.swing.text.html.HTML.Attribute;
22  
23  import java.io.Reader;
24  import java.util.HashSet;
25  import java.util.LinkedList;
26  import java.util.Set;
27  import java.util.Stack;
28  import java.util.regex.Pattern;
29  
30  import org.apache.maven.doxia.macro.MacroExecutionException;
31  import org.apache.maven.doxia.markup.HtmlMarkup;
32  import org.apache.maven.doxia.sink.Sink;
33  import org.apache.maven.doxia.sink.SinkEventAttributes;
34  import org.apache.maven.doxia.sink.impl.EventCapturingSinkProxy;
35  import org.apache.maven.doxia.sink.impl.SinkEventAttributeSet;
36  import org.apache.maven.doxia.util.DoxiaUtils;
37  import org.codehaus.plexus.util.xml.pull.XmlPullParser;
38  import org.codehaus.plexus.util.xml.pull.XmlPullParserException;
39  import org.slf4j.Logger;
40  import org.slf4j.LoggerFactory;
41  
42  /**
43   * Common base parser for xhtml5 events.
44   */
45  public class Xhtml5BaseParser extends AbstractXmlParser implements HtmlMarkup {
46      private static final Logger LOGGER = LoggerFactory.getLogger(Xhtml5BaseParser.class);
47  
48      /** Used to identify if a class string contains `bodyTableBorder` */
49      private static final Pattern BODYTABLEBORDER_CLASS_PATTERN =
50              Pattern.compile("(?:.*\\s|^)bodyTableBorder(?:\\s.*|$)");
51  
52      private static final Set<String> UNMATCHED_XHTML5_ELEMENTS = new HashSet<>();
53      private static final Set<String> UNMATCHED_XHTML5_SIMPLE_ELEMENTS = new HashSet<>();
54  
55      static {
56          UNMATCHED_XHTML5_SIMPLE_ELEMENTS.add(HtmlMarkup.AREA.toString());
57          UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.AUDIO.toString());
58          UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.BUTTON.toString());
59          UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.CANVAS.toString());
60          UNMATCHED_XHTML5_SIMPLE_ELEMENTS.add(HtmlMarkup.COL.toString());
61          UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.COLGROUP.toString());
62          UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.COMMAND.toString());
63          UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.DATA.toString());
64          UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.DATALIST.toString());
65          UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.DETAILS.toString());
66          UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.DIALOG.toString());
67          UNMATCHED_XHTML5_SIMPLE_ELEMENTS.add(HtmlMarkup.EMBED.toString());
68          UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.FIELDSET.toString());
69          UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.FORM.toString());
70          UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.HGROUP.toString());
71          UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.IFRAME.toString());
72          UNMATCHED_XHTML5_SIMPLE_ELEMENTS.add(HtmlMarkup.INPUT.toString());
73          UNMATCHED_XHTML5_SIMPLE_ELEMENTS.add(HtmlMarkup.KEYGEN.toString());
74          UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.LABEL.toString());
75          UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.LEGEND.toString());
76          UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.MAP.toString());
77          UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.MENU.toString());
78          UNMATCHED_XHTML5_SIMPLE_ELEMENTS.add(HtmlMarkup.MENUITEM.toString());
79          UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.METER.toString());
80          UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.NOSCRIPT.toString());
81          UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.OBJECT.toString());
82          UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.OPTGROUP.toString());
83          UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.OPTION.toString());
84          UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.OUTPUT.toString());
85          UNMATCHED_XHTML5_SIMPLE_ELEMENTS.add(HtmlMarkup.PARAM.toString());
86          UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.PICTURE.toString());
87          UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.PROGRESS.toString());
88          UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.SELECT.toString());
89          UNMATCHED_XHTML5_SIMPLE_ELEMENTS.add(HtmlMarkup.SOURCE.toString());
90          UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.SUMMARY.toString());
91          UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.SVG.toString());
92          UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.TEMPLATE.toString());
93          UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.TEXTAREA.toString());
94          UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.TBODY.toString());
95          UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.THEAD.toString());
96          UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.TFOOT.toString());
97          UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.TIME.toString());
98          UNMATCHED_XHTML5_SIMPLE_ELEMENTS.add(HtmlMarkup.TRACK.toString());
99          UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.VAR.toString());
100         UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.VIDEO.toString());
101     }
102 
103     /**
104      * True if a &lt;script&gt;&lt;/script&gt; or &lt;style&gt;&lt;/style&gt; block is read. CDATA sections within are
105      * handled as rawText.
106      */
107     private boolean scriptBlock;
108 
109     /** Used to distinguish &lt;a href=""&gt; from &lt;a name=""&gt;. */
110     private boolean isLink;
111 
112     /** Used to distinguish &lt;a href=""&gt; from &lt;a name=""&gt;. */
113     private boolean isAnchor;
114 
115     /** Used for nested lists. */
116     private int orderedListDepth = 0;
117 
118     /** Counts section nesting level of the sections manually set in the HTML document */
119     private int sectionLevel;
120 
121     /** Counts current heading level. This is either the {@link #sectionLevel} if no artificial sections are currently open
122      * for headings or a number higher or lower than {@link #sectionLevel} (for all section currently opened/closed for a preceding heading).
123      * The heading level only changes when a new heading starts, or a section starts or ends. */
124     private int headingLevel;
125 
126     /** Verbatim flag, true whenever we are inside a &lt;pre&gt; tag. */
127     private boolean inVerbatim;
128 
129     /** Used to keep track of closing tags for content events */
130     private Stack<String> divStack = new Stack<>();
131 
132     /** Used to wrap the definedTerm with its definition, even when one is omitted */
133     boolean hasDefinitionListItem = false;
134 
135     private LinkedList<String> capturedSinkEventNames;
136 
137     /** {@inheritDoc} */
138     @Override
139     public void parse(Reader source, Sink sink, String reference) throws ParseException {
140         init();
141 
142         try {
143             capturedSinkEventNames = new LinkedList<>();
144             Sink capturingSink = EventCapturingSinkProxy.newInstance(sink, capturedSinkEventNames);
145             super.parse(source, capturingSink, reference);
146         } finally {
147             setSecondParsing(false);
148             init();
149         }
150     }
151 
152     /**
153      * {@inheritDoc}
154      *
155      * Adds all XHTML (HTML 5.2) entities to the parser so that they can be recognized and resolved
156      * without additional DTD.
157      */
158     @Override
159     protected void initXmlParser(XmlPullParser parser) throws XmlPullParserException {
160         super.initXmlParser(parser);
161     }
162 
163     /**
164      * <p>
165      *   Goes through a common list of possible html5 start tags. These include only tags that can go into
166      *   the body of an xhtml5 document and so should be re-usable by different xhtml-based parsers.
167      * </p>
168      * <p>
169      *   The currently handled tags are:
170      * </p>
171      * <p>
172      *   <code>
173      *      &lt;article&gt;, &lt;nav&gt;, &lt;aside&gt;, &lt;section&gt;, &lt;h1&gt;, &lt;h2&gt;, &lt;h3&gt;,
174      *      &lt;h4&gt;, &lt;h5&gt;, &lt;header&gt;, &lt;main&gt;, &lt;footer&gt;, &lt;em&gt;, &lt;strong&gt;,
175      *      &lt;small&gt;, &lt;s&gt;, &lt;cite&gt;, &lt;q&gt;, &lt;dfn&gt;, &lt;abbr&gt;, &lt;i&gt;,
176      *      &lt;b&gt;, &lt;code&gt;, &lt;samp&gt;, &lt;kbd&gt;, &lt;sub&gt;, &lt;sup&gt;, &lt;u&gt;,
177      *      &lt;mark&gt;, &lt;ruby&gt;, &lt;rb&gt;, &lt;rt&gt;, &lt;rtc&gt;, &lt;rp&gt;, &lt;bdi&gt;,
178      *      &lt;bdo&gt;, &lt;span&gt;, &lt;ins&gt;, &lt;del&gt;, &lt;p&gt;, &lt;pre&gt;, &lt;ul&gt;,
179      *      &lt;ol&gt;, &lt;li&gt;, &lt;dl&gt;, &lt;dt&gt;, &lt;dd&gt;, &lt;a&gt;, &lt;table&gt;,
180      *      &lt;tr&gt;, &lt;th&gt;, &lt;td&gt;, &lt;caption&gt;, &lt;br/&gt;, &lt;wbr/&gt;, &lt;hr/&gt;,
181      *      &lt;img/&gt;.
182      *   </code>
183      * </p>
184      *
185      * @param parser A parser.
186      * @param sink the sink to receive the events.
187      * @return True if the event has been handled by this method, i.e. the tag was recognized, false otherwise.
188      */
189     protected boolean baseStartTag(XmlPullParser parser, Sink sink) {
190         SinkEventAttributeSet attribs = getAttributesFromParser(parser);
191         return baseStartTag(parser.getName(), attribs, sink);
192     }
193 
194     protected boolean baseStartTag(String elementName, SinkEventAttributeSet attribs, Sink sink) {
195         boolean visited = true;
196 
197         if (elementName.equals(HtmlMarkup.ARTICLE.toString())) {
198             sink.article(attribs);
199         } else if (elementName.equals(HtmlMarkup.NAV.toString())) {
200             sink.navigation(attribs);
201         } else if (elementName.equals(HtmlMarkup.ASIDE.toString())) {
202             sink.sidebar(attribs);
203         } else if (elementName.equals(HtmlMarkup.SECTION.toString())) {
204             handleSectionStart(sink, attribs);
205         } else if (elementName.equals(HtmlMarkup.H1.toString())) {
206             handleHeadingStart(sink, Sink.SECTION_LEVEL_1, attribs);
207         } else if (elementName.equals(HtmlMarkup.H2.toString())) {
208             handleHeadingStart(sink, Sink.SECTION_LEVEL_2, attribs);
209         } else if (elementName.equals(HtmlMarkup.H3.toString())) {
210             handleHeadingStart(sink, Sink.SECTION_LEVEL_3, attribs);
211         } else if (elementName.equals(HtmlMarkup.H4.toString())) {
212             handleHeadingStart(sink, Sink.SECTION_LEVEL_4, attribs);
213         } else if (elementName.equals(HtmlMarkup.H5.toString())) {
214             handleHeadingStart(sink, Sink.SECTION_LEVEL_5, attribs);
215         } else if (elementName.equals(HtmlMarkup.H6.toString())) {
216             handleHeadingStart(sink, Sink.SECTION_LEVEL_6, attribs);
217         } else if (elementName.equals(HtmlMarkup.HEADER.toString())) {
218             sink.header(attribs);
219         } else if (elementName.equals(HtmlMarkup.MAIN.toString())) {
220             sink.content(attribs);
221         } else if (elementName.equals(HtmlMarkup.FOOTER.toString())) {
222             sink.footer(attribs);
223         } else if (elementName.equals(HtmlMarkup.EM.toString())) {
224             attribs.addAttributes(SinkEventAttributeSet.Semantics.EMPHASIS);
225             sink.inline(attribs);
226         } else if (elementName.equals(HtmlMarkup.STRONG.toString())) {
227             attribs.addAttributes(SinkEventAttributeSet.Semantics.STRONG);
228             sink.inline(attribs);
229         } else if (elementName.equals(HtmlMarkup.SMALL.toString())) {
230             attribs.addAttributes(SinkEventAttributeSet.Semantics.SMALL);
231             sink.inline(attribs);
232         } else if (elementName.equals(HtmlMarkup.S.toString())) {
233             attribs.addAttributes(SinkEventAttributeSet.Semantics.LINE_THROUGH);
234             sink.inline(attribs);
235             /* deprecated line-through support */
236         } else if (elementName.equals(HtmlMarkup.CITE.toString())) {
237             attribs.addAttributes(SinkEventAttributeSet.Semantics.CITATION);
238             sink.inline(attribs);
239         } else if (elementName.equals(HtmlMarkup.Q.toString())) {
240             attribs.addAttributes(SinkEventAttributeSet.Semantics.QUOTE);
241             sink.inline(attribs);
242         } else if (elementName.equals(HtmlMarkup.DFN.toString())) {
243             attribs.addAttributes(SinkEventAttributeSet.Semantics.DEFINITION);
244             sink.inline(attribs);
245         } else if (elementName.equals(HtmlMarkup.ABBR.toString())) {
246             attribs.addAttributes(SinkEventAttributeSet.Semantics.ABBREVIATION);
247             sink.inline(attribs);
248         } else if (elementName.equals(HtmlMarkup.I.toString())) {
249             attribs.addAttributes(SinkEventAttributeSet.Semantics.ITALIC);
250             sink.inline(attribs);
251         } else if (elementName.equals(HtmlMarkup.B.toString())) {
252             attribs.addAttributes(SinkEventAttributeSet.Semantics.BOLD);
253             sink.inline(attribs);
254         } else if (elementName.equals(HtmlMarkup.CODE.toString())) {
255             attribs.addAttributes(SinkEventAttributeSet.Semantics.CODE);
256             sink.inline(attribs);
257         } else if (elementName.equals(HtmlMarkup.VAR.toString())) {
258             attribs.addAttributes(SinkEventAttributeSet.Semantics.VARIABLE);
259             sink.inline(attribs);
260         } else if (elementName.equals(HtmlMarkup.SAMP.toString())) {
261             attribs.addAttributes(SinkEventAttributeSet.Semantics.SAMPLE);
262             sink.inline(attribs);
263         } else if (elementName.equals(HtmlMarkup.KBD.toString())) {
264             attribs.addAttributes(SinkEventAttributeSet.Semantics.KEYBOARD);
265             sink.inline(attribs);
266         } else if (elementName.equals(HtmlMarkup.SUP.toString())) {
267             attribs.addAttributes(SinkEventAttributeSet.Semantics.SUPERSCRIPT);
268             sink.inline(attribs);
269         } else if (elementName.equals(HtmlMarkup.SUB.toString())) {
270             attribs.addAttributes(SinkEventAttributeSet.Semantics.SUBSCRIPT);
271             sink.inline(attribs);
272         } else if (elementName.equals(HtmlMarkup.U.toString())) {
273             attribs.addAttributes(SinkEventAttributeSet.Semantics.ANNOTATION);
274             sink.inline(attribs);
275         } else if (elementName.equals(HtmlMarkup.MARK.toString())) {
276             attribs.addAttributes(SinkEventAttributeSet.Semantics.HIGHLIGHT);
277             sink.inline(attribs);
278         } else if (elementName.equals(HtmlMarkup.RUBY.toString())) {
279             attribs.addAttributes(SinkEventAttributeSet.Semantics.RUBY);
280             sink.inline(attribs);
281         } else if (elementName.equals(HtmlMarkup.RB.toString())) {
282             attribs.addAttributes(SinkEventAttributeSet.Semantics.RUBY_BASE);
283             sink.inline(attribs);
284         } else if (elementName.equals(HtmlMarkup.RT.toString())) {
285             attribs.addAttributes(SinkEventAttributeSet.Semantics.RUBY_TEXT);
286             sink.inline(attribs);
287         } else if (elementName.equals(HtmlMarkup.RTC.toString())) {
288             attribs.addAttributes(SinkEventAttributeSet.Semantics.RUBY_TEXT_CONTAINER);
289             sink.inline(attribs);
290         } else if (elementName.equals(HtmlMarkup.RP.toString())) {
291             attribs.addAttributes(SinkEventAttributeSet.Semantics.RUBY_PARANTHESES);
292             sink.inline(attribs);
293         } else if (elementName.equals(HtmlMarkup.BDI.toString())) {
294             attribs.addAttributes(SinkEventAttributeSet.Semantics.BIDIRECTIONAL_ISOLATION);
295             sink.inline(attribs);
296         } else if (elementName.equals(HtmlMarkup.BDO.toString())) {
297             attribs.addAttributes(SinkEventAttributeSet.Semantics.BIDIRECTIONAL_OVERRIDE);
298             sink.inline(attribs);
299         } else if (elementName.equals(HtmlMarkup.SPAN.toString())) {
300             attribs.addAttributes(SinkEventAttributeSet.Semantics.PHRASE);
301             sink.inline(attribs);
302         } else if (elementName.equals(HtmlMarkup.INS.toString())) {
303             attribs.addAttributes(SinkEventAttributeSet.Semantics.INSERT);
304             sink.inline(attribs);
305         } else if (elementName.equals(HtmlMarkup.DEL.toString())) {
306             attribs.addAttributes(SinkEventAttributeSet.Semantics.DELETE);
307             sink.inline(attribs);
308         } else if (elementName.equals(HtmlMarkup.P.toString())) {
309             handlePStart(sink, attribs);
310         } else if (elementName.equals(HtmlMarkup.DIV.toString())) {
311             handleDivStart(attribs, sink);
312         } else if (elementName.equals(HtmlMarkup.PRE.toString())) {
313             handlePreStart(attribs, sink);
314         } else if (elementName.equals(HtmlMarkup.UL.toString())) {
315             sink.list(attribs);
316         } else if (elementName.equals(HtmlMarkup.OL.toString())) {
317             handleOLStart(sink, attribs);
318         } else if (elementName.equals(HtmlMarkup.LI.toString())) {
319             handleLIStart(sink, attribs);
320         } else if (elementName.equals(HtmlMarkup.DL.toString())) {
321             sink.definitionList(attribs);
322         } else if (elementName.equals(HtmlMarkup.DT.toString())) {
323             if (hasDefinitionListItem) {
324                 // close previous listItem
325                 sink.definitionListItem_();
326             }
327             sink.definitionListItem(attribs);
328             hasDefinitionListItem = true;
329             sink.definedTerm(attribs);
330         } else if (elementName.equals(HtmlMarkup.DD.toString())) {
331             if (!hasDefinitionListItem) {
332                 sink.definitionListItem(attribs);
333             }
334             sink.definition(attribs);
335         } else if (elementName.equals(HtmlMarkup.FIGURE.toString())) {
336             sink.figure(attribs);
337         } else if (elementName.equals(HtmlMarkup.FIGCAPTION.toString())) {
338             sink.figureCaption(attribs);
339         } else if (elementName.equals(HtmlMarkup.A.toString())) {
340             handleAStart(sink, attribs);
341         } else if (elementName.equals(HtmlMarkup.TABLE.toString())) {
342             handleTableStart(sink, attribs);
343         } else if (elementName.equals(HtmlMarkup.TR.toString())) {
344             sink.tableRow(attribs);
345         } else if (elementName.equals(HtmlMarkup.TH.toString())) {
346             sink.tableHeaderCell(attribs);
347         } else if (elementName.equals(HtmlMarkup.TD.toString())) {
348             sink.tableCell(attribs);
349         } else if (elementName.equals(HtmlMarkup.CAPTION.toString())) {
350             sink.tableCaption(attribs);
351         } else if (elementName.equals(HtmlMarkup.BR.toString())) {
352             sink.lineBreak(attribs);
353         } else if (elementName.equals(HtmlMarkup.WBR.toString())) {
354             sink.lineBreakOpportunity(attribs);
355         } else if (elementName.equals(HtmlMarkup.HR.toString())) {
356             sink.horizontalRule(attribs);
357         } else if (elementName.equals(HtmlMarkup.IMG.toString())) {
358             handleImgStart(sink, attribs);
359         } else if (elementName.equals(HtmlMarkup.BLOCKQUOTE.toString())) {
360             sink.blockquote(attribs);
361         } else if (UNMATCHED_XHTML5_ELEMENTS.contains(elementName)) {
362             handleUnknown(elementName, attribs, sink, TAG_TYPE_START);
363         } else if (UNMATCHED_XHTML5_SIMPLE_ELEMENTS.contains(elementName)) {
364             handleUnknown(elementName, attribs, sink, TAG_TYPE_SIMPLE);
365         } else if (elementName.equals(HtmlMarkup.SCRIPT.toString())
366                 || elementName.equals(HtmlMarkup.STYLE.toString())) {
367             handleUnknown(elementName, attribs, sink, TAG_TYPE_START);
368             scriptBlock = true;
369         } else {
370             visited = false;
371         }
372 
373         return visited;
374     }
375 
376     /**
377      * <p>
378      *   Goes through a common list of possible html end tags.
379      *   These should be re-usable by different xhtml-based parsers.
380      *   The tags handled here are the same as for {@link #baseStartTag(XmlPullParser,Sink)},
381      *   except for the empty elements ({@code <br/>, <hr/>, <img/>}).
382      * </p>
383      *
384      * @param parser A parser.
385      * @param sink the sink to receive the events.
386      * @return True if the event has been handled by this method, false otherwise.
387      */
388     protected boolean baseEndTag(XmlPullParser parser, Sink sink) {
389         SinkEventAttributeSet attribs = getAttributesFromParser(parser);
390         return baseEndTag(parser.getName(), attribs, sink);
391     }
392 
393     protected boolean baseEndTag(String elementName, SinkEventAttributeSet attribs, Sink sink) {
394         boolean visited = true;
395 
396         if (elementName.equals(HtmlMarkup.P.toString())) {
397             sink.paragraph_();
398         } else if (elementName.equals(HtmlMarkup.DIV.toString())) {
399             handleDivEnd(sink);
400         } else if (elementName.equals(HtmlMarkup.PRE.toString())) {
401             verbatim_();
402 
403             sink.verbatim_();
404         } else if (elementName.equals(HtmlMarkup.UL.toString())) {
405             sink.list_();
406         } else if (elementName.equals(HtmlMarkup.OL.toString())) {
407             sink.numberedList_();
408             orderedListDepth--;
409         } else if (elementName.equals(HtmlMarkup.LI.toString())) {
410             handleListItemEnd(sink);
411         } else if (elementName.equals(HtmlMarkup.DL.toString())) {
412             if (hasDefinitionListItem) {
413                 sink.definitionListItem_();
414                 hasDefinitionListItem = false;
415             }
416             sink.definitionList_();
417         } else if (elementName.equals(HtmlMarkup.DT.toString())) {
418             sink.definedTerm_();
419         } else if (elementName.equals(HtmlMarkup.DD.toString())) {
420             sink.definition_();
421             sink.definitionListItem_();
422             hasDefinitionListItem = false;
423         } else if (elementName.equals(HtmlMarkup.FIGURE.toString())) {
424             sink.figure_();
425         } else if (elementName.equals(HtmlMarkup.FIGCAPTION.toString())) {
426             sink.figureCaption_();
427         } else if (elementName.equals(HtmlMarkup.A.toString())) {
428             handleAEnd(sink);
429         } else if (elementName.equals(HtmlMarkup.EM.toString())) {
430             sink.inline_();
431         } else if (elementName.equals(HtmlMarkup.STRONG.toString())) {
432             sink.inline_();
433         } else if (elementName.equals(HtmlMarkup.SMALL.toString())) {
434             sink.inline_();
435         } else if (elementName.equals(HtmlMarkup.S.toString())) {
436             sink.inline_();
437         } else if (elementName.equals(HtmlMarkup.CITE.toString())) {
438             sink.inline_();
439         } else if (elementName.equals(HtmlMarkup.Q.toString())) {
440             sink.inline_();
441         } else if (elementName.equals(HtmlMarkup.DFN.toString())) {
442             sink.inline_();
443         } else if (elementName.equals(HtmlMarkup.ABBR.toString())) {
444             sink.inline_();
445         } else if (elementName.equals(HtmlMarkup.I.toString())) {
446             sink.inline_();
447         } else if (elementName.equals(HtmlMarkup.B.toString())) {
448             sink.inline_();
449         } else if (elementName.equals(HtmlMarkup.CODE.toString())) {
450             sink.inline_();
451         } else if (elementName.equals(HtmlMarkup.VAR.toString())) {
452             sink.inline_();
453         } else if (elementName.equals(HtmlMarkup.SAMP.toString())) {
454             sink.inline_();
455         } else if (elementName.equals(HtmlMarkup.KBD.toString())) {
456             sink.inline_();
457         } else if (elementName.equals(HtmlMarkup.SUP.toString())) {
458             sink.inline_();
459         } else if (elementName.equals(HtmlMarkup.SUB.toString())) {
460             sink.inline_();
461         } else if (elementName.equals(HtmlMarkup.U.toString())) {
462             sink.inline_();
463         } else if (elementName.equals(HtmlMarkup.MARK.toString())) {
464             sink.inline_();
465         } else if (elementName.equals(HtmlMarkup.RUBY.toString())) {
466             sink.inline_();
467         } else if (elementName.equals(HtmlMarkup.RB.toString())) {
468             sink.inline_();
469         } else if (elementName.equals(HtmlMarkup.RT.toString())) {
470             sink.inline_();
471         } else if (elementName.equals(HtmlMarkup.RTC.toString())) {
472             sink.inline_();
473         } else if (elementName.equals(HtmlMarkup.RP.toString())) {
474             sink.inline_();
475         } else if (elementName.equals(HtmlMarkup.BDI.toString())) {
476             sink.inline_();
477         } else if (elementName.equals(HtmlMarkup.BDO.toString())) {
478             sink.inline_();
479         } else if (elementName.equals(HtmlMarkup.SPAN.toString())) {
480             sink.inline_();
481         } else if (elementName.equals(HtmlMarkup.INS.toString())) {
482             sink.inline_();
483         } else if (elementName.equals(HtmlMarkup.DEL.toString())) {
484             sink.inline_();
485         }
486 
487         // ----------------------------------------------------------------------
488         // Tables
489         // ----------------------------------------------------------------------
490 
491         else if (elementName.equals(HtmlMarkup.TABLE.toString())) {
492             sink.tableRows_();
493             sink.table_();
494         } else if (elementName.equals(HtmlMarkup.TR.toString())) {
495             sink.tableRow_();
496         } else if (elementName.equals(HtmlMarkup.TH.toString())) {
497             sink.tableHeaderCell_();
498         } else if (elementName.equals(HtmlMarkup.TD.toString())) {
499             sink.tableCell_();
500         } else if (elementName.equals(HtmlMarkup.CAPTION.toString())) {
501             sink.tableCaption_();
502         } else if (elementName.equals(HtmlMarkup.ARTICLE.toString())) {
503             sink.article_();
504         } else if (elementName.equals(HtmlMarkup.NAV.toString())) {
505             sink.navigation_();
506         } else if (elementName.equals(HtmlMarkup.ASIDE.toString())) {
507             sink.sidebar_();
508         } else if (elementName.equals(HtmlMarkup.SECTION.toString())) {
509             handleSectionEnd(sink);
510         } else if (elementName.equals(HtmlMarkup.H1.toString())) {
511             sink.sectionTitle1_();
512         } else if (elementName.equals(HtmlMarkup.H2.toString())) {
513             sink.sectionTitle2_();
514         } else if (elementName.equals(HtmlMarkup.H3.toString())) {
515             sink.sectionTitle3_();
516         } else if (elementName.equals(HtmlMarkup.H4.toString())) {
517             sink.sectionTitle4_();
518         } else if (elementName.equals(HtmlMarkup.H5.toString())) {
519             sink.sectionTitle5_();
520         } else if (elementName.equals(HtmlMarkup.H6.toString())) {
521             sink.sectionTitle6_();
522         } else if (elementName.equals(HtmlMarkup.HEADER.toString())) {
523             sink.header_();
524         } else if (elementName.equals(HtmlMarkup.MAIN.toString())) {
525             sink.content_();
526         } else if (elementName.equals(HtmlMarkup.FOOTER.toString())) {
527             sink.footer_();
528         } else if (elementName.equals(HtmlMarkup.BLOCKQUOTE.toString())) {
529             sink.blockquote_();
530         } else if (UNMATCHED_XHTML5_ELEMENTS.contains(elementName)) {
531             handleUnknown(elementName, attribs, sink, TAG_TYPE_END);
532         } else if (elementName.equals(HtmlMarkup.SCRIPT.toString())
533                 || elementName.equals(HtmlMarkup.STYLE.toString())) {
534             handleUnknown(elementName, attribs, sink, TAG_TYPE_END);
535 
536             scriptBlock = false;
537         } else {
538             visited = false;
539         }
540 
541         return visited;
542     }
543 
544     /**
545      * {@inheritDoc}
546      *
547      * Just calls {@link #baseStartTag(XmlPullParser,Sink)}, this should be
548      * overridden by implementing parsers to include additional tags.
549      */
550     protected void handleStartTag(XmlPullParser parser, Sink sink)
551             throws XmlPullParserException, MacroExecutionException {
552         if (!baseStartTag(parser, sink)) {
553             LOGGER.warn(
554                     "Unrecognized xml tag <{}> at [{}:{}]",
555                     parser.getName(),
556                     parser.getLineNumber(),
557                     parser.getColumnNumber());
558         }
559     }
560 
561     /**
562      * {@inheritDoc}
563      *
564      * Just calls {@link #baseEndTag(XmlPullParser,Sink)}, this should be
565      * overridden by implementing parsers to include additional tags.
566      */
567     protected void handleEndTag(XmlPullParser parser, Sink sink)
568             throws XmlPullParserException, MacroExecutionException {
569         if (!baseEndTag(parser, sink)) {
570             // unrecognized tag is already logged in StartTag
571         }
572     }
573 
574     /** {@inheritDoc} */
575     @Override
576     protected void handleText(XmlPullParser parser, Sink sink) throws XmlPullParserException {
577         String text = getText(parser);
578 
579         /*
580          * NOTE: Don't do any whitespace trimming here. Whitespace normalization has already been performed by the
581          * parser so any whitespace that makes it here is significant.
582          *
583          * NOTE: text within script tags is ignored, scripting code should be embedded in CDATA.
584          */
585         if ((text != null && !text.isEmpty()) && !isScriptBlock()) {
586             sink.text(text);
587         }
588     }
589 
590     /** {@inheritDoc} */
591     @Override
592     protected void handleComment(XmlPullParser parser, Sink sink) throws XmlPullParserException {
593         String text = getText(parser);
594 
595         if ("PB".equals(text.trim())) {
596             sink.pageBreak();
597         } else {
598             if (isEmitComments()) {
599                 sink.comment(text);
600             }
601         }
602     }
603 
604     /** {@inheritDoc} */
605     @Override
606     protected void handleCdsect(XmlPullParser parser, Sink sink) throws XmlPullParserException {
607         String text = getText(parser);
608 
609         if (isScriptBlock()) {
610             sink.unknown(CDATA, new Object[] {CDATA_TYPE, text}, null);
611         } else {
612             sink.text(text);
613         }
614     }
615 
616     /**
617      * Shortcut for {@link #emitHeadingSections(int, Sink, boolean)} with last argument being {@code true}
618      * @param newLevel
619      * @param sink
620      * @param attribs
621      * @deprecated Use {@link #emitHeadingSections(int, Sink, boolean)} instead.
622      */
623     @Deprecated
624     protected void consecutiveSections(int newLevel, Sink sink, SinkEventAttributeSet attribs) {
625         emitHeadingSections(newLevel, sink, true);
626     }
627 
628     /**
629      * Make sure sections are nested consecutively and correctly inserted for the given heading level
630      *
631      * <p>
632      * HTML5 heading tags H1 to H5 imply same level sections in Sink API (compare with {@link Sink#sectionTitle(int, SinkEventAttributes)}).
633      * However (X)HTML5 allows headings without explicit surrounding section elements and is also
634      * less strict with non-consecutive heading levels.
635      * This methods both closes open sections which have been added for previous headings and/or opens
636      * sections necessary for the new heading level.
637      * At least one section needs to be opened directly prior the heading due to Sink API restrictions.
638      * </p>
639      *
640      * <p>
641      * For instance, if the following sequence is parsed:
642      * </p>
643      * <pre>
644      * &lt;h2&gt;&lt;/h2&gt;
645      * &lt;h5&gt;&lt;/h5&gt;
646      * </pre>
647      * <p>
648      * we have to insert two section starts before we open the <code>&lt;h5&gt;</code>.
649      * In the following sequence
650      * </p>
651      * <pre>
652      * &lt;h5&gt;&lt;/h5&gt;
653      * &lt;h2&gt;&lt;/h2&gt;
654      * </pre>
655      * <p>
656      * we have to close two sections before we open the <code>&lt;h2&gt;</code>.
657      * </p>
658      *
659      * <p>The current heading level is set to newLevel afterwards.</p>
660      *
661      * @param newLevel the new section level, all upper levels have to be closed.
662      * @param sink the sink to receive the events.
663      * @param enforceNewSection whether to enforce a new section or not
664      */
665     protected void emitHeadingSections(int newLevel, Sink sink, boolean enforceNewSection) {
666         int lowerBoundSectionLevel = newLevel;
667         if (enforceNewSection) {
668             // close one more if either last event was not section start or the new level is lower than the current one
669             // (in this case the last event may be a section start event but for another level)
670             if (!isLastEventSectionStart() || newLevel < this.headingLevel) {
671                 lowerBoundSectionLevel--;
672             }
673         }
674         closeOpenHeadingSections(lowerBoundSectionLevel, sink);
675         openMissingHeadingSections(newLevel, sink);
676 
677         this.headingLevel = newLevel;
678     }
679 
680     private boolean isLastEventSectionStart() {
681         String lastEventName = capturedSinkEventNames.pollLast();
682         if (lastEventName == null) {
683             return false;
684         }
685         return lastEventName.startsWith("section")
686                 && !lastEventName.endsWith("_")
687                 && !lastEventName.startsWith("sectionTitle");
688     }
689 
690     /**
691      * Close open heading sections.
692      *
693      * @param newLevel the new section level, all upper levels have to be closed.
694      * @param sink the sink to receive the events.
695      */
696     private void closeOpenHeadingSections(int newLevel, Sink sink) {
697         while (this.headingLevel > newLevel) {
698             if (headingLevel >= Sink.SECTION_LEVEL_1 && headingLevel <= Sink.SECTION_LEVEL_6) {
699                 sink.section_(headingLevel);
700             }
701 
702             this.headingLevel--;
703         }
704         // enforce the previous element is a section
705     }
706 
707     /**
708      * Open missing heading sections.
709      *
710      * @param newLevel the new section level, all lower levels have to be opened.
711      * @param sink the sink to receive the events.
712      */
713     private void openMissingHeadingSections(int newLevel, Sink sink) {
714         while (this.headingLevel < newLevel) {
715             this.headingLevel++;
716 
717             if (headingLevel >= Sink.SECTION_LEVEL_1 && headingLevel <= Sink.SECTION_LEVEL_6) {
718                 sink.section(headingLevel, null);
719             }
720         }
721     }
722 
723     /**
724      * Return the current section level.
725      *
726      * @return the current section level.
727      */
728     protected int getSectionLevel() {
729         return this.headingLevel;
730     }
731 
732     /**
733      * Set the current section level.
734      *
735      * @param newLevel the new section level.
736      */
737     protected void setSectionLevel(int newLevel) {
738         this.headingLevel = newLevel;
739     }
740 
741     /**
742      * Stop verbatim mode.
743      */
744     protected void verbatim_() {
745         this.inVerbatim = false;
746     }
747 
748     /**
749      * Start verbatim mode.
750      */
751     protected void verbatim() {
752         this.inVerbatim = true;
753     }
754 
755     /**
756      * Checks if we are currently inside a &lt;pre&gt; tag.
757      *
758      * @return true if we are currently in verbatim mode.
759      */
760     protected boolean isVerbatim() {
761         return this.inVerbatim;
762     }
763 
764     /**
765      * Checks if we are currently inside a &lt;script&gt; tag.
766      *
767      * @return true if we are currently inside <code>&lt;script&gt;</code> tags.
768      * @since 1.1.1.
769      */
770     protected boolean isScriptBlock() {
771         return this.scriptBlock;
772     }
773 
774     /**
775      * Checks if the given id is a valid Doxia id and if not, returns a transformed one.
776      *
777      * @param id The id to validate.
778      * @return A transformed id or the original id if it was already valid.
779      * @see DoxiaUtils#encodeId(String)
780      */
781     protected String validAnchor(String id) {
782         if (!DoxiaUtils.isValidId(id)) {
783             String linkAnchor = DoxiaUtils.encodeId(id);
784 
785             LOGGER.debug("Modified invalid link '{}' to '{}'", id, linkAnchor);
786 
787             return linkAnchor;
788         }
789 
790         return id;
791     }
792 
793     /** {@inheritDoc} */
794     @Override
795     protected void init() {
796         super.init();
797 
798         this.scriptBlock = false;
799         this.isLink = false;
800         this.isAnchor = false;
801         this.orderedListDepth = 0;
802         this.headingLevel = 0;
803         this.inVerbatim = false;
804     }
805 
806     private void handleAEnd(Sink sink) {
807         if (isLink) {
808             sink.link_();
809             isLink = false;
810         } else if (isAnchor) {
811             sink.anchor_();
812             isAnchor = false;
813         }
814     }
815 
816     private void handleAStart(Sink sink, SinkEventAttributeSet attribs) {
817         String href = (String) attribs.getAttribute(Attribute.HREF.toString());
818 
819         if (href != null) {
820             int hashIndex = href.indexOf('#');
821             if (hashIndex != -1 && !DoxiaUtils.isExternalLink(href)) {
822                 String hash = href.substring(hashIndex + 1);
823 
824                 if (!DoxiaUtils.isValidId(hash)) {
825                     href = href.substring(0, hashIndex) + "#" + DoxiaUtils.encodeId(hash);
826 
827                     LOGGER.debug("Modified invalid link '{}' to '{}'", hash, href);
828                 }
829             }
830             sink.link(href, attribs);
831             isLink = true;
832         } else {
833             String id = (String) attribs.getAttribute(Attribute.ID.toString());
834             if (id != null) {
835                 sink.anchor(validAnchor(id), attribs);
836                 isAnchor = true;
837             }
838         }
839     }
840 
841     private boolean handleDivStart(SinkEventAttributeSet attribs, Sink sink) {
842         String divClass = (String) attribs.getAttribute(Attribute.CLASS.toString());
843 
844         this.divStack.push(divClass);
845 
846         if ("content".equals(divClass)) {
847             SinkEventAttributeSet atts = new SinkEventAttributeSet(attribs);
848             atts.removeAttribute(SinkEventAttributes.CLASS);
849             sink.content(atts);
850         }
851         if ("verbatim".equals(divClass) || "verbatim source".equals(divClass)) {
852             return false;
853         } else {
854             sink.division(attribs);
855         }
856 
857         return true;
858     }
859 
860     private boolean handleDivEnd(Sink sink) {
861         String divClass = divStack.pop();
862 
863         if ("content".equals(divClass)) {
864             sink.content_();
865         }
866         if ("verbatim".equals(divClass) || "verbatim source".equals(divClass)) {
867             return false;
868         } else {
869             sink.division_();
870         }
871 
872         return true;
873     }
874 
875     private void handleImgStart(Sink sink, SinkEventAttributeSet attribs) {
876         String src = (String) attribs.getAttribute(Attribute.SRC.toString());
877 
878         if (src != null) {
879             sink.figureGraphics(src, attribs);
880         }
881     }
882 
883     private void handleLIStart(Sink sink, SinkEventAttributeSet attribs) {
884         if (orderedListDepth == 0) {
885             sink.listItem(attribs);
886         } else {
887             sink.numberedListItem(attribs);
888         }
889     }
890 
891     private void handleListItemEnd(Sink sink) {
892         if (orderedListDepth == 0) {
893             sink.listItem_();
894         } else {
895             sink.numberedListItem_();
896         }
897     }
898 
899     private void handleOLStart(Sink sink, SinkEventAttributeSet attribs) {
900         int numbering = Sink.NUMBERING_DECIMAL;
901         // this will have to be generalized if we handle styles
902         String style = (String) attribs.getAttribute(Attribute.STYLE.toString());
903 
904         if (style != null) {
905             switch (style) {
906                 case "list-style-type: upper-alpha;":
907                     numbering = Sink.NUMBERING_UPPER_ALPHA;
908                     break;
909                 case "list-style-type: lower-alpha;":
910                     numbering = Sink.NUMBERING_LOWER_ALPHA;
911                     break;
912                 case "list-style-type: upper-roman;":
913                     numbering = Sink.NUMBERING_UPPER_ROMAN;
914                     break;
915                 case "list-style-type: lower-roman;":
916                     numbering = Sink.NUMBERING_LOWER_ROMAN;
917                     break;
918                 case "list-style-type: decimal;":
919                     numbering = Sink.NUMBERING_DECIMAL;
920                     break;
921                 default:
922                     // ignore all other
923             }
924         }
925 
926         sink.numberedList(numbering, attribs);
927         orderedListDepth++;
928     }
929 
930     private void handlePStart(Sink sink, SinkEventAttributeSet attribs) {
931         sink.paragraph(attribs);
932     }
933 
934     /*
935      * The PRE element tells visual user agents that the enclosed text is
936      * "preformatted". When handling preformatted text, visual user agents:
937      * - May leave white space intact.
938      * - May render text with a fixed-pitch font.
939      * - May disable automatic word wrap.
940      * - Must not disable bidirectional processing.
941      * Non-visual user agents are not required to respect extra white space
942      * in the content of a PRE element.
943      */
944     private void handlePreStart(SinkEventAttributeSet attribs, Sink sink) {
945         verbatim();
946         sink.verbatim(attribs);
947     }
948 
949     private void handleSectionStart(Sink sink, SinkEventAttributeSet attribs) {
950         emitHeadingSections(sectionLevel, sink, false);
951         sink.section(++sectionLevel, attribs);
952         this.headingLevel = sectionLevel;
953     }
954 
955     private void handleHeadingStart(Sink sink, int level, SinkEventAttributeSet attribs) {
956         emitHeadingSections(level, sink, true);
957         sink.sectionTitle(level, attribs);
958     }
959 
960     private void handleSectionEnd(Sink sink) {
961         emitHeadingSections(sectionLevel, sink, false);
962         sink.section_(sectionLevel--);
963         this.headingLevel = sectionLevel;
964     }
965 
966     private void handleTableStart(Sink sink, SinkEventAttributeSet attribs) {
967         sink.table(attribs);
968         String givenTableClass = (String) attribs.getAttribute(Attribute.CLASS.toString());
969         boolean grid = false;
970         if (givenTableClass != null
971                 && BODYTABLEBORDER_CLASS_PATTERN.matcher(givenTableClass).matches()) {
972             grid = true;
973         }
974 
975         sink.tableRows(null, grid);
976     }
977 }