001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *   http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing,
013 * software distributed under the License is distributed on an
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 * KIND, either express or implied.  See the License for the
016 * specific language governing permissions and limitations
017 * under the License.
018 */
019package org.apache.maven.doxia.parser;
020
021import javax.swing.text.html.HTML.Attribute;
022
023import java.io.Reader;
024import java.util.HashSet;
025import java.util.LinkedList;
026import java.util.Set;
027import java.util.Stack;
028import java.util.regex.Pattern;
029
030import org.apache.maven.doxia.macro.MacroExecutionException;
031import org.apache.maven.doxia.markup.HtmlMarkup;
032import org.apache.maven.doxia.sink.Sink;
033import org.apache.maven.doxia.sink.SinkEventAttributes;
034import org.apache.maven.doxia.sink.impl.EventCapturingSinkProxy;
035import org.apache.maven.doxia.sink.impl.SinkEventAttributeSet;
036import org.apache.maven.doxia.util.DoxiaUtils;
037import org.codehaus.plexus.util.xml.pull.XmlPullParser;
038import org.codehaus.plexus.util.xml.pull.XmlPullParserException;
039import org.slf4j.Logger;
040import org.slf4j.LoggerFactory;
041
042/**
043 * Common base parser for xhtml5 events.
044 */
045public class Xhtml5BaseParser extends AbstractXmlParser implements HtmlMarkup {
046    private static final Logger LOGGER = LoggerFactory.getLogger(Xhtml5BaseParser.class);
047
048    /** Used to identify if a class string contains `bodyTableBorder` */
049    private static final Pattern BODYTABLEBORDER_CLASS_PATTERN =
050            Pattern.compile("(?:.*\\s|^)bodyTableBorder(?:\\s.*|$)");
051
052    private static final Set<String> UNMATCHED_XHTML5_ELEMENTS = new HashSet<>();
053    private static final Set<String> UNMATCHED_XHTML5_SIMPLE_ELEMENTS = new HashSet<>();
054
055    static {
056        UNMATCHED_XHTML5_SIMPLE_ELEMENTS.add(HtmlMarkup.AREA.toString());
057        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.AUDIO.toString());
058        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.BUTTON.toString());
059        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.CANVAS.toString());
060        UNMATCHED_XHTML5_SIMPLE_ELEMENTS.add(HtmlMarkup.COL.toString());
061        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.COLGROUP.toString());
062        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.COMMAND.toString());
063        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.DATA.toString());
064        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.DATALIST.toString());
065        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.DETAILS.toString());
066        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.DIALOG.toString());
067        UNMATCHED_XHTML5_SIMPLE_ELEMENTS.add(HtmlMarkup.EMBED.toString());
068        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.FIELDSET.toString());
069        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.FORM.toString());
070        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.HGROUP.toString());
071        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.IFRAME.toString());
072        UNMATCHED_XHTML5_SIMPLE_ELEMENTS.add(HtmlMarkup.INPUT.toString());
073        UNMATCHED_XHTML5_SIMPLE_ELEMENTS.add(HtmlMarkup.KEYGEN.toString());
074        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.LABEL.toString());
075        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.LEGEND.toString());
076        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.MAP.toString());
077        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.MENU.toString());
078        UNMATCHED_XHTML5_SIMPLE_ELEMENTS.add(HtmlMarkup.MENUITEM.toString());
079        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.METER.toString());
080        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.NOSCRIPT.toString());
081        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.OBJECT.toString());
082        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.OPTGROUP.toString());
083        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.OPTION.toString());
084        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.OUTPUT.toString());
085        UNMATCHED_XHTML5_SIMPLE_ELEMENTS.add(HtmlMarkup.PARAM.toString());
086        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.PICTURE.toString());
087        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.PROGRESS.toString());
088        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.SELECT.toString());
089        UNMATCHED_XHTML5_SIMPLE_ELEMENTS.add(HtmlMarkup.SOURCE.toString());
090        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.SUMMARY.toString());
091        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.SVG.toString());
092        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.TEMPLATE.toString());
093        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.TEXTAREA.toString());
094        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.TBODY.toString());
095        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.THEAD.toString());
096        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.TFOOT.toString());
097        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.TIME.toString());
098        UNMATCHED_XHTML5_SIMPLE_ELEMENTS.add(HtmlMarkup.TRACK.toString());
099        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.VAR.toString());
100        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.VIDEO.toString());
101    }
102
103    /**
104     * True if a &lt;script&gt;&lt;/script&gt; or &lt;style&gt;&lt;/style&gt; block is read. CDATA sections within are
105     * handled as rawText.
106     */
107    private boolean scriptBlock;
108
109    /** Used to distinguish &lt;a href=""&gt; from &lt;a name=""&gt;. */
110    private boolean isLink;
111
112    /** Used to distinguish &lt;a href=""&gt; from &lt;a name=""&gt;. */
113    private boolean isAnchor;
114
115    /** Used for nested lists. */
116    private int orderedListDepth = 0;
117
118    /** Counts section nesting level of the sections manually set in the HTML document */
119    private int sectionLevel;
120
121    /** Counts current heading level. This is either the {@link #sectionLevel} if no artificial sections are currently open
122     * for headings or a number higher or lower than {@link #sectionLevel} (for all section currently opened/closed for a preceding heading).
123     * The heading level only changes when a new heading starts, or a section starts or ends. */
124    private int headingLevel;
125
126    /** Verbatim flag, true whenever we are inside a &lt;pre&gt; tag. */
127    private boolean inVerbatim;
128
129    /** Used to keep track of closing tags for content events */
130    private Stack<String> divStack = new Stack<>();
131
132    /** Used to wrap the definedTerm with its definition, even when one is omitted */
133    boolean hasDefinitionListItem = false;
134
135    private LinkedList<String> capturedSinkEventNames;
136
137    /** {@inheritDoc} */
138    @Override
139    public void parse(Reader source, Sink sink, String reference) throws ParseException {
140        init();
141
142        try {
143            capturedSinkEventNames = new LinkedList<>();
144            Sink capturingSink = EventCapturingSinkProxy.newInstance(sink, capturedSinkEventNames);
145            super.parse(source, capturingSink, reference);
146        } finally {
147            setSecondParsing(false);
148            init();
149        }
150    }
151
152    /**
153     * {@inheritDoc}
154     *
155     * Adds all XHTML (HTML 5.2) entities to the parser so that they can be recognized and resolved
156     * without additional DTD.
157     */
158    @Override
159    protected void initXmlParser(XmlPullParser parser) throws XmlPullParserException {
160        super.initXmlParser(parser);
161    }
162
163    /**
164     * <p>
165     *   Goes through a common list of possible html5 start tags. These include only tags that can go into
166     *   the body of an xhtml5 document and so should be re-usable by different xhtml-based parsers.
167     * </p>
168     * <p>
169     *   The currently handled tags are:
170     * </p>
171     * <p>
172     *   <code>
173     *      &lt;article&gt;, &lt;nav&gt;, &lt;aside&gt;, &lt;section&gt;, &lt;h1&gt;, &lt;h2&gt;, &lt;h3&gt;,
174     *      &lt;h4&gt;, &lt;h5&gt;, &lt;header&gt;, &lt;main&gt;, &lt;footer&gt;, &lt;em&gt;, &lt;strong&gt;,
175     *      &lt;small&gt;, &lt;s&gt;, &lt;cite&gt;, &lt;q&gt;, &lt;dfn&gt;, &lt;abbr&gt;, &lt;i&gt;,
176     *      &lt;b&gt;, &lt;code&gt;, &lt;samp&gt;, &lt;kbd&gt;, &lt;sub&gt;, &lt;sup&gt;, &lt;u&gt;,
177     *      &lt;mark&gt;, &lt;ruby&gt;, &lt;rb&gt;, &lt;rt&gt;, &lt;rtc&gt;, &lt;rp&gt;, &lt;bdi&gt;,
178     *      &lt;bdo&gt;, &lt;span&gt;, &lt;ins&gt;, &lt;del&gt;, &lt;p&gt;, &lt;pre&gt;, &lt;ul&gt;,
179     *      &lt;ol&gt;, &lt;li&gt;, &lt;dl&gt;, &lt;dt&gt;, &lt;dd&gt;, &lt;a&gt;, &lt;table&gt;,
180     *      &lt;tr&gt;, &lt;th&gt;, &lt;td&gt;, &lt;caption&gt;, &lt;br/&gt;, &lt;wbr/&gt;, &lt;hr/&gt;,
181     *      &lt;img/&gt;.
182     *   </code>
183     * </p>
184     *
185     * @param parser A parser.
186     * @param sink the sink to receive the events.
187     * @return True if the event has been handled by this method, i.e. the tag was recognized, false otherwise.
188     */
189    protected boolean baseStartTag(XmlPullParser parser, Sink sink) {
190        SinkEventAttributeSet attribs = getAttributesFromParser(parser);
191        return baseStartTag(parser.getName(), attribs, sink);
192    }
193
194    protected boolean baseStartTag(String elementName, SinkEventAttributeSet attribs, Sink sink) {
195        boolean visited = true;
196
197        if (elementName.equals(HtmlMarkup.ARTICLE.toString())) {
198            sink.article(attribs);
199        } else if (elementName.equals(HtmlMarkup.NAV.toString())) {
200            sink.navigation(attribs);
201        } else if (elementName.equals(HtmlMarkup.ASIDE.toString())) {
202            sink.sidebar(attribs);
203        } else if (elementName.equals(HtmlMarkup.SECTION.toString())) {
204            handleSectionStart(sink, attribs);
205        } else if (elementName.equals(HtmlMarkup.H1.toString())) {
206            handleHeadingStart(sink, Sink.SECTION_LEVEL_1, attribs);
207        } else if (elementName.equals(HtmlMarkup.H2.toString())) {
208            handleHeadingStart(sink, Sink.SECTION_LEVEL_2, attribs);
209        } else if (elementName.equals(HtmlMarkup.H3.toString())) {
210            handleHeadingStart(sink, Sink.SECTION_LEVEL_3, attribs);
211        } else if (elementName.equals(HtmlMarkup.H4.toString())) {
212            handleHeadingStart(sink, Sink.SECTION_LEVEL_4, attribs);
213        } else if (elementName.equals(HtmlMarkup.H5.toString())) {
214            handleHeadingStart(sink, Sink.SECTION_LEVEL_5, attribs);
215        } else if (elementName.equals(HtmlMarkup.H6.toString())) {
216            handleHeadingStart(sink, Sink.SECTION_LEVEL_6, attribs);
217        } else if (elementName.equals(HtmlMarkup.HEADER.toString())) {
218            sink.header(attribs);
219        } else if (elementName.equals(HtmlMarkup.MAIN.toString())) {
220            sink.content(attribs);
221        } else if (elementName.equals(HtmlMarkup.FOOTER.toString())) {
222            sink.footer(attribs);
223        } else if (elementName.equals(HtmlMarkup.EM.toString())) {
224            attribs.addAttributes(SinkEventAttributeSet.Semantics.EMPHASIS);
225            sink.inline(attribs);
226        } else if (elementName.equals(HtmlMarkup.STRONG.toString())) {
227            attribs.addAttributes(SinkEventAttributeSet.Semantics.STRONG);
228            sink.inline(attribs);
229        } else if (elementName.equals(HtmlMarkup.SMALL.toString())) {
230            attribs.addAttributes(SinkEventAttributeSet.Semantics.SMALL);
231            sink.inline(attribs);
232        } else if (elementName.equals(HtmlMarkup.S.toString())) {
233            attribs.addAttributes(SinkEventAttributeSet.Semantics.LINE_THROUGH);
234            sink.inline(attribs);
235            /* deprecated line-through support */
236        } else if (elementName.equals(HtmlMarkup.CITE.toString())) {
237            attribs.addAttributes(SinkEventAttributeSet.Semantics.CITATION);
238            sink.inline(attribs);
239        } else if (elementName.equals(HtmlMarkup.Q.toString())) {
240            attribs.addAttributes(SinkEventAttributeSet.Semantics.QUOTE);
241            sink.inline(attribs);
242        } else if (elementName.equals(HtmlMarkup.DFN.toString())) {
243            attribs.addAttributes(SinkEventAttributeSet.Semantics.DEFINITION);
244            sink.inline(attribs);
245        } else if (elementName.equals(HtmlMarkup.ABBR.toString())) {
246            attribs.addAttributes(SinkEventAttributeSet.Semantics.ABBREVIATION);
247            sink.inline(attribs);
248        } else if (elementName.equals(HtmlMarkup.I.toString())) {
249            attribs.addAttributes(SinkEventAttributeSet.Semantics.ITALIC);
250            sink.inline(attribs);
251        } else if (elementName.equals(HtmlMarkup.B.toString())) {
252            attribs.addAttributes(SinkEventAttributeSet.Semantics.BOLD);
253            sink.inline(attribs);
254        } else if (elementName.equals(HtmlMarkup.CODE.toString())) {
255            attribs.addAttributes(SinkEventAttributeSet.Semantics.CODE);
256            sink.inline(attribs);
257        } else if (elementName.equals(HtmlMarkup.VAR.toString())) {
258            attribs.addAttributes(SinkEventAttributeSet.Semantics.VARIABLE);
259            sink.inline(attribs);
260        } else if (elementName.equals(HtmlMarkup.SAMP.toString())) {
261            attribs.addAttributes(SinkEventAttributeSet.Semantics.SAMPLE);
262            sink.inline(attribs);
263        } else if (elementName.equals(HtmlMarkup.KBD.toString())) {
264            attribs.addAttributes(SinkEventAttributeSet.Semantics.KEYBOARD);
265            sink.inline(attribs);
266        } else if (elementName.equals(HtmlMarkup.SUP.toString())) {
267            attribs.addAttributes(SinkEventAttributeSet.Semantics.SUPERSCRIPT);
268            sink.inline(attribs);
269        } else if (elementName.equals(HtmlMarkup.SUB.toString())) {
270            attribs.addAttributes(SinkEventAttributeSet.Semantics.SUBSCRIPT);
271            sink.inline(attribs);
272        } else if (elementName.equals(HtmlMarkup.U.toString())) {
273            attribs.addAttributes(SinkEventAttributeSet.Semantics.ANNOTATION);
274            sink.inline(attribs);
275        } else if (elementName.equals(HtmlMarkup.MARK.toString())) {
276            attribs.addAttributes(SinkEventAttributeSet.Semantics.HIGHLIGHT);
277            sink.inline(attribs);
278        } else if (elementName.equals(HtmlMarkup.RUBY.toString())) {
279            attribs.addAttributes(SinkEventAttributeSet.Semantics.RUBY);
280            sink.inline(attribs);
281        } else if (elementName.equals(HtmlMarkup.RB.toString())) {
282            attribs.addAttributes(SinkEventAttributeSet.Semantics.RUBY_BASE);
283            sink.inline(attribs);
284        } else if (elementName.equals(HtmlMarkup.RT.toString())) {
285            attribs.addAttributes(SinkEventAttributeSet.Semantics.RUBY_TEXT);
286            sink.inline(attribs);
287        } else if (elementName.equals(HtmlMarkup.RTC.toString())) {
288            attribs.addAttributes(SinkEventAttributeSet.Semantics.RUBY_TEXT_CONTAINER);
289            sink.inline(attribs);
290        } else if (elementName.equals(HtmlMarkup.RP.toString())) {
291            attribs.addAttributes(SinkEventAttributeSet.Semantics.RUBY_PARANTHESES);
292            sink.inline(attribs);
293        } else if (elementName.equals(HtmlMarkup.BDI.toString())) {
294            attribs.addAttributes(SinkEventAttributeSet.Semantics.BIDIRECTIONAL_ISOLATION);
295            sink.inline(attribs);
296        } else if (elementName.equals(HtmlMarkup.BDO.toString())) {
297            attribs.addAttributes(SinkEventAttributeSet.Semantics.BIDIRECTIONAL_OVERRIDE);
298            sink.inline(attribs);
299        } else if (elementName.equals(HtmlMarkup.SPAN.toString())) {
300            attribs.addAttributes(SinkEventAttributeSet.Semantics.PHRASE);
301            sink.inline(attribs);
302        } else if (elementName.equals(HtmlMarkup.INS.toString())) {
303            attribs.addAttributes(SinkEventAttributeSet.Semantics.INSERT);
304            sink.inline(attribs);
305        } else if (elementName.equals(HtmlMarkup.DEL.toString())) {
306            attribs.addAttributes(SinkEventAttributeSet.Semantics.DELETE);
307            sink.inline(attribs);
308        } else if (elementName.equals(HtmlMarkup.P.toString())) {
309            handlePStart(sink, attribs);
310        } else if (elementName.equals(HtmlMarkup.DIV.toString())) {
311            handleDivStart(attribs, sink);
312        } else if (elementName.equals(HtmlMarkup.PRE.toString())) {
313            handlePreStart(attribs, sink);
314        } else if (elementName.equals(HtmlMarkup.UL.toString())) {
315            sink.list(attribs);
316        } else if (elementName.equals(HtmlMarkup.OL.toString())) {
317            handleOLStart(sink, attribs);
318        } else if (elementName.equals(HtmlMarkup.LI.toString())) {
319            handleLIStart(sink, attribs);
320        } else if (elementName.equals(HtmlMarkup.DL.toString())) {
321            sink.definitionList(attribs);
322        } else if (elementName.equals(HtmlMarkup.DT.toString())) {
323            if (hasDefinitionListItem) {
324                // close previous listItem
325                sink.definitionListItem_();
326            }
327            sink.definitionListItem(attribs);
328            hasDefinitionListItem = true;
329            sink.definedTerm(attribs);
330        } else if (elementName.equals(HtmlMarkup.DD.toString())) {
331            if (!hasDefinitionListItem) {
332                sink.definitionListItem(attribs);
333            }
334            sink.definition(attribs);
335        } else if (elementName.equals(HtmlMarkup.FIGURE.toString())) {
336            sink.figure(attribs);
337        } else if (elementName.equals(HtmlMarkup.FIGCAPTION.toString())) {
338            sink.figureCaption(attribs);
339        } else if (elementName.equals(HtmlMarkup.A.toString())) {
340            handleAStart(sink, attribs);
341        } else if (elementName.equals(HtmlMarkup.TABLE.toString())) {
342            handleTableStart(sink, attribs);
343        } else if (elementName.equals(HtmlMarkup.TR.toString())) {
344            sink.tableRow(attribs);
345        } else if (elementName.equals(HtmlMarkup.TH.toString())) {
346            sink.tableHeaderCell(attribs);
347        } else if (elementName.equals(HtmlMarkup.TD.toString())) {
348            sink.tableCell(attribs);
349        } else if (elementName.equals(HtmlMarkup.CAPTION.toString())) {
350            sink.tableCaption(attribs);
351        } else if (elementName.equals(HtmlMarkup.BR.toString())) {
352            sink.lineBreak(attribs);
353        } else if (elementName.equals(HtmlMarkup.WBR.toString())) {
354            sink.lineBreakOpportunity(attribs);
355        } else if (elementName.equals(HtmlMarkup.HR.toString())) {
356            sink.horizontalRule(attribs);
357        } else if (elementName.equals(HtmlMarkup.IMG.toString())) {
358            handleImgStart(sink, attribs);
359        } else if (elementName.equals(HtmlMarkup.BLOCKQUOTE.toString())) {
360            sink.blockquote(attribs);
361        } else if (UNMATCHED_XHTML5_ELEMENTS.contains(elementName)) {
362            handleUnknown(elementName, attribs, sink, TAG_TYPE_START);
363        } else if (UNMATCHED_XHTML5_SIMPLE_ELEMENTS.contains(elementName)) {
364            handleUnknown(elementName, attribs, sink, TAG_TYPE_SIMPLE);
365        } else if (elementName.equals(HtmlMarkup.SCRIPT.toString())
366                || elementName.equals(HtmlMarkup.STYLE.toString())) {
367            handleUnknown(elementName, attribs, sink, TAG_TYPE_START);
368            scriptBlock = true;
369        } else {
370            visited = false;
371        }
372
373        return visited;
374    }
375
376    /**
377     * <p>
378     *   Goes through a common list of possible html end tags.
379     *   These should be re-usable by different xhtml-based parsers.
380     *   The tags handled here are the same as for {@link #baseStartTag(XmlPullParser,Sink)},
381     *   except for the empty elements ({@code <br/>, <hr/>, <img/>}).
382     * </p>
383     *
384     * @param parser A parser.
385     * @param sink the sink to receive the events.
386     * @return True if the event has been handled by this method, false otherwise.
387     */
388    protected boolean baseEndTag(XmlPullParser parser, Sink sink) {
389        SinkEventAttributeSet attribs = getAttributesFromParser(parser);
390        return baseEndTag(parser.getName(), attribs, sink);
391    }
392
393    protected boolean baseEndTag(String elementName, SinkEventAttributeSet attribs, Sink sink) {
394        boolean visited = true;
395
396        if (elementName.equals(HtmlMarkup.P.toString())) {
397            sink.paragraph_();
398        } else if (elementName.equals(HtmlMarkup.DIV.toString())) {
399            handleDivEnd(sink);
400        } else if (elementName.equals(HtmlMarkup.PRE.toString())) {
401            verbatim_();
402
403            sink.verbatim_();
404        } else if (elementName.equals(HtmlMarkup.UL.toString())) {
405            sink.list_();
406        } else if (elementName.equals(HtmlMarkup.OL.toString())) {
407            sink.numberedList_();
408            orderedListDepth--;
409        } else if (elementName.equals(HtmlMarkup.LI.toString())) {
410            handleListItemEnd(sink);
411        } else if (elementName.equals(HtmlMarkup.DL.toString())) {
412            if (hasDefinitionListItem) {
413                sink.definitionListItem_();
414                hasDefinitionListItem = false;
415            }
416            sink.definitionList_();
417        } else if (elementName.equals(HtmlMarkup.DT.toString())) {
418            sink.definedTerm_();
419        } else if (elementName.equals(HtmlMarkup.DD.toString())) {
420            sink.definition_();
421            sink.definitionListItem_();
422            hasDefinitionListItem = false;
423        } else if (elementName.equals(HtmlMarkup.FIGURE.toString())) {
424            sink.figure_();
425        } else if (elementName.equals(HtmlMarkup.FIGCAPTION.toString())) {
426            sink.figureCaption_();
427        } else if (elementName.equals(HtmlMarkup.A.toString())) {
428            handleAEnd(sink);
429        } else if (elementName.equals(HtmlMarkup.EM.toString())) {
430            sink.inline_();
431        } else if (elementName.equals(HtmlMarkup.STRONG.toString())) {
432            sink.inline_();
433        } else if (elementName.equals(HtmlMarkup.SMALL.toString())) {
434            sink.inline_();
435        } else if (elementName.equals(HtmlMarkup.S.toString())) {
436            sink.inline_();
437        } else if (elementName.equals(HtmlMarkup.CITE.toString())) {
438            sink.inline_();
439        } else if (elementName.equals(HtmlMarkup.Q.toString())) {
440            sink.inline_();
441        } else if (elementName.equals(HtmlMarkup.DFN.toString())) {
442            sink.inline_();
443        } else if (elementName.equals(HtmlMarkup.ABBR.toString())) {
444            sink.inline_();
445        } else if (elementName.equals(HtmlMarkup.I.toString())) {
446            sink.inline_();
447        } else if (elementName.equals(HtmlMarkup.B.toString())) {
448            sink.inline_();
449        } else if (elementName.equals(HtmlMarkup.CODE.toString())) {
450            sink.inline_();
451        } else if (elementName.equals(HtmlMarkup.VAR.toString())) {
452            sink.inline_();
453        } else if (elementName.equals(HtmlMarkup.SAMP.toString())) {
454            sink.inline_();
455        } else if (elementName.equals(HtmlMarkup.KBD.toString())) {
456            sink.inline_();
457        } else if (elementName.equals(HtmlMarkup.SUP.toString())) {
458            sink.inline_();
459        } else if (elementName.equals(HtmlMarkup.SUB.toString())) {
460            sink.inline_();
461        } else if (elementName.equals(HtmlMarkup.U.toString())) {
462            sink.inline_();
463        } else if (elementName.equals(HtmlMarkup.MARK.toString())) {
464            sink.inline_();
465        } else if (elementName.equals(HtmlMarkup.RUBY.toString())) {
466            sink.inline_();
467        } else if (elementName.equals(HtmlMarkup.RB.toString())) {
468            sink.inline_();
469        } else if (elementName.equals(HtmlMarkup.RT.toString())) {
470            sink.inline_();
471        } else if (elementName.equals(HtmlMarkup.RTC.toString())) {
472            sink.inline_();
473        } else if (elementName.equals(HtmlMarkup.RP.toString())) {
474            sink.inline_();
475        } else if (elementName.equals(HtmlMarkup.BDI.toString())) {
476            sink.inline_();
477        } else if (elementName.equals(HtmlMarkup.BDO.toString())) {
478            sink.inline_();
479        } else if (elementName.equals(HtmlMarkup.SPAN.toString())) {
480            sink.inline_();
481        } else if (elementName.equals(HtmlMarkup.INS.toString())) {
482            sink.inline_();
483        } else if (elementName.equals(HtmlMarkup.DEL.toString())) {
484            sink.inline_();
485        }
486
487        // ----------------------------------------------------------------------
488        // Tables
489        // ----------------------------------------------------------------------
490
491        else if (elementName.equals(HtmlMarkup.TABLE.toString())) {
492            sink.tableRows_();
493            sink.table_();
494        } else if (elementName.equals(HtmlMarkup.TR.toString())) {
495            sink.tableRow_();
496        } else if (elementName.equals(HtmlMarkup.TH.toString())) {
497            sink.tableHeaderCell_();
498        } else if (elementName.equals(HtmlMarkup.TD.toString())) {
499            sink.tableCell_();
500        } else if (elementName.equals(HtmlMarkup.CAPTION.toString())) {
501            sink.tableCaption_();
502        } else if (elementName.equals(HtmlMarkup.ARTICLE.toString())) {
503            sink.article_();
504        } else if (elementName.equals(HtmlMarkup.NAV.toString())) {
505            sink.navigation_();
506        } else if (elementName.equals(HtmlMarkup.ASIDE.toString())) {
507            sink.sidebar_();
508        } else if (elementName.equals(HtmlMarkup.SECTION.toString())) {
509            handleSectionEnd(sink);
510        } else if (elementName.equals(HtmlMarkup.H1.toString())) {
511            sink.sectionTitle1_();
512        } else if (elementName.equals(HtmlMarkup.H2.toString())) {
513            sink.sectionTitle2_();
514        } else if (elementName.equals(HtmlMarkup.H3.toString())) {
515            sink.sectionTitle3_();
516        } else if (elementName.equals(HtmlMarkup.H4.toString())) {
517            sink.sectionTitle4_();
518        } else if (elementName.equals(HtmlMarkup.H5.toString())) {
519            sink.sectionTitle5_();
520        } else if (elementName.equals(HtmlMarkup.H6.toString())) {
521            sink.sectionTitle6_();
522        } else if (elementName.equals(HtmlMarkup.HEADER.toString())) {
523            sink.header_();
524        } else if (elementName.equals(HtmlMarkup.MAIN.toString())) {
525            sink.content_();
526        } else if (elementName.equals(HtmlMarkup.FOOTER.toString())) {
527            sink.footer_();
528        } else if (elementName.equals(HtmlMarkup.BLOCKQUOTE.toString())) {
529            sink.blockquote_();
530        } else if (UNMATCHED_XHTML5_ELEMENTS.contains(elementName)) {
531            handleUnknown(elementName, attribs, sink, TAG_TYPE_END);
532        } else if (elementName.equals(HtmlMarkup.SCRIPT.toString())
533                || elementName.equals(HtmlMarkup.STYLE.toString())) {
534            handleUnknown(elementName, attribs, sink, TAG_TYPE_END);
535
536            scriptBlock = false;
537        } else {
538            visited = false;
539        }
540
541        return visited;
542    }
543
544    /**
545     * {@inheritDoc}
546     *
547     * Just calls {@link #baseStartTag(XmlPullParser,Sink)}, this should be
548     * overridden by implementing parsers to include additional tags.
549     */
550    protected void handleStartTag(XmlPullParser parser, Sink sink)
551            throws XmlPullParserException, MacroExecutionException {
552        if (!baseStartTag(parser, sink)) {
553            LOGGER.warn(
554                    "Unrecognized xml tag <{}> at [{}:{}]",
555                    parser.getName(),
556                    parser.getLineNumber(),
557                    parser.getColumnNumber());
558        }
559    }
560
561    /**
562     * {@inheritDoc}
563     *
564     * Just calls {@link #baseEndTag(XmlPullParser,Sink)}, this should be
565     * overridden by implementing parsers to include additional tags.
566     */
567    protected void handleEndTag(XmlPullParser parser, Sink sink)
568            throws XmlPullParserException, MacroExecutionException {
569        if (!baseEndTag(parser, sink)) {
570            // unrecognized tag is already logged in StartTag
571        }
572    }
573
574    /** {@inheritDoc} */
575    @Override
576    protected void handleText(XmlPullParser parser, Sink sink) throws XmlPullParserException {
577        String text = getText(parser);
578
579        /*
580         * NOTE: Don't do any whitespace trimming here. Whitespace normalization has already been performed by the
581         * parser so any whitespace that makes it here is significant.
582         *
583         * NOTE: text within script tags is ignored, scripting code should be embedded in CDATA.
584         */
585        if ((text != null && !text.isEmpty()) && !isScriptBlock()) {
586            sink.text(text);
587        }
588    }
589
590    /** {@inheritDoc} */
591    @Override
592    protected void handleComment(XmlPullParser parser, Sink sink) throws XmlPullParserException {
593        String text = getText(parser);
594
595        if ("PB".equals(text.trim())) {
596            sink.pageBreak();
597        } else {
598            if (isEmitComments()) {
599                sink.comment(text);
600            }
601        }
602    }
603
604    /** {@inheritDoc} */
605    @Override
606    protected void handleCdsect(XmlPullParser parser, Sink sink) throws XmlPullParserException {
607        String text = getText(parser);
608
609        if (isScriptBlock()) {
610            sink.unknown(CDATA, new Object[] {CDATA_TYPE, text}, null);
611        } else {
612            sink.text(text);
613        }
614    }
615
616    /**
617     * Shortcut for {@link #emitHeadingSections(int, Sink, boolean)} with last argument being {@code true}
618     * @param newLevel
619     * @param sink
620     * @param attribs
621     * @deprecated Use {@link #emitHeadingSections(int, Sink, boolean)} instead.
622     */
623    @Deprecated
624    protected void consecutiveSections(int newLevel, Sink sink, SinkEventAttributeSet attribs) {
625        emitHeadingSections(newLevel, sink, true);
626    }
627
628    /**
629     * Make sure sections are nested consecutively and correctly inserted for the given heading level
630     *
631     * <p>
632     * HTML5 heading tags H1 to H5 imply same level sections in Sink API (compare with {@link Sink#sectionTitle(int, SinkEventAttributes)}).
633     * However (X)HTML5 allows headings without explicit surrounding section elements and is also
634     * less strict with non-consecutive heading levels.
635     * This methods both closes open sections which have been added for previous headings and/or opens
636     * sections necessary for the new heading level.
637     * At least one section needs to be opened directly prior the heading due to Sink API restrictions.
638     * </p>
639     *
640     * <p>
641     * For instance, if the following sequence is parsed:
642     * </p>
643     * <pre>
644     * &lt;h2&gt;&lt;/h2&gt;
645     * &lt;h5&gt;&lt;/h5&gt;
646     * </pre>
647     * <p>
648     * we have to insert two section starts before we open the <code>&lt;h5&gt;</code>.
649     * In the following sequence
650     * </p>
651     * <pre>
652     * &lt;h5&gt;&lt;/h5&gt;
653     * &lt;h2&gt;&lt;/h2&gt;
654     * </pre>
655     * <p>
656     * we have to close two sections before we open the <code>&lt;h2&gt;</code>.
657     * </p>
658     *
659     * <p>The current heading level is set to newLevel afterwards.</p>
660     *
661     * @param newLevel the new section level, all upper levels have to be closed.
662     * @param sink the sink to receive the events.
663     * @param enforceNewSection whether to enforce a new section or not
664     */
665    protected void emitHeadingSections(int newLevel, Sink sink, boolean enforceNewSection) {
666        int lowerBoundSectionLevel = newLevel;
667        if (enforceNewSection) {
668            // close one more if either last event was not section start or the new level is lower than the current one
669            // (in this case the last event may be a section start event but for another level)
670            if (!isLastEventSectionStart() || newLevel < this.headingLevel) {
671                lowerBoundSectionLevel--;
672            }
673        }
674        closeOpenHeadingSections(lowerBoundSectionLevel, sink);
675        openMissingHeadingSections(newLevel, sink);
676
677        this.headingLevel = newLevel;
678    }
679
680    private boolean isLastEventSectionStart() {
681        String lastEventName = capturedSinkEventNames.pollLast();
682        if (lastEventName == null) {
683            return false;
684        }
685        return lastEventName.startsWith("section")
686                && !lastEventName.endsWith("_")
687                && !lastEventName.startsWith("sectionTitle");
688    }
689
690    /**
691     * Close open heading sections.
692     *
693     * @param newLevel the new section level, all upper levels have to be closed.
694     * @param sink the sink to receive the events.
695     */
696    private void closeOpenHeadingSections(int newLevel, Sink sink) {
697        while (this.headingLevel > newLevel) {
698            if (headingLevel >= Sink.SECTION_LEVEL_1 && headingLevel <= Sink.SECTION_LEVEL_6) {
699                sink.section_(headingLevel);
700            }
701
702            this.headingLevel--;
703        }
704        // enforce the previous element is a section
705    }
706
707    /**
708     * Open missing heading sections.
709     *
710     * @param newLevel the new section level, all lower levels have to be opened.
711     * @param sink the sink to receive the events.
712     */
713    private void openMissingHeadingSections(int newLevel, Sink sink) {
714        while (this.headingLevel < newLevel) {
715            this.headingLevel++;
716
717            if (headingLevel >= Sink.SECTION_LEVEL_1 && headingLevel <= Sink.SECTION_LEVEL_6) {
718                sink.section(headingLevel, null);
719            }
720        }
721    }
722
723    /**
724     * Return the current section level.
725     *
726     * @return the current section level.
727     */
728    protected int getSectionLevel() {
729        return this.headingLevel;
730    }
731
732    /**
733     * Set the current section level.
734     *
735     * @param newLevel the new section level.
736     */
737    protected void setSectionLevel(int newLevel) {
738        this.headingLevel = newLevel;
739    }
740
741    /**
742     * Stop verbatim mode.
743     */
744    protected void verbatim_() {
745        this.inVerbatim = false;
746    }
747
748    /**
749     * Start verbatim mode.
750     */
751    protected void verbatim() {
752        this.inVerbatim = true;
753    }
754
755    /**
756     * Checks if we are currently inside a &lt;pre&gt; tag.
757     *
758     * @return true if we are currently in verbatim mode.
759     */
760    protected boolean isVerbatim() {
761        return this.inVerbatim;
762    }
763
764    /**
765     * Checks if we are currently inside a &lt;script&gt; tag.
766     *
767     * @return true if we are currently inside <code>&lt;script&gt;</code> tags.
768     * @since 1.1.1.
769     */
770    protected boolean isScriptBlock() {
771        return this.scriptBlock;
772    }
773
774    /**
775     * Checks if the given id is a valid Doxia id and if not, returns a transformed one.
776     *
777     * @param id The id to validate.
778     * @return A transformed id or the original id if it was already valid.
779     * @see DoxiaUtils#encodeId(String)
780     */
781    protected String validAnchor(String id) {
782        if (!DoxiaUtils.isValidId(id)) {
783            String linkAnchor = DoxiaUtils.encodeId(id);
784
785            LOGGER.debug("Modified invalid link '{}' to '{}'", id, linkAnchor);
786
787            return linkAnchor;
788        }
789
790        return id;
791    }
792
793    /** {@inheritDoc} */
794    @Override
795    protected void init() {
796        super.init();
797
798        this.scriptBlock = false;
799        this.isLink = false;
800        this.isAnchor = false;
801        this.orderedListDepth = 0;
802        this.headingLevel = 0;
803        this.inVerbatim = false;
804    }
805
806    private void handleAEnd(Sink sink) {
807        if (isLink) {
808            sink.link_();
809            isLink = false;
810        } else if (isAnchor) {
811            sink.anchor_();
812            isAnchor = false;
813        }
814    }
815
816    private void handleAStart(Sink sink, SinkEventAttributeSet attribs) {
817        String href = (String) attribs.getAttribute(Attribute.HREF.toString());
818
819        if (href != null) {
820            int hashIndex = href.indexOf('#');
821            if (hashIndex != -1 && !DoxiaUtils.isExternalLink(href)) {
822                String hash = href.substring(hashIndex + 1);
823
824                if (!DoxiaUtils.isValidId(hash)) {
825                    href = href.substring(0, hashIndex) + "#" + DoxiaUtils.encodeId(hash);
826
827                    LOGGER.debug("Modified invalid link '{}' to '{}'", hash, href);
828                }
829            }
830            sink.link(href, attribs);
831            isLink = true;
832        } else {
833            String id = (String) attribs.getAttribute(Attribute.ID.toString());
834            if (id != null) {
835                sink.anchor(validAnchor(id), attribs);
836                isAnchor = true;
837            }
838        }
839    }
840
841    private boolean handleDivStart(SinkEventAttributeSet attribs, Sink sink) {
842        String divClass = (String) attribs.getAttribute(Attribute.CLASS.toString());
843
844        this.divStack.push(divClass);
845
846        if ("content".equals(divClass)) {
847            SinkEventAttributeSet atts = new SinkEventAttributeSet(attribs);
848            atts.removeAttribute(SinkEventAttributes.CLASS);
849            sink.content(atts);
850        }
851        if ("verbatim".equals(divClass) || "verbatim source".equals(divClass)) {
852            return false;
853        } else {
854            sink.division(attribs);
855        }
856
857        return true;
858    }
859
860    private boolean handleDivEnd(Sink sink) {
861        String divClass = divStack.pop();
862
863        if ("content".equals(divClass)) {
864            sink.content_();
865        }
866        if ("verbatim".equals(divClass) || "verbatim source".equals(divClass)) {
867            return false;
868        } else {
869            sink.division_();
870        }
871
872        return true;
873    }
874
875    private void handleImgStart(Sink sink, SinkEventAttributeSet attribs) {
876        String src = (String) attribs.getAttribute(Attribute.SRC.toString());
877
878        if (src != null) {
879            sink.figureGraphics(src, attribs);
880        }
881    }
882
883    private void handleLIStart(Sink sink, SinkEventAttributeSet attribs) {
884        if (orderedListDepth == 0) {
885            sink.listItem(attribs);
886        } else {
887            sink.numberedListItem(attribs);
888        }
889    }
890
891    private void handleListItemEnd(Sink sink) {
892        if (orderedListDepth == 0) {
893            sink.listItem_();
894        } else {
895            sink.numberedListItem_();
896        }
897    }
898
899    private void handleOLStart(Sink sink, SinkEventAttributeSet attribs) {
900        int numbering = Sink.NUMBERING_DECIMAL;
901        // this will have to be generalized if we handle styles
902        String style = (String) attribs.getAttribute(Attribute.STYLE.toString());
903
904        if (style != null) {
905            switch (style) {
906                case "list-style-type: upper-alpha;":
907                    numbering = Sink.NUMBERING_UPPER_ALPHA;
908                    break;
909                case "list-style-type: lower-alpha;":
910                    numbering = Sink.NUMBERING_LOWER_ALPHA;
911                    break;
912                case "list-style-type: upper-roman;":
913                    numbering = Sink.NUMBERING_UPPER_ROMAN;
914                    break;
915                case "list-style-type: lower-roman;":
916                    numbering = Sink.NUMBERING_LOWER_ROMAN;
917                    break;
918                case "list-style-type: decimal;":
919                    numbering = Sink.NUMBERING_DECIMAL;
920                    break;
921                default:
922                    // ignore all other
923            }
924        }
925
926        sink.numberedList(numbering, attribs);
927        orderedListDepth++;
928    }
929
930    private void handlePStart(Sink sink, SinkEventAttributeSet attribs) {
931        sink.paragraph(attribs);
932    }
933
934    /*
935     * The PRE element tells visual user agents that the enclosed text is
936     * "preformatted". When handling preformatted text, visual user agents:
937     * - May leave white space intact.
938     * - May render text with a fixed-pitch font.
939     * - May disable automatic word wrap.
940     * - Must not disable bidirectional processing.
941     * Non-visual user agents are not required to respect extra white space
942     * in the content of a PRE element.
943     */
944    private void handlePreStart(SinkEventAttributeSet attribs, Sink sink) {
945        verbatim();
946        sink.verbatim(attribs);
947    }
948
949    private void handleSectionStart(Sink sink, SinkEventAttributeSet attribs) {
950        emitHeadingSections(sectionLevel, sink, false);
951        sink.section(++sectionLevel, attribs);
952        this.headingLevel = sectionLevel;
953    }
954
955    private void handleHeadingStart(Sink sink, int level, SinkEventAttributeSet attribs) {
956        emitHeadingSections(level, sink, true);
957        sink.sectionTitle(level, attribs);
958    }
959
960    private void handleSectionEnd(Sink sink) {
961        emitHeadingSections(sectionLevel, sink, false);
962        sink.section_(sectionLevel--);
963        this.headingLevel = sectionLevel;
964    }
965
966    private void handleTableStart(Sink sink, SinkEventAttributeSet attribs) {
967        sink.table(attribs);
968        String givenTableClass = (String) attribs.getAttribute(Attribute.CLASS.toString());
969        boolean grid = false;
970        if (givenTableClass != null
971                && BODYTABLEBORDER_CLASS_PATTERN.matcher(givenTableClass).matches()) {
972            grid = true;
973        }
974
975        sink.tableRows(null, grid);
976    }
977}