001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *   http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing,
013 * software distributed under the License is distributed on an
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 * KIND, either express or implied.  See the License for the
016 * specific language governing permissions and limitations
017 * under the License.
018 */
019package org.apache.maven.doxia.parser;
020
021import javax.swing.text.html.HTML.Attribute;
022
023import java.io.Reader;
024import java.text.CharacterIterator;
025import java.text.StringCharacterIterator;
026import java.util.HashSet;
027import java.util.LinkedList;
028import java.util.Set;
029import java.util.Stack;
030import java.util.regex.Pattern;
031
032import org.apache.maven.doxia.macro.MacroExecutionException;
033import org.apache.maven.doxia.markup.HtmlMarkup;
034import org.apache.maven.doxia.sink.Sink;
035import org.apache.maven.doxia.sink.SinkEventAttributes;
036import org.apache.maven.doxia.sink.impl.EventCapturingSinkProxy;
037import org.apache.maven.doxia.sink.impl.SinkEventAttributeSet;
038import org.apache.maven.doxia.util.DoxiaUtils;
039import org.codehaus.plexus.util.xml.pull.XmlPullParser;
040import org.codehaus.plexus.util.xml.pull.XmlPullParserException;
041import org.slf4j.Logger;
042import org.slf4j.LoggerFactory;
043
044/**
045 * Common base parser for XHTML5 (now <a href="https://html.spec.whatwg.org/multipage/#toc-the-xhtml-syntax">HTML Living standard, XML syntax</a>) elements and attributes.
046 *
047 * @see <a href="https://html.spec.whatwg.org/multipage/introduction.html#history-2">HTML Living standard, history</a>
048 */
049public class Xhtml5BaseParser extends AbstractXmlParser implements HtmlMarkup {
050    private static final Logger LOGGER = LoggerFactory.getLogger(Xhtml5BaseParser.class);
051
052    /** Used to identify if a class string contains `bodyTableBorder` */
053    private static final Pattern BODYTABLEBORDER_CLASS_PATTERN =
054            Pattern.compile("(?:.*\\s|^)bodyTableBorder(?:\\s.*|$)");
055
056    private static final Set<String> UNMATCHED_XHTML5_ELEMENTS = new HashSet<>();
057    private static final Set<String> UNMATCHED_XHTML5_SIMPLE_ELEMENTS = new HashSet<>();
058
059    static {
060        UNMATCHED_XHTML5_SIMPLE_ELEMENTS.add(HtmlMarkup.AREA.toString());
061        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.AUDIO.toString());
062        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.BUTTON.toString());
063        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.CANVAS.toString());
064        UNMATCHED_XHTML5_SIMPLE_ELEMENTS.add(HtmlMarkup.COL.toString());
065        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.COLGROUP.toString());
066        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.COMMAND.toString());
067        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.DATA.toString());
068        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.DATALIST.toString());
069        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.DETAILS.toString());
070        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.DIALOG.toString());
071        UNMATCHED_XHTML5_SIMPLE_ELEMENTS.add(HtmlMarkup.EMBED.toString());
072        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.FIELDSET.toString());
073        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.FORM.toString());
074        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.HGROUP.toString());
075        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.IFRAME.toString());
076        UNMATCHED_XHTML5_SIMPLE_ELEMENTS.add(HtmlMarkup.INPUT.toString());
077        UNMATCHED_XHTML5_SIMPLE_ELEMENTS.add(HtmlMarkup.KEYGEN.toString());
078        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.LABEL.toString());
079        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.LEGEND.toString());
080        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.MAP.toString());
081        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.MENU.toString());
082        UNMATCHED_XHTML5_SIMPLE_ELEMENTS.add(HtmlMarkup.MENUITEM.toString());
083        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.METER.toString());
084        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.NOSCRIPT.toString());
085        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.OBJECT.toString());
086        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.OPTGROUP.toString());
087        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.OPTION.toString());
088        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.OUTPUT.toString());
089        UNMATCHED_XHTML5_SIMPLE_ELEMENTS.add(HtmlMarkup.PARAM.toString());
090        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.PICTURE.toString());
091        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.PROGRESS.toString());
092        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.SELECT.toString());
093        UNMATCHED_XHTML5_SIMPLE_ELEMENTS.add(HtmlMarkup.SOURCE.toString());
094        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.SUMMARY.toString());
095        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.SVG.toString());
096        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.TEMPLATE.toString());
097        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.TEXTAREA.toString());
098        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.TBODY.toString());
099        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.THEAD.toString());
100        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.TFOOT.toString());
101        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.TIME.toString());
102        UNMATCHED_XHTML5_SIMPLE_ELEMENTS.add(HtmlMarkup.TRACK.toString());
103        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.VAR.toString());
104        UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.VIDEO.toString());
105    }
106
107    /**
108     * True if a &lt;script&gt;&lt;/script&gt; or &lt;style&gt;&lt;/style&gt; block is read. CDATA sections within are
109     * handled as rawText.
110     */
111    private boolean scriptBlock;
112
113    /** Used to distinguish &lt;a href=""&gt; from &lt;a name=""&gt;. */
114    private boolean isLink;
115
116    /** If true, the next text event is at the beginning of a line inside a block element, i.e. after a block tag or a line break/end block tag. */
117    protected boolean isBeginningOfLineInsideBlock = true;
118
119    /** Used to distinguish &lt;a href=""&gt; from &lt;a name=""&gt;. */
120    private boolean isAnchor;
121
122    /** Used for nested lists. */
123    private int orderedListDepth = 0;
124
125    /** Counts section nesting level of the sections manually set in the HTML document */
126    private int sectionLevel;
127
128    /** Counts current heading level. This is either the {@link #sectionLevel} if no artificial sections are currently open
129     * for headings or a number higher or lower than {@link #sectionLevel} (for all section currently opened/closed for a preceding heading).
130     * The heading level only changes when a new heading starts, or a section starts or ends. */
131    private int headingLevel;
132
133    /** Verbatim flag, true whenever we are inside a &lt;pre&gt; tag. */
134    private boolean inVerbatim;
135
136    /** Used to keep track of closing tags for content events */
137    private Stack<String> divStack = new Stack<>();
138
139    /** Used to wrap the definedTerm with its definition, even when one is omitted */
140    boolean hasDefinitionListItem = false;
141
142    private LinkedList<String> capturedSinkEventNames;
143
144    @Override
145    public void parse(Reader source, Sink sink, String reference) throws ParseException {
146        init();
147
148        try {
149            capturedSinkEventNames = new LinkedList<>();
150            Sink capturingSink = EventCapturingSinkProxy.newInstance(sink, capturedSinkEventNames);
151            super.parse(source, capturingSink, reference);
152        } finally {
153            setSecondParsing(false);
154            init();
155        }
156    }
157
158    /**
159     * {@inheritDoc}
160     *
161     * Adds all XHTML (HTML 5.2) entities to the parser so that they can be recognized and resolved
162     * without additional DTD.
163     */
164    @Override
165    protected void initXmlParser(XmlPullParser parser) throws XmlPullParserException {
166        super.initXmlParser(parser);
167    }
168
169    /**
170     * <p>
171     *   Goes through a common list of possible html5 start tags. These include only tags that can go into
172     *   the body of an xhtml5 document and so should be re-usable by different xhtml-based parsers.
173     * </p>
174     * <p>
175     *   The currently handled tags are:
176     * </p>
177     * <p>
178     *   <code>
179     *      &lt;article&gt;, &lt;nav&gt;, &lt;aside&gt;, &lt;section&gt;, &lt;h1&gt;, &lt;h2&gt;, &lt;h3&gt;,
180     *      &lt;h4&gt;, &lt;h5&gt;, &lt;header&gt;, &lt;main&gt;, &lt;footer&gt;, &lt;em&gt;, &lt;strong&gt;,
181     *      &lt;small&gt;, &lt;s&gt;, &lt;cite&gt;, &lt;q&gt;, &lt;dfn&gt;, &lt;abbr&gt;, &lt;i&gt;,
182     *      &lt;b&gt;, &lt;code&gt;, &lt;samp&gt;, &lt;kbd&gt;, &lt;sub&gt;, &lt;sup&gt;, &lt;u&gt;,
183     *      &lt;mark&gt;, &lt;ruby&gt;, &lt;rb&gt;, &lt;rt&gt;, &lt;rtc&gt;, &lt;rp&gt;, &lt;bdi&gt;,
184     *      &lt;bdo&gt;, &lt;span&gt;, &lt;ins&gt;, &lt;del&gt;, &lt;p&gt;, &lt;pre&gt;, &lt;ul&gt;,
185     *      &lt;ol&gt;, &lt;li&gt;, &lt;dl&gt;, &lt;dt&gt;, &lt;dd&gt;, &lt;a&gt;, &lt;table&gt;,
186     *      &lt;tr&gt;, &lt;th&gt;, &lt;td&gt;, &lt;caption&gt;, &lt;br/&gt;, &lt;wbr/&gt;, &lt;hr/&gt;,
187     *      &lt;img/&gt;.
188     *   </code>
189     * </p>
190     *
191     * @param parser A parser.
192     * @param sink the sink to receive the events.
193     * @return True if the event has been handled by this method, i.e. the tag was recognized, false otherwise.
194     */
195    protected boolean baseStartTag(XmlPullParser parser, Sink sink) {
196        SinkEventAttributeSet attribs = getAttributesFromParser(parser);
197        return baseStartTag(parser.getName(), attribs, sink);
198    }
199
200    protected boolean baseStartTag(String elementName, SinkEventAttributeSet attribs, Sink sink) {
201        boolean visited = true;
202        isBeginningOfLineInsideBlock = true;
203        switch (elementName) {
204            case "article":
205                sink.article(attribs);
206                break;
207            case "nav":
208                sink.navigation(attribs);
209                break;
210            case "aside":
211                sink.sidebar(attribs);
212                break;
213            case "section":
214                handleSectionStart(sink, attribs);
215                break;
216            case "h1":
217                handleHeadingStart(sink, Sink.SECTION_LEVEL_1, attribs);
218                break;
219            case "h2":
220                handleHeadingStart(sink, Sink.SECTION_LEVEL_2, attribs);
221                break;
222            case "h3":
223                handleHeadingStart(sink, Sink.SECTION_LEVEL_3, attribs);
224                break;
225            case "h4":
226                handleHeadingStart(sink, Sink.SECTION_LEVEL_4, attribs);
227                break;
228            case "h5":
229                handleHeadingStart(sink, Sink.SECTION_LEVEL_5, attribs);
230                break;
231            case "h6":
232                handleHeadingStart(sink, Sink.SECTION_LEVEL_6, attribs);
233                break;
234            case "header":
235                sink.header(attribs);
236                break;
237            case "main":
238                sink.content(attribs);
239                break;
240            case "footer":
241                sink.footer(attribs);
242                break;
243            case "em":
244                attribs.addAttributes(SinkEventAttributeSet.Semantics.EMPHASIS);
245                sink.inline(attribs);
246                isBeginningOfLineInsideBlock = false;
247                break;
248            case "strong":
249                attribs.addAttributes(SinkEventAttributeSet.Semantics.STRONG);
250                sink.inline(attribs);
251                isBeginningOfLineInsideBlock = false;
252                break;
253            case "small":
254                attribs.addAttributes(SinkEventAttributeSet.Semantics.SMALL);
255                sink.inline(attribs);
256                isBeginningOfLineInsideBlock = false;
257                break;
258            case "s":
259                /* deprecated line-through support */
260                attribs.addAttributes(SinkEventAttributeSet.Semantics.LINE_THROUGH);
261                sink.inline(attribs);
262                isBeginningOfLineInsideBlock = false;
263                break;
264            case "cite":
265                attribs.addAttributes(SinkEventAttributeSet.Semantics.CITATION);
266                sink.inline(attribs);
267                isBeginningOfLineInsideBlock = false;
268                break;
269            case "q":
270                attribs.addAttributes(SinkEventAttributeSet.Semantics.QUOTE);
271                sink.inline(attribs);
272                break;
273            case "dfn":
274                attribs.addAttributes(SinkEventAttributeSet.Semantics.DEFINITION);
275                sink.inline(attribs);
276                isBeginningOfLineInsideBlock = false;
277                break;
278            case "abbr":
279                attribs.addAttributes(SinkEventAttributeSet.Semantics.ABBREVIATION);
280                sink.inline(attribs);
281                isBeginningOfLineInsideBlock = false;
282                break;
283            case "i":
284                attribs.addAttributes(SinkEventAttributeSet.Semantics.ITALIC);
285                sink.inline(attribs);
286                break;
287            case "b":
288                attribs.addAttributes(SinkEventAttributeSet.Semantics.BOLD);
289                sink.inline(attribs);
290                isBeginningOfLineInsideBlock = false;
291                break;
292            case "code":
293                attribs.addAttributes(SinkEventAttributeSet.Semantics.CODE);
294                sink.inline(attribs);
295                isBeginningOfLineInsideBlock = false;
296                break;
297            case "var":
298                attribs.addAttributes(SinkEventAttributeSet.Semantics.VARIABLE);
299                sink.inline(attribs);
300                isBeginningOfLineInsideBlock = false;
301                break;
302            case "samp":
303                attribs.addAttributes(SinkEventAttributeSet.Semantics.SAMPLE);
304                sink.inline(attribs);
305                isBeginningOfLineInsideBlock = false;
306                break;
307            case "kbd":
308                attribs.addAttributes(SinkEventAttributeSet.Semantics.KEYBOARD);
309                sink.inline(attribs);
310                isBeginningOfLineInsideBlock = false;
311                break;
312            case "sup":
313                attribs.addAttributes(SinkEventAttributeSet.Semantics.SUPERSCRIPT);
314                sink.inline(attribs);
315                isBeginningOfLineInsideBlock = false;
316                break;
317            case "sub":
318                attribs.addAttributes(SinkEventAttributeSet.Semantics.SUBSCRIPT);
319                sink.inline(attribs);
320                isBeginningOfLineInsideBlock = false;
321                break;
322            case "u":
323                attribs.addAttributes(SinkEventAttributeSet.Semantics.ANNOTATION);
324                sink.inline(attribs);
325                isBeginningOfLineInsideBlock = false;
326                break;
327            case "mark":
328                attribs.addAttributes(SinkEventAttributeSet.Semantics.HIGHLIGHT);
329                sink.inline(attribs);
330                break;
331            case "ruby":
332                attribs.addAttributes(SinkEventAttributeSet.Semantics.RUBY);
333                sink.inline(attribs);
334                isBeginningOfLineInsideBlock = false;
335                break;
336            case "rb":
337                attribs.addAttributes(SinkEventAttributeSet.Semantics.RUBY_BASE);
338                sink.inline(attribs);
339                isBeginningOfLineInsideBlock = false;
340                break;
341            case "rt":
342                attribs.addAttributes(SinkEventAttributeSet.Semantics.RUBY_TEXT);
343                sink.inline(attribs);
344                isBeginningOfLineInsideBlock = false;
345                break;
346            case "rtc":
347                attribs.addAttributes(SinkEventAttributeSet.Semantics.RUBY_TEXT_CONTAINER);
348                sink.inline(attribs);
349                isBeginningOfLineInsideBlock = false;
350                break;
351            case "rp":
352                attribs.addAttributes(SinkEventAttributeSet.Semantics.RUBY_PARANTHESES);
353                sink.inline(attribs);
354                isBeginningOfLineInsideBlock = false;
355                break;
356            case "bdi":
357                attribs.addAttributes(SinkEventAttributeSet.Semantics.BIDIRECTIONAL_ISOLATION);
358                sink.inline(attribs);
359                isBeginningOfLineInsideBlock = false;
360                break;
361            case "bdo":
362                attribs.addAttributes(SinkEventAttributeSet.Semantics.BIDIRECTIONAL_OVERRIDE);
363                sink.inline(attribs);
364                isBeginningOfLineInsideBlock = false;
365                break;
366            case "span":
367                attribs.addAttributes(SinkEventAttributeSet.Semantics.PHRASE);
368                sink.inline(attribs);
369                isBeginningOfLineInsideBlock = false;
370                break;
371            case "ins":
372                attribs.addAttributes(SinkEventAttributeSet.Semantics.INSERT);
373                sink.inline(attribs);
374                isBeginningOfLineInsideBlock = false;
375                break;
376            case "del":
377                attribs.addAttributes(SinkEventAttributeSet.Semantics.DELETE);
378                sink.inline(attribs);
379                isBeginningOfLineInsideBlock = false;
380                break;
381            case "p":
382                handlePStart(sink, attribs);
383                break;
384            case "div":
385                handleDivStart(attribs, sink);
386                break;
387            case "pre":
388                handlePreStart(attribs, sink);
389                break;
390            case "ul":
391                sink.list(attribs);
392                break;
393            case "ol":
394                handleOLStart(sink, attribs);
395                break;
396            case "li":
397                handleLIStart(sink, attribs);
398                break;
399            case "dl":
400                sink.definitionList(attribs);
401                break;
402            case "dt":
403                if (hasDefinitionListItem) {
404                    // close previous listItem
405                    sink.definitionListItem_();
406                }
407                sink.definitionListItem(attribs);
408                hasDefinitionListItem = true;
409                sink.definedTerm(attribs);
410                break;
411            case "dd":
412                if (!hasDefinitionListItem) {
413                    sink.definitionListItem(attribs);
414                }
415                sink.definition(attribs);
416                break;
417            case "figure":
418                sink.figure(attribs);
419                break;
420            case "figcaption":
421                sink.figureCaption(attribs);
422                break;
423            case "a":
424                isBeginningOfLineInsideBlock = false;
425                handleAStart(sink, attribs);
426                break;
427            case "table":
428                handleTableStart(sink, attribs);
429                break;
430            case "tr":
431                sink.tableRow(attribs);
432                break;
433            case "th":
434                sink.tableHeaderCell(attribs);
435                break;
436            case "td":
437                sink.tableCell(attribs);
438                break;
439            case "caption":
440                sink.tableCaption(attribs);
441                break;
442            case "br":
443                sink.lineBreak(attribs);
444                break;
445            case "wbr":
446                sink.lineBreakOpportunity(attribs);
447                break;
448            case "hr":
449                sink.horizontalRule(attribs);
450                break;
451            case "img":
452                isBeginningOfLineInsideBlock = false;
453                handleImgStart(sink, attribs);
454                break;
455            case "blockquote":
456                sink.blockquote(attribs);
457                break;
458            case "script":
459            case "style":
460                handleUnknown(elementName, attribs, sink, TAG_TYPE_START);
461                scriptBlock = true;
462                break;
463            default:
464                if (UNMATCHED_XHTML5_ELEMENTS.contains(elementName)) {
465                    handleUnknown(elementName, attribs, sink, TAG_TYPE_START);
466                } else if (UNMATCHED_XHTML5_SIMPLE_ELEMENTS.contains(elementName)) {
467                    handleUnknown(elementName, attribs, sink, TAG_TYPE_SIMPLE);
468                } else {
469                    visited = false;
470                }
471                break;
472        }
473
474        return visited;
475    }
476
477    /**
478     * <p>
479     *   Goes through a common list of possible html end tags.
480     *   These should be re-usable by different xhtml-based parsers.
481     *   The tags handled here are the same as for {@link #baseStartTag(XmlPullParser,Sink)},
482     *   except for the empty elements ({@code <br/>, <hr/>, <img/>}).
483     * </p>
484     *
485     * @param parser A parser.
486     * @param sink the sink to receive the events.
487     * @return True if the event has been handled by this method, false otherwise.
488     */
489    protected boolean baseEndTag(XmlPullParser parser, Sink sink) {
490        SinkEventAttributeSet attribs = getAttributesFromParser(parser);
491        return baseEndTag(parser.getName(), attribs, sink);
492    }
493
494    protected boolean baseEndTag(String elementName, SinkEventAttributeSet attribs, Sink sink) {
495        boolean visited = true;
496        isBeginningOfLineInsideBlock = true;
497
498        switch (elementName) {
499            case "p":
500                sink.paragraph_();
501                break;
502            case "div":
503                handleDivEnd(sink);
504                break;
505            case "pre":
506                verbatim_();
507                sink.verbatim_();
508                break;
509            case "ul":
510                sink.list_();
511                break;
512            case "ol":
513                sink.numberedList_();
514                orderedListDepth--;
515                break;
516            case "li":
517                handleListItemEnd(sink);
518                break;
519            case "dl":
520                if (hasDefinitionListItem) {
521                    sink.definitionListItem_();
522                    hasDefinitionListItem = false;
523                }
524                sink.definitionList_();
525                break;
526            case "dt":
527                sink.definedTerm_();
528                break;
529            case "dd":
530                sink.definition_();
531                sink.definitionListItem_();
532                hasDefinitionListItem = false;
533                break;
534            case "figure":
535                sink.figure_();
536                break;
537            case "figcaption":
538                sink.figureCaption_();
539                break;
540            case "a":
541                isBeginningOfLineInsideBlock = false;
542                handleAEnd(sink);
543                break;
544            case "em":
545            case "strong":
546            case "small":
547            case "s":
548            case "cite":
549            case "q":
550            case "dfn":
551            case "abbr":
552            case "i":
553            case "b":
554            case "code":
555            case "var":
556            case "samp":
557            case "kbd":
558            case "sup":
559            case "sub":
560            case "u":
561            case "mark":
562            case "ruby":
563            case "rb":
564            case "rt":
565            case "rtc":
566            case "rp":
567            case "bdi":
568            case "bdo":
569            case "span":
570            case "ins":
571            case "del":
572                sink.inline_();
573                isBeginningOfLineInsideBlock = false;
574                break;
575
576            // ----------------------------------------------------------------------
577            // Tables
578            // ----------------------------------------------------------------------
579
580            case "table":
581                sink.tableRows_();
582                sink.table_();
583                break;
584            case "tr":
585                sink.tableRow_();
586                break;
587            case "th":
588                sink.tableHeaderCell_();
589                break;
590            case "td":
591                sink.tableCell_();
592                break;
593            case "caption":
594                sink.tableCaption_();
595                break;
596            case "article":
597                sink.article_();
598                break;
599            case "nav":
600                sink.navigation_();
601                break;
602            case "aside":
603                sink.sidebar_();
604                break;
605            case "section":
606                handleSectionEnd(sink);
607                break;
608            case "h1":
609                sink.sectionTitle1_();
610                break;
611            case "h2":
612                sink.sectionTitle2_();
613                break;
614            case "h3":
615                sink.sectionTitle3_();
616                break;
617            case "h4":
618                sink.sectionTitle4_();
619                break;
620            case "h5":
621                sink.sectionTitle5_();
622                break;
623            case "h6":
624                sink.sectionTitle6_();
625                break;
626            case "header":
627                sink.header_();
628                break;
629            case "main":
630                sink.content_();
631                break;
632            case "footer":
633                sink.footer_();
634                break;
635            case "img":
636                isBeginningOfLineInsideBlock = false;
637                break;
638            case "blockquote":
639                sink.blockquote_();
640                break;
641            case "script":
642            case "style":
643                handleUnknown(elementName, attribs, sink, TAG_TYPE_END);
644                scriptBlock = false;
645                break;
646            default:
647                if (UNMATCHED_XHTML5_ELEMENTS.contains(elementName)) {
648                    handleUnknown(elementName, attribs, sink, TAG_TYPE_END);
649                } else {
650                    visited = false;
651                }
652                break;
653        }
654
655        return visited;
656    }
657
658    /**
659     * {@inheritDoc}
660     *
661     * Just calls {@link #baseStartTag(XmlPullParser,Sink)}, this should be
662     * overridden by implementing parsers to include additional tags.
663     */
664    protected void handleStartTag(XmlPullParser parser, Sink sink)
665            throws XmlPullParserException, MacroExecutionException {
666        if (!baseStartTag(parser, sink)) {
667            LOGGER.warn(
668                    "Unrecognized xml tag <{}> at [{}:{}]",
669                    parser.getName(),
670                    parser.getLineNumber(),
671                    parser.getColumnNumber());
672        }
673    }
674
675    /**
676     * {@inheritDoc}
677     *
678     * Just calls {@link #baseEndTag(XmlPullParser,Sink)}, this should be
679     * overridden by implementing parsers to include additional tags.
680     */
681    protected void handleEndTag(XmlPullParser parser, Sink sink)
682            throws XmlPullParserException, MacroExecutionException {
683        if (!baseEndTag(parser, sink)) {
684            // unrecognized tag is already logged in StartTag
685        }
686    }
687
688    @Override
689    protected void handleText(XmlPullParser parser, Sink sink) throws XmlPullParserException {
690        String text = getText(parser);
691
692        if (!inVerbatim && text != null) {
693            // do special whitespace processing as outlined in
694            // https://developer.mozilla.org/en-US/docs/Web/CSS/Guides/Text/Whitespace
695            if (isBeginningOfLineInsideBlock) {
696                // normalize linebreaks
697                processInsignificantLineBreaks(sink, text);
698                // trim leading whitespace from text being emitted
699                // https://developer.mozilla.org/en-US/docs/Web/CSS/Guides/Text/Whitespace#trimming_and_positioning
700                String regex = "^\\s+";
701                text = text.replaceAll(regex, "");
702            }
703
704            // assume white-space-collapse: collapse for all non-verbatim text (outside of <pre>)
705            text = collapseWhitespace(text);
706        }
707        if ((text != null && !text.isEmpty()) && !isScriptBlock()) {
708            sink.text(text);
709            isBeginningOfLineInsideBlock = false;
710        }
711    }
712
713    /**
714     * Process all line-breaks in the given text which are not significant for the output, i.e. all line-breaks which are not within a verbatim block and
715     * are at the beginning of the given text.
716     * In addition it emits information about the whitespace characters following the line-breaks as they may be relevant for the output (e.g. for indentation).
717     *
718     * @param sink the sink to receive the events.
719     * @param text the text to process.
720     */
721    protected void processInsignificantLineBreaks(Sink sink, String text) {
722        CharacterIterator it = new StringCharacterIterator(text.replaceAll("\\r\\n?", "\n"));
723
724        boolean wasNewLine = false;
725        int indentLevel = 0;
726        //
727        while (it.current() != CharacterIterator.DONE) {
728            char c = it.current();
729            if (c == '\n') {
730                if (wasNewLine) {
731                    sink.markupLineBreak(indentLevel);
732                }
733                indentLevel = 0;
734                wasNewLine = true;
735            } else if (Character.isWhitespace(c)) {
736                indentLevel++;
737            } else {
738                // once non-whitespace character is reached we assume everything following is relevant and emitted
739                // within the text event
740                break;
741            }
742            it.next();
743        }
744        if (wasNewLine) {
745            // if the text ends with a newline, we need to emit the last line break
746            sink.markupLineBreak(indentLevel);
747        }
748    }
749
750    /**
751     * @see <a href="https://developer.mozilla.org/en-US/docs/Web/CSS/Guides/Text/Whitespace#how_does_css_process_whitespace">How does CSS process whitespace?</a>
752     * @see <a href="https://drafts.csswg.org/css-text-4/#white-space-processing">CSS Text Module Level 4 - White Space Processing</a>
753     *
754     * @param text
755     * @return
756     */
757    private static String collapseWhitespace(String text) {
758        // replace all sequences of whitespace characters with a single space (this includes newlines, tabs, etc.)
759        return text.replaceAll("\\s+", " ");
760    }
761
762    @Override
763    protected void handleComment(XmlPullParser parser, Sink sink) throws XmlPullParserException {
764        isBeginningOfLineInsideBlock = false;
765        String text = getText(parser);
766
767        if ("PB".equals(text.trim())) {
768            sink.pageBreak();
769        } else {
770            if (isEmitComments()) {
771                sink.comment(text);
772            }
773        }
774    }
775
776    @Override
777    protected void handleCdsect(XmlPullParser parser, Sink sink) throws XmlPullParserException {
778        isBeginningOfLineInsideBlock = false;
779        String text = getText(parser);
780
781        if (isScriptBlock()) {
782            sink.unknown(CDATA, new Object[] {CDATA_TYPE, text}, null);
783        } else {
784            sink.text(text);
785        }
786    }
787
788    /**
789     * Shortcut for {@link #emitHeadingSections(int, Sink, boolean)} with last argument being {@code true}
790     * @param newLevel
791     * @param sink
792     * @param attribs
793     * @deprecated Use {@link #emitHeadingSections(int, Sink, boolean)} instead.
794     */
795    @Deprecated
796    protected void consecutiveSections(int newLevel, Sink sink, SinkEventAttributeSet attribs) {
797        emitHeadingSections(newLevel, sink, true);
798    }
799
800    /**
801     * Make sure sections are nested consecutively and correctly inserted for the given heading level
802     *
803     * <p>
804     * HTML5 heading tags H1 to H5 imply same level sections in Sink API (compare with {@link Sink#sectionTitle(int, SinkEventAttributes)}).
805     * However (X)HTML5 allows headings without explicit surrounding section elements and is also
806     * less strict with non-consecutive heading levels.
807     * This methods both closes open sections which have been added for previous headings and/or opens
808     * sections necessary for the new heading level.
809     * At least one section needs to be opened directly prior the heading due to Sink API restrictions.
810     * </p>
811     *
812     * <p>
813     * For instance, if the following sequence is parsed:
814     * </p>
815     * <pre>
816     * &lt;h2&gt;&lt;/h2&gt;
817     * &lt;h5&gt;&lt;/h5&gt;
818     * </pre>
819     * <p>
820     * we have to insert two section starts before we open the <code>&lt;h5&gt;</code>.
821     * In the following sequence
822     * </p>
823     * <pre>
824     * &lt;h5&gt;&lt;/h5&gt;
825     * &lt;h2&gt;&lt;/h2&gt;
826     * </pre>
827     * <p>
828     * we have to close two sections before we open the <code>&lt;h2&gt;</code>.
829     * </p>
830     *
831     * <p>The current heading level is set to newLevel afterwards.</p>
832     *
833     * @param newLevel the new section level, all upper levels have to be closed.
834     * @param sink the sink to receive the events.
835     * @param enforceNewSection whether to enforce a new section or not
836     */
837    protected void emitHeadingSections(int newLevel, Sink sink, boolean enforceNewSection) {
838        int lowerBoundSectionLevel = newLevel;
839        if (enforceNewSection) {
840            // close one more if either last event was not section start or the new level is lower than the current one
841            // (in this case the last event may be a section start event but for another level)
842            if (!isLastEventSectionStart() || newLevel < this.headingLevel) {
843                lowerBoundSectionLevel--;
844            }
845        }
846        closeOpenHeadingSections(lowerBoundSectionLevel, sink);
847        openMissingHeadingSections(newLevel, sink);
848
849        this.headingLevel = newLevel;
850    }
851
852    private boolean isLastEventSectionStart() {
853        String lastEventName = capturedSinkEventNames.pollLast();
854        if (lastEventName == null) {
855            return false;
856        }
857        return lastEventName.startsWith("section")
858                && !lastEventName.endsWith("_")
859                && !lastEventName.startsWith("sectionTitle");
860    }
861
862    /**
863     * Close open heading sections.
864     *
865     * @param newLevel the new section level, all upper levels have to be closed.
866     * @param sink the sink to receive the events.
867     */
868    private void closeOpenHeadingSections(int newLevel, Sink sink) {
869        while (this.headingLevel > newLevel) {
870            if (headingLevel >= Sink.SECTION_LEVEL_1 && headingLevel <= Sink.SECTION_LEVEL_6) {
871                sink.section_(headingLevel);
872            }
873
874            this.headingLevel--;
875        }
876        // enforce the previous element is a section
877    }
878
879    /**
880     * Open missing heading sections.
881     *
882     * @param newLevel the new section level, all lower levels have to be opened.
883     * @param sink the sink to receive the events.
884     */
885    private void openMissingHeadingSections(int newLevel, Sink sink) {
886        while (this.headingLevel < newLevel) {
887            this.headingLevel++;
888
889            if (headingLevel >= Sink.SECTION_LEVEL_1 && headingLevel <= Sink.SECTION_LEVEL_6) {
890                sink.section(headingLevel, null);
891            }
892        }
893    }
894
895    /**
896     * Return the current section level.
897     *
898     * @return the current section level.
899     */
900    protected int getSectionLevel() {
901        return this.headingLevel;
902    }
903
904    /**
905     * Set the current section level.
906     *
907     * @param newLevel the new section level.
908     */
909    protected void setSectionLevel(int newLevel) {
910        this.headingLevel = newLevel;
911    }
912
913    /**
914     * Stop verbatim mode.
915     */
916    protected void verbatim_() {
917        this.inVerbatim = false;
918    }
919
920    /**
921     * Start verbatim mode.
922     */
923    protected void verbatim() {
924        this.inVerbatim = true;
925    }
926
927    /**
928     * Checks if we are currently inside a &lt;pre&gt; tag.
929     *
930     * @return true if we are currently in verbatim mode.
931     */
932    protected boolean isVerbatim() {
933        return this.inVerbatim;
934    }
935
936    /**
937     * Checks if we are currently inside a &lt;script&gt; tag.
938     *
939     * @return true if we are currently inside <code>&lt;script&gt;</code> tags.
940     * @since 1.1.1.
941     */
942    protected boolean isScriptBlock() {
943        return this.scriptBlock;
944    }
945
946    /**
947     * Checks if the given id is a valid Doxia id and if not, returns a transformed one.
948     *
949     * @param id The id to validate.
950     * @return A transformed id or the original id if it was already valid.
951     * @see DoxiaUtils#encodeId(String)
952     */
953    protected String validAnchor(String id) {
954        if (!DoxiaUtils.isValidId(id)) {
955            String linkAnchor = DoxiaUtils.encodeId(id);
956
957            LOGGER.debug("Modified invalid link '{}' to '{}'", id, linkAnchor);
958
959            return linkAnchor;
960        }
961
962        return id;
963    }
964
965    @Override
966    protected void init() {
967        super.init();
968
969        this.scriptBlock = false;
970        this.isLink = false;
971        this.isAnchor = false;
972        this.orderedListDepth = 0;
973        this.headingLevel = 0;
974        this.inVerbatim = false;
975    }
976
977    private void handleAEnd(Sink sink) {
978        if (isLink) {
979            sink.link_();
980            isLink = false;
981        } else if (isAnchor) {
982            sink.anchor_();
983            isAnchor = false;
984        }
985    }
986
987    private void handleAStart(Sink sink, SinkEventAttributeSet attribs) {
988        String href = (String) attribs.getAttribute(Attribute.HREF.toString());
989
990        if (href != null) {
991            int hashIndex = href.indexOf('#');
992            if (hashIndex != -1
993                    && !DoxiaUtils.isExternalLink(href)
994                    && !"external".equals(attribs.getAttribute(Attribute.REL.toString()))) {
995                String hash = href.substring(hashIndex + 1);
996
997                if (!DoxiaUtils.isValidId(hash)) {
998                    href = href.substring(0, hashIndex) + "#" + DoxiaUtils.encodeId(hash);
999
1000                    LOGGER.debug("Modified invalid link '{}' to '{}'", hash, href);
1001                }
1002            }
1003            sink.link(href, attribs);
1004            isLink = true;
1005        } else {
1006            String id = (String) attribs.getAttribute(Attribute.ID.toString());
1007            if (id == null) {
1008                // although the "name" attribute is obsolete in HTML5, it is still allowed
1009                // (https://www.w3.org/TR/html5-diff/#obsolete-attributes)
1010                id = (String) attribs.getAttribute(Attribute.NAME.toString());
1011            }
1012            if (id != null) {
1013                sink.anchor(validAnchor(id), attribs);
1014                isAnchor = true;
1015            }
1016        }
1017    }
1018
1019    private boolean handleDivStart(SinkEventAttributeSet attribs, Sink sink) {
1020        String divClass = (String) attribs.getAttribute(Attribute.CLASS.toString());
1021
1022        this.divStack.push(divClass);
1023
1024        if ("content".equals(divClass)) {
1025            SinkEventAttributeSet atts = new SinkEventAttributeSet(attribs);
1026            atts.removeAttribute(SinkEventAttributes.CLASS);
1027            sink.content(atts);
1028        }
1029        if ("verbatim".equals(divClass) || "verbatim source".equals(divClass)) {
1030            return false;
1031        } else {
1032            sink.division(attribs);
1033        }
1034
1035        return true;
1036    }
1037
1038    private boolean handleDivEnd(Sink sink) {
1039        String divClass = divStack.pop();
1040
1041        if ("content".equals(divClass)) {
1042            sink.content_();
1043        }
1044        if ("verbatim".equals(divClass) || "verbatim source".equals(divClass)) {
1045            return false;
1046        } else {
1047            sink.division_();
1048        }
1049
1050        return true;
1051    }
1052
1053    private void handleImgStart(Sink sink, SinkEventAttributeSet attribs) {
1054        String src = (String) attribs.getAttribute(Attribute.SRC.toString());
1055
1056        if (src != null) {
1057            sink.figureGraphics(src, attribs);
1058        }
1059    }
1060
1061    private void handleLIStart(Sink sink, SinkEventAttributeSet attribs) {
1062        if (orderedListDepth == 0) {
1063            sink.listItem(attribs);
1064        } else {
1065            sink.numberedListItem(attribs);
1066        }
1067    }
1068
1069    private void handleListItemEnd(Sink sink) {
1070        if (orderedListDepth == 0) {
1071            sink.listItem_();
1072        } else {
1073            sink.numberedListItem_();
1074        }
1075    }
1076
1077    private void handleOLStart(Sink sink, SinkEventAttributeSet attribs) {
1078        int numbering = Sink.NUMBERING_DECIMAL;
1079        // this will have to be generalized if we handle styles
1080        String style = (String) attribs.getAttribute(Attribute.STYLE.toString());
1081
1082        if (style != null) {
1083            switch (style) {
1084                case "list-style-type: upper-alpha;":
1085                    numbering = Sink.NUMBERING_UPPER_ALPHA;
1086                    break;
1087                case "list-style-type: lower-alpha;":
1088                    numbering = Sink.NUMBERING_LOWER_ALPHA;
1089                    break;
1090                case "list-style-type: upper-roman;":
1091                    numbering = Sink.NUMBERING_UPPER_ROMAN;
1092                    break;
1093                case "list-style-type: lower-roman;":
1094                    numbering = Sink.NUMBERING_LOWER_ROMAN;
1095                    break;
1096                case "list-style-type: decimal;":
1097                    numbering = Sink.NUMBERING_DECIMAL;
1098                    break;
1099                default:
1100                // ignore all other
1101            }
1102        }
1103
1104        sink.numberedList(numbering, attribs);
1105        orderedListDepth++;
1106    }
1107
1108    private void handlePStart(Sink sink, SinkEventAttributeSet attribs) {
1109        sink.paragraph(attribs);
1110    }
1111
1112    /*
1113     * The PRE element tells visual user agents that the enclosed text is
1114     * "preformatted". When handling preformatted text, visual user agents:
1115     * - May leave white space intact.
1116     * - May render text with a fixed-pitch font.
1117     * - May disable automatic word wrap.
1118     * - Must not disable bidirectional processing.
1119     * Non-visual user agents are not required to respect extra white space
1120     * in the content of a PRE element.
1121     */
1122    private void handlePreStart(SinkEventAttributeSet attribs, Sink sink) {
1123        verbatim();
1124        sink.verbatim(attribs);
1125    }
1126
1127    private void handleSectionStart(Sink sink, SinkEventAttributeSet attribs) {
1128        emitHeadingSections(sectionLevel, sink, false);
1129        sink.section(++sectionLevel, attribs);
1130        this.headingLevel = sectionLevel;
1131    }
1132
1133    private void handleHeadingStart(Sink sink, int level, SinkEventAttributeSet attribs) {
1134        emitHeadingSections(level, sink, true);
1135        sink.sectionTitle(level, attribs);
1136    }
1137
1138    private void handleSectionEnd(Sink sink) {
1139        emitHeadingSections(sectionLevel, sink, false);
1140        sink.section_(sectionLevel--);
1141        this.headingLevel = sectionLevel;
1142    }
1143
1144    private void handleTableStart(Sink sink, SinkEventAttributeSet attribs) {
1145        sink.table(attribs);
1146        String givenTableClass = (String) attribs.getAttribute(Attribute.CLASS.toString());
1147        boolean grid = false;
1148        if (givenTableClass != null
1149                && BODYTABLEBORDER_CLASS_PATTERN.matcher(givenTableClass).matches()) {
1150            grid = true;
1151        }
1152
1153        sink.tableRows(null, grid);
1154    }
1155}