001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019package org.apache.maven.doxia.parser; 020 021import javax.swing.text.html.HTML.Attribute; 022 023import java.io.Reader; 024import java.text.CharacterIterator; 025import java.text.StringCharacterIterator; 026import java.util.HashSet; 027import java.util.LinkedList; 028import java.util.Set; 029import java.util.Stack; 030import java.util.regex.Pattern; 031 032import org.apache.maven.doxia.macro.MacroExecutionException; 033import org.apache.maven.doxia.markup.HtmlMarkup; 034import org.apache.maven.doxia.sink.Sink; 035import org.apache.maven.doxia.sink.SinkEventAttributes; 036import org.apache.maven.doxia.sink.impl.EventCapturingSinkProxy; 037import org.apache.maven.doxia.sink.impl.SinkEventAttributeSet; 038import org.apache.maven.doxia.util.DoxiaUtils; 039import org.codehaus.plexus.util.xml.pull.XmlPullParser; 040import org.codehaus.plexus.util.xml.pull.XmlPullParserException; 041import org.slf4j.Logger; 042import org.slf4j.LoggerFactory; 043 044/** 045 * Common base parser for XHTML5 (now <a href="https://html.spec.whatwg.org/multipage/#toc-the-xhtml-syntax">HTML Living standard, XML syntax</a>) elements and attributes. 046 * 047 * @see <a href="https://html.spec.whatwg.org/multipage/introduction.html#history-2">HTML Living standard, history</a> 048 */ 049public class Xhtml5BaseParser extends AbstractXmlParser implements HtmlMarkup { 050 private static final Logger LOGGER = LoggerFactory.getLogger(Xhtml5BaseParser.class); 051 052 /** Used to identify if a class string contains `bodyTableBorder` */ 053 private static final Pattern BODYTABLEBORDER_CLASS_PATTERN = 054 Pattern.compile("(?:.*\\s|^)bodyTableBorder(?:\\s.*|$)"); 055 056 private static final Set<String> UNMATCHED_XHTML5_ELEMENTS = new HashSet<>(); 057 private static final Set<String> UNMATCHED_XHTML5_SIMPLE_ELEMENTS = new HashSet<>(); 058 059 static { 060 UNMATCHED_XHTML5_SIMPLE_ELEMENTS.add(HtmlMarkup.AREA.toString()); 061 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.AUDIO.toString()); 062 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.BUTTON.toString()); 063 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.CANVAS.toString()); 064 UNMATCHED_XHTML5_SIMPLE_ELEMENTS.add(HtmlMarkup.COL.toString()); 065 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.COLGROUP.toString()); 066 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.COMMAND.toString()); 067 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.DATA.toString()); 068 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.DATALIST.toString()); 069 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.DETAILS.toString()); 070 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.DIALOG.toString()); 071 UNMATCHED_XHTML5_SIMPLE_ELEMENTS.add(HtmlMarkup.EMBED.toString()); 072 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.FIELDSET.toString()); 073 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.FORM.toString()); 074 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.HGROUP.toString()); 075 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.IFRAME.toString()); 076 UNMATCHED_XHTML5_SIMPLE_ELEMENTS.add(HtmlMarkup.INPUT.toString()); 077 UNMATCHED_XHTML5_SIMPLE_ELEMENTS.add(HtmlMarkup.KEYGEN.toString()); 078 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.LABEL.toString()); 079 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.LEGEND.toString()); 080 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.MAP.toString()); 081 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.MENU.toString()); 082 UNMATCHED_XHTML5_SIMPLE_ELEMENTS.add(HtmlMarkup.MENUITEM.toString()); 083 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.METER.toString()); 084 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.NOSCRIPT.toString()); 085 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.OBJECT.toString()); 086 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.OPTGROUP.toString()); 087 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.OPTION.toString()); 088 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.OUTPUT.toString()); 089 UNMATCHED_XHTML5_SIMPLE_ELEMENTS.add(HtmlMarkup.PARAM.toString()); 090 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.PICTURE.toString()); 091 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.PROGRESS.toString()); 092 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.SELECT.toString()); 093 UNMATCHED_XHTML5_SIMPLE_ELEMENTS.add(HtmlMarkup.SOURCE.toString()); 094 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.SUMMARY.toString()); 095 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.SVG.toString()); 096 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.TEMPLATE.toString()); 097 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.TEXTAREA.toString()); 098 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.TBODY.toString()); 099 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.THEAD.toString()); 100 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.TFOOT.toString()); 101 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.TIME.toString()); 102 UNMATCHED_XHTML5_SIMPLE_ELEMENTS.add(HtmlMarkup.TRACK.toString()); 103 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.VAR.toString()); 104 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.VIDEO.toString()); 105 } 106 107 /** 108 * True if a <script></script> or <style></style> block is read. CDATA sections within are 109 * handled as rawText. 110 */ 111 private boolean scriptBlock; 112 113 /** Used to distinguish <a href=""> from <a name="">. */ 114 private boolean isLink; 115 116 /** If true, the next text event is at the beginning of a line inside a block element, i.e. after a block tag or a line break/end block tag. */ 117 protected boolean isBeginningOfLineInsideBlock = true; 118 119 /** Used to distinguish <a href=""> from <a name="">. */ 120 private boolean isAnchor; 121 122 /** Used for nested lists. */ 123 private int orderedListDepth = 0; 124 125 /** Counts section nesting level of the sections manually set in the HTML document */ 126 private int sectionLevel; 127 128 /** Counts current heading level. This is either the {@link #sectionLevel} if no artificial sections are currently open 129 * for headings or a number higher or lower than {@link #sectionLevel} (for all section currently opened/closed for a preceding heading). 130 * The heading level only changes when a new heading starts, or a section starts or ends. */ 131 private int headingLevel; 132 133 /** Verbatim flag, true whenever we are inside a <pre> tag. */ 134 private boolean inVerbatim; 135 136 /** Used to keep track of closing tags for content events */ 137 private Stack<String> divStack = new Stack<>(); 138 139 /** Used to wrap the definedTerm with its definition, even when one is omitted */ 140 boolean hasDefinitionListItem = false; 141 142 private LinkedList<String> capturedSinkEventNames; 143 144 @Override 145 public void parse(Reader source, Sink sink, String reference) throws ParseException { 146 init(); 147 148 try { 149 capturedSinkEventNames = new LinkedList<>(); 150 Sink capturingSink = EventCapturingSinkProxy.newInstance(sink, capturedSinkEventNames); 151 super.parse(source, capturingSink, reference); 152 } finally { 153 setSecondParsing(false); 154 init(); 155 } 156 } 157 158 /** 159 * {@inheritDoc} 160 * 161 * Adds all XHTML (HTML 5.2) entities to the parser so that they can be recognized and resolved 162 * without additional DTD. 163 */ 164 @Override 165 protected void initXmlParser(XmlPullParser parser) throws XmlPullParserException { 166 super.initXmlParser(parser); 167 } 168 169 /** 170 * <p> 171 * Goes through a common list of possible html5 start tags. These include only tags that can go into 172 * the body of an xhtml5 document and so should be re-usable by different xhtml-based parsers. 173 * </p> 174 * <p> 175 * The currently handled tags are: 176 * </p> 177 * <p> 178 * <code> 179 * <article>, <nav>, <aside>, <section>, <h1>, <h2>, <h3>, 180 * <h4>, <h5>, <header>, <main>, <footer>, <em>, <strong>, 181 * <small>, <s>, <cite>, <q>, <dfn>, <abbr>, <i>, 182 * <b>, <code>, <samp>, <kbd>, <sub>, <sup>, <u>, 183 * <mark>, <ruby>, <rb>, <rt>, <rtc>, <rp>, <bdi>, 184 * <bdo>, <span>, <ins>, <del>, <p>, <pre>, <ul>, 185 * <ol>, <li>, <dl>, <dt>, <dd>, <a>, <table>, 186 * <tr>, <th>, <td>, <caption>, <br/>, <wbr/>, <hr/>, 187 * <img/>. 188 * </code> 189 * </p> 190 * 191 * @param parser A parser. 192 * @param sink the sink to receive the events. 193 * @return True if the event has been handled by this method, i.e. the tag was recognized, false otherwise. 194 */ 195 protected boolean baseStartTag(XmlPullParser parser, Sink sink) { 196 SinkEventAttributeSet attribs = getAttributesFromParser(parser); 197 return baseStartTag(parser.getName(), attribs, sink); 198 } 199 200 protected boolean baseStartTag(String elementName, SinkEventAttributeSet attribs, Sink sink) { 201 boolean visited = true; 202 isBeginningOfLineInsideBlock = true; 203 switch (elementName) { 204 case "article": 205 sink.article(attribs); 206 break; 207 case "nav": 208 sink.navigation(attribs); 209 break; 210 case "aside": 211 sink.sidebar(attribs); 212 break; 213 case "section": 214 handleSectionStart(sink, attribs); 215 break; 216 case "h1": 217 handleHeadingStart(sink, Sink.SECTION_LEVEL_1, attribs); 218 break; 219 case "h2": 220 handleHeadingStart(sink, Sink.SECTION_LEVEL_2, attribs); 221 break; 222 case "h3": 223 handleHeadingStart(sink, Sink.SECTION_LEVEL_3, attribs); 224 break; 225 case "h4": 226 handleHeadingStart(sink, Sink.SECTION_LEVEL_4, attribs); 227 break; 228 case "h5": 229 handleHeadingStart(sink, Sink.SECTION_LEVEL_5, attribs); 230 break; 231 case "h6": 232 handleHeadingStart(sink, Sink.SECTION_LEVEL_6, attribs); 233 break; 234 case "header": 235 sink.header(attribs); 236 break; 237 case "main": 238 sink.content(attribs); 239 break; 240 case "footer": 241 sink.footer(attribs); 242 break; 243 case "em": 244 attribs.addAttributes(SinkEventAttributeSet.Semantics.EMPHASIS); 245 sink.inline(attribs); 246 isBeginningOfLineInsideBlock = false; 247 break; 248 case "strong": 249 attribs.addAttributes(SinkEventAttributeSet.Semantics.STRONG); 250 sink.inline(attribs); 251 isBeginningOfLineInsideBlock = false; 252 break; 253 case "small": 254 attribs.addAttributes(SinkEventAttributeSet.Semantics.SMALL); 255 sink.inline(attribs); 256 isBeginningOfLineInsideBlock = false; 257 break; 258 case "s": 259 /* deprecated line-through support */ 260 attribs.addAttributes(SinkEventAttributeSet.Semantics.LINE_THROUGH); 261 sink.inline(attribs); 262 isBeginningOfLineInsideBlock = false; 263 break; 264 case "cite": 265 attribs.addAttributes(SinkEventAttributeSet.Semantics.CITATION); 266 sink.inline(attribs); 267 isBeginningOfLineInsideBlock = false; 268 break; 269 case "q": 270 attribs.addAttributes(SinkEventAttributeSet.Semantics.QUOTE); 271 sink.inline(attribs); 272 break; 273 case "dfn": 274 attribs.addAttributes(SinkEventAttributeSet.Semantics.DEFINITION); 275 sink.inline(attribs); 276 isBeginningOfLineInsideBlock = false; 277 break; 278 case "abbr": 279 attribs.addAttributes(SinkEventAttributeSet.Semantics.ABBREVIATION); 280 sink.inline(attribs); 281 isBeginningOfLineInsideBlock = false; 282 break; 283 case "i": 284 attribs.addAttributes(SinkEventAttributeSet.Semantics.ITALIC); 285 sink.inline(attribs); 286 break; 287 case "b": 288 attribs.addAttributes(SinkEventAttributeSet.Semantics.BOLD); 289 sink.inline(attribs); 290 isBeginningOfLineInsideBlock = false; 291 break; 292 case "code": 293 attribs.addAttributes(SinkEventAttributeSet.Semantics.CODE); 294 sink.inline(attribs); 295 isBeginningOfLineInsideBlock = false; 296 break; 297 case "var": 298 attribs.addAttributes(SinkEventAttributeSet.Semantics.VARIABLE); 299 sink.inline(attribs); 300 isBeginningOfLineInsideBlock = false; 301 break; 302 case "samp": 303 attribs.addAttributes(SinkEventAttributeSet.Semantics.SAMPLE); 304 sink.inline(attribs); 305 isBeginningOfLineInsideBlock = false; 306 break; 307 case "kbd": 308 attribs.addAttributes(SinkEventAttributeSet.Semantics.KEYBOARD); 309 sink.inline(attribs); 310 isBeginningOfLineInsideBlock = false; 311 break; 312 case "sup": 313 attribs.addAttributes(SinkEventAttributeSet.Semantics.SUPERSCRIPT); 314 sink.inline(attribs); 315 isBeginningOfLineInsideBlock = false; 316 break; 317 case "sub": 318 attribs.addAttributes(SinkEventAttributeSet.Semantics.SUBSCRIPT); 319 sink.inline(attribs); 320 isBeginningOfLineInsideBlock = false; 321 break; 322 case "u": 323 attribs.addAttributes(SinkEventAttributeSet.Semantics.ANNOTATION); 324 sink.inline(attribs); 325 isBeginningOfLineInsideBlock = false; 326 break; 327 case "mark": 328 attribs.addAttributes(SinkEventAttributeSet.Semantics.HIGHLIGHT); 329 sink.inline(attribs); 330 break; 331 case "ruby": 332 attribs.addAttributes(SinkEventAttributeSet.Semantics.RUBY); 333 sink.inline(attribs); 334 isBeginningOfLineInsideBlock = false; 335 break; 336 case "rb": 337 attribs.addAttributes(SinkEventAttributeSet.Semantics.RUBY_BASE); 338 sink.inline(attribs); 339 isBeginningOfLineInsideBlock = false; 340 break; 341 case "rt": 342 attribs.addAttributes(SinkEventAttributeSet.Semantics.RUBY_TEXT); 343 sink.inline(attribs); 344 isBeginningOfLineInsideBlock = false; 345 break; 346 case "rtc": 347 attribs.addAttributes(SinkEventAttributeSet.Semantics.RUBY_TEXT_CONTAINER); 348 sink.inline(attribs); 349 isBeginningOfLineInsideBlock = false; 350 break; 351 case "rp": 352 attribs.addAttributes(SinkEventAttributeSet.Semantics.RUBY_PARANTHESES); 353 sink.inline(attribs); 354 isBeginningOfLineInsideBlock = false; 355 break; 356 case "bdi": 357 attribs.addAttributes(SinkEventAttributeSet.Semantics.BIDIRECTIONAL_ISOLATION); 358 sink.inline(attribs); 359 isBeginningOfLineInsideBlock = false; 360 break; 361 case "bdo": 362 attribs.addAttributes(SinkEventAttributeSet.Semantics.BIDIRECTIONAL_OVERRIDE); 363 sink.inline(attribs); 364 isBeginningOfLineInsideBlock = false; 365 break; 366 case "span": 367 attribs.addAttributes(SinkEventAttributeSet.Semantics.PHRASE); 368 sink.inline(attribs); 369 isBeginningOfLineInsideBlock = false; 370 break; 371 case "ins": 372 attribs.addAttributes(SinkEventAttributeSet.Semantics.INSERT); 373 sink.inline(attribs); 374 isBeginningOfLineInsideBlock = false; 375 break; 376 case "del": 377 attribs.addAttributes(SinkEventAttributeSet.Semantics.DELETE); 378 sink.inline(attribs); 379 isBeginningOfLineInsideBlock = false; 380 break; 381 case "p": 382 handlePStart(sink, attribs); 383 break; 384 case "div": 385 handleDivStart(attribs, sink); 386 break; 387 case "pre": 388 handlePreStart(attribs, sink); 389 break; 390 case "ul": 391 sink.list(attribs); 392 break; 393 case "ol": 394 handleOLStart(sink, attribs); 395 break; 396 case "li": 397 handleLIStart(sink, attribs); 398 break; 399 case "dl": 400 sink.definitionList(attribs); 401 break; 402 case "dt": 403 if (hasDefinitionListItem) { 404 // close previous listItem 405 sink.definitionListItem_(); 406 } 407 sink.definitionListItem(attribs); 408 hasDefinitionListItem = true; 409 sink.definedTerm(attribs); 410 break; 411 case "dd": 412 if (!hasDefinitionListItem) { 413 sink.definitionListItem(attribs); 414 } 415 sink.definition(attribs); 416 break; 417 case "figure": 418 sink.figure(attribs); 419 break; 420 case "figcaption": 421 sink.figureCaption(attribs); 422 break; 423 case "a": 424 isBeginningOfLineInsideBlock = false; 425 handleAStart(sink, attribs); 426 break; 427 case "table": 428 handleTableStart(sink, attribs); 429 break; 430 case "tr": 431 sink.tableRow(attribs); 432 break; 433 case "th": 434 sink.tableHeaderCell(attribs); 435 break; 436 case "td": 437 sink.tableCell(attribs); 438 break; 439 case "caption": 440 sink.tableCaption(attribs); 441 break; 442 case "br": 443 sink.lineBreak(attribs); 444 break; 445 case "wbr": 446 sink.lineBreakOpportunity(attribs); 447 break; 448 case "hr": 449 sink.horizontalRule(attribs); 450 break; 451 case "img": 452 isBeginningOfLineInsideBlock = false; 453 handleImgStart(sink, attribs); 454 break; 455 case "blockquote": 456 sink.blockquote(attribs); 457 break; 458 case "script": 459 case "style": 460 handleUnknown(elementName, attribs, sink, TAG_TYPE_START); 461 scriptBlock = true; 462 break; 463 default: 464 if (UNMATCHED_XHTML5_ELEMENTS.contains(elementName)) { 465 handleUnknown(elementName, attribs, sink, TAG_TYPE_START); 466 } else if (UNMATCHED_XHTML5_SIMPLE_ELEMENTS.contains(elementName)) { 467 handleUnknown(elementName, attribs, sink, TAG_TYPE_SIMPLE); 468 } else { 469 visited = false; 470 } 471 break; 472 } 473 474 return visited; 475 } 476 477 /** 478 * <p> 479 * Goes through a common list of possible html end tags. 480 * These should be re-usable by different xhtml-based parsers. 481 * The tags handled here are the same as for {@link #baseStartTag(XmlPullParser,Sink)}, 482 * except for the empty elements ({@code <br/>, <hr/>, <img/>}). 483 * </p> 484 * 485 * @param parser A parser. 486 * @param sink the sink to receive the events. 487 * @return True if the event has been handled by this method, false otherwise. 488 */ 489 protected boolean baseEndTag(XmlPullParser parser, Sink sink) { 490 SinkEventAttributeSet attribs = getAttributesFromParser(parser); 491 return baseEndTag(parser.getName(), attribs, sink); 492 } 493 494 protected boolean baseEndTag(String elementName, SinkEventAttributeSet attribs, Sink sink) { 495 boolean visited = true; 496 isBeginningOfLineInsideBlock = true; 497 498 switch (elementName) { 499 case "p": 500 sink.paragraph_(); 501 break; 502 case "div": 503 handleDivEnd(sink); 504 break; 505 case "pre": 506 verbatim_(); 507 sink.verbatim_(); 508 break; 509 case "ul": 510 sink.list_(); 511 break; 512 case "ol": 513 sink.numberedList_(); 514 orderedListDepth--; 515 break; 516 case "li": 517 handleListItemEnd(sink); 518 break; 519 case "dl": 520 if (hasDefinitionListItem) { 521 sink.definitionListItem_(); 522 hasDefinitionListItem = false; 523 } 524 sink.definitionList_(); 525 break; 526 case "dt": 527 sink.definedTerm_(); 528 break; 529 case "dd": 530 sink.definition_(); 531 sink.definitionListItem_(); 532 hasDefinitionListItem = false; 533 break; 534 case "figure": 535 sink.figure_(); 536 break; 537 case "figcaption": 538 sink.figureCaption_(); 539 break; 540 case "a": 541 isBeginningOfLineInsideBlock = false; 542 handleAEnd(sink); 543 break; 544 case "em": 545 case "strong": 546 case "small": 547 case "s": 548 case "cite": 549 case "q": 550 case "dfn": 551 case "abbr": 552 case "i": 553 case "b": 554 case "code": 555 case "var": 556 case "samp": 557 case "kbd": 558 case "sup": 559 case "sub": 560 case "u": 561 case "mark": 562 case "ruby": 563 case "rb": 564 case "rt": 565 case "rtc": 566 case "rp": 567 case "bdi": 568 case "bdo": 569 case "span": 570 case "ins": 571 case "del": 572 sink.inline_(); 573 isBeginningOfLineInsideBlock = false; 574 break; 575 576 // ---------------------------------------------------------------------- 577 // Tables 578 // ---------------------------------------------------------------------- 579 580 case "table": 581 sink.tableRows_(); 582 sink.table_(); 583 break; 584 case "tr": 585 sink.tableRow_(); 586 break; 587 case "th": 588 sink.tableHeaderCell_(); 589 break; 590 case "td": 591 sink.tableCell_(); 592 break; 593 case "caption": 594 sink.tableCaption_(); 595 break; 596 case "article": 597 sink.article_(); 598 break; 599 case "nav": 600 sink.navigation_(); 601 break; 602 case "aside": 603 sink.sidebar_(); 604 break; 605 case "section": 606 handleSectionEnd(sink); 607 break; 608 case "h1": 609 sink.sectionTitle1_(); 610 break; 611 case "h2": 612 sink.sectionTitle2_(); 613 break; 614 case "h3": 615 sink.sectionTitle3_(); 616 break; 617 case "h4": 618 sink.sectionTitle4_(); 619 break; 620 case "h5": 621 sink.sectionTitle5_(); 622 break; 623 case "h6": 624 sink.sectionTitle6_(); 625 break; 626 case "header": 627 sink.header_(); 628 break; 629 case "main": 630 sink.content_(); 631 break; 632 case "footer": 633 sink.footer_(); 634 break; 635 case "img": 636 isBeginningOfLineInsideBlock = false; 637 break; 638 case "blockquote": 639 sink.blockquote_(); 640 break; 641 case "script": 642 case "style": 643 handleUnknown(elementName, attribs, sink, TAG_TYPE_END); 644 scriptBlock = false; 645 break; 646 default: 647 if (UNMATCHED_XHTML5_ELEMENTS.contains(elementName)) { 648 handleUnknown(elementName, attribs, sink, TAG_TYPE_END); 649 } else { 650 visited = false; 651 } 652 break; 653 } 654 655 return visited; 656 } 657 658 /** 659 * {@inheritDoc} 660 * 661 * Just calls {@link #baseStartTag(XmlPullParser,Sink)}, this should be 662 * overridden by implementing parsers to include additional tags. 663 */ 664 protected void handleStartTag(XmlPullParser parser, Sink sink) 665 throws XmlPullParserException, MacroExecutionException { 666 if (!baseStartTag(parser, sink)) { 667 LOGGER.warn( 668 "Unrecognized xml tag <{}> at [{}:{}]", 669 parser.getName(), 670 parser.getLineNumber(), 671 parser.getColumnNumber()); 672 } 673 } 674 675 /** 676 * {@inheritDoc} 677 * 678 * Just calls {@link #baseEndTag(XmlPullParser,Sink)}, this should be 679 * overridden by implementing parsers to include additional tags. 680 */ 681 protected void handleEndTag(XmlPullParser parser, Sink sink) 682 throws XmlPullParserException, MacroExecutionException { 683 if (!baseEndTag(parser, sink)) { 684 // unrecognized tag is already logged in StartTag 685 } 686 } 687 688 @Override 689 protected void handleText(XmlPullParser parser, Sink sink) throws XmlPullParserException { 690 String text = getText(parser); 691 692 if (!inVerbatim && text != null) { 693 // do special whitespace processing as outlined in 694 // https://developer.mozilla.org/en-US/docs/Web/CSS/Guides/Text/Whitespace 695 if (isBeginningOfLineInsideBlock) { 696 // normalize linebreaks 697 processInsignificantLineBreaks(sink, text); 698 // trim leading whitespace from text being emitted 699 // https://developer.mozilla.org/en-US/docs/Web/CSS/Guides/Text/Whitespace#trimming_and_positioning 700 String regex = "^\\s+"; 701 text = text.replaceAll(regex, ""); 702 } 703 704 // assume white-space-collapse: collapse for all non-verbatim text (outside of <pre>) 705 text = collapseWhitespace(text); 706 } 707 if ((text != null && !text.isEmpty()) && !isScriptBlock()) { 708 sink.text(text); 709 isBeginningOfLineInsideBlock = false; 710 } 711 } 712 713 /** 714 * Process all line-breaks in the given text which are not significant for the output, i.e. all line-breaks which are not within a verbatim block and 715 * are at the beginning of the given text. 716 * In addition it emits information about the whitespace characters following the line-breaks as they may be relevant for the output (e.g. for indentation). 717 * 718 * @param sink the sink to receive the events. 719 * @param text the text to process. 720 */ 721 protected void processInsignificantLineBreaks(Sink sink, String text) { 722 CharacterIterator it = new StringCharacterIterator(text.replaceAll("\\r\\n?", "\n")); 723 724 boolean wasNewLine = false; 725 int indentLevel = 0; 726 // 727 while (it.current() != CharacterIterator.DONE) { 728 char c = it.current(); 729 if (c == '\n') { 730 if (wasNewLine) { 731 sink.markupLineBreak(indentLevel); 732 } 733 indentLevel = 0; 734 wasNewLine = true; 735 } else if (Character.isWhitespace(c)) { 736 indentLevel++; 737 } else { 738 // once non-whitespace character is reached we assume everything following is relevant and emitted 739 // within the text event 740 break; 741 } 742 it.next(); 743 } 744 if (wasNewLine) { 745 // if the text ends with a newline, we need to emit the last line break 746 sink.markupLineBreak(indentLevel); 747 } 748 } 749 750 /** 751 * @see <a href="https://developer.mozilla.org/en-US/docs/Web/CSS/Guides/Text/Whitespace#how_does_css_process_whitespace">How does CSS process whitespace?</a> 752 * @see <a href="https://drafts.csswg.org/css-text-4/#white-space-processing">CSS Text Module Level 4 - White Space Processing</a> 753 * 754 * @param text 755 * @return 756 */ 757 private static String collapseWhitespace(String text) { 758 // replace all sequences of whitespace characters with a single space (this includes newlines, tabs, etc.) 759 return text.replaceAll("\\s+", " "); 760 } 761 762 @Override 763 protected void handleComment(XmlPullParser parser, Sink sink) throws XmlPullParserException { 764 isBeginningOfLineInsideBlock = false; 765 String text = getText(parser); 766 767 if ("PB".equals(text.trim())) { 768 sink.pageBreak(); 769 } else { 770 if (isEmitComments()) { 771 sink.comment(text); 772 } 773 } 774 } 775 776 @Override 777 protected void handleCdsect(XmlPullParser parser, Sink sink) throws XmlPullParserException { 778 isBeginningOfLineInsideBlock = false; 779 String text = getText(parser); 780 781 if (isScriptBlock()) { 782 sink.unknown(CDATA, new Object[] {CDATA_TYPE, text}, null); 783 } else { 784 sink.text(text); 785 } 786 } 787 788 /** 789 * Shortcut for {@link #emitHeadingSections(int, Sink, boolean)} with last argument being {@code true} 790 * @param newLevel 791 * @param sink 792 * @param attribs 793 * @deprecated Use {@link #emitHeadingSections(int, Sink, boolean)} instead. 794 */ 795 @Deprecated 796 protected void consecutiveSections(int newLevel, Sink sink, SinkEventAttributeSet attribs) { 797 emitHeadingSections(newLevel, sink, true); 798 } 799 800 /** 801 * Make sure sections are nested consecutively and correctly inserted for the given heading level 802 * 803 * <p> 804 * HTML5 heading tags H1 to H5 imply same level sections in Sink API (compare with {@link Sink#sectionTitle(int, SinkEventAttributes)}). 805 * However (X)HTML5 allows headings without explicit surrounding section elements and is also 806 * less strict with non-consecutive heading levels. 807 * This methods both closes open sections which have been added for previous headings and/or opens 808 * sections necessary for the new heading level. 809 * At least one section needs to be opened directly prior the heading due to Sink API restrictions. 810 * </p> 811 * 812 * <p> 813 * For instance, if the following sequence is parsed: 814 * </p> 815 * <pre> 816 * <h2></h2> 817 * <h5></h5> 818 * </pre> 819 * <p> 820 * we have to insert two section starts before we open the <code><h5></code>. 821 * In the following sequence 822 * </p> 823 * <pre> 824 * <h5></h5> 825 * <h2></h2> 826 * </pre> 827 * <p> 828 * we have to close two sections before we open the <code><h2></code>. 829 * </p> 830 * 831 * <p>The current heading level is set to newLevel afterwards.</p> 832 * 833 * @param newLevel the new section level, all upper levels have to be closed. 834 * @param sink the sink to receive the events. 835 * @param enforceNewSection whether to enforce a new section or not 836 */ 837 protected void emitHeadingSections(int newLevel, Sink sink, boolean enforceNewSection) { 838 int lowerBoundSectionLevel = newLevel; 839 if (enforceNewSection) { 840 // close one more if either last event was not section start or the new level is lower than the current one 841 // (in this case the last event may be a section start event but for another level) 842 if (!isLastEventSectionStart() || newLevel < this.headingLevel) { 843 lowerBoundSectionLevel--; 844 } 845 } 846 closeOpenHeadingSections(lowerBoundSectionLevel, sink); 847 openMissingHeadingSections(newLevel, sink); 848 849 this.headingLevel = newLevel; 850 } 851 852 private boolean isLastEventSectionStart() { 853 String lastEventName = capturedSinkEventNames.pollLast(); 854 if (lastEventName == null) { 855 return false; 856 } 857 return lastEventName.startsWith("section") 858 && !lastEventName.endsWith("_") 859 && !lastEventName.startsWith("sectionTitle"); 860 } 861 862 /** 863 * Close open heading sections. 864 * 865 * @param newLevel the new section level, all upper levels have to be closed. 866 * @param sink the sink to receive the events. 867 */ 868 private void closeOpenHeadingSections(int newLevel, Sink sink) { 869 while (this.headingLevel > newLevel) { 870 if (headingLevel >= Sink.SECTION_LEVEL_1 && headingLevel <= Sink.SECTION_LEVEL_6) { 871 sink.section_(headingLevel); 872 } 873 874 this.headingLevel--; 875 } 876 // enforce the previous element is a section 877 } 878 879 /** 880 * Open missing heading sections. 881 * 882 * @param newLevel the new section level, all lower levels have to be opened. 883 * @param sink the sink to receive the events. 884 */ 885 private void openMissingHeadingSections(int newLevel, Sink sink) { 886 while (this.headingLevel < newLevel) { 887 this.headingLevel++; 888 889 if (headingLevel >= Sink.SECTION_LEVEL_1 && headingLevel <= Sink.SECTION_LEVEL_6) { 890 sink.section(headingLevel, null); 891 } 892 } 893 } 894 895 /** 896 * Return the current section level. 897 * 898 * @return the current section level. 899 */ 900 protected int getSectionLevel() { 901 return this.headingLevel; 902 } 903 904 /** 905 * Set the current section level. 906 * 907 * @param newLevel the new section level. 908 */ 909 protected void setSectionLevel(int newLevel) { 910 this.headingLevel = newLevel; 911 } 912 913 /** 914 * Stop verbatim mode. 915 */ 916 protected void verbatim_() { 917 this.inVerbatim = false; 918 } 919 920 /** 921 * Start verbatim mode. 922 */ 923 protected void verbatim() { 924 this.inVerbatim = true; 925 } 926 927 /** 928 * Checks if we are currently inside a <pre> tag. 929 * 930 * @return true if we are currently in verbatim mode. 931 */ 932 protected boolean isVerbatim() { 933 return this.inVerbatim; 934 } 935 936 /** 937 * Checks if we are currently inside a <script> tag. 938 * 939 * @return true if we are currently inside <code><script></code> tags. 940 * @since 1.1.1. 941 */ 942 protected boolean isScriptBlock() { 943 return this.scriptBlock; 944 } 945 946 /** 947 * Checks if the given id is a valid Doxia id and if not, returns a transformed one. 948 * 949 * @param id The id to validate. 950 * @return A transformed id or the original id if it was already valid. 951 * @see DoxiaUtils#encodeId(String) 952 */ 953 protected String validAnchor(String id) { 954 if (!DoxiaUtils.isValidId(id)) { 955 String linkAnchor = DoxiaUtils.encodeId(id); 956 957 LOGGER.debug("Modified invalid link '{}' to '{}'", id, linkAnchor); 958 959 return linkAnchor; 960 } 961 962 return id; 963 } 964 965 @Override 966 protected void init() { 967 super.init(); 968 969 this.scriptBlock = false; 970 this.isLink = false; 971 this.isAnchor = false; 972 this.orderedListDepth = 0; 973 this.headingLevel = 0; 974 this.inVerbatim = false; 975 } 976 977 private void handleAEnd(Sink sink) { 978 if (isLink) { 979 sink.link_(); 980 isLink = false; 981 } else if (isAnchor) { 982 sink.anchor_(); 983 isAnchor = false; 984 } 985 } 986 987 private void handleAStart(Sink sink, SinkEventAttributeSet attribs) { 988 String href = (String) attribs.getAttribute(Attribute.HREF.toString()); 989 990 if (href != null) { 991 int hashIndex = href.indexOf('#'); 992 if (hashIndex != -1 993 && !DoxiaUtils.isExternalLink(href) 994 && !"external".equals(attribs.getAttribute(Attribute.REL.toString()))) { 995 String hash = href.substring(hashIndex + 1); 996 997 if (!DoxiaUtils.isValidId(hash)) { 998 href = href.substring(0, hashIndex) + "#" + DoxiaUtils.encodeId(hash); 999 1000 LOGGER.debug("Modified invalid link '{}' to '{}'", hash, href); 1001 } 1002 } 1003 sink.link(href, attribs); 1004 isLink = true; 1005 } else { 1006 String id = (String) attribs.getAttribute(Attribute.ID.toString()); 1007 if (id == null) { 1008 // although the "name" attribute is obsolete in HTML5, it is still allowed 1009 // (https://www.w3.org/TR/html5-diff/#obsolete-attributes) 1010 id = (String) attribs.getAttribute(Attribute.NAME.toString()); 1011 } 1012 if (id != null) { 1013 sink.anchor(validAnchor(id), attribs); 1014 isAnchor = true; 1015 } 1016 } 1017 } 1018 1019 private boolean handleDivStart(SinkEventAttributeSet attribs, Sink sink) { 1020 String divClass = (String) attribs.getAttribute(Attribute.CLASS.toString()); 1021 1022 this.divStack.push(divClass); 1023 1024 if ("content".equals(divClass)) { 1025 SinkEventAttributeSet atts = new SinkEventAttributeSet(attribs); 1026 atts.removeAttribute(SinkEventAttributes.CLASS); 1027 sink.content(atts); 1028 } 1029 if ("verbatim".equals(divClass) || "verbatim source".equals(divClass)) { 1030 return false; 1031 } else { 1032 sink.division(attribs); 1033 } 1034 1035 return true; 1036 } 1037 1038 private boolean handleDivEnd(Sink sink) { 1039 String divClass = divStack.pop(); 1040 1041 if ("content".equals(divClass)) { 1042 sink.content_(); 1043 } 1044 if ("verbatim".equals(divClass) || "verbatim source".equals(divClass)) { 1045 return false; 1046 } else { 1047 sink.division_(); 1048 } 1049 1050 return true; 1051 } 1052 1053 private void handleImgStart(Sink sink, SinkEventAttributeSet attribs) { 1054 String src = (String) attribs.getAttribute(Attribute.SRC.toString()); 1055 1056 if (src != null) { 1057 sink.figureGraphics(src, attribs); 1058 } 1059 } 1060 1061 private void handleLIStart(Sink sink, SinkEventAttributeSet attribs) { 1062 if (orderedListDepth == 0) { 1063 sink.listItem(attribs); 1064 } else { 1065 sink.numberedListItem(attribs); 1066 } 1067 } 1068 1069 private void handleListItemEnd(Sink sink) { 1070 if (orderedListDepth == 0) { 1071 sink.listItem_(); 1072 } else { 1073 sink.numberedListItem_(); 1074 } 1075 } 1076 1077 private void handleOLStart(Sink sink, SinkEventAttributeSet attribs) { 1078 int numbering = Sink.NUMBERING_DECIMAL; 1079 // this will have to be generalized if we handle styles 1080 String style = (String) attribs.getAttribute(Attribute.STYLE.toString()); 1081 1082 if (style != null) { 1083 switch (style) { 1084 case "list-style-type: upper-alpha;": 1085 numbering = Sink.NUMBERING_UPPER_ALPHA; 1086 break; 1087 case "list-style-type: lower-alpha;": 1088 numbering = Sink.NUMBERING_LOWER_ALPHA; 1089 break; 1090 case "list-style-type: upper-roman;": 1091 numbering = Sink.NUMBERING_UPPER_ROMAN; 1092 break; 1093 case "list-style-type: lower-roman;": 1094 numbering = Sink.NUMBERING_LOWER_ROMAN; 1095 break; 1096 case "list-style-type: decimal;": 1097 numbering = Sink.NUMBERING_DECIMAL; 1098 break; 1099 default: 1100 // ignore all other 1101 } 1102 } 1103 1104 sink.numberedList(numbering, attribs); 1105 orderedListDepth++; 1106 } 1107 1108 private void handlePStart(Sink sink, SinkEventAttributeSet attribs) { 1109 sink.paragraph(attribs); 1110 } 1111 1112 /* 1113 * The PRE element tells visual user agents that the enclosed text is 1114 * "preformatted". When handling preformatted text, visual user agents: 1115 * - May leave white space intact. 1116 * - May render text with a fixed-pitch font. 1117 * - May disable automatic word wrap. 1118 * - Must not disable bidirectional processing. 1119 * Non-visual user agents are not required to respect extra white space 1120 * in the content of a PRE element. 1121 */ 1122 private void handlePreStart(SinkEventAttributeSet attribs, Sink sink) { 1123 verbatim(); 1124 sink.verbatim(attribs); 1125 } 1126 1127 private void handleSectionStart(Sink sink, SinkEventAttributeSet attribs) { 1128 emitHeadingSections(sectionLevel, sink, false); 1129 sink.section(++sectionLevel, attribs); 1130 this.headingLevel = sectionLevel; 1131 } 1132 1133 private void handleHeadingStart(Sink sink, int level, SinkEventAttributeSet attribs) { 1134 emitHeadingSections(level, sink, true); 1135 sink.sectionTitle(level, attribs); 1136 } 1137 1138 private void handleSectionEnd(Sink sink) { 1139 emitHeadingSections(sectionLevel, sink, false); 1140 sink.section_(sectionLevel--); 1141 this.headingLevel = sectionLevel; 1142 } 1143 1144 private void handleTableStart(Sink sink, SinkEventAttributeSet attribs) { 1145 sink.table(attribs); 1146 String givenTableClass = (String) attribs.getAttribute(Attribute.CLASS.toString()); 1147 boolean grid = false; 1148 if (givenTableClass != null 1149 && BODYTABLEBORDER_CLASS_PATTERN.matcher(givenTableClass).matches()) { 1150 grid = true; 1151 } 1152 1153 sink.tableRows(null, grid); 1154 } 1155}