001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019package org.apache.maven.doxia.parser; 020 021import javax.swing.text.html.HTML.Attribute; 022 023import java.io.Reader; 024import java.util.HashSet; 025import java.util.LinkedList; 026import java.util.Set; 027import java.util.Stack; 028import java.util.regex.Pattern; 029 030import org.apache.maven.doxia.macro.MacroExecutionException; 031import org.apache.maven.doxia.markup.HtmlMarkup; 032import org.apache.maven.doxia.sink.Sink; 033import org.apache.maven.doxia.sink.SinkEventAttributes; 034import org.apache.maven.doxia.sink.impl.EventCapturingSinkProxy; 035import org.apache.maven.doxia.sink.impl.SinkEventAttributeSet; 036import org.apache.maven.doxia.util.DoxiaUtils; 037import org.codehaus.plexus.util.xml.pull.XmlPullParser; 038import org.codehaus.plexus.util.xml.pull.XmlPullParserException; 039import org.slf4j.Logger; 040import org.slf4j.LoggerFactory; 041 042/** 043 * Common base parser for xhtml5 events. 044 */ 045public class Xhtml5BaseParser extends AbstractXmlParser implements HtmlMarkup { 046 private static final Logger LOGGER = LoggerFactory.getLogger(Xhtml5BaseParser.class); 047 048 /** Used to identify if a class string contains `bodyTableBorder` */ 049 private static final Pattern BODYTABLEBORDER_CLASS_PATTERN = 050 Pattern.compile("(?:.*\\s|^)bodyTableBorder(?:\\s.*|$)"); 051 052 private static final Set<String> UNMATCHED_XHTML5_ELEMENTS = new HashSet<>(); 053 private static final Set<String> UNMATCHED_XHTML5_SIMPLE_ELEMENTS = new HashSet<>(); 054 055 static { 056 UNMATCHED_XHTML5_SIMPLE_ELEMENTS.add(HtmlMarkup.AREA.toString()); 057 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.AUDIO.toString()); 058 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.BUTTON.toString()); 059 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.CANVAS.toString()); 060 UNMATCHED_XHTML5_SIMPLE_ELEMENTS.add(HtmlMarkup.COL.toString()); 061 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.COLGROUP.toString()); 062 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.COMMAND.toString()); 063 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.DATA.toString()); 064 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.DATALIST.toString()); 065 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.DETAILS.toString()); 066 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.DIALOG.toString()); 067 UNMATCHED_XHTML5_SIMPLE_ELEMENTS.add(HtmlMarkup.EMBED.toString()); 068 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.FIELDSET.toString()); 069 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.FORM.toString()); 070 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.HGROUP.toString()); 071 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.IFRAME.toString()); 072 UNMATCHED_XHTML5_SIMPLE_ELEMENTS.add(HtmlMarkup.INPUT.toString()); 073 UNMATCHED_XHTML5_SIMPLE_ELEMENTS.add(HtmlMarkup.KEYGEN.toString()); 074 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.LABEL.toString()); 075 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.LEGEND.toString()); 076 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.MAP.toString()); 077 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.MENU.toString()); 078 UNMATCHED_XHTML5_SIMPLE_ELEMENTS.add(HtmlMarkup.MENUITEM.toString()); 079 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.METER.toString()); 080 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.NOSCRIPT.toString()); 081 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.OBJECT.toString()); 082 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.OPTGROUP.toString()); 083 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.OPTION.toString()); 084 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.OUTPUT.toString()); 085 UNMATCHED_XHTML5_SIMPLE_ELEMENTS.add(HtmlMarkup.PARAM.toString()); 086 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.PICTURE.toString()); 087 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.PROGRESS.toString()); 088 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.SELECT.toString()); 089 UNMATCHED_XHTML5_SIMPLE_ELEMENTS.add(HtmlMarkup.SOURCE.toString()); 090 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.SUMMARY.toString()); 091 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.SVG.toString()); 092 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.TEMPLATE.toString()); 093 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.TEXTAREA.toString()); 094 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.TBODY.toString()); 095 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.THEAD.toString()); 096 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.TFOOT.toString()); 097 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.TIME.toString()); 098 UNMATCHED_XHTML5_SIMPLE_ELEMENTS.add(HtmlMarkup.TRACK.toString()); 099 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.VAR.toString()); 100 UNMATCHED_XHTML5_ELEMENTS.add(HtmlMarkup.VIDEO.toString()); 101 } 102 103 /** 104 * True if a <script></script> or <style></style> block is read. CDATA sections within are 105 * handled as rawText. 106 */ 107 private boolean scriptBlock; 108 109 /** Used to distinguish <a href=""> from <a name="">. */ 110 private boolean isLink; 111 112 /** Used to distinguish <a href=""> from <a name="">. */ 113 private boolean isAnchor; 114 115 /** Used for nested lists. */ 116 private int orderedListDepth = 0; 117 118 /** Counts section nesting level of the sections manually set in the HTML document */ 119 private int sectionLevel; 120 121 /** Counts current heading level. This is either the {@link #sectionLevel} if no artificial sections are currently open 122 * for headings or a number higher or lower than {@link #sectionLevel} (for all section currently opened/closed for a preceding heading). 123 * The heading level only changes when a new heading starts, or a section starts or ends. */ 124 private int headingLevel; 125 126 /** Verbatim flag, true whenever we are inside a <pre> tag. */ 127 private boolean inVerbatim; 128 129 /** Used to keep track of closing tags for content events */ 130 private Stack<String> divStack = new Stack<>(); 131 132 /** Used to wrap the definedTerm with its definition, even when one is omitted */ 133 boolean hasDefinitionListItem = false; 134 135 private LinkedList<String> capturedSinkEventNames; 136 137 /** {@inheritDoc} */ 138 @Override 139 public void parse(Reader source, Sink sink, String reference) throws ParseException { 140 init(); 141 142 try { 143 capturedSinkEventNames = new LinkedList<>(); 144 Sink capturingSink = EventCapturingSinkProxy.newInstance(sink, capturedSinkEventNames); 145 super.parse(source, capturingSink, reference); 146 } finally { 147 setSecondParsing(false); 148 init(); 149 } 150 } 151 152 /** 153 * {@inheritDoc} 154 * 155 * Adds all XHTML (HTML 5.2) entities to the parser so that they can be recognized and resolved 156 * without additional DTD. 157 */ 158 @Override 159 protected void initXmlParser(XmlPullParser parser) throws XmlPullParserException { 160 super.initXmlParser(parser); 161 } 162 163 /** 164 * <p> 165 * Goes through a common list of possible html5 start tags. These include only tags that can go into 166 * the body of an xhtml5 document and so should be re-usable by different xhtml-based parsers. 167 * </p> 168 * <p> 169 * The currently handled tags are: 170 * </p> 171 * <p> 172 * <code> 173 * <article>, <nav>, <aside>, <section>, <h1>, <h2>, <h3>, 174 * <h4>, <h5>, <header>, <main>, <footer>, <em>, <strong>, 175 * <small>, <s>, <cite>, <q>, <dfn>, <abbr>, <i>, 176 * <b>, <code>, <samp>, <kbd>, <sub>, <sup>, <u>, 177 * <mark>, <ruby>, <rb>, <rt>, <rtc>, <rp>, <bdi>, 178 * <bdo>, <span>, <ins>, <del>, <p>, <pre>, <ul>, 179 * <ol>, <li>, <dl>, <dt>, <dd>, <a>, <table>, 180 * <tr>, <th>, <td>, <caption>, <br/>, <wbr/>, <hr/>, 181 * <img/>. 182 * </code> 183 * </p> 184 * 185 * @param parser A parser. 186 * @param sink the sink to receive the events. 187 * @return True if the event has been handled by this method, i.e. the tag was recognized, false otherwise. 188 */ 189 protected boolean baseStartTag(XmlPullParser parser, Sink sink) { 190 SinkEventAttributeSet attribs = getAttributesFromParser(parser); 191 return baseStartTag(parser.getName(), attribs, sink); 192 } 193 194 protected boolean baseStartTag(String elementName, SinkEventAttributeSet attribs, Sink sink) { 195 boolean visited = true; 196 197 if (elementName.equals(HtmlMarkup.ARTICLE.toString())) { 198 sink.article(attribs); 199 } else if (elementName.equals(HtmlMarkup.NAV.toString())) { 200 sink.navigation(attribs); 201 } else if (elementName.equals(HtmlMarkup.ASIDE.toString())) { 202 sink.sidebar(attribs); 203 } else if (elementName.equals(HtmlMarkup.SECTION.toString())) { 204 handleSectionStart(sink, attribs); 205 } else if (elementName.equals(HtmlMarkup.H1.toString())) { 206 handleHeadingStart(sink, Sink.SECTION_LEVEL_1, attribs); 207 } else if (elementName.equals(HtmlMarkup.H2.toString())) { 208 handleHeadingStart(sink, Sink.SECTION_LEVEL_2, attribs); 209 } else if (elementName.equals(HtmlMarkup.H3.toString())) { 210 handleHeadingStart(sink, Sink.SECTION_LEVEL_3, attribs); 211 } else if (elementName.equals(HtmlMarkup.H4.toString())) { 212 handleHeadingStart(sink, Sink.SECTION_LEVEL_4, attribs); 213 } else if (elementName.equals(HtmlMarkup.H5.toString())) { 214 handleHeadingStart(sink, Sink.SECTION_LEVEL_5, attribs); 215 } else if (elementName.equals(HtmlMarkup.H6.toString())) { 216 handleHeadingStart(sink, Sink.SECTION_LEVEL_6, attribs); 217 } else if (elementName.equals(HtmlMarkup.HEADER.toString())) { 218 sink.header(attribs); 219 } else if (elementName.equals(HtmlMarkup.MAIN.toString())) { 220 sink.content(attribs); 221 } else if (elementName.equals(HtmlMarkup.FOOTER.toString())) { 222 sink.footer(attribs); 223 } else if (elementName.equals(HtmlMarkup.EM.toString())) { 224 attribs.addAttributes(SinkEventAttributeSet.Semantics.EMPHASIS); 225 sink.inline(attribs); 226 } else if (elementName.equals(HtmlMarkup.STRONG.toString())) { 227 attribs.addAttributes(SinkEventAttributeSet.Semantics.STRONG); 228 sink.inline(attribs); 229 } else if (elementName.equals(HtmlMarkup.SMALL.toString())) { 230 attribs.addAttributes(SinkEventAttributeSet.Semantics.SMALL); 231 sink.inline(attribs); 232 } else if (elementName.equals(HtmlMarkup.S.toString())) { 233 attribs.addAttributes(SinkEventAttributeSet.Semantics.LINE_THROUGH); 234 sink.inline(attribs); 235 /* deprecated line-through support */ 236 } else if (elementName.equals(HtmlMarkup.CITE.toString())) { 237 attribs.addAttributes(SinkEventAttributeSet.Semantics.CITATION); 238 sink.inline(attribs); 239 } else if (elementName.equals(HtmlMarkup.Q.toString())) { 240 attribs.addAttributes(SinkEventAttributeSet.Semantics.QUOTE); 241 sink.inline(attribs); 242 } else if (elementName.equals(HtmlMarkup.DFN.toString())) { 243 attribs.addAttributes(SinkEventAttributeSet.Semantics.DEFINITION); 244 sink.inline(attribs); 245 } else if (elementName.equals(HtmlMarkup.ABBR.toString())) { 246 attribs.addAttributes(SinkEventAttributeSet.Semantics.ABBREVIATION); 247 sink.inline(attribs); 248 } else if (elementName.equals(HtmlMarkup.I.toString())) { 249 attribs.addAttributes(SinkEventAttributeSet.Semantics.ITALIC); 250 sink.inline(attribs); 251 } else if (elementName.equals(HtmlMarkup.B.toString())) { 252 attribs.addAttributes(SinkEventAttributeSet.Semantics.BOLD); 253 sink.inline(attribs); 254 } else if (elementName.equals(HtmlMarkup.CODE.toString())) { 255 attribs.addAttributes(SinkEventAttributeSet.Semantics.CODE); 256 sink.inline(attribs); 257 } else if (elementName.equals(HtmlMarkup.VAR.toString())) { 258 attribs.addAttributes(SinkEventAttributeSet.Semantics.VARIABLE); 259 sink.inline(attribs); 260 } else if (elementName.equals(HtmlMarkup.SAMP.toString())) { 261 attribs.addAttributes(SinkEventAttributeSet.Semantics.SAMPLE); 262 sink.inline(attribs); 263 } else if (elementName.equals(HtmlMarkup.KBD.toString())) { 264 attribs.addAttributes(SinkEventAttributeSet.Semantics.KEYBOARD); 265 sink.inline(attribs); 266 } else if (elementName.equals(HtmlMarkup.SUP.toString())) { 267 attribs.addAttributes(SinkEventAttributeSet.Semantics.SUPERSCRIPT); 268 sink.inline(attribs); 269 } else if (elementName.equals(HtmlMarkup.SUB.toString())) { 270 attribs.addAttributes(SinkEventAttributeSet.Semantics.SUBSCRIPT); 271 sink.inline(attribs); 272 } else if (elementName.equals(HtmlMarkup.U.toString())) { 273 attribs.addAttributes(SinkEventAttributeSet.Semantics.ANNOTATION); 274 sink.inline(attribs); 275 } else if (elementName.equals(HtmlMarkup.MARK.toString())) { 276 attribs.addAttributes(SinkEventAttributeSet.Semantics.HIGHLIGHT); 277 sink.inline(attribs); 278 } else if (elementName.equals(HtmlMarkup.RUBY.toString())) { 279 attribs.addAttributes(SinkEventAttributeSet.Semantics.RUBY); 280 sink.inline(attribs); 281 } else if (elementName.equals(HtmlMarkup.RB.toString())) { 282 attribs.addAttributes(SinkEventAttributeSet.Semantics.RUBY_BASE); 283 sink.inline(attribs); 284 } else if (elementName.equals(HtmlMarkup.RT.toString())) { 285 attribs.addAttributes(SinkEventAttributeSet.Semantics.RUBY_TEXT); 286 sink.inline(attribs); 287 } else if (elementName.equals(HtmlMarkup.RTC.toString())) { 288 attribs.addAttributes(SinkEventAttributeSet.Semantics.RUBY_TEXT_CONTAINER); 289 sink.inline(attribs); 290 } else if (elementName.equals(HtmlMarkup.RP.toString())) { 291 attribs.addAttributes(SinkEventAttributeSet.Semantics.RUBY_PARANTHESES); 292 sink.inline(attribs); 293 } else if (elementName.equals(HtmlMarkup.BDI.toString())) { 294 attribs.addAttributes(SinkEventAttributeSet.Semantics.BIDIRECTIONAL_ISOLATION); 295 sink.inline(attribs); 296 } else if (elementName.equals(HtmlMarkup.BDO.toString())) { 297 attribs.addAttributes(SinkEventAttributeSet.Semantics.BIDIRECTIONAL_OVERRIDE); 298 sink.inline(attribs); 299 } else if (elementName.equals(HtmlMarkup.SPAN.toString())) { 300 attribs.addAttributes(SinkEventAttributeSet.Semantics.PHRASE); 301 sink.inline(attribs); 302 } else if (elementName.equals(HtmlMarkup.INS.toString())) { 303 attribs.addAttributes(SinkEventAttributeSet.Semantics.INSERT); 304 sink.inline(attribs); 305 } else if (elementName.equals(HtmlMarkup.DEL.toString())) { 306 attribs.addAttributes(SinkEventAttributeSet.Semantics.DELETE); 307 sink.inline(attribs); 308 } else if (elementName.equals(HtmlMarkup.P.toString())) { 309 handlePStart(sink, attribs); 310 } else if (elementName.equals(HtmlMarkup.DIV.toString())) { 311 handleDivStart(attribs, sink); 312 } else if (elementName.equals(HtmlMarkup.PRE.toString())) { 313 handlePreStart(attribs, sink); 314 } else if (elementName.equals(HtmlMarkup.UL.toString())) { 315 sink.list(attribs); 316 } else if (elementName.equals(HtmlMarkup.OL.toString())) { 317 handleOLStart(sink, attribs); 318 } else if (elementName.equals(HtmlMarkup.LI.toString())) { 319 handleLIStart(sink, attribs); 320 } else if (elementName.equals(HtmlMarkup.DL.toString())) { 321 sink.definitionList(attribs); 322 } else if (elementName.equals(HtmlMarkup.DT.toString())) { 323 if (hasDefinitionListItem) { 324 // close previous listItem 325 sink.definitionListItem_(); 326 } 327 sink.definitionListItem(attribs); 328 hasDefinitionListItem = true; 329 sink.definedTerm(attribs); 330 } else if (elementName.equals(HtmlMarkup.DD.toString())) { 331 if (!hasDefinitionListItem) { 332 sink.definitionListItem(attribs); 333 } 334 sink.definition(attribs); 335 } else if (elementName.equals(HtmlMarkup.FIGURE.toString())) { 336 sink.figure(attribs); 337 } else if (elementName.equals(HtmlMarkup.FIGCAPTION.toString())) { 338 sink.figureCaption(attribs); 339 } else if (elementName.equals(HtmlMarkup.A.toString())) { 340 handleAStart(sink, attribs); 341 } else if (elementName.equals(HtmlMarkup.TABLE.toString())) { 342 handleTableStart(sink, attribs); 343 } else if (elementName.equals(HtmlMarkup.TR.toString())) { 344 sink.tableRow(attribs); 345 } else if (elementName.equals(HtmlMarkup.TH.toString())) { 346 sink.tableHeaderCell(attribs); 347 } else if (elementName.equals(HtmlMarkup.TD.toString())) { 348 sink.tableCell(attribs); 349 } else if (elementName.equals(HtmlMarkup.CAPTION.toString())) { 350 sink.tableCaption(attribs); 351 } else if (elementName.equals(HtmlMarkup.BR.toString())) { 352 sink.lineBreak(attribs); 353 } else if (elementName.equals(HtmlMarkup.WBR.toString())) { 354 sink.lineBreakOpportunity(attribs); 355 } else if (elementName.equals(HtmlMarkup.HR.toString())) { 356 sink.horizontalRule(attribs); 357 } else if (elementName.equals(HtmlMarkup.IMG.toString())) { 358 handleImgStart(sink, attribs); 359 } else if (elementName.equals(HtmlMarkup.BLOCKQUOTE.toString())) { 360 sink.blockquote(attribs); 361 } else if (UNMATCHED_XHTML5_ELEMENTS.contains(elementName)) { 362 handleUnknown(elementName, attribs, sink, TAG_TYPE_START); 363 } else if (UNMATCHED_XHTML5_SIMPLE_ELEMENTS.contains(elementName)) { 364 handleUnknown(elementName, attribs, sink, TAG_TYPE_SIMPLE); 365 } else if (elementName.equals(HtmlMarkup.SCRIPT.toString()) 366 || elementName.equals(HtmlMarkup.STYLE.toString())) { 367 handleUnknown(elementName, attribs, sink, TAG_TYPE_START); 368 scriptBlock = true; 369 } else { 370 visited = false; 371 } 372 373 return visited; 374 } 375 376 /** 377 * <p> 378 * Goes through a common list of possible html end tags. 379 * These should be re-usable by different xhtml-based parsers. 380 * The tags handled here are the same as for {@link #baseStartTag(XmlPullParser,Sink)}, 381 * except for the empty elements ({@code <br/>, <hr/>, <img/>}). 382 * </p> 383 * 384 * @param parser A parser. 385 * @param sink the sink to receive the events. 386 * @return True if the event has been handled by this method, false otherwise. 387 */ 388 protected boolean baseEndTag(XmlPullParser parser, Sink sink) { 389 SinkEventAttributeSet attribs = getAttributesFromParser(parser); 390 return baseEndTag(parser.getName(), attribs, sink); 391 } 392 393 protected boolean baseEndTag(String elementName, SinkEventAttributeSet attribs, Sink sink) { 394 boolean visited = true; 395 396 if (elementName.equals(HtmlMarkup.P.toString())) { 397 sink.paragraph_(); 398 } else if (elementName.equals(HtmlMarkup.DIV.toString())) { 399 handleDivEnd(sink); 400 } else if (elementName.equals(HtmlMarkup.PRE.toString())) { 401 verbatim_(); 402 403 sink.verbatim_(); 404 } else if (elementName.equals(HtmlMarkup.UL.toString())) { 405 sink.list_(); 406 } else if (elementName.equals(HtmlMarkup.OL.toString())) { 407 sink.numberedList_(); 408 orderedListDepth--; 409 } else if (elementName.equals(HtmlMarkup.LI.toString())) { 410 handleListItemEnd(sink); 411 } else if (elementName.equals(HtmlMarkup.DL.toString())) { 412 if (hasDefinitionListItem) { 413 sink.definitionListItem_(); 414 hasDefinitionListItem = false; 415 } 416 sink.definitionList_(); 417 } else if (elementName.equals(HtmlMarkup.DT.toString())) { 418 sink.definedTerm_(); 419 } else if (elementName.equals(HtmlMarkup.DD.toString())) { 420 sink.definition_(); 421 sink.definitionListItem_(); 422 hasDefinitionListItem = false; 423 } else if (elementName.equals(HtmlMarkup.FIGURE.toString())) { 424 sink.figure_(); 425 } else if (elementName.equals(HtmlMarkup.FIGCAPTION.toString())) { 426 sink.figureCaption_(); 427 } else if (elementName.equals(HtmlMarkup.A.toString())) { 428 handleAEnd(sink); 429 } else if (elementName.equals(HtmlMarkup.EM.toString())) { 430 sink.inline_(); 431 } else if (elementName.equals(HtmlMarkup.STRONG.toString())) { 432 sink.inline_(); 433 } else if (elementName.equals(HtmlMarkup.SMALL.toString())) { 434 sink.inline_(); 435 } else if (elementName.equals(HtmlMarkup.S.toString())) { 436 sink.inline_(); 437 } else if (elementName.equals(HtmlMarkup.CITE.toString())) { 438 sink.inline_(); 439 } else if (elementName.equals(HtmlMarkup.Q.toString())) { 440 sink.inline_(); 441 } else if (elementName.equals(HtmlMarkup.DFN.toString())) { 442 sink.inline_(); 443 } else if (elementName.equals(HtmlMarkup.ABBR.toString())) { 444 sink.inline_(); 445 } else if (elementName.equals(HtmlMarkup.I.toString())) { 446 sink.inline_(); 447 } else if (elementName.equals(HtmlMarkup.B.toString())) { 448 sink.inline_(); 449 } else if (elementName.equals(HtmlMarkup.CODE.toString())) { 450 sink.inline_(); 451 } else if (elementName.equals(HtmlMarkup.VAR.toString())) { 452 sink.inline_(); 453 } else if (elementName.equals(HtmlMarkup.SAMP.toString())) { 454 sink.inline_(); 455 } else if (elementName.equals(HtmlMarkup.KBD.toString())) { 456 sink.inline_(); 457 } else if (elementName.equals(HtmlMarkup.SUP.toString())) { 458 sink.inline_(); 459 } else if (elementName.equals(HtmlMarkup.SUB.toString())) { 460 sink.inline_(); 461 } else if (elementName.equals(HtmlMarkup.U.toString())) { 462 sink.inline_(); 463 } else if (elementName.equals(HtmlMarkup.MARK.toString())) { 464 sink.inline_(); 465 } else if (elementName.equals(HtmlMarkup.RUBY.toString())) { 466 sink.inline_(); 467 } else if (elementName.equals(HtmlMarkup.RB.toString())) { 468 sink.inline_(); 469 } else if (elementName.equals(HtmlMarkup.RT.toString())) { 470 sink.inline_(); 471 } else if (elementName.equals(HtmlMarkup.RTC.toString())) { 472 sink.inline_(); 473 } else if (elementName.equals(HtmlMarkup.RP.toString())) { 474 sink.inline_(); 475 } else if (elementName.equals(HtmlMarkup.BDI.toString())) { 476 sink.inline_(); 477 } else if (elementName.equals(HtmlMarkup.BDO.toString())) { 478 sink.inline_(); 479 } else if (elementName.equals(HtmlMarkup.SPAN.toString())) { 480 sink.inline_(); 481 } else if (elementName.equals(HtmlMarkup.INS.toString())) { 482 sink.inline_(); 483 } else if (elementName.equals(HtmlMarkup.DEL.toString())) { 484 sink.inline_(); 485 } 486 487 // ---------------------------------------------------------------------- 488 // Tables 489 // ---------------------------------------------------------------------- 490 491 else if (elementName.equals(HtmlMarkup.TABLE.toString())) { 492 sink.tableRows_(); 493 sink.table_(); 494 } else if (elementName.equals(HtmlMarkup.TR.toString())) { 495 sink.tableRow_(); 496 } else if (elementName.equals(HtmlMarkup.TH.toString())) { 497 sink.tableHeaderCell_(); 498 } else if (elementName.equals(HtmlMarkup.TD.toString())) { 499 sink.tableCell_(); 500 } else if (elementName.equals(HtmlMarkup.CAPTION.toString())) { 501 sink.tableCaption_(); 502 } else if (elementName.equals(HtmlMarkup.ARTICLE.toString())) { 503 sink.article_(); 504 } else if (elementName.equals(HtmlMarkup.NAV.toString())) { 505 sink.navigation_(); 506 } else if (elementName.equals(HtmlMarkup.ASIDE.toString())) { 507 sink.sidebar_(); 508 } else if (elementName.equals(HtmlMarkup.SECTION.toString())) { 509 handleSectionEnd(sink); 510 } else if (elementName.equals(HtmlMarkup.H1.toString())) { 511 sink.sectionTitle1_(); 512 } else if (elementName.equals(HtmlMarkup.H2.toString())) { 513 sink.sectionTitle2_(); 514 } else if (elementName.equals(HtmlMarkup.H3.toString())) { 515 sink.sectionTitle3_(); 516 } else if (elementName.equals(HtmlMarkup.H4.toString())) { 517 sink.sectionTitle4_(); 518 } else if (elementName.equals(HtmlMarkup.H5.toString())) { 519 sink.sectionTitle5_(); 520 } else if (elementName.equals(HtmlMarkup.H6.toString())) { 521 sink.sectionTitle6_(); 522 } else if (elementName.equals(HtmlMarkup.HEADER.toString())) { 523 sink.header_(); 524 } else if (elementName.equals(HtmlMarkup.MAIN.toString())) { 525 sink.content_(); 526 } else if (elementName.equals(HtmlMarkup.FOOTER.toString())) { 527 sink.footer_(); 528 } else if (elementName.equals(HtmlMarkup.BLOCKQUOTE.toString())) { 529 sink.blockquote_(); 530 } else if (UNMATCHED_XHTML5_ELEMENTS.contains(elementName)) { 531 handleUnknown(elementName, attribs, sink, TAG_TYPE_END); 532 } else if (elementName.equals(HtmlMarkup.SCRIPT.toString()) 533 || elementName.equals(HtmlMarkup.STYLE.toString())) { 534 handleUnknown(elementName, attribs, sink, TAG_TYPE_END); 535 536 scriptBlock = false; 537 } else { 538 visited = false; 539 } 540 541 return visited; 542 } 543 544 /** 545 * {@inheritDoc} 546 * 547 * Just calls {@link #baseStartTag(XmlPullParser,Sink)}, this should be 548 * overridden by implementing parsers to include additional tags. 549 */ 550 protected void handleStartTag(XmlPullParser parser, Sink sink) 551 throws XmlPullParserException, MacroExecutionException { 552 if (!baseStartTag(parser, sink)) { 553 LOGGER.warn( 554 "Unrecognized xml tag <{}> at [{}:{}]", 555 parser.getName(), 556 parser.getLineNumber(), 557 parser.getColumnNumber()); 558 } 559 } 560 561 /** 562 * {@inheritDoc} 563 * 564 * Just calls {@link #baseEndTag(XmlPullParser,Sink)}, this should be 565 * overridden by implementing parsers to include additional tags. 566 */ 567 protected void handleEndTag(XmlPullParser parser, Sink sink) 568 throws XmlPullParserException, MacroExecutionException { 569 if (!baseEndTag(parser, sink)) { 570 // unrecognized tag is already logged in StartTag 571 } 572 } 573 574 /** {@inheritDoc} */ 575 @Override 576 protected void handleText(XmlPullParser parser, Sink sink) throws XmlPullParserException { 577 String text = getText(parser); 578 579 /* 580 * NOTE: Don't do any whitespace trimming here. Whitespace normalization has already been performed by the 581 * parser so any whitespace that makes it here is significant. 582 * 583 * NOTE: text within script tags is ignored, scripting code should be embedded in CDATA. 584 */ 585 if ((text != null && !text.isEmpty()) && !isScriptBlock()) { 586 sink.text(text); 587 } 588 } 589 590 /** {@inheritDoc} */ 591 @Override 592 protected void handleComment(XmlPullParser parser, Sink sink) throws XmlPullParserException { 593 String text = getText(parser); 594 595 if ("PB".equals(text.trim())) { 596 sink.pageBreak(); 597 } else { 598 if (isEmitComments()) { 599 sink.comment(text); 600 } 601 } 602 } 603 604 /** {@inheritDoc} */ 605 @Override 606 protected void handleCdsect(XmlPullParser parser, Sink sink) throws XmlPullParserException { 607 String text = getText(parser); 608 609 if (isScriptBlock()) { 610 sink.unknown(CDATA, new Object[] {CDATA_TYPE, text}, null); 611 } else { 612 sink.text(text); 613 } 614 } 615 616 /** 617 * Shortcut for {@link #emitHeadingSections(int, Sink, boolean)} with last argument being {@code true} 618 * @param newLevel 619 * @param sink 620 * @param attribs 621 * @deprecated Use {@link #emitHeadingSections(int, Sink, boolean)} instead. 622 */ 623 @Deprecated 624 protected void consecutiveSections(int newLevel, Sink sink, SinkEventAttributeSet attribs) { 625 emitHeadingSections(newLevel, sink, true); 626 } 627 628 /** 629 * Make sure sections are nested consecutively and correctly inserted for the given heading level 630 * 631 * <p> 632 * HTML5 heading tags H1 to H5 imply same level sections in Sink API (compare with {@link Sink#sectionTitle(int, SinkEventAttributes)}). 633 * However (X)HTML5 allows headings without explicit surrounding section elements and is also 634 * less strict with non-consecutive heading levels. 635 * This methods both closes open sections which have been added for previous headings and/or opens 636 * sections necessary for the new heading level. 637 * At least one section needs to be opened directly prior the heading due to Sink API restrictions. 638 * </p> 639 * 640 * <p> 641 * For instance, if the following sequence is parsed: 642 * </p> 643 * <pre> 644 * <h2></h2> 645 * <h5></h5> 646 * </pre> 647 * <p> 648 * we have to insert two section starts before we open the <code><h5></code>. 649 * In the following sequence 650 * </p> 651 * <pre> 652 * <h5></h5> 653 * <h2></h2> 654 * </pre> 655 * <p> 656 * we have to close two sections before we open the <code><h2></code>. 657 * </p> 658 * 659 * <p>The current heading level is set to newLevel afterwards.</p> 660 * 661 * @param newLevel the new section level, all upper levels have to be closed. 662 * @param sink the sink to receive the events. 663 * @param enforceNewSection whether to enforce a new section or not 664 */ 665 protected void emitHeadingSections(int newLevel, Sink sink, boolean enforceNewSection) { 666 int lowerBoundSectionLevel = newLevel; 667 if (enforceNewSection) { 668 // close one more if either last event was not section start or the new level is lower than the current one 669 // (in this case the last event may be a section start event but for another level) 670 if (!isLastEventSectionStart() || newLevel < this.headingLevel) { 671 lowerBoundSectionLevel--; 672 } 673 } 674 closeOpenHeadingSections(lowerBoundSectionLevel, sink); 675 openMissingHeadingSections(newLevel, sink); 676 677 this.headingLevel = newLevel; 678 } 679 680 private boolean isLastEventSectionStart() { 681 String lastEventName = capturedSinkEventNames.pollLast(); 682 if (lastEventName == null) { 683 return false; 684 } 685 return lastEventName.startsWith("section") 686 && !lastEventName.endsWith("_") 687 && !lastEventName.startsWith("sectionTitle"); 688 } 689 690 /** 691 * Close open heading sections. 692 * 693 * @param newLevel the new section level, all upper levels have to be closed. 694 * @param sink the sink to receive the events. 695 */ 696 private void closeOpenHeadingSections(int newLevel, Sink sink) { 697 while (this.headingLevel > newLevel) { 698 if (headingLevel >= Sink.SECTION_LEVEL_1 && headingLevel <= Sink.SECTION_LEVEL_6) { 699 sink.section_(headingLevel); 700 } 701 702 this.headingLevel--; 703 } 704 // enforce the previous element is a section 705 } 706 707 /** 708 * Open missing heading sections. 709 * 710 * @param newLevel the new section level, all lower levels have to be opened. 711 * @param sink the sink to receive the events. 712 */ 713 private void openMissingHeadingSections(int newLevel, Sink sink) { 714 while (this.headingLevel < newLevel) { 715 this.headingLevel++; 716 717 if (headingLevel >= Sink.SECTION_LEVEL_1 && headingLevel <= Sink.SECTION_LEVEL_6) { 718 sink.section(headingLevel, null); 719 } 720 } 721 } 722 723 /** 724 * Return the current section level. 725 * 726 * @return the current section level. 727 */ 728 protected int getSectionLevel() { 729 return this.headingLevel; 730 } 731 732 /** 733 * Set the current section level. 734 * 735 * @param newLevel the new section level. 736 */ 737 protected void setSectionLevel(int newLevel) { 738 this.headingLevel = newLevel; 739 } 740 741 /** 742 * Stop verbatim mode. 743 */ 744 protected void verbatim_() { 745 this.inVerbatim = false; 746 } 747 748 /** 749 * Start verbatim mode. 750 */ 751 protected void verbatim() { 752 this.inVerbatim = true; 753 } 754 755 /** 756 * Checks if we are currently inside a <pre> tag. 757 * 758 * @return true if we are currently in verbatim mode. 759 */ 760 protected boolean isVerbatim() { 761 return this.inVerbatim; 762 } 763 764 /** 765 * Checks if we are currently inside a <script> tag. 766 * 767 * @return true if we are currently inside <code><script></code> tags. 768 * @since 1.1.1. 769 */ 770 protected boolean isScriptBlock() { 771 return this.scriptBlock; 772 } 773 774 /** 775 * Checks if the given id is a valid Doxia id and if not, returns a transformed one. 776 * 777 * @param id The id to validate. 778 * @return A transformed id or the original id if it was already valid. 779 * @see DoxiaUtils#encodeId(String) 780 */ 781 protected String validAnchor(String id) { 782 if (!DoxiaUtils.isValidId(id)) { 783 String linkAnchor = DoxiaUtils.encodeId(id); 784 785 LOGGER.debug("Modified invalid link '{}' to '{}'", id, linkAnchor); 786 787 return linkAnchor; 788 } 789 790 return id; 791 } 792 793 /** {@inheritDoc} */ 794 @Override 795 protected void init() { 796 super.init(); 797 798 this.scriptBlock = false; 799 this.isLink = false; 800 this.isAnchor = false; 801 this.orderedListDepth = 0; 802 this.headingLevel = 0; 803 this.inVerbatim = false; 804 } 805 806 private void handleAEnd(Sink sink) { 807 if (isLink) { 808 sink.link_(); 809 isLink = false; 810 } else if (isAnchor) { 811 sink.anchor_(); 812 isAnchor = false; 813 } 814 } 815 816 private void handleAStart(Sink sink, SinkEventAttributeSet attribs) { 817 String href = (String) attribs.getAttribute(Attribute.HREF.toString()); 818 819 if (href != null) { 820 int hashIndex = href.indexOf('#'); 821 if (hashIndex != -1 && !DoxiaUtils.isExternalLink(href)) { 822 String hash = href.substring(hashIndex + 1); 823 824 if (!DoxiaUtils.isValidId(hash)) { 825 href = href.substring(0, hashIndex) + "#" + DoxiaUtils.encodeId(hash); 826 827 LOGGER.debug("Modified invalid link '{}' to '{}'", hash, href); 828 } 829 } 830 sink.link(href, attribs); 831 isLink = true; 832 } else { 833 String id = (String) attribs.getAttribute(Attribute.ID.toString()); 834 if (id != null) { 835 sink.anchor(validAnchor(id), attribs); 836 isAnchor = true; 837 } 838 } 839 } 840 841 private boolean handleDivStart(SinkEventAttributeSet attribs, Sink sink) { 842 String divClass = (String) attribs.getAttribute(Attribute.CLASS.toString()); 843 844 this.divStack.push(divClass); 845 846 if ("content".equals(divClass)) { 847 SinkEventAttributeSet atts = new SinkEventAttributeSet(attribs); 848 atts.removeAttribute(SinkEventAttributes.CLASS); 849 sink.content(atts); 850 } 851 if ("verbatim".equals(divClass) || "verbatim source".equals(divClass)) { 852 return false; 853 } else { 854 sink.division(attribs); 855 } 856 857 return true; 858 } 859 860 private boolean handleDivEnd(Sink sink) { 861 String divClass = divStack.pop(); 862 863 if ("content".equals(divClass)) { 864 sink.content_(); 865 } 866 if ("verbatim".equals(divClass) || "verbatim source".equals(divClass)) { 867 return false; 868 } else { 869 sink.division_(); 870 } 871 872 return true; 873 } 874 875 private void handleImgStart(Sink sink, SinkEventAttributeSet attribs) { 876 String src = (String) attribs.getAttribute(Attribute.SRC.toString()); 877 878 if (src != null) { 879 sink.figureGraphics(src, attribs); 880 } 881 } 882 883 private void handleLIStart(Sink sink, SinkEventAttributeSet attribs) { 884 if (orderedListDepth == 0) { 885 sink.listItem(attribs); 886 } else { 887 sink.numberedListItem(attribs); 888 } 889 } 890 891 private void handleListItemEnd(Sink sink) { 892 if (orderedListDepth == 0) { 893 sink.listItem_(); 894 } else { 895 sink.numberedListItem_(); 896 } 897 } 898 899 private void handleOLStart(Sink sink, SinkEventAttributeSet attribs) { 900 int numbering = Sink.NUMBERING_DECIMAL; 901 // this will have to be generalized if we handle styles 902 String style = (String) attribs.getAttribute(Attribute.STYLE.toString()); 903 904 if (style != null) { 905 switch (style) { 906 case "list-style-type: upper-alpha;": 907 numbering = Sink.NUMBERING_UPPER_ALPHA; 908 break; 909 case "list-style-type: lower-alpha;": 910 numbering = Sink.NUMBERING_LOWER_ALPHA; 911 break; 912 case "list-style-type: upper-roman;": 913 numbering = Sink.NUMBERING_UPPER_ROMAN; 914 break; 915 case "list-style-type: lower-roman;": 916 numbering = Sink.NUMBERING_LOWER_ROMAN; 917 break; 918 case "list-style-type: decimal;": 919 numbering = Sink.NUMBERING_DECIMAL; 920 break; 921 default: 922 // ignore all other 923 } 924 } 925 926 sink.numberedList(numbering, attribs); 927 orderedListDepth++; 928 } 929 930 private void handlePStart(Sink sink, SinkEventAttributeSet attribs) { 931 sink.paragraph(attribs); 932 } 933 934 /* 935 * The PRE element tells visual user agents that the enclosed text is 936 * "preformatted". When handling preformatted text, visual user agents: 937 * - May leave white space intact. 938 * - May render text with a fixed-pitch font. 939 * - May disable automatic word wrap. 940 * - Must not disable bidirectional processing. 941 * Non-visual user agents are not required to respect extra white space 942 * in the content of a PRE element. 943 */ 944 private void handlePreStart(SinkEventAttributeSet attribs, Sink sink) { 945 verbatim(); 946 sink.verbatim(attribs); 947 } 948 949 private void handleSectionStart(Sink sink, SinkEventAttributeSet attribs) { 950 emitHeadingSections(sectionLevel, sink, false); 951 sink.section(++sectionLevel, attribs); 952 this.headingLevel = sectionLevel; 953 } 954 955 private void handleHeadingStart(Sink sink, int level, SinkEventAttributeSet attribs) { 956 emitHeadingSections(level, sink, true); 957 sink.sectionTitle(level, attribs); 958 } 959 960 private void handleSectionEnd(Sink sink) { 961 emitHeadingSections(sectionLevel, sink, false); 962 sink.section_(sectionLevel--); 963 this.headingLevel = sectionLevel; 964 } 965 966 private void handleTableStart(Sink sink, SinkEventAttributeSet attribs) { 967 sink.table(attribs); 968 String givenTableClass = (String) attribs.getAttribute(Attribute.CLASS.toString()); 969 boolean grid = false; 970 if (givenTableClass != null 971 && BODYTABLEBORDER_CLASS_PATTERN.matcher(givenTableClass).matches()) { 972 grid = true; 973 } 974 975 sink.tableRows(null, grid); 976 } 977}