1 package org.apache.maven.doxia.parser;
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 import java.io.Reader;
23
24 import javax.swing.text.html.HTML.Attribute;
25
26 import org.apache.maven.doxia.macro.MacroExecutionException;
27 import org.apache.maven.doxia.markup.HtmlMarkup;
28 import org.apache.maven.doxia.sink.Sink;
29 import org.apache.maven.doxia.sink.SinkEventAttributes;
30 import org.apache.maven.doxia.sink.impl.SinkEventAttributeSet;
31 import org.apache.maven.doxia.util.DoxiaUtils;
32
33 import org.codehaus.plexus.util.StringUtils;
34 import org.codehaus.plexus.util.xml.pull.XmlPullParser;
35 import org.codehaus.plexus.util.xml.pull.XmlPullParserException;
36 import org.slf4j.Logger;
37 import org.slf4j.LoggerFactory;
38
39
40
41
42
43
44
45
46 public class XhtmlBaseParser
47 extends AbstractXmlParser
48 implements HtmlMarkup
49 {
50 private static final Logger LOGGER = LoggerFactory.getLogger( XhtmlBaseParser.class );
51
52
53
54
55
56 private boolean scriptBlock;
57
58
59 private boolean isLink;
60
61
62 private boolean isAnchor;
63
64
65 private int orderedListDepth = 0;
66
67
68 private int sectionLevel;
69
70
71 private boolean inVerbatim;
72
73
74 private boolean inFigure;
75
76
77 boolean hasDefinitionListItem = false;
78
79
80 @Override
81 public void parse( Reader source, Sink sink, String reference )
82 throws ParseException
83 {
84 init();
85
86 try
87 {
88 super.parse( source, sink, reference );
89 }
90 finally
91 {
92 setSecondParsing( false );
93 init();
94 }
95 }
96
97
98
99
100
101
102
103 @Override
104 protected void initXmlParser( XmlPullParser parser )
105 throws XmlPullParserException
106 {
107 super.initXmlParser( parser );
108
109
110
111
112
113
114
115
116 parser.defineEntityReplacementText( "nbsp", "\u00a0" );
117 parser.defineEntityReplacementText( "iexcl", "\u00a1" );
118 parser.defineEntityReplacementText( "cent", "\u00a2" );
119 parser.defineEntityReplacementText( "pound", "\u00a3" );
120 parser.defineEntityReplacementText( "curren", "\u00a4" );
121 parser.defineEntityReplacementText( "yen", "\u00a5" );
122 parser.defineEntityReplacementText( "brvbar", "\u00a6" );
123 parser.defineEntityReplacementText( "sect", "\u00a7" );
124 parser.defineEntityReplacementText( "uml", "\u00a8" );
125 parser.defineEntityReplacementText( "copy", "\u00a9" );
126 parser.defineEntityReplacementText( "ordf", "\u00aa" );
127 parser.defineEntityReplacementText( "laquo", "\u00ab" );
128 parser.defineEntityReplacementText( "not", "\u00ac" );
129 parser.defineEntityReplacementText( "shy", "\u00ad" );
130 parser.defineEntityReplacementText( "reg", "\u00ae" );
131 parser.defineEntityReplacementText( "macr", "\u00af" );
132 parser.defineEntityReplacementText( "deg", "\u00b0" );
133 parser.defineEntityReplacementText( "plusmn", "\u00b1" );
134 parser.defineEntityReplacementText( "sup2", "\u00b2" );
135 parser.defineEntityReplacementText( "sup3", "\u00b3" );
136 parser.defineEntityReplacementText( "acute", "\u00b4" );
137 parser.defineEntityReplacementText( "micro", "\u00b5" );
138 parser.defineEntityReplacementText( "para", "\u00b6" );
139 parser.defineEntityReplacementText( "middot", "\u00b7" );
140 parser.defineEntityReplacementText( "cedil", "\u00b8" );
141 parser.defineEntityReplacementText( "sup1", "\u00b9" );
142 parser.defineEntityReplacementText( "ordm", "\u00ba" );
143 parser.defineEntityReplacementText( "raquo", "\u00bb" );
144 parser.defineEntityReplacementText( "frac14", "\u00bc" );
145 parser.defineEntityReplacementText( "frac12", "\u00bd" );
146 parser.defineEntityReplacementText( "frac34", "\u00be" );
147 parser.defineEntityReplacementText( "iquest", "\u00bf" );
148 parser.defineEntityReplacementText( "Agrave", "\u00c0" );
149 parser.defineEntityReplacementText( "Aacute", "\u00c1" );
150 parser.defineEntityReplacementText( "Acirc", "\u00c2" );
151 parser.defineEntityReplacementText( "Atilde", "\u00c3" );
152 parser.defineEntityReplacementText( "Auml", "\u00c4" );
153 parser.defineEntityReplacementText( "Aring", "\u00c5" );
154 parser.defineEntityReplacementText( "AElig", "\u00c6" );
155 parser.defineEntityReplacementText( "Ccedil", "\u00c7" );
156 parser.defineEntityReplacementText( "Egrave", "\u00c8" );
157 parser.defineEntityReplacementText( "Eacute", "\u00c9" );
158 parser.defineEntityReplacementText( "Ecirc", "\u00ca" );
159 parser.defineEntityReplacementText( "Euml", "\u00cb" );
160 parser.defineEntityReplacementText( "Igrave", "\u00cc" );
161 parser.defineEntityReplacementText( "Iacute", "\u00cd" );
162 parser.defineEntityReplacementText( "Icirc", "\u00ce" );
163 parser.defineEntityReplacementText( "Iuml", "\u00cf" );
164 parser.defineEntityReplacementText( "ETH", "\u00d0" );
165 parser.defineEntityReplacementText( "Ntilde", "\u00d1" );
166 parser.defineEntityReplacementText( "Ograve", "\u00d2" );
167 parser.defineEntityReplacementText( "Oacute", "\u00d3" );
168 parser.defineEntityReplacementText( "Ocirc", "\u00d4" );
169 parser.defineEntityReplacementText( "Otilde", "\u00d5" );
170 parser.defineEntityReplacementText( "Ouml", "\u00d6" );
171 parser.defineEntityReplacementText( "times", "\u00d7" );
172 parser.defineEntityReplacementText( "Oslash", "\u00d8" );
173 parser.defineEntityReplacementText( "Ugrave", "\u00d9" );
174 parser.defineEntityReplacementText( "Uacute", "\u00da" );
175 parser.defineEntityReplacementText( "Ucirc", "\u00db" );
176 parser.defineEntityReplacementText( "Uuml", "\u00dc" );
177 parser.defineEntityReplacementText( "Yacute", "\u00dd" );
178 parser.defineEntityReplacementText( "THORN", "\u00de" );
179 parser.defineEntityReplacementText( "szlig", "\u00df" );
180 parser.defineEntityReplacementText( "agrave", "\u00e0" );
181 parser.defineEntityReplacementText( "aacute", "\u00e1" );
182 parser.defineEntityReplacementText( "acirc", "\u00e2" );
183 parser.defineEntityReplacementText( "atilde", "\u00e3" );
184 parser.defineEntityReplacementText( "auml", "\u00e4" );
185 parser.defineEntityReplacementText( "aring", "\u00e5" );
186 parser.defineEntityReplacementText( "aelig", "\u00e6" );
187 parser.defineEntityReplacementText( "ccedil", "\u00e7" );
188 parser.defineEntityReplacementText( "egrave", "\u00e8" );
189 parser.defineEntityReplacementText( "eacute", "\u00e9" );
190 parser.defineEntityReplacementText( "ecirc", "\u00ea" );
191 parser.defineEntityReplacementText( "euml", "\u00eb" );
192 parser.defineEntityReplacementText( "igrave", "\u00ec" );
193 parser.defineEntityReplacementText( "iacute", "\u00ed" );
194 parser.defineEntityReplacementText( "icirc", "\u00ee" );
195 parser.defineEntityReplacementText( "iuml", "\u00ef" );
196 parser.defineEntityReplacementText( "eth", "\u00f0" );
197 parser.defineEntityReplacementText( "ntilde", "\u00f1" );
198 parser.defineEntityReplacementText( "ograve", "\u00f2" );
199 parser.defineEntityReplacementText( "oacute", "\u00f3" );
200 parser.defineEntityReplacementText( "ocirc", "\u00f4" );
201 parser.defineEntityReplacementText( "otilde", "\u00f5" );
202 parser.defineEntityReplacementText( "ouml", "\u00f6" );
203 parser.defineEntityReplacementText( "divide", "\u00f7" );
204 parser.defineEntityReplacementText( "oslash", "\u00f8" );
205 parser.defineEntityReplacementText( "ugrave", "\u00f9" );
206 parser.defineEntityReplacementText( "uacute", "\u00fa" );
207 parser.defineEntityReplacementText( "ucirc", "\u00fb" );
208 parser.defineEntityReplacementText( "uuml", "\u00fc" );
209 parser.defineEntityReplacementText( "yacute", "\u00fd" );
210 parser.defineEntityReplacementText( "thorn", "\u00fe" );
211 parser.defineEntityReplacementText( "yuml", "\u00ff" );
212
213
214
215
216
217 parser.defineEntityReplacementText( "OElig", "\u0152" );
218 parser.defineEntityReplacementText( "oelig", "\u0153" );
219 parser.defineEntityReplacementText( "Scaron", "\u0160" );
220 parser.defineEntityReplacementText( "scaron", "\u0161" );
221 parser.defineEntityReplacementText( "Yuml", "\u0178" );
222 parser.defineEntityReplacementText( "circ", "\u02c6" );
223 parser.defineEntityReplacementText( "tilde", "\u02dc" );
224 parser.defineEntityReplacementText( "ensp", "\u2002" );
225 parser.defineEntityReplacementText( "emsp", "\u2003" );
226 parser.defineEntityReplacementText( "thinsp", "\u2009" );
227 parser.defineEntityReplacementText( "zwnj", "\u200c" );
228 parser.defineEntityReplacementText( "zwj", "\u200d" );
229 parser.defineEntityReplacementText( "lrm", "\u200e" );
230 parser.defineEntityReplacementText( "rlm", "\u200f" );
231 parser.defineEntityReplacementText( "ndash", "\u2013" );
232 parser.defineEntityReplacementText( "mdash", "\u2014" );
233 parser.defineEntityReplacementText( "lsquo", "\u2018" );
234 parser.defineEntityReplacementText( "rsquo", "\u2019" );
235 parser.defineEntityReplacementText( "sbquo", "\u201a" );
236 parser.defineEntityReplacementText( "ldquo", "\u201c" );
237 parser.defineEntityReplacementText( "rdquo", "\u201d" );
238 parser.defineEntityReplacementText( "bdquo", "\u201e" );
239 parser.defineEntityReplacementText( "dagger", "\u2020" );
240 parser.defineEntityReplacementText( "Dagger", "\u2021" );
241 parser.defineEntityReplacementText( "permil", "\u2030" );
242 parser.defineEntityReplacementText( "lsaquo", "\u2039" );
243 parser.defineEntityReplacementText( "rsaquo", "\u203a" );
244 parser.defineEntityReplacementText( "euro", "\u20ac" );
245
246
247
248
249
250 parser.defineEntityReplacementText( "fnof", "\u0192" );
251 parser.defineEntityReplacementText( "Alpha", "\u0391" );
252 parser.defineEntityReplacementText( "Beta", "\u0392" );
253 parser.defineEntityReplacementText( "Gamma", "\u0393" );
254 parser.defineEntityReplacementText( "Delta", "\u0394" );
255 parser.defineEntityReplacementText( "Epsilon", "\u0395" );
256 parser.defineEntityReplacementText( "Zeta", "\u0396" );
257 parser.defineEntityReplacementText( "Eta", "\u0397" );
258 parser.defineEntityReplacementText( "Theta", "\u0398" );
259 parser.defineEntityReplacementText( "Iota", "\u0399" );
260 parser.defineEntityReplacementText( "Kappa", "\u039a" );
261 parser.defineEntityReplacementText( "Lambda", "\u039b" );
262 parser.defineEntityReplacementText( "Mu", "\u039c" );
263 parser.defineEntityReplacementText( "Nu", "\u039d" );
264 parser.defineEntityReplacementText( "Xi", "\u039e" );
265 parser.defineEntityReplacementText( "Omicron", "\u039f" );
266 parser.defineEntityReplacementText( "Pi", "\u03a0" );
267 parser.defineEntityReplacementText( "Rho", "\u03a1" );
268 parser.defineEntityReplacementText( "Sigma", "\u03a3" );
269 parser.defineEntityReplacementText( "Tau", "\u03a4" );
270 parser.defineEntityReplacementText( "Upsilon", "\u03a5" );
271 parser.defineEntityReplacementText( "Phi", "\u03a6" );
272 parser.defineEntityReplacementText( "Chi", "\u03a7" );
273 parser.defineEntityReplacementText( "Psi", "\u03a8" );
274 parser.defineEntityReplacementText( "Omega", "\u03a9" );
275 parser.defineEntityReplacementText( "alpha", "\u03b1" );
276 parser.defineEntityReplacementText( "beta", "\u03b2" );
277 parser.defineEntityReplacementText( "gamma", "\u03b3" );
278 parser.defineEntityReplacementText( "delta", "\u03b4" );
279 parser.defineEntityReplacementText( "epsilon", "\u03b5" );
280 parser.defineEntityReplacementText( "zeta", "\u03b6" );
281 parser.defineEntityReplacementText( "eta", "\u03b7" );
282 parser.defineEntityReplacementText( "theta", "\u03b8" );
283 parser.defineEntityReplacementText( "iota", "\u03b9" );
284 parser.defineEntityReplacementText( "kappa", "\u03ba" );
285 parser.defineEntityReplacementText( "lambda", "\u03bb" );
286 parser.defineEntityReplacementText( "mu", "\u03bc" );
287 parser.defineEntityReplacementText( "nu", "\u03bd" );
288 parser.defineEntityReplacementText( "xi", "\u03be" );
289 parser.defineEntityReplacementText( "omicron", "\u03bf" );
290 parser.defineEntityReplacementText( "pi", "\u03c0" );
291 parser.defineEntityReplacementText( "rho", "\u03c1" );
292 parser.defineEntityReplacementText( "sigmaf", "\u03c2" );
293 parser.defineEntityReplacementText( "sigma", "\u03c3" );
294 parser.defineEntityReplacementText( "tau", "\u03c4" );
295 parser.defineEntityReplacementText( "upsilon", "\u03c5" );
296 parser.defineEntityReplacementText( "phi", "\u03c6" );
297 parser.defineEntityReplacementText( "chi", "\u03c7" );
298 parser.defineEntityReplacementText( "psi", "\u03c8" );
299 parser.defineEntityReplacementText( "omega", "\u03c9" );
300 parser.defineEntityReplacementText( "thetasym", "\u03d1" );
301 parser.defineEntityReplacementText( "upsih", "\u03d2" );
302 parser.defineEntityReplacementText( "piv", "\u03d6" );
303 parser.defineEntityReplacementText( "bull", "\u2022" );
304 parser.defineEntityReplacementText( "hellip", "\u2026" );
305 parser.defineEntityReplacementText( "prime", "\u2032" );
306 parser.defineEntityReplacementText( "Prime", "\u2033" );
307 parser.defineEntityReplacementText( "oline", "\u203e" );
308 parser.defineEntityReplacementText( "frasl", "\u2044" );
309 parser.defineEntityReplacementText( "weierp", "\u2118" );
310 parser.defineEntityReplacementText( "image", "\u2111" );
311 parser.defineEntityReplacementText( "real", "\u211c" );
312 parser.defineEntityReplacementText( "trade", "\u2122" );
313 parser.defineEntityReplacementText( "alefsym", "\u2135" );
314 parser.defineEntityReplacementText( "larr", "\u2190" );
315 parser.defineEntityReplacementText( "uarr", "\u2191" );
316 parser.defineEntityReplacementText( "rarr", "\u2192" );
317 parser.defineEntityReplacementText( "darr", "\u2193" );
318 parser.defineEntityReplacementText( "harr", "\u2194" );
319 parser.defineEntityReplacementText( "crarr", "\u21b5" );
320 parser.defineEntityReplacementText( "lArr", "\u21d0" );
321 parser.defineEntityReplacementText( "uArr", "\u21d1" );
322 parser.defineEntityReplacementText( "rArr", "\u21d2" );
323 parser.defineEntityReplacementText( "dArr", "\u21d3" );
324 parser.defineEntityReplacementText( "hArr", "\u21d4" );
325 parser.defineEntityReplacementText( "forall", "\u2200" );
326 parser.defineEntityReplacementText( "part", "\u2202" );
327 parser.defineEntityReplacementText( "exist", "\u2203" );
328 parser.defineEntityReplacementText( "empty", "\u2205" );
329 parser.defineEntityReplacementText( "nabla", "\u2207" );
330 parser.defineEntityReplacementText( "isin", "\u2208" );
331 parser.defineEntityReplacementText( "notin", "\u2209" );
332 parser.defineEntityReplacementText( "ni", "\u220b" );
333 parser.defineEntityReplacementText( "prod", "\u220f" );
334 parser.defineEntityReplacementText( "sum", "\u2211" );
335 parser.defineEntityReplacementText( "minus", "\u2212" );
336 parser.defineEntityReplacementText( "lowast", "\u2217" );
337 parser.defineEntityReplacementText( "radic", "\u221a" );
338 parser.defineEntityReplacementText( "prop", "\u221d" );
339 parser.defineEntityReplacementText( "infin", "\u221e" );
340 parser.defineEntityReplacementText( "ang", "\u2220" );
341 parser.defineEntityReplacementText( "and", "\u2227" );
342 parser.defineEntityReplacementText( "or", "\u2228" );
343 parser.defineEntityReplacementText( "cap", "\u2229" );
344 parser.defineEntityReplacementText( "cup", "\u222a" );
345 parser.defineEntityReplacementText( "int", "\u222b" );
346 parser.defineEntityReplacementText( "there4", "\u2234" );
347 parser.defineEntityReplacementText( "sim", "\u223c" );
348 parser.defineEntityReplacementText( "cong", "\u2245" );
349 parser.defineEntityReplacementText( "asymp", "\u2248" );
350 parser.defineEntityReplacementText( "ne", "\u2260" );
351 parser.defineEntityReplacementText( "equiv", "\u2261" );
352 parser.defineEntityReplacementText( "le", "\u2264" );
353 parser.defineEntityReplacementText( "ge", "\u2265" );
354 parser.defineEntityReplacementText( "sub", "\u2282" );
355 parser.defineEntityReplacementText( "sup", "\u2283" );
356 parser.defineEntityReplacementText( "nsub", "\u2284" );
357 parser.defineEntityReplacementText( "sube", "\u2286" );
358 parser.defineEntityReplacementText( "supe", "\u2287" );
359 parser.defineEntityReplacementText( "oplus", "\u2295" );
360 parser.defineEntityReplacementText( "otimes", "\u2297" );
361 parser.defineEntityReplacementText( "perp", "\u22a5" );
362 parser.defineEntityReplacementText( "sdot", "\u22c5" );
363 parser.defineEntityReplacementText( "lceil", "\u2308" );
364 parser.defineEntityReplacementText( "rceil", "\u2309" );
365 parser.defineEntityReplacementText( "lfloor", "\u230a" );
366 parser.defineEntityReplacementText( "rfloor", "\u230b" );
367 parser.defineEntityReplacementText( "lang", "\u2329" );
368 parser.defineEntityReplacementText( "rang", "\u232a" );
369 parser.defineEntityReplacementText( "loz", "\u25ca" );
370 parser.defineEntityReplacementText( "spades", "\u2660" );
371 parser.defineEntityReplacementText( "clubs", "\u2663" );
372 parser.defineEntityReplacementText( "hearts", "\u2665" );
373 parser.defineEntityReplacementText( "diams", "\u2666" );
374 }
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397 protected boolean baseStartTag( XmlPullParser parser, Sink sink )
398 {
399 boolean visited = true;
400
401 SinkEventAttributeSet attribs = getAttributesFromParser( parser );
402
403 if ( parser.getName().equals( HtmlMarkup.H2.toString() ) )
404 {
405 handleSectionStart( sink, Sink.SECTION_LEVEL_1, attribs );
406 }
407 else if ( parser.getName().equals( HtmlMarkup.H3.toString() ) )
408 {
409 handleSectionStart( sink, Sink.SECTION_LEVEL_2, attribs );
410 }
411 else if ( parser.getName().equals( HtmlMarkup.H4.toString() ) )
412 {
413 handleSectionStart( sink, Sink.SECTION_LEVEL_3, attribs );
414 }
415 else if ( parser.getName().equals( HtmlMarkup.H5.toString() ) )
416 {
417 handleSectionStart( sink, Sink.SECTION_LEVEL_4, attribs );
418 }
419 else if ( parser.getName().equals( HtmlMarkup.H6.toString() ) )
420 {
421 handleSectionStart( sink, Sink.SECTION_LEVEL_5, attribs );
422 }
423 else if ( parser.getName().equals( HtmlMarkup.U.toString() ) )
424 {
425 attribs.addAttributes( SinkEventAttributeSet.Semantics.ANNOTATION );
426 sink.inline( attribs );
427 }
428 else if ( parser.getName().equals( HtmlMarkup.S.toString() )
429 || parser.getName().equals( HtmlMarkup.STRIKE.toString() )
430 || parser.getName().equals( "del" ) )
431 {
432 attribs.addAttributes( SinkEventAttributeSet.Semantics.LINE_THROUGH );
433 sink.inline( attribs );
434 }
435 else if ( parser.getName().equals( HtmlMarkup.SUB.toString() ) )
436 {
437 attribs.addAttributes( SinkEventAttributeSet.Semantics.SUBSCRIPT );
438 sink.inline( attribs );
439 }
440 else if ( parser.getName().equals( HtmlMarkup.SUP.toString() ) )
441 {
442 attribs.addAttributes( SinkEventAttributeSet.Semantics.SUPERSCRIPT );
443 sink.inline( attribs );
444 }
445 else if ( parser.getName().equals( HtmlMarkup.P.toString() ) )
446 {
447 handlePStart( sink, attribs );
448 }
449 else if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) )
450 {
451 visited = handleDivStart( parser, attribs, sink );
452 }
453 else if ( parser.getName().equals( HtmlMarkup.PRE.toString() ) )
454 {
455 handlePreStart( attribs, sink );
456 }
457 else if ( parser.getName().equals( HtmlMarkup.UL.toString() ) )
458 {
459 sink.list( attribs );
460 }
461 else if ( parser.getName().equals( HtmlMarkup.OL.toString() ) )
462 {
463 handleOLStart( parser, sink, attribs );
464 }
465 else if ( parser.getName().equals( HtmlMarkup.LI.toString() ) )
466 {
467 handleLIStart( sink, attribs );
468 }
469 else if ( parser.getName().equals( HtmlMarkup.DL.toString() ) )
470 {
471 sink.definitionList( attribs );
472 }
473 else if ( parser.getName().equals( HtmlMarkup.DT.toString() ) )
474 {
475 if ( hasDefinitionListItem )
476 {
477
478 sink.definitionListItem_();
479 }
480 sink.definitionListItem( attribs );
481 hasDefinitionListItem = true;
482 sink.definedTerm( attribs );
483 }
484 else if ( parser.getName().equals( HtmlMarkup.DD.toString() ) )
485 {
486 if ( !hasDefinitionListItem )
487 {
488 sink.definitionListItem( attribs );
489 }
490 sink.definition( attribs );
491 }
492 else if ( ( parser.getName().equals( HtmlMarkup.B.toString() ) )
493 || ( parser.getName().equals( HtmlMarkup.STRONG.toString() ) ) )
494 {
495 sink.inline( SinkEventAttributeSet.Semantics.BOLD );
496 }
497 else if ( ( parser.getName().equals( HtmlMarkup.I.toString() ) )
498 || ( parser.getName().equals( HtmlMarkup.EM.toString() ) ) )
499 {
500 handleFigureCaptionStart( sink, attribs );
501 }
502 else if ( ( parser.getName().equals( HtmlMarkup.CODE.toString() ) )
503 || ( parser.getName().equals( HtmlMarkup.SAMP.toString() ) )
504 || ( parser.getName().equals( HtmlMarkup.TT.toString() ) ) )
505 {
506 attribs.addAttributes( SinkEventAttributeSet.Semantics.CODE );
507 sink.inline( attribs );
508 }
509 else if ( parser.getName().equals( HtmlMarkup.A.toString() ) )
510 {
511 handleAStart( parser, sink, attribs );
512 }
513 else if ( parser.getName().equals( HtmlMarkup.TABLE.toString() ) )
514 {
515 handleTableStart( sink, attribs, parser );
516 }
517 else if ( parser.getName().equals( HtmlMarkup.TR.toString() ) )
518 {
519 sink.tableRow( attribs );
520 }
521 else if ( parser.getName().equals( HtmlMarkup.TH.toString() ) )
522 {
523 sink.tableHeaderCell( attribs );
524 }
525 else if ( parser.getName().equals( HtmlMarkup.TD.toString() ) )
526 {
527 sink.tableCell( attribs );
528 }
529 else if ( parser.getName().equals( HtmlMarkup.CAPTION.toString() ) )
530 {
531 sink.tableCaption( attribs );
532 }
533 else if ( parser.getName().equals( HtmlMarkup.BR.toString() ) )
534 {
535 sink.lineBreak( attribs );
536 }
537 else if ( parser.getName().equals( HtmlMarkup.HR.toString() ) )
538 {
539 sink.horizontalRule( attribs );
540 }
541 else if ( parser.getName().equals( HtmlMarkup.IMG.toString() ) )
542 {
543 handleImgStart( parser, sink, attribs );
544 }
545 else if ( parser.getName().equals( HtmlMarkup.SCRIPT.toString() )
546 || parser.getName().equals( HtmlMarkup.STYLE.toString() ) )
547 {
548 handleUnknown( parser, sink, TAG_TYPE_START );
549 scriptBlock = true;
550 }
551 else
552 {
553 visited = false;
554 }
555
556 return visited;
557 }
558
559
560
561
562
563
564
565
566
567
568
569
570
571 protected boolean baseEndTag( XmlPullParser parser, Sink sink )
572 {
573 boolean visited = true;
574
575 if ( parser.getName().equals( HtmlMarkup.P.toString() ) )
576 {
577 if ( !inFigure )
578 {
579 sink.paragraph_();
580 }
581 }
582 else if ( parser.getName().equals( HtmlMarkup.U.toString() )
583 || parser.getName().equals( HtmlMarkup.S.toString() )
584 || parser.getName().equals( HtmlMarkup.STRIKE.toString() )
585 || parser.getName().equals( "del" ) )
586 {
587 sink.inline_();
588 }
589 else if ( parser.getName().equals( HtmlMarkup.SUB.toString() )
590 || parser.getName().equals( HtmlMarkup.SUP.toString() ) )
591 {
592 sink.inline_();
593 }
594 else if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) )
595 {
596 if ( inFigure )
597 {
598 sink.figure_();
599 this.inFigure = false;
600 }
601 else
602 {
603 visited = false;
604 }
605 }
606 else if ( parser.getName().equals( HtmlMarkup.PRE.toString() ) )
607 {
608 verbatim_();
609
610 sink.verbatim_();
611 }
612 else if ( parser.getName().equals( HtmlMarkup.UL.toString() ) )
613 {
614 sink.list_();
615 }
616 else if ( parser.getName().equals( HtmlMarkup.OL.toString() ) )
617 {
618 sink.numberedList_();
619 orderedListDepth--;
620 }
621 else if ( parser.getName().equals( HtmlMarkup.LI.toString() ) )
622 {
623 handleListItemEnd( sink );
624 }
625 else if ( parser.getName().equals( HtmlMarkup.DL.toString() ) )
626 {
627 if ( hasDefinitionListItem )
628 {
629 sink.definitionListItem_();
630 hasDefinitionListItem = false;
631 }
632 sink.definitionList_();
633 }
634 else if ( parser.getName().equals( HtmlMarkup.DT.toString() ) )
635 {
636 sink.definedTerm_();
637 }
638 else if ( parser.getName().equals( HtmlMarkup.DD.toString() ) )
639 {
640 sink.definition_();
641 sink.definitionListItem_();
642 hasDefinitionListItem = false;
643 }
644 else if ( ( parser.getName().equals( HtmlMarkup.B.toString() ) )
645 || ( parser.getName().equals( HtmlMarkup.STRONG.toString() ) ) )
646 {
647 sink.inline_();
648 }
649 else if ( ( parser.getName().equals( HtmlMarkup.I.toString() ) )
650 || ( parser.getName().equals( HtmlMarkup.EM.toString() ) ) )
651 {
652 handleFigureCaptionEnd( sink );
653 }
654 else if ( ( parser.getName().equals( HtmlMarkup.CODE.toString() ) )
655 || ( parser.getName().equals( HtmlMarkup.SAMP.toString() ) )
656 || ( parser.getName().equals( HtmlMarkup.TT.toString() ) ) )
657 {
658 sink.inline_();
659 }
660 else if ( parser.getName().equals( HtmlMarkup.A.toString() ) )
661 {
662 handleAEnd( sink );
663 }
664
665
666
667
668
669 else if ( parser.getName().equals( HtmlMarkup.TABLE.toString() ) )
670 {
671 sink.tableRows_();
672
673 sink.table_();
674 }
675 else if ( parser.getName().equals( HtmlMarkup.TR.toString() ) )
676 {
677 sink.tableRow_();
678 }
679 else if ( parser.getName().equals( HtmlMarkup.TH.toString() ) )
680 {
681 sink.tableHeaderCell_();
682 }
683 else if ( parser.getName().equals( HtmlMarkup.TD.toString() ) )
684 {
685 sink.tableCell_();
686 }
687 else if ( parser.getName().equals( HtmlMarkup.CAPTION.toString() ) )
688 {
689 sink.tableCaption_();
690 }
691 else if ( parser.getName().equals( HtmlMarkup.H2.toString() ) )
692 {
693 sink.sectionTitle1_();
694 }
695 else if ( parser.getName().equals( HtmlMarkup.H3.toString() ) )
696 {
697 sink.sectionTitle2_();
698 }
699 else if ( parser.getName().equals( HtmlMarkup.H4.toString() ) )
700 {
701 sink.sectionTitle3_();
702 }
703 else if ( parser.getName().equals( HtmlMarkup.H5.toString() ) )
704 {
705 sink.sectionTitle4_();
706 }
707 else if ( parser.getName().equals( HtmlMarkup.H6.toString() ) )
708 {
709 sink.sectionTitle5_();
710 }
711 else if ( parser.getName().equals( HtmlMarkup.SCRIPT.toString() )
712 || parser.getName().equals( HtmlMarkup.STYLE.toString() ) )
713 {
714 handleUnknown( parser, sink, TAG_TYPE_END );
715
716 scriptBlock = false;
717 }
718 else
719 {
720 visited = false;
721 }
722
723 return visited;
724 }
725
726
727
728
729
730
731
732 protected void handleStartTag( XmlPullParser parser, Sink sink )
733 throws XmlPullParserException, MacroExecutionException
734 {
735 if ( !baseStartTag( parser, sink ) )
736 {
737 LOGGER.warn( "Unrecognized xml tag <{}> at [{}:{}]", parser.getName(),
738 parser.getLineNumber(), parser.getColumnNumber() );
739 }
740 }
741
742
743
744
745
746
747
748 protected void handleEndTag( XmlPullParser parser, Sink sink )
749 throws XmlPullParserException, MacroExecutionException
750 {
751 if ( !baseEndTag( parser, sink ) )
752 {
753
754 }
755 }
756
757
758 @Override
759 protected void handleText( XmlPullParser parser, Sink sink )
760 throws XmlPullParserException
761 {
762 String text = getText( parser );
763
764
765
766
767
768
769
770 if ( StringUtils.isNotEmpty( text ) && !isScriptBlock() )
771 {
772 sink.text( text );
773 }
774 }
775
776
777 @Override
778 protected void handleComment( XmlPullParser parser, Sink sink )
779 throws XmlPullParserException
780 {
781 String text = getText( parser );
782
783 if ( "PB".equals( text.trim() ) )
784 {
785 sink.pageBreak();
786 }
787 else
788 {
789 if ( isEmitComments() )
790 {
791 sink.comment( text );
792 }
793 }
794 }
795
796
797 @Override
798 protected void handleCdsect( XmlPullParser parser, Sink sink )
799 throws XmlPullParserException
800 {
801 String text = getText( parser );
802
803 if ( isScriptBlock() )
804 {
805 sink.unknown( CDATA, new Object[] { CDATA_TYPE, text }, null );
806 }
807 else
808 {
809 sink.text( text );
810 }
811 }
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845 protected void consecutiveSections( int newLevel, Sink sink )
846 {
847 closeOpenSections( newLevel, sink );
848 openMissingSections( newLevel, sink );
849
850 this.sectionLevel = newLevel;
851 }
852
853
854
855
856
857
858
859 private void closeOpenSections( int newLevel, Sink sink )
860 {
861 while ( this.sectionLevel >= newLevel )
862 {
863 if ( sectionLevel == Sink.SECTION_LEVEL_5 )
864 {
865 sink.section5_();
866 }
867 else if ( sectionLevel == Sink.SECTION_LEVEL_4 )
868 {
869 sink.section4_();
870 }
871 else if ( sectionLevel == Sink.SECTION_LEVEL_3 )
872 {
873 sink.section3_();
874 }
875 else if ( sectionLevel == Sink.SECTION_LEVEL_2 )
876 {
877 sink.section2_();
878 }
879 else if ( sectionLevel == Sink.SECTION_LEVEL_1 )
880 {
881 sink.section1_();
882 }
883
884 this.sectionLevel--;
885 }
886 }
887
888
889
890
891
892
893
894 private void openMissingSections( int newLevel, Sink sink )
895 {
896 while ( this.sectionLevel < newLevel - 1 )
897 {
898 this.sectionLevel++;
899
900 if ( sectionLevel == Sink.SECTION_LEVEL_5 )
901 {
902 sink.section5();
903 }
904 else if ( sectionLevel == Sink.SECTION_LEVEL_4 )
905 {
906 sink.section4();
907 }
908 else if ( sectionLevel == Sink.SECTION_LEVEL_3 )
909 {
910 sink.section3();
911 }
912 else if ( sectionLevel == Sink.SECTION_LEVEL_2 )
913 {
914 sink.section2();
915 }
916 else if ( sectionLevel == Sink.SECTION_LEVEL_1 )
917 {
918 sink.section1();
919 }
920 }
921 }
922
923
924
925
926
927
928 protected int getSectionLevel()
929 {
930 return this.sectionLevel;
931 }
932
933
934
935
936
937
938 protected void setSectionLevel( int newLevel )
939 {
940 this.sectionLevel = newLevel;
941 }
942
943
944
945
946 protected void verbatim_()
947 {
948 this.inVerbatim = false;
949 }
950
951
952
953
954 protected void verbatim()
955 {
956 this.inVerbatim = true;
957 }
958
959
960
961
962
963
964 protected boolean isVerbatim()
965 {
966 return this.inVerbatim;
967 }
968
969
970
971
972
973
974
975 protected boolean isScriptBlock()
976 {
977 return this.scriptBlock;
978 }
979
980
981
982
983
984
985
986
987 protected String validAnchor( String id )
988 {
989 if ( !DoxiaUtils.isValidId( id ) )
990 {
991 String linkAnchor = DoxiaUtils.encodeId( id, true );
992
993 LOGGER.debug( "Modified invalid link '{}' to '{}'", id, linkAnchor );
994
995 return linkAnchor;
996 }
997
998 return id;
999 }
1000
1001
1002 @Override
1003 protected void init()
1004 {
1005 super.init();
1006
1007 this.scriptBlock = false;
1008 this.isLink = false;
1009 this.isAnchor = false;
1010 this.orderedListDepth = 0;
1011 this.sectionLevel = 0;
1012 this.inVerbatim = false;
1013 this.inFigure = false;
1014 }
1015
1016 private void handleAEnd( Sink sink )
1017 {
1018 if ( isLink )
1019 {
1020 sink.link_();
1021 isLink = false;
1022 }
1023 else if ( isAnchor )
1024 {
1025 sink.anchor_();
1026 isAnchor = false;
1027 }
1028 }
1029
1030 private void handleAStart( XmlPullParser parser, Sink sink, SinkEventAttributeSet attribs )
1031 {
1032 String href = parser.getAttributeValue( null, Attribute.HREF.toString() );
1033
1034 if ( href != null )
1035 {
1036 int hashIndex = href.indexOf( '#' );
1037 if ( hashIndex != -1 && !DoxiaUtils.isExternalLink( href ) )
1038 {
1039 String hash = href.substring( hashIndex + 1 );
1040
1041 if ( !DoxiaUtils.isValidId( hash ) )
1042 {
1043 href = href.substring( 0, hashIndex ) + "#" + DoxiaUtils.encodeId( hash, true );
1044
1045 LOGGER.debug( "Modified invalid link '{}' to '{}'", hash, href );
1046 }
1047 }
1048 sink.link( href, attribs );
1049 isLink = true;
1050 }
1051 else
1052 {
1053 String name = parser.getAttributeValue( null, Attribute.NAME.toString() );
1054
1055 if ( name != null )
1056 {
1057 sink.anchor( validAnchor( name ), attribs );
1058 isAnchor = true;
1059 }
1060 else
1061 {
1062 String id = parser.getAttributeValue( null, Attribute.ID.toString() );
1063 if ( id != null )
1064 {
1065 sink.anchor( validAnchor( id ), attribs );
1066 isAnchor = true;
1067 }
1068 }
1069 }
1070 }
1071
1072 private boolean handleDivStart( XmlPullParser parser, SinkEventAttributeSet attribs, Sink sink )
1073 {
1074 boolean visited = true;
1075
1076 String divclass = parser.getAttributeValue( null, Attribute.CLASS.toString() );
1077
1078 if ( "figure".equals( divclass ) )
1079 {
1080 this.inFigure = true;
1081 SinkEventAttributeSet atts = new SinkEventAttributeSet( attribs );
1082 atts.removeAttribute( SinkEventAttributes.CLASS );
1083 sink.figure( atts );
1084 }
1085 else
1086 {
1087 visited = false;
1088 }
1089
1090 return visited;
1091 }
1092
1093 private void handleFigureCaptionEnd( Sink sink )
1094 {
1095 if ( inFigure )
1096 {
1097 sink.figureCaption_();
1098 }
1099 else
1100 {
1101 sink.inline_();
1102 }
1103 }
1104
1105 private void handleFigureCaptionStart( Sink sink, SinkEventAttributeSet attribs )
1106 {
1107 if ( inFigure )
1108 {
1109 sink.figureCaption( attribs );
1110 }
1111 else
1112 {
1113 sink.inline( SinkEventAttributeSet.Semantics.ITALIC );
1114 }
1115 }
1116
1117 private void handleImgStart( XmlPullParser parser, Sink sink, SinkEventAttributeSet attribs )
1118 {
1119 String src = parser.getAttributeValue( null, Attribute.SRC.toString() );
1120
1121 if ( src != null )
1122 {
1123 sink.figureGraphics( src, attribs );
1124 }
1125 }
1126
1127 private void handleLIStart( Sink sink, SinkEventAttributeSet attribs )
1128 {
1129 if ( orderedListDepth == 0 )
1130 {
1131 sink.listItem( attribs );
1132 }
1133 else
1134 {
1135 sink.numberedListItem( attribs );
1136 }
1137 }
1138
1139 private void handleListItemEnd( Sink sink )
1140 {
1141 if ( orderedListDepth == 0 )
1142 {
1143 sink.listItem_();
1144 }
1145 else
1146 {
1147 sink.numberedListItem_();
1148 }
1149 }
1150
1151 private void handleOLStart( XmlPullParser parser, Sink sink, SinkEventAttributeSet attribs )
1152 {
1153 int numbering = Sink.NUMBERING_DECIMAL;
1154
1155 String style = parser.getAttributeValue( null, Attribute.STYLE.toString() );
1156
1157 if ( style != null )
1158 {
1159 switch ( style )
1160 {
1161 case "list-style-type: upper-alpha":
1162 numbering = Sink.NUMBERING_UPPER_ALPHA;
1163 break;
1164 case "list-style-type: lower-alpha":
1165 numbering = Sink.NUMBERING_LOWER_ALPHA;
1166 break;
1167 case "list-style-type: upper-roman":
1168 numbering = Sink.NUMBERING_UPPER_ROMAN;
1169 break;
1170 case "list-style-type: lower-roman":
1171 numbering = Sink.NUMBERING_LOWER_ROMAN;
1172 break;
1173 case "list-style-type: decimal":
1174 numbering = Sink.NUMBERING_DECIMAL;
1175 break;
1176 default:
1177
1178 }
1179 }
1180
1181 sink.numberedList( numbering, attribs );
1182 orderedListDepth++;
1183 }
1184
1185 private void handlePStart( Sink sink, SinkEventAttributeSet attribs )
1186 {
1187 if ( !inFigure )
1188 {
1189 sink.paragraph( attribs );
1190 }
1191 }
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203 private void handlePreStart( SinkEventAttributeSet attribs, Sink sink )
1204 {
1205 verbatim();
1206 sink.verbatim( attribs );
1207 }
1208
1209 private void handleSectionStart( Sink sink, int level, SinkEventAttributeSet attribs )
1210 {
1211 consecutiveSections( level, sink );
1212 sink.section( level, attribs );
1213 sink.sectionTitle( level, attribs );
1214 }
1215
1216 private void handleTableStart( Sink sink, SinkEventAttributeSet attribs, XmlPullParser parser )
1217 {
1218 sink.table( attribs );
1219 String border = parser.getAttributeValue( null, Attribute.BORDER.toString() );
1220 boolean grid = true;
1221
1222 if ( border == null || "0".equals( border ) )
1223 {
1224 grid = false;
1225 }
1226
1227 String align = parser.getAttributeValue( null, Attribute.ALIGN.toString() );
1228 int[] justif = {Sink.JUSTIFY_LEFT};
1229
1230 if ( "center".equals( align ) )
1231 {
1232 justif[0] = Sink.JUSTIFY_CENTER;
1233 }
1234 else if ( "right".equals( align ) )
1235 {
1236 justif[0] = Sink.JUSTIFY_RIGHT;
1237 }
1238
1239 sink.tableRows( justif, grid );
1240 }
1241 }