View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *   http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  package org.apache.maven.doxia.parser;
20  
21  import java.util.Iterator;
22  
23  import org.apache.maven.doxia.sink.impl.SinkEventAttributeSet;
24  import org.apache.maven.doxia.sink.impl.SinkEventElement;
25  import org.apache.maven.doxia.sink.impl.SinkEventTestingSink;
26  import org.junit.jupiter.api.BeforeEach;
27  import org.junit.jupiter.api.Test;
28  
29  import static org.junit.jupiter.api.Assertions.*;
30  
31  /**
32   * Test for Xhtml5BaseParser.
33   */
34  public class Xhtml5BaseParserTest extends AbstractParserTest {
35      private Xhtml5BaseParser parser;
36      private final SinkEventTestingSink sink = new SinkEventTestingSink();
37  
38      @Override
39      protected AbstractParser createParser() {
40          parser = new Xhtml5BaseParser();
41          return parser;
42      }
43  
44      @Override
45      protected String outputExtension() {
46          return "xhtml";
47      }
48  
49      @BeforeEach
50      protected void setUp() throws Exception {
51          parser = new Xhtml5BaseParser();
52          sink.reset();
53      }
54  
55      @Test
56      public void testHeadingEventsList() throws Exception {
57          String text = "<p><h1></h1><h2></h2><h3></h3><h4></h4><h5></h5><h1></h1></p>";
58  
59          parser.parse(text, sink);
60  
61          Iterator<SinkEventElement> it = sink.getEventList().iterator();
62  
63          assertEquals("paragraph", it.next().getName());
64          assertEquals("section1", it.next().getName());
65          assertEquals("sectionTitle1", it.next().getName());
66          assertEquals("sectionTitle1_", it.next().getName());
67          assertEquals("section2", it.next().getName());
68          assertEquals("sectionTitle2", it.next().getName());
69          assertEquals("sectionTitle2_", it.next().getName());
70          assertEquals("section3", it.next().getName());
71          assertEquals("sectionTitle3", it.next().getName());
72          assertEquals("sectionTitle3_", it.next().getName());
73          assertEquals("section4", it.next().getName());
74          assertEquals("sectionTitle4", it.next().getName());
75          assertEquals("sectionTitle4_", it.next().getName());
76          assertEquals("section5", it.next().getName());
77          assertEquals("sectionTitle5", it.next().getName());
78          assertEquals("sectionTitle5_", it.next().getName());
79          assertEquals("section5_", it.next().getName());
80          assertEquals("section4_", it.next().getName());
81          assertEquals("section3_", it.next().getName());
82          assertEquals("section2_", it.next().getName());
83          assertEquals("section1_", it.next().getName());
84          assertEquals("section1", it.next().getName());
85          assertEquals("sectionTitle1", it.next().getName());
86          assertEquals("sectionTitle1_", it.next().getName());
87          // this one is missing because we enclose everything in <p> which is not valid xhtml,
88          // needs to be tested in overriding parser, eg XhtmlParser, XdocParser.
89          // assertEquals("section1_", it.next().getName());
90          assertEquals("paragraph_", it.next().getName());
91          assertFalse(it.hasNext());
92      }
93  
94      @Test
95      public void testNestedHeadingEventsList() throws Exception {
96          // DOXIA-241
97          String text = "<p><h1></h1><h5></h5><h2></h2></p>";
98  
99          parser.parse(text, sink);
100 
101         Iterator<SinkEventElement> it = sink.getEventList().iterator();
102 
103         assertEquals("paragraph", it.next().getName());
104         assertEquals("section1", it.next().getName());
105         assertEquals("sectionTitle1", it.next().getName());
106         assertEquals("sectionTitle1_", it.next().getName());
107 
108         assertEquals("section2", it.next().getName());
109         assertEquals("section3", it.next().getName());
110         assertEquals("section4", it.next().getName());
111 
112         assertEquals("section5", it.next().getName());
113         assertEquals("sectionTitle5", it.next().getName());
114         assertEquals("sectionTitle5_", it.next().getName());
115         assertEquals("section5_", it.next().getName());
116 
117         assertEquals("section4_", it.next().getName());
118         assertEquals("section3_", it.next().getName());
119         assertEquals("section2_", it.next().getName());
120 
121         assertEquals("section2", it.next().getName());
122         assertEquals("sectionTitle2", it.next().getName());
123         assertEquals("sectionTitle2_", it.next().getName());
124         // these two are missing because we enclose everything in <p> which is not valid xhtml,
125         // needs to be tested in overriding parser, eg XhtmlParser, XdocParser.
126         // assertEquals("section2_", it.next().getName());
127         // assertEquals("section1_", it.next().getName());
128         assertEquals("paragraph_", it.next().getName());
129         assertFalse(it.hasNext());
130     }
131 
132     @Test
133     public void testSectionsAndHeadingsOnDifferentLevels() throws ParseException {
134         // section on higher level than heading
135         String text = "<body><section><section><h1>Headline1</h1></section></section></body>";
136         parser.parse(text, sink);
137 
138         Iterator<SinkEventElement> it = sink.getEventList().iterator();
139         assertSinkEquals(
140                 it,
141                 "section1",
142                 "section2",
143                 "section2_",
144                 "section1_",
145                 "section1",
146                 "sectionTitle1",
147                 "text",
148                 "sectionTitle1_",
149                 "section2",
150                 "section2_",
151                 "section1_");
152     }
153 
154     @Test
155     public void testSectionsAndHeadingsOnDifferentLevels2() throws ParseException {
156         // section on lower level than heading
157         String text = "<body><section><h3>Headline1</h3></section></body>";
158         parser.parse(text, sink);
159 
160         Iterator<SinkEventElement> it = sink.getEventList().iterator();
161         assertSinkEquals(
162                 it,
163                 "section1",
164                 "section2",
165                 "section3",
166                 "sectionTitle3",
167                 "text",
168                 "sectionTitle3_",
169                 "section3_",
170                 "section2_",
171                 "section1_");
172     }
173 
174     @Test
175     public void testSectionsAndHeadingsOnSameLevel() throws ParseException {
176         // heading directly following same level section doesn't need additional sections, while headings following some
177         // other element (still same level)
178         // needs an explicit new (same level) section
179         String text =
180                 "<body><section><h1>Headline1</h1><section><h2>Headline2</h2></section><h1>Headline3</h1></section></body>";
181         parser.parse(text, sink);
182 
183         Iterator<SinkEventElement> it = sink.getEventList().iterator();
184         assertSinkEquals(
185                 it,
186                 "section1",
187                 "sectionTitle1",
188                 "text",
189                 "sectionTitle1_",
190                 "section2",
191                 "sectionTitle2",
192                 "text",
193                 "sectionTitle2_",
194                 "section2_",
195                 "section1_",
196                 "section1",
197                 "sectionTitle1",
198                 "text",
199                 "sectionTitle1_",
200                 "section1_");
201     }
202 
203     @Test
204     public void testFigureEventsList() throws Exception {
205         String text = "<img src=\"source\" title=\"caption\" />";
206 
207         parser.parse(text, sink);
208 
209         Iterator<SinkEventElement> it = sink.getEventList().iterator();
210 
211         assertEquals("figureGraphics", it.next().getName());
212         assertFalse(it.hasNext());
213     }
214 
215     @Test
216     public void testTableEventsList() throws Exception {
217         // TODO: table caption, see DOXIA-177
218 
219         String text = "<table><tr><th>Header</th></tr><tr><td>cell</td></tr></table>";
220 
221         parser.parse(text, sink);
222 
223         Iterator<SinkEventElement> it = sink.getEventList().iterator();
224 
225         assertEquals("table", it.next().getName());
226         assertEquals("tableRows", it.next().getName());
227         assertEquals("tableRow", it.next().getName());
228         assertEquals("tableHeaderCell", it.next().getName());
229         assertEquals("text", it.next().getName());
230         assertEquals("tableHeaderCell_", it.next().getName());
231         assertEquals("tableRow_", it.next().getName());
232         assertEquals("tableRow", it.next().getName());
233         assertEquals("tableCell", it.next().getName());
234         assertEquals("text", it.next().getName());
235         assertEquals("tableCell_", it.next().getName());
236         assertEquals("tableRow_", it.next().getName());
237         assertEquals("tableRows_", it.next().getName());
238         assertEquals("table_", it.next().getName());
239 
240         assertFalse(it.hasNext());
241     }
242 
243     @Test
244     public void testSignificantWhiteSpace() throws Exception {
245         // NOTE significant white space
246         String text = "<p><b>word</b> <i>word</i></p>";
247 
248         parser.parse(text, sink);
249 
250         Iterator<SinkEventElement> it = sink.getEventList().iterator();
251 
252         assertEquals("paragraph", it.next().getName());
253         assertEquals("inline", it.next().getName());
254         assertEquals("text", it.next().getName());
255         assertEquals("inline_", it.next().getName());
256 
257         SinkEventElement el = it.next();
258         assertEquals("text", el.getName());
259         assertEquals(" ", (String) el.getArgs()[0]);
260 
261         assertEquals("inline", it.next().getName());
262         assertEquals("text", it.next().getName());
263         assertEquals("inline_", it.next().getName());
264         assertEquals("paragraph_", it.next().getName());
265         assertFalse(it.hasNext());
266 
267         // same test with EOL
268         String eol = System.getProperty("line.separator");
269         text = "<p><b>word</b>" + eol + "<i>word</i></p>";
270 
271         sink.reset();
272         parser.parse(text, sink);
273         it = sink.getEventList().iterator();
274 
275         assertEquals("paragraph", it.next().getName());
276         assertEquals("inline", it.next().getName());
277         assertEquals("text", it.next().getName());
278         assertEquals("inline_", it.next().getName());
279 
280         el = it.next();
281         assertEquals("text", el.getName());
282         // according to section 2.11 of the XML spec, parsers must normalize line breaks to "\n"
283         assertEquals("\n", (String) el.getArgs()[0]);
284 
285         assertEquals("inline", it.next().getName());
286         assertEquals("text", it.next().getName());
287         assertEquals("inline_", it.next().getName());
288         assertEquals("paragraph_", it.next().getName());
289         assertFalse(it.hasNext());
290 
291         // DOXIA-189: there should be no EOL after closing tag
292         text = "<p>There should be no space after the last <i>word</i>.</p>";
293 
294         sink.reset();
295         parser.parse(text, sink);
296         it = sink.getEventList().iterator();
297 
298         assertEquals("paragraph", it.next().getName());
299         assertEquals("text", it.next().getName());
300         assertEquals("inline", it.next().getName());
301         assertEquals("text", it.next().getName());
302         assertEquals("inline_", it.next().getName());
303 
304         el = it.next();
305         assertEquals("text", el.getName());
306         assertEquals(".", (String) el.getArgs()[0]);
307 
308         assertEquals("paragraph_", it.next().getName());
309         assertFalse(it.hasNext());
310     }
311 
312     @Test
313     public void testPreFormattedText() throws Exception {
314         String text = "<pre><a href=\"what.html\">what</a></pre>";
315 
316         parser.parse(text, sink);
317 
318         Iterator<SinkEventElement> it = sink.getEventList().iterator();
319         assertEquals("verbatim", it.next().getName());
320         assertEquals("link", it.next().getName());
321         assertEquals("text", it.next().getName());
322         assertEquals("link_", it.next().getName());
323         assertEquals("verbatim_", it.next().getName());
324         assertFalse(it.hasNext());
325 
326         text = "<pre><![CDATA[<a href=\"what.html\">what</a>]]></pre>";
327         sink.reset();
328         parser.parse(text, sink);
329 
330         it = sink.getEventList().iterator();
331         assertEquals("verbatim", it.next().getName());
332         assertEquals("text", it.next().getName());
333         assertEquals("verbatim_", it.next().getName());
334         assertFalse(it.hasNext());
335 
336         text = "<pre><![CDATA[<pre>what</pre>]]></pre>";
337         sink.reset();
338         parser.parse(text, sink);
339 
340         it = sink.getEventList().iterator();
341         assertEquals("verbatim", it.next().getName());
342         assertEquals("text", it.next().getName());
343         assertEquals("verbatim_", it.next().getName());
344         assertFalse(it.hasNext());
345     }
346 
347     @Test
348     public void testPreEOL() throws Exception {
349         // test EOLs within <pre>: the sink MUST receive a text event for the EOL
350         String text =
351                 "<pre><a href=\"what.html\">what</a>" + Xhtml5BaseParser.EOL + "<a href=\"what.html\">what</a></pre>";
352 
353         parser.parse(text, sink);
354 
355         Iterator<SinkEventElement> it = sink.getEventList().iterator();
356 
357         assertEquals("verbatim", it.next().getName());
358         assertEquals("link", it.next().getName());
359         assertEquals("text", it.next().getName());
360         assertEquals("link_", it.next().getName());
361         assertEquals("text", it.next().getName());
362         assertEquals("link", it.next().getName());
363         assertEquals("text", it.next().getName());
364         assertEquals("link_", it.next().getName());
365         assertEquals("verbatim_", it.next().getName());
366     }
367 
368     @Test
369     public void testDoxia250() throws Exception {
370         StringBuilder sb = new StringBuilder();
371         sb.append("<!DOCTYPE test [").append(Xhtml5BaseParser.EOL);
372         sb.append("<!ENTITY foo \"&#x159;\">").append(Xhtml5BaseParser.EOL);
373         sb.append("<!ENTITY foo1 \"&nbsp;\">").append(Xhtml5BaseParser.EOL);
374         sb.append("<!ENTITY foo2 \"&#x161;\">").append(Xhtml5BaseParser.EOL);
375         sb.append("<!ENTITY tritPos \"&#x1d7ed;\">").append(Xhtml5BaseParser.EOL);
376         sb.append("]>").append(Xhtml5BaseParser.EOL);
377         sb.append("<p>&foo;&foo1;&foo2;&tritPos;</p>");
378 
379         parser.setValidate(false);
380         parser.parse(sb.toString(), sink);
381 
382         Iterator<SinkEventElement> it = sink.getEventList().iterator();
383 
384         SinkEventElement event = it.next();
385         assertEquals("paragraph", event.getName());
386 
387         event = it.next();
388         assertEquals("text", event.getName());
389         assertEquals("\u0159", (String) event.getArgs()[0]);
390 
391         event = it.next();
392         assertEquals("text", event.getName());
393         assertEquals("\u00A0", (String) event.getArgs()[0]);
394 
395         event = it.next();
396         assertEquals("text", event.getName());
397         assertEquals("\u0161", (String) event.getArgs()[0]);
398 
399         event = it.next();
400         assertEquals("text", event.getName());
401         assertEquals("\uD835\uDFED", (String) event.getArgs()[0]);
402 
403         event = it.next();
404         assertEquals("paragraph_", event.getName());
405     }
406 
407     @Test
408     public void testEntities() throws Exception {
409         final String text =
410                 "<!DOCTYPE test [<!ENTITY flo \"&#x159;\"><!ENTITY tritPos \"&#x1d7ed;\"><!ENTITY fo \"&#65;\"><!ENTITY myCustom \"&fo;\">]>"
411                         + "<body><h1>&amp;&flo;&#x159;&tritPos;&#x1d7ed;</h1><p>&amp;&flo;&#x159;&tritPos;&#x1d7ed;&myCustom;</p></body>";
412 
413         parser.setValidate(false);
414         parser.parse(text, sink);
415 
416         Iterator<SinkEventElement> it = sink.getEventList().iterator();
417 
418         assertEquals("section1", it.next().getName());
419         assertEquals("sectionTitle1", it.next().getName());
420 
421         SinkEventElement textEvt = it.next();
422         assertEquals("text", textEvt.getName());
423         assertEquals("&", textEvt.getArgs()[0]);
424 
425         textEvt = it.next();
426         assertEquals("text", textEvt.getName());
427         assertEquals("\u0159", textEvt.getArgs()[0]);
428 
429         textEvt = it.next();
430         assertEquals("text", textEvt.getName());
431         assertEquals("\u0159", textEvt.getArgs()[0]);
432 
433         textEvt = it.next();
434         assertEquals("text", textEvt.getName());
435         assertEquals("\uD835\uDFED", (String) textEvt.getArgs()[0]);
436 
437         textEvt = it.next();
438         assertEquals("text", textEvt.getName());
439         assertEquals("\uD835\uDFED", textEvt.getArgs()[0]);
440 
441         assertEquals("sectionTitle1_", it.next().getName());
442         assertEquals("paragraph", it.next().getName());
443 
444         textEvt = it.next();
445         assertEquals("text", textEvt.getName());
446         assertEquals("&", textEvt.getArgs()[0]);
447 
448         textEvt = it.next();
449         assertEquals("text", textEvt.getName());
450         assertEquals("\u0159", textEvt.getArgs()[0]);
451 
452         textEvt = it.next();
453         assertEquals("text", textEvt.getName());
454         assertEquals("\u0159", textEvt.getArgs()[0]);
455 
456         textEvt = it.next();
457         assertEquals("text", textEvt.getName());
458         assertEquals("\uD835\uDFED", (String) textEvt.getArgs()[0]);
459 
460         textEvt = it.next();
461         assertEquals("text", textEvt.getName());
462         assertEquals("\uD835\uDFED", textEvt.getArgs()[0]);
463 
464         textEvt = it.next();
465         assertEquals("text", textEvt.getName());
466         assertEquals("A", textEvt.getArgs()[0]);
467 
468         assertEquals("paragraph_", it.next().getName());
469         // FIXME
470         //        assertEquals("section1_", it.next().getName());
471 
472         assertFalse(it.hasNext());
473     }
474 
475     @Test
476     public void testXhtmlEntities() throws Exception {
477         final String text = "<body><h1>&quot;&amp;</h1><p>&apos;&lt;&gt;</p></body>";
478 
479         parser.parse(text, sink);
480 
481         Iterator<SinkEventElement> it = sink.getEventList().iterator();
482 
483         assertEquals("section1", it.next().getName());
484         assertEquals("sectionTitle1", it.next().getName());
485 
486         SinkEventElement textEvt = it.next();
487         assertEquals("text", textEvt.getName());
488         assertEquals("\"", textEvt.getArgs()[0]);
489 
490         textEvt = it.next();
491         assertEquals("text", textEvt.getName());
492         assertEquals("&", textEvt.getArgs()[0]);
493 
494         assertEquals("sectionTitle1_", it.next().getName());
495         assertEquals("paragraph", it.next().getName());
496 
497         textEvt = it.next();
498         assertEquals("text", textEvt.getName());
499         assertEquals("\'", textEvt.getArgs()[0]);
500 
501         textEvt = it.next();
502         assertEquals("text", textEvt.getName());
503         assertEquals("<", textEvt.getArgs()[0]);
504 
505         textEvt = it.next();
506         assertEquals("text", textEvt.getName());
507         assertEquals(">", textEvt.getArgs()[0]);
508 
509         assertEquals("paragraph_", it.next().getName());
510 
511         assertFalse(it.hasNext());
512     }
513 
514     @Test
515     public void testLists() throws Exception {
516         String text = "<div><ul><li></li></ul><ol><li></li></ol><dl><dt></dt><dd></dd></dl></div>";
517         parser.parse(text, sink);
518         Iterator<SinkEventElement> it = sink.getEventList().iterator();
519 
520         assertEquals("division", it.next().getName());
521         assertEquals("list", it.next().getName());
522         assertEquals("listItem", it.next().getName());
523         assertEquals("listItem_", it.next().getName());
524         assertEquals("list_", it.next().getName());
525 
526         assertEquals("numberedList", it.next().getName());
527         assertEquals("numberedListItem", it.next().getName());
528         assertEquals("numberedListItem_", it.next().getName());
529         assertEquals("numberedList_", it.next().getName());
530 
531         assertEquals("definitionList", it.next().getName());
532         assertEquals("definitionListItem", it.next().getName());
533         assertEquals("definedTerm", it.next().getName());
534         assertEquals("definedTerm_", it.next().getName());
535         assertEquals("definition", it.next().getName());
536         assertEquals("definition_", it.next().getName());
537         assertEquals("definitionListItem_", it.next().getName());
538         assertEquals("definitionList_", it.next().getName());
539         assertEquals("division_", it.next().getName());
540     }
541 
542     @Test
543     public void testSimpleTags() throws Exception {
544         String text = "<div><br /><wbr /><hr /><img src=\"img.src\"/></div>";
545         parser.parse(text, sink);
546         Iterator<SinkEventElement> it = sink.getEventList().iterator();
547 
548         assertEquals("division", it.next().getName());
549         assertEquals("lineBreak", it.next().getName());
550         assertEquals("lineBreakOpportunity", it.next().getName());
551         assertEquals("horizontalRule", it.next().getName());
552         assertEquals("figureGraphics", it.next().getName());
553         assertEquals("division_", it.next().getName());
554     }
555 
556     @Test
557     public void testSemanticTags() throws Exception {
558         String text =
559                 "<em><strong><small><s><cite><q><dfn><abbr><i><b><code><var><samp><kbd><sup><sub><u><mark><ruby><rb><rt><rtc><rp><bdi><bdo><span><ins><del>a text &amp; &#xc6;</del></ins></span></bdo></bdi></rp></rtc></rt></rb></ruby></mark></u></sub></sup></kbd></samp></var></code></b></i></abbr></dfn></q></cite></s></small></strong></em>";
560         parser.parse(text, sink);
561         Iterator<SinkEventElement> it = sink.getEventList().iterator();
562 
563         SinkEventElement event = it.next();
564         assertEquals("inline", event.getName());
565         assertEquals("semantics=emphasis", event.getArgs()[0].toString().trim());
566 
567         event = it.next();
568         assertEquals("inline", event.getName());
569         assertEquals("semantics=strong", event.getArgs()[0].toString().trim());
570 
571         event = it.next();
572         assertEquals("inline", event.getName());
573         assertEquals("semantics=small", event.getArgs()[0].toString().trim());
574 
575         event = it.next();
576         assertEquals("inline", event.getName());
577         assertEquals("semantics=line-through", event.getArgs()[0].toString().trim());
578 
579         event = it.next();
580         assertEquals("inline", event.getName());
581         assertEquals("semantics=citation", event.getArgs()[0].toString().trim());
582 
583         event = it.next();
584         assertEquals("inline", event.getName());
585         assertEquals("semantics=quote", event.getArgs()[0].toString().trim());
586 
587         event = it.next();
588         assertEquals("inline", event.getName());
589         assertEquals("semantics=definition", event.getArgs()[0].toString().trim());
590 
591         event = it.next();
592         assertEquals("inline", event.getName());
593         assertEquals("semantics=abbreviation", event.getArgs()[0].toString().trim());
594 
595         event = it.next();
596         assertEquals("inline", event.getName());
597         assertEquals("semantics=italic", event.getArgs()[0].toString().trim());
598 
599         event = it.next();
600         assertEquals("inline", event.getName());
601         assertEquals("semantics=bold", event.getArgs()[0].toString().trim());
602 
603         event = it.next();
604         assertEquals("inline", event.getName());
605         assertEquals("semantics=code", event.getArgs()[0].toString().trim());
606 
607         event = it.next();
608         assertEquals("inline", event.getName());
609         assertEquals("semantics=variable", event.getArgs()[0].toString().trim());
610 
611         event = it.next();
612         assertEquals("inline", event.getName());
613         assertEquals("semantics=sample", event.getArgs()[0].toString().trim());
614 
615         event = it.next();
616         assertEquals("inline", event.getName());
617         assertEquals("semantics=keyboard", event.getArgs()[0].toString().trim());
618 
619         event = it.next();
620         assertEquals("inline", event.getName());
621         assertEquals("semantics=superscript", event.getArgs()[0].toString().trim());
622 
623         event = it.next();
624         assertEquals("inline", event.getName());
625         assertEquals("semantics=subscript", event.getArgs()[0].toString().trim());
626 
627         event = it.next();
628         assertEquals("inline", event.getName());
629         assertEquals("semantics=annotation", event.getArgs()[0].toString().trim());
630 
631         event = it.next();
632         assertEquals("inline", event.getName());
633         assertEquals("semantics=highlight", event.getArgs()[0].toString().trim());
634 
635         event = it.next();
636         assertEquals("inline", event.getName());
637         assertEquals("semantics=ruby", event.getArgs()[0].toString().trim());
638 
639         event = it.next();
640         assertEquals("inline", event.getName());
641         assertEquals("semantics=rubyBase", event.getArgs()[0].toString().trim());
642 
643         event = it.next();
644         assertEquals("inline", event.getName());
645         assertEquals("semantics=rubyText", event.getArgs()[0].toString().trim());
646 
647         event = it.next();
648         assertEquals("inline", event.getName());
649         assertEquals(
650                 "semantics=rubyTextContainer", event.getArgs()[0].toString().trim());
651 
652         event = it.next();
653         assertEquals("inline", event.getName());
654         assertEquals("semantics=rubyParentheses", event.getArgs()[0].toString().trim());
655 
656         event = it.next();
657         assertEquals("inline", event.getName());
658         assertEquals(
659                 "semantics=bidirectionalIsolation",
660                 event.getArgs()[0].toString().trim());
661 
662         event = it.next();
663         assertEquals("inline", event.getName());
664         assertEquals(
665                 "semantics=bidirectionalOverride", event.getArgs()[0].toString().trim());
666 
667         event = it.next();
668         assertEquals("inline", event.getName());
669         assertEquals("semantics=phrase", event.getArgs()[0].toString().trim());
670 
671         event = it.next();
672         assertEquals("inline", event.getName());
673         assertEquals("semantics=insert", event.getArgs()[0].toString().trim());
674 
675         event = it.next();
676         assertEquals("inline", event.getName());
677         assertEquals("semantics=delete", event.getArgs()[0].toString().trim());
678 
679         assertEquals("text", it.next().getName());
680         assertEquals("text", it.next().getName());
681         assertEquals("text", it.next().getName());
682         assertEquals("text", it.next().getName());
683 
684         assertEquals("inline_", it.next().getName());
685         assertEquals("inline_", it.next().getName());
686         assertEquals("inline_", it.next().getName());
687         assertEquals("inline_", it.next().getName());
688         assertEquals("inline_", it.next().getName());
689         assertEquals("inline_", it.next().getName());
690         assertEquals("inline_", it.next().getName());
691         assertEquals("inline_", it.next().getName());
692         assertEquals("inline_", it.next().getName());
693         assertEquals("inline_", it.next().getName());
694         assertEquals("inline_", it.next().getName());
695         assertEquals("inline_", it.next().getName());
696         assertEquals("inline_", it.next().getName());
697         assertEquals("inline_", it.next().getName());
698         assertEquals("inline_", it.next().getName());
699         assertEquals("inline_", it.next().getName());
700         assertEquals("inline_", it.next().getName());
701         assertEquals("inline_", it.next().getName());
702         assertEquals("inline_", it.next().getName());
703         assertEquals("inline_", it.next().getName());
704         assertEquals("inline_", it.next().getName());
705         assertEquals("inline_", it.next().getName());
706         assertEquals("inline_", it.next().getName());
707         assertEquals("inline_", it.next().getName());
708         assertEquals("inline_", it.next().getName());
709         assertEquals("inline_", it.next().getName());
710         assertEquals("inline_", it.next().getName());
711         assertEquals("inline_", it.next().getName());
712     }
713 
714     @Test
715     public void testSpecial() throws Exception {
716         String text = "<p><!-- a pagebreak: --><!-- PB -->&nbsp;&#160;<unknown /></p>";
717         parser.parse(text, sink);
718         Iterator<SinkEventElement> it = sink.getEventList().iterator();
719 
720         assertEquals("paragraph", it.next().getName());
721         assertEquals("comment", it.next().getName());
722         assertEquals("pageBreak", it.next().getName());
723         assertEquals("nonBreakingSpace", it.next().getName());
724         assertEquals("nonBreakingSpace", it.next().getName());
725         // unknown events are not reported by the base parser
726         assertEquals("paragraph_", it.next().getName());
727     }
728 
729     @Test
730     public void testTable() throws Exception {
731         String text = "<table><caption></caption><tr><th></th></tr><tr><td></td></tr></table>";
732         parser.parse(text, sink);
733         Iterator<SinkEventElement> it = sink.getEventList().iterator();
734 
735         assertEquals("table", it.next().getName());
736 
737         // DOXIA-374
738         SinkEventElement el = it.next();
739         assertEquals("tableRows", el.getName());
740         assertFalse((Boolean) el.getArgs()[1]);
741 
742         assertEquals("tableCaption", it.next().getName());
743         assertEquals("tableCaption_", it.next().getName());
744         assertEquals("tableRow", it.next().getName());
745         assertEquals("tableHeaderCell", it.next().getName());
746         assertEquals("tableHeaderCell_", it.next().getName());
747         assertEquals("tableRow_", it.next().getName());
748         assertEquals("tableRow", it.next().getName());
749         assertEquals("tableCell", it.next().getName());
750         assertEquals("tableCell_", it.next().getName());
751         assertEquals("tableRow_", it.next().getName());
752         assertEquals("tableRows_", it.next().getName());
753         assertEquals("table_", it.next().getName());
754     }
755 
756     @Test
757     public void testFigure() throws Exception {
758         String text = "<figure><img src=\"src.jpg\"/><figcaption></figcaption></figure>";
759         parser.parse(text, sink);
760         Iterator<SinkEventElement> it = sink.getEventList().iterator();
761 
762         assertEquals("figure", it.next().getName());
763         assertEquals("figureGraphics", it.next().getName());
764         assertEquals("figureCaption", it.next().getName());
765         assertEquals("figureCaption_", it.next().getName());
766         assertEquals("figure_", it.next().getName());
767     }
768 
769     @Test
770     public void testLink() throws Exception {
771         // param1 value = "/&ΓΌ" URL encoded twice!
772         String text = "<div><a href=\"http://www.fo.com/index.html&amp;param1=%252F%2526%25C3%25BC\"></a></div>";
773 
774         parser.parse(text, sink);
775         Iterator<SinkEventElement> it = sink.getEventList().iterator();
776 
777         SinkEventElement element = it.next();
778         assertEquals("division", element.getName());
779 
780         element = it.next();
781         assertEquals("link", element.getName());
782         assertEquals("http://www.fo.com/index.html&param1=%252F%2526%25C3%25BC", element.getArgs()[0]);
783         assertEquals("link_", it.next().getName());
784 
785         element = it.next();
786         assertEquals("division_", element.getName());
787     }
788 
789     @Test
790     public void testAnchorLink() throws Exception {
791         String text = "<div><a href=\"\"></a>" + "<a href=\"valid\"></a>"
792                 + "<a href=\"#1invalid\"></a>"
793                 + "<a href=\"http://www.fo.com/index.html#1invalid\"></a>"
794                 + "<a id=\"valid\"></a>"
795                 + "<a id=\"1invalid\"></a>"
796                 + "<a id=\"1invalid\"></a></div>";
797 
798         parser.parse(text, sink);
799         Iterator<SinkEventElement> it = sink.getEventList().iterator();
800 
801         SinkEventElement element = it.next();
802         assertEquals("division", element.getName());
803 
804         element = it.next();
805         assertEquals("link", element.getName());
806         assertEquals("", element.getArgs()[0]);
807         assertEquals("link_", it.next().getName());
808 
809         element = it.next();
810         assertEquals("link", element.getName());
811         assertEquals("valid", element.getArgs()[0]);
812         assertEquals("link_", it.next().getName());
813 
814         element = it.next();
815         assertEquals("link", element.getName());
816         assertEquals("#a1invalid", element.getArgs()[0]);
817         assertEquals("link_", it.next().getName());
818 
819         element = it.next();
820         assertEquals("link", element.getName());
821         assertEquals("http://www.fo.com/index.html#1invalid", element.getArgs()[0]);
822         assertEquals("link_", it.next().getName());
823 
824         element = it.next();
825         assertEquals("anchor", element.getName());
826         assertEquals("valid", element.getArgs()[0]);
827         assertEquals("anchor_", it.next().getName());
828 
829         element = it.next();
830         assertEquals("anchor", element.getName());
831         assertEquals("a1invalid", element.getArgs()[0]);
832         assertEquals("anchor_", it.next().getName());
833 
834         element = it.next();
835         assertEquals("anchor", element.getName());
836         assertEquals("a1invalid", element.getArgs()[0]);
837         assertEquals("anchor_", it.next().getName());
838 
839         element = it.next();
840         assertEquals("division_", element.getName());
841     }
842 
843     /**
844      * Test entities in attributes.
845      *
846      * @throws java.lang.Exception if any.
847      */
848     @Test
849     public void testAttributeEntities() throws Exception {
850         String text = "<script src=\"http://ex.com/ex.js?v=l&amp;l=e\"></script>";
851 
852         parser.parse(text, sink);
853 
854         Iterator<SinkEventElement> it = sink.getEventList().iterator();
855 
856         SinkEventElement event = it.next();
857 
858         assertEquals("unknown", event.getName());
859         assertEquals("script", event.getArgs()[0]);
860         SinkEventAttributeSet attribs = (SinkEventAttributeSet) event.getArgs()[2];
861         // ampersand should be un-escaped
862         assertEquals("http://ex.com/ex.js?v=l&l=e", attribs.getAttribute("src"));
863         assertEquals("unknown", it.next().getName());
864         assertFalse(it.hasNext());
865 
866         sink.reset();
867         text = "<img src=\"http://ex.com/ex.jpg?v=l&amp;l=e\" alt=\"image\"/>";
868         parser.parse(text, sink);
869 
870         it = sink.getEventList().iterator();
871         event = it.next();
872         assertEquals("figureGraphics", event.getName());
873         attribs = (SinkEventAttributeSet) event.getArgs()[1];
874         // ampersand should be un-escaped
875         assertEquals("http://ex.com/ex.jpg?v=l&l=e", attribs.getAttribute("src"));
876     }
877 
878     @Test
879     public void testUnbalancedDefinitionListItem() throws Exception {
880         String text = "<body><dl><dt>key</dt><dd>value</dd></dl>" + "<dl><dd>value</dd></dl>"
881                 + "<dl><dt>key</dt></dl>"
882                 + "<dl></dl>"
883                 + "<dl><dd>value</dd><dt>key</dt></dl></body>";
884 
885         parser.parse(text, sink);
886 
887         Iterator<SinkEventElement> it = sink.getEventList().iterator();
888         assertSinkStartsWith(
889                 it,
890                 "definitionList",
891                 "definitionListItem",
892                 "definedTerm",
893                 "text",
894                 "definedTerm_",
895                 "definition",
896                 "text",
897                 "definition_",
898                 "definitionListItem_",
899                 "definitionList_");
900         assertSinkStartsWith(
901                 it,
902                 "definitionList",
903                 "definitionListItem",
904                 "definition",
905                 "text",
906                 "definition_",
907                 "definitionListItem_",
908                 "definitionList_");
909         assertSinkStartsWith(
910                 it,
911                 "definitionList",
912                 "definitionListItem",
913                 "definedTerm",
914                 "text",
915                 "definedTerm_",
916                 "definitionListItem_",
917                 "definitionList_");
918         assertSinkStartsWith(it, "definitionList", "definitionList_");
919         assertSinkEquals(
920                 it,
921                 "definitionList",
922                 "definitionListItem",
923                 "definition",
924                 "text",
925                 "definition_",
926                 "definitionListItem_",
927                 "definitionListItem",
928                 "definedTerm",
929                 "text",
930                 "definedTerm_",
931                 "definitionListItem_",
932                 "definitionList_");
933     }
934 
935     @Override
936     protected String getVerbatimSource() {
937         return "<pre>&lt;&gt;{}=#*</pre>";
938     }
939 
940     @Override
941     protected String getVerbatimCodeSource() {
942         return "<pre><code>&lt;&gt;{}=#*</code></pre>";
943     }
944 }