View Javadoc

1   package org.apache.maven.doxia.parser;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *   http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing,
15   * software distributed under the License is distributed on an
16   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17   * KIND, either express or implied.  See the License for the
18   * specific language governing permissions and limitations
19   * under the License.
20   */
21  
22  import java.io.Reader;
23  import java.util.HashMap;
24  import java.util.Map;
25  import java.util.Set;
26  import java.util.TreeSet;
27  
28  import javax.swing.text.html.HTML.Attribute;
29  
30  import org.apache.maven.doxia.macro.MacroExecutionException;
31  import org.apache.maven.doxia.markup.HtmlMarkup;
32  import org.apache.maven.doxia.sink.Sink;
33  import org.apache.maven.doxia.sink.SinkEventAttributeSet;
34  import org.apache.maven.doxia.sink.SinkEventAttributes;
35  import org.apache.maven.doxia.util.DoxiaUtils;
36  
37  import org.codehaus.plexus.util.StringUtils;
38  import org.codehaus.plexus.util.xml.pull.XmlPullParser;
39  import org.codehaus.plexus.util.xml.pull.XmlPullParserException;
40  
41  /**
42   * Common base parser for xhtml events.
43   *
44   * @author <a href="mailto:jason@maven.org">Jason van Zyl</a>
45   * @author ltheussl
46   * @version $Id: XhtmlBaseParser.java 1185112 2011-10-17 11:33:00Z ltheussl $
47   * @since 1.1
48   */
49  public class XhtmlBaseParser
50      extends AbstractXmlParser
51          implements HtmlMarkup
52  {
53      /** True if a &lt;script&gt;&lt;/script&gt; block is read. CDATA sections within are handled as rawText. */
54      private boolean scriptBlock;
55  
56      /** Used to distinguish &lt;a href=""&gt; from &lt;a name=""&gt;. */
57      private boolean isLink;
58  
59      /** Used to distinguish &lt;a href=""&gt; from &lt;a name=""&gt;. */
60      private boolean isAnchor;
61  
62      /** Used for nested lists. */
63      private int orderedListDepth = 0;
64  
65      /** Counts section level. */
66      private int sectionLevel;
67  
68      /** Verbatim flag, true whenever we are inside a &lt;pre&gt; tag. */
69      private boolean inVerbatim;
70  
71      /** Used to recognize the case of img inside figure. */
72      private boolean inFigure;
73  
74      /** Decoration properties, eg for texts. */
75      private final SinkEventAttributeSet decoration = new SinkEventAttributeSet();
76  
77      /** Map of warn messages with a String as key to describe the error type and a Set as value.
78       * Using to reduce warn messages. */
79      private Map<String, Set<String>> warnMessages;
80  
81      /** {@inheritDoc} */
82      @Override
83      public void parse( Reader source, Sink sink )
84          throws ParseException
85      {
86          init();
87  
88          try
89          {
90              super.parse( source, sink );
91          }
92          finally
93          {
94              logWarnings();
95  
96              setSecondParsing( false );
97              init();
98          }
99      }
100 
101     /**
102      * <p>
103      *   Goes through a common list of possible html start tags. These include only tags that can go into
104      *   the body of a xhtml document and so should be re-usable by different xhtml-based parsers.
105      * </p>
106      * <p>
107      *   The currently handled tags are:
108      * </p>
109      * <p>
110      *   <code>
111      *      &lt;h2&gt;, &lt;h3&gt;, &lt;h4&gt;, &lt;h5&gt;, &lt;h6&gt;, &lt;p&gt;, &lt;pre&gt;,
112      *      &lt;ul&gt;, &lt;ol&gt;, &lt;li&gt;, &lt;dl&gt;, &lt;dt&gt;, &lt;dd&gt;, &lt;b&gt;, &lt;strong&gt;,
113      *      &lt;i&gt;, &lt;em&gt;, &lt;code&gt;, &lt;samp&gt;, &lt;tt&gt;, &lt;a&gt;, &lt;table&gt;, &lt;tr&gt;,
114      *      &lt;th&gt;, &lt;td&gt;, &lt;caption&gt;, &lt;br/&gt;, &lt;hr/&gt;, &lt;img/&gt;.
115      *   </code>
116      * </p>
117      *
118      * @param parser A parser.
119      * @param sink the sink to receive the events.
120      * @return True if the event has been handled by this method, i.e. the tag was recognized, false otherwise.
121      */
122     protected boolean baseStartTag( XmlPullParser parser, Sink sink )
123     {
124         boolean visited = true;
125 
126         SinkEventAttributeSet attribs = getAttributesFromParser( parser );
127 
128         if ( parser.getName().equals( HtmlMarkup.H2.toString() ) )
129         {
130             handleSectionStart( sink, Sink.SECTION_LEVEL_1, attribs );
131         }
132         else if ( parser.getName().equals( HtmlMarkup.H3.toString() ) )
133         {
134             handleSectionStart( sink, Sink.SECTION_LEVEL_2, attribs );
135         }
136         else if ( parser.getName().equals( HtmlMarkup.H4.toString() ) )
137         {
138             handleSectionStart( sink, Sink.SECTION_LEVEL_3, attribs );
139         }
140         else if ( parser.getName().equals( HtmlMarkup.H5.toString() ) )
141         {
142             handleSectionStart( sink, Sink.SECTION_LEVEL_4, attribs );
143         }
144         else if ( parser.getName().equals( HtmlMarkup.H6.toString() ) )
145         {
146             handleSectionStart( sink, Sink.SECTION_LEVEL_5, attribs );
147         }
148         else if ( parser.getName().equals( HtmlMarkup.U.toString() ) )
149         {
150             decoration.addAttribute( SinkEventAttributes.DECORATION, "underline" );
151         }
152         else if ( parser.getName().equals( HtmlMarkup.S.toString() )
153                 || parser.getName().equals( HtmlMarkup.STRIKE.toString() )
154                 || parser.getName().equals( "del" ) )
155         {
156             decoration.addAttribute( SinkEventAttributes.DECORATION, "line-through" );
157         }
158         else if ( parser.getName().equals( HtmlMarkup.SUB.toString() ) )
159         {
160             decoration.addAttribute( SinkEventAttributes.VALIGN, "sub" );
161         }
162         else if ( parser.getName().equals( HtmlMarkup.SUP.toString() ) )
163         {
164             decoration.addAttribute( SinkEventAttributes.VALIGN, "sup" );
165         }
166         else if ( parser.getName().equals( HtmlMarkup.P.toString() ) )
167         {
168             handlePStart( sink, attribs );
169         }
170         else if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) )
171         {
172             visited = handleDivStart( parser, attribs, sink );
173         }
174         else if ( parser.getName().equals( HtmlMarkup.PRE.toString() ) )
175         {
176             handlePreStart( attribs, sink );
177         }
178         else if ( parser.getName().equals( HtmlMarkup.UL.toString() ) )
179         {
180             sink.list( attribs );
181         }
182         else if ( parser.getName().equals( HtmlMarkup.OL.toString() ) )
183         {
184             handleOLStart( parser, sink, attribs );
185         }
186         else if ( parser.getName().equals( HtmlMarkup.LI.toString() ) )
187         {
188             handleLIStart( sink, attribs );
189         }
190         else if ( parser.getName().equals( HtmlMarkup.DL.toString() ) )
191         {
192             sink.definitionList( attribs );
193         }
194         else if ( parser.getName().equals( HtmlMarkup.DT.toString() ) )
195         {
196             sink.definitionListItem( attribs );
197             sink.definedTerm( attribs );
198         }
199         else if ( parser.getName().equals( HtmlMarkup.DD.toString() ) )
200         {
201             sink.definition( attribs );
202         }
203         else if ( ( parser.getName().equals( HtmlMarkup.B.toString() ) )
204                 || ( parser.getName().equals( HtmlMarkup.STRONG.toString() ) ) )
205         {
206             sink.bold();
207         }
208         else if ( ( parser.getName().equals( HtmlMarkup.I.toString() ) )
209                 || ( parser.getName().equals( HtmlMarkup.EM.toString() ) ) )
210         {
211             handleFigureCaptionStart( sink, attribs );
212         }
213         else if ( ( parser.getName().equals( HtmlMarkup.CODE.toString() ) )
214                 || ( parser.getName().equals( HtmlMarkup.SAMP.toString() ) )
215                 || ( parser.getName().equals( HtmlMarkup.TT.toString() ) ) )
216         {
217             sink.monospaced();
218         }
219         else if ( parser.getName().equals( HtmlMarkup.A.toString() ) )
220         {
221             handleAStart( parser, sink, attribs );
222         }
223         else if ( parser.getName().equals( HtmlMarkup.TABLE.toString() ) )
224         {
225             handleTableStart( sink, attribs, parser );
226         }
227         else if ( parser.getName().equals( HtmlMarkup.TR.toString() ) )
228         {
229             sink.tableRow( attribs );
230         }
231         else if ( parser.getName().equals( HtmlMarkup.TH.toString() ) )
232         {
233             sink.tableHeaderCell( attribs );
234         }
235         else if ( parser.getName().equals( HtmlMarkup.TD.toString() ) )
236         {
237             sink.tableCell( attribs );
238         }
239         else if ( parser.getName().equals( HtmlMarkup.CAPTION.toString() ) )
240         {
241             sink.tableCaption( attribs );
242         }
243         else if ( parser.getName().equals( HtmlMarkup.BR.toString() ) )
244         {
245             sink.lineBreak( attribs );
246         }
247         else if ( parser.getName().equals( HtmlMarkup.HR.toString() ) )
248         {
249             sink.horizontalRule( attribs );
250         }
251         else if ( parser.getName().equals( HtmlMarkup.IMG.toString() ) )
252         {
253             handleImgStart( parser, sink, attribs );
254         }
255         else if ( parser.getName().equals( HtmlMarkup.SCRIPT.toString() ) )
256         {
257             handleUnknown( parser, sink, TAG_TYPE_START );
258             scriptBlock = true;
259         }
260         else
261         {
262             visited = false;
263         }
264 
265         return visited;
266     }
267 
268     /**
269      * <p>
270      *   Goes through a common list of possible html end tags.
271      *   These should be re-usable by different xhtml-based parsers.
272      *   The tags handled here are the same as for {@link #baseStartTag(XmlPullParser,Sink)},
273      *   except for the empty elements (<code>&lt;br/&gt;, &lt;hr/&gt;, &lt;img/&gt;<code>).
274      * </p>
275      *
276      * @param parser A parser.
277      * @param sink the sink to receive the events.
278      * @return True if the event has been handled by this method, false otherwise.
279      */
280     protected boolean baseEndTag( XmlPullParser parser, Sink sink )
281     {
282         boolean visited = true;
283 
284         if ( parser.getName().equals( HtmlMarkup.P.toString() ) )
285         {
286             if ( !inFigure )
287             {
288                 sink.paragraph_();
289             }
290         }
291         else if ( parser.getName().equals( HtmlMarkup.U.toString() )
292                 || parser.getName().equals( HtmlMarkup.S.toString() )
293                 || parser.getName().equals( HtmlMarkup.STRIKE.toString() )
294                 || parser.getName().equals( "del" ) )
295         {
296             decoration.removeAttribute( SinkEventAttributes.DECORATION );
297         }
298         else if ( parser.getName().equals( HtmlMarkup.SUB.toString() )
299                 || parser.getName().equals( HtmlMarkup.SUP.toString() ) )
300         {
301             decoration.removeAttribute( SinkEventAttributes.VALIGN );
302         }
303         else if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) )
304         {
305             if ( inFigure )
306             {
307                 sink.figure_();
308                 this.inFigure = false;
309             }
310             else
311             {
312                 visited = false;
313             }
314         }
315         else if ( parser.getName().equals( HtmlMarkup.PRE.toString() ) )
316         {
317             verbatim_();
318 
319             sink.verbatim_();
320         }
321         else if ( parser.getName().equals( HtmlMarkup.UL.toString() ) )
322         {
323             sink.list_();
324         }
325         else if ( parser.getName().equals( HtmlMarkup.OL.toString() ) )
326         {
327             sink.numberedList_();
328             orderedListDepth--;
329         }
330         else if ( parser.getName().equals( HtmlMarkup.LI.toString() ) )
331         {
332             handleListItemEnd( sink );
333         }
334         else if ( parser.getName().equals( HtmlMarkup.DL.toString() ) )
335         {
336             sink.definitionList_();
337         }
338         else if ( parser.getName().equals( HtmlMarkup.DT.toString() ) )
339         {
340             sink.definedTerm_();
341         }
342         else if ( parser.getName().equals( HtmlMarkup.DD.toString() ) )
343         {
344             sink.definition_();
345             sink.definitionListItem_();
346         }
347         else if ( ( parser.getName().equals( HtmlMarkup.B.toString() ) )
348                 || ( parser.getName().equals( HtmlMarkup.STRONG.toString() ) ) )
349         {
350             sink.bold_();
351         }
352         else if ( ( parser.getName().equals( HtmlMarkup.I.toString() ) )
353                 || ( parser.getName().equals( HtmlMarkup.EM.toString() ) ) )
354         {
355             handleFigureCaptionEnd( sink );
356         }
357         else if ( ( parser.getName().equals( HtmlMarkup.CODE.toString() ) )
358                 || ( parser.getName().equals( HtmlMarkup.SAMP.toString() ) )
359                 || ( parser.getName().equals( HtmlMarkup.TT.toString() ) ) )
360         {
361             sink.monospaced_();
362         }
363         else if ( parser.getName().equals( HtmlMarkup.A.toString() ) )
364         {
365             handleAEnd( sink );
366         }
367 
368         // ----------------------------------------------------------------------
369         // Tables
370         // ----------------------------------------------------------------------
371 
372         else if ( parser.getName().equals( HtmlMarkup.TABLE.toString() ) )
373         {
374             sink.tableRows_();
375 
376             sink.table_();
377         }
378         else if ( parser.getName().equals( HtmlMarkup.TR.toString() ) )
379         {
380             sink.tableRow_();
381         }
382         else if ( parser.getName().equals( HtmlMarkup.TH.toString() ) )
383         {
384             sink.tableHeaderCell_();
385         }
386         else if ( parser.getName().equals( HtmlMarkup.TD.toString() ) )
387         {
388             sink.tableCell_();
389         }
390         else if ( parser.getName().equals( HtmlMarkup.CAPTION.toString() ) )
391         {
392             sink.tableCaption_();
393         }
394         else if ( parser.getName().equals( HtmlMarkup.H2.toString() ) )
395         {
396             sink.sectionTitle1_();
397         }
398         else if ( parser.getName().equals( HtmlMarkup.H3.toString() ) )
399         {
400             sink.sectionTitle2_();
401         }
402         else if ( parser.getName().equals( HtmlMarkup.H4.toString() ) )
403         {
404             sink.sectionTitle3_();
405         }
406         else if ( parser.getName().equals( HtmlMarkup.H5.toString() ) )
407         {
408             sink.sectionTitle4_();
409         }
410         else if ( parser.getName().equals( HtmlMarkup.H6.toString() ) )
411         {
412             sink.sectionTitle5_();
413         }
414         else if ( parser.getName().equals( HtmlMarkup.SCRIPT.toString() ) )
415         {
416             handleUnknown( parser, sink, TAG_TYPE_END );
417 
418             scriptBlock = false;
419         }
420         else
421         {
422             visited = false;
423         }
424 
425         return visited;
426     }
427 
428     /**
429      * {@inheritDoc}
430      *
431      * Just calls {@link #baseStartTag(XmlPullParser,Sink)}, this should be
432      * overridden by implementing parsers to include additional tags.
433      */
434     protected void handleStartTag( XmlPullParser parser, Sink sink )
435         throws XmlPullParserException, MacroExecutionException
436     {
437         if ( !baseStartTag( parser, sink ) )
438         {
439             if ( getLog().isWarnEnabled() )
440             {
441                 String position = "[" + parser.getLineNumber() + ":"
442                     + parser.getColumnNumber() + "]";
443                 String tag = "<" + parser.getName() + ">";
444 
445                 getLog().warn( "Unrecognized xml tag: " + tag + " at " + position );
446             }
447         }
448     }
449 
450     /**
451      * {@inheritDoc}
452      *
453      * Just calls {@link #baseEndTag(XmlPullParser,Sink)}, this should be
454      * overridden by implementing parsers to include additional tags.
455      */
456     protected void handleEndTag( XmlPullParser parser, Sink sink )
457         throws XmlPullParserException, MacroExecutionException
458     {
459         if ( !baseEndTag( parser, sink ) )
460         {
461             // unrecognized tag is already logged in StartTag
462         }
463     }
464 
465     /** {@inheritDoc} */
466     @Override
467     protected void handleText( XmlPullParser parser, Sink sink )
468         throws XmlPullParserException
469     {
470         String text = getText( parser );
471 
472         /*
473          * NOTE: Don't do any whitespace trimming here. Whitespace normalization has already been performed by the
474          * parser so any whitespace that makes it here is significant.
475          *
476          * NOTE: text within script tags is ignored, scripting code should be embedded in CDATA.
477          */
478         if ( StringUtils.isNotEmpty( text ) && !isScriptBlock() )
479         {
480             sink.text( text, decoration );
481         }
482     }
483 
484     /** {@inheritDoc} */
485     @Override
486     protected void handleComment( XmlPullParser parser, Sink sink )
487         throws XmlPullParserException
488     {
489         String text = getText( parser ).trim();
490 
491         if ( "PB".equals( text ) )
492         {
493             sink.pageBreak();
494         }
495         else
496         {
497             sink.comment( text );
498         }
499     }
500 
501     /** {@inheritDoc} */
502     @Override
503     protected void handleCdsect( XmlPullParser parser, Sink sink )
504         throws XmlPullParserException
505     {
506         String text = getText( parser );
507 
508         if ( isScriptBlock() )
509         {
510             sink.unknown( CDATA, new Object[] {new Integer( CDATA_TYPE ), text}, null );
511         }
512         else
513         {
514             sink.text( text );
515         }
516     }
517 
518     /**
519      * Make sure sections are nested consecutively.
520      *
521      * <p>
522      * HTML doesn't have any sections, only sectionTitles (&lt;h2&gt; etc), that means we have to
523      * open close any sections that are missing in between.
524      * </p>
525      *
526      * <p>
527      * For instance, if the following sequence is parsed:
528      * <pre>
529      * &lt;h3&gt;&lt;/h3&gt;
530      * &lt;h6&gt;&lt;/h6&gt;
531      * </pre>
532      * we have to insert two section starts before we open the <code>&lt;h6&gt;</code>.
533      * In the following sequence
534      * <pre>
535      * &lt;h6&gt;&lt;/h6&gt;
536      * &lt;h3&gt;&lt;/h3&gt;
537      * </pre>
538      * we have to close two sections before we open the <code>&lt;h3&gt;</code>.
539      * </p>
540      *
541      * <p>The current level is set to newLevel afterwards.</p>
542      *
543      * @param newLevel the new section level, all upper levels have to be closed.
544      * @param sink the sink to receive the events.
545      */
546     protected void consecutiveSections( int newLevel, Sink sink )
547     {
548         closeOpenSections( newLevel, sink );
549         openMissingSections( newLevel, sink );
550 
551         this.sectionLevel = newLevel;
552     }
553 
554     /**
555      * Close open sections.
556      *
557      * @param newLevel the new section level, all upper levels have to be closed.
558      * @param sink the sink to receive the events.
559      */
560     private void closeOpenSections( int newLevel, Sink sink )
561     {
562         while ( this.sectionLevel >= newLevel )
563         {
564             if ( sectionLevel == Sink.SECTION_LEVEL_5 )
565             {
566                 sink.section5_();
567             }
568             else if ( sectionLevel == Sink.SECTION_LEVEL_4 )
569             {
570                 sink.section4_();
571             }
572             else if ( sectionLevel == Sink.SECTION_LEVEL_3 )
573             {
574                 sink.section3_();
575             }
576             else if ( sectionLevel == Sink.SECTION_LEVEL_2 )
577             {
578                 sink.section2_();
579             }
580             else if ( sectionLevel == Sink.SECTION_LEVEL_1 )
581             {
582                 sink.section1_();
583             }
584 
585             this.sectionLevel--;
586         }
587     }
588 
589     /**
590      * Open missing sections.
591      *
592      * @param newLevel the new section level, all lower levels have to be opened.
593      * @param sink the sink to receive the events.
594      */
595     private void openMissingSections( int newLevel, Sink sink )
596     {
597         while ( this.sectionLevel < newLevel - 1 )
598         {
599             this.sectionLevel++;
600 
601             if ( sectionLevel == Sink.SECTION_LEVEL_5 )
602             {
603                 sink.section5();
604             }
605             else if ( sectionLevel == Sink.SECTION_LEVEL_4 )
606             {
607                 sink.section4();
608             }
609             else if ( sectionLevel == Sink.SECTION_LEVEL_3 )
610             {
611                 sink.section3();
612             }
613             else if ( sectionLevel == Sink.SECTION_LEVEL_2 )
614             {
615                 sink.section2();
616             }
617             else if ( sectionLevel == Sink.SECTION_LEVEL_1 )
618             {
619                 sink.section1();
620             }
621         }
622     }
623 
624     /**
625      * Return the current section level.
626      *
627      * @return the current section level.
628      */
629     protected int getSectionLevel()
630     {
631         return this.sectionLevel;
632     }
633 
634     /**
635      * Set the current section level.
636      *
637      * @param newLevel the new section level.
638      */
639     protected void setSectionLevel( int newLevel )
640     {
641         this.sectionLevel = newLevel;
642     }
643 
644     /**
645      * Stop verbatim mode.
646      */
647     protected void verbatim_()
648     {
649         this.inVerbatim = false;
650     }
651 
652     /**
653      * Start verbatim mode.
654      */
655     protected void verbatim()
656     {
657         this.inVerbatim = true;
658     }
659 
660     /**
661      * Checks if we are currently inside a &lt;pre&gt; tag.
662      *
663      * @return true if we are currently in verbatim mode.
664      */
665     protected boolean isVerbatim()
666     {
667         return this.inVerbatim;
668     }
669 
670     /**
671      * Checks if we are currently inside a &lt;script&gt; tag.
672      *
673      * @return true if we are currently inside <code>&lt;script&gt;</code> tags.
674      *
675      * @since 1.1.1.
676      */
677     protected boolean isScriptBlock()
678     {
679         return this.scriptBlock;
680     }
681 
682     /**
683      * Checks if the given id is a valid Doxia id and if not, returns a transformed one.
684      *
685      * @param id The id to validate.
686      * @return A transformed id or the original id if it was already valid.
687      * @see DoxiaUtils#encodeId(String)
688      */
689     protected String validAnchor( String id )
690     {
691         if ( !DoxiaUtils.isValidId( id ) )
692         {
693             String linkAnchor = DoxiaUtils.encodeId( id, true );
694 
695             String msg = "Modified invalid link: '" + id + "' to '" + linkAnchor + "'";
696             logMessage( "modifiedLink", msg );
697 
698             return linkAnchor;
699         }
700 
701         return id;
702     }
703 
704     /** {@inheritDoc} */
705     @Override
706     protected void init()
707     {
708         super.init();
709 
710         this.scriptBlock = false;
711         this.isLink = false;
712         this.isAnchor = false;
713         this.orderedListDepth = 0;
714         this.sectionLevel = 0;
715         this.inVerbatim = false;
716         this.inFigure = false;
717         while ( this.decoration.getAttributeNames().hasMoreElements() )
718         {
719             this.decoration.removeAttribute( this.decoration.getAttributeNames().nextElement() );
720         }
721         this.warnMessages = null;
722     }
723 
724     private void handleAEnd( Sink sink )
725     {
726         if ( isLink )
727         {
728             sink.link_();
729             isLink = false;
730         }
731         else if ( isAnchor )
732         {
733             sink.anchor_();
734             isAnchor = false;
735         }
736     }
737 
738     private void handleAStart( XmlPullParser parser, Sink sink, SinkEventAttributeSet attribs )
739     {
740         String href = parser.getAttributeValue( null, Attribute.HREF.toString() );
741 
742         if ( href != null )
743         {
744             int hashIndex = href.indexOf( '#');
745             if ( hashIndex != -1 && !DoxiaUtils.isExternalLink( href ) )
746             {
747                 String hash = href.substring( hashIndex + 1 );
748 
749                 if ( !DoxiaUtils.isValidId( hash ) )
750                 {
751                     href = href.substring( 0, hashIndex ) + "#" + DoxiaUtils.encodeId( hash, true );
752 
753                     String msg = "Modified invalid link: '" + hash + "' to '" + href + "'";
754                     logMessage( "modifiedLink", msg );
755                 }
756             }
757             sink.link( href, attribs );
758             isLink = true;
759         }
760         else
761         {
762             String name = parser.getAttributeValue( null, Attribute.NAME.toString() );
763 
764             if ( name != null )
765             {
766                 sink.anchor( validAnchor( name ), attribs );
767                 isAnchor = true;
768             }
769             else
770             {
771                 String id = parser.getAttributeValue( null, Attribute.ID.toString() );
772                 if ( id != null )
773                 {
774                     sink.anchor( validAnchor( id ), attribs );
775                     isAnchor = true;
776                 }
777             }
778         }
779     }
780 
781     private boolean handleDivStart( XmlPullParser parser, SinkEventAttributeSet attribs, Sink sink )
782     {
783         boolean visited = true;
784 
785         String divclass = parser.getAttributeValue( null, Attribute.CLASS.toString() );
786 
787         if ( "figure".equals( divclass ) )
788         {
789             this.inFigure = true;
790             SinkEventAttributeSet atts = new SinkEventAttributeSet( attribs );
791             atts.removeAttribute( SinkEventAttributes.CLASS );
792             sink.figure( atts );
793         }
794         else
795         {
796             visited = false;
797         }
798 
799         return visited;
800     }
801 
802     private void handleFigureCaptionEnd( Sink sink )
803     {
804         if ( inFigure )
805         {
806             sink.figureCaption_();
807         }
808         else
809         {
810             sink.italic_();
811         }
812     }
813 
814     private void handleFigureCaptionStart( Sink sink, SinkEventAttributeSet attribs )
815     {
816         if ( inFigure )
817         {
818             sink.figureCaption( attribs );
819         }
820         else
821         {
822             sink.italic();
823         }
824     }
825 
826     private void handleImgStart( XmlPullParser parser, Sink sink, SinkEventAttributeSet attribs )
827     {
828         String src = parser.getAttributeValue( null, Attribute.SRC.toString() );
829 
830         if ( src != null )
831         {
832             sink.figureGraphics( src, attribs );
833         }
834     }
835 
836     private void handleLIStart( Sink sink, SinkEventAttributeSet attribs )
837     {
838         if ( orderedListDepth == 0 )
839         {
840             sink.listItem( attribs );
841         }
842         else
843         {
844             sink.numberedListItem( attribs );
845         }
846     }
847 
848     private void handleListItemEnd( Sink sink )
849     {
850         if ( orderedListDepth == 0 )
851         {
852             sink.listItem_();
853         }
854         else
855         {
856             sink.numberedListItem_();
857         }
858     }
859 
860     private void handleOLStart( XmlPullParser parser, Sink sink, SinkEventAttributeSet attribs )
861     {
862         int numbering = Sink.NUMBERING_DECIMAL;
863         // this will have to be generalized if we handle styles
864         String style = parser.getAttributeValue( null, Attribute.STYLE.toString() );
865 
866         if ( style != null )
867         {
868             if ( "list-style-type: upper-alpha".equals( style ) )
869             {
870                 numbering = Sink.NUMBERING_UPPER_ALPHA;
871             }
872             else if ( "list-style-type: lower-alpha".equals( style ) )
873             {
874                 numbering = Sink.NUMBERING_LOWER_ALPHA;
875             }
876             else if ( "list-style-type: upper-roman".equals( style ) )
877             {
878                 numbering = Sink.NUMBERING_UPPER_ROMAN;
879             }
880             else if ( "list-style-type: lower-roman".equals( style ) )
881             {
882                 numbering = Sink.NUMBERING_LOWER_ROMAN;
883             }
884             else if ( "list-style-type: decimal".equals( style ) )
885             {
886                 numbering = Sink.NUMBERING_DECIMAL;
887             }
888         }
889 
890         sink.numberedList( numbering, attribs );
891         orderedListDepth++;
892     }
893 
894     private void handlePStart( Sink sink, SinkEventAttributeSet attribs )
895     {
896         if ( !inFigure )
897         {
898             sink.paragraph( attribs );
899         }
900     }
901 
902     /*
903      * The PRE element tells visual user agents that the enclosed text is
904      * "preformatted". When handling preformatted text, visual user agents:
905      * - May leave white space intact.
906      * - May render text with a fixed-pitch font.
907      * - May disable automatic word wrap.
908      * - Must not disable bidirectional processing.
909      * Non-visual user agents are not required to respect extra white space
910      * in the content of a PRE element.
911      */
912     private void handlePreStart( SinkEventAttributeSet attribs, Sink sink )
913     {
914         verbatim();
915         attribs.removeAttribute( SinkEventAttributes.DECORATION );
916         sink.verbatim( attribs );
917     }
918 
919     private void handleSectionStart( Sink sink, int level, SinkEventAttributeSet attribs )
920     {
921         consecutiveSections( level, sink );
922         sink.section( level, attribs );
923         sink.sectionTitle( level, attribs );
924     }
925 
926     private void handleTableStart( Sink sink, SinkEventAttributeSet attribs, XmlPullParser parser )
927     {
928         sink.table( attribs );
929         String border = parser.getAttributeValue( null, Attribute.BORDER.toString() );
930         boolean grid = true;
931 
932         if ( border == null || "0".equals( border ) )
933         {
934             grid = false;
935         }
936 
937         String align = parser.getAttributeValue( null, Attribute.ALIGN.toString() );
938         int[] justif = {Sink.JUSTIFY_LEFT};
939 
940         if ( "center".equals( align ) )
941         {
942             justif[0] = Sink.JUSTIFY_CENTER;
943         }
944         else if ( "right".equals( align ) )
945         {
946             justif[0] = Sink.JUSTIFY_RIGHT;
947         }
948 
949         sink.tableRows( justif, grid );
950     }
951 
952     /**
953      * If debug mode is enabled, log the <code>msg</code> as is, otherwise add unique msg in <code>warnMessages</code>.
954      *
955      * @param key not null
956      * @param msg not null
957      * @see #parse(Reader, Sink)
958      * @since 1.1.1
959      */
960     private void logMessage( String key, String msg )
961     {
962         final String log = "[XHTML Parser] " + msg;
963         if ( getLog().isDebugEnabled() )
964         {
965             getLog().debug( log );
966 
967             return;
968         }
969 
970         if ( warnMessages == null )
971         {
972             warnMessages = new HashMap<String, Set<String>>();
973         }
974 
975         Set<String> set = warnMessages.get( key );
976         if ( set == null )
977         {
978             set = new TreeSet<String>();
979         }
980         set.add( log );
981         warnMessages.put( key, set );
982     }
983 
984     /**
985      * @since 1.1.1
986      */
987     private void logWarnings()
988     {
989         if ( getLog().isWarnEnabled() && this.warnMessages != null && !isSecondParsing() )
990         {
991             for ( Map.Entry<String, Set<String>> entry : this.warnMessages.entrySet() )
992             {
993                 for ( String msg : entry.getValue() )
994                 {
995                     getLog().warn( msg );
996                 }
997             }
998 
999             this.warnMessages = null;
1000         }
1001     }
1002 }