View Javadoc

1   package org.apache.maven.doxia.parser;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *   http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing,
15   * software distributed under the License is distributed on an
16   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17   * KIND, either express or implied.  See the License for the
18   * specific language governing permissions and limitations
19   * under the License.
20   */
21  
22  import java.io.Reader;
23  import java.util.HashMap;
24  import java.util.Map;
25  import java.util.Set;
26  import java.util.TreeSet;
27  
28  import javax.swing.text.html.HTML.Attribute;
29  
30  import org.apache.maven.doxia.macro.MacroExecutionException;
31  import org.apache.maven.doxia.markup.HtmlMarkup;
32  import org.apache.maven.doxia.sink.Sink;
33  import org.apache.maven.doxia.sink.SinkEventAttributeSet;
34  import org.apache.maven.doxia.sink.SinkEventAttributes;
35  import org.apache.maven.doxia.util.DoxiaUtils;
36  
37  import org.codehaus.plexus.util.StringUtils;
38  import org.codehaus.plexus.util.xml.pull.XmlPullParser;
39  import org.codehaus.plexus.util.xml.pull.XmlPullParserException;
40  
41  /**
42   * Common base parser for xhtml events.
43   *
44   * @author <a href="mailto:jason@maven.org">Jason van Zyl</a>
45   * @author ltheussl
46   * @version $Id: XhtmlBaseParser.java 1090706 2011-04-09 23:15:28Z hboutemy $
47   * @since 1.1
48   */
49  public class XhtmlBaseParser
50      extends AbstractXmlParser
51          implements HtmlMarkup
52  {
53      /** True if a &lt;script&gt;&lt;/script&gt; block is read. CDATA sections within are handled as rawText. */
54      private boolean scriptBlock;
55  
56      /** Used to distinguish &lt;a href=""&gt; from &lt;a name=""&gt;. */
57      private boolean isLink;
58  
59      /** Used to distinguish &lt;a href=""&gt; from &lt;a name=""&gt;. */
60      private boolean isAnchor;
61  
62      /** Used for nested lists. */
63      private int orderedListDepth = 0;
64  
65      /** Counts section level. */
66      private int sectionLevel;
67  
68      /** Verbatim flag, true whenever we are inside a &lt;pre&gt; tag. */
69      private boolean inVerbatim;
70  
71      /** Used to recognize the case of img inside figure. */
72      private boolean inFigure;
73  
74      /** Decoration properties, eg for texts. */
75      private final SinkEventAttributeSet decoration = new SinkEventAttributeSet();
76  
77      /** Map of warn messages with a String as key to describe the error type and a Set as value.
78       * Using to reduce warn messages. */
79      private Map<String, Set<String>> warnMessages;
80  
81      /** {@inheritDoc} */
82      public void parse( Reader source, Sink sink )
83          throws ParseException
84      {
85          init();
86  
87          try
88          {
89              super.parse( source, sink );
90          }
91          finally
92          {
93              logWarnings();
94  
95              setSecondParsing( false );
96              init();
97          }
98      }
99  
100     /**
101      * <p>
102      *   Goes through a common list of possible html start tags. These include only tags that can go into
103      *   the body of a xhtml document and so should be re-usable by different xhtml-based parsers.
104      * </p>
105      * <p>
106      *   The currently handled tags are:
107      * </p>
108      * <p>
109      *   <code>
110      *      &lt;h2&gt;, &lt;h3&gt;, &lt;h4&gt;, &lt;h5&gt;, &lt;h6&gt;, &lt;p&gt;, &lt;pre&gt;,
111      *      &lt;ul&gt;, &lt;ol&gt;, &lt;li&gt;, &lt;dl&gt;, &lt;dt&gt;, &lt;dd&gt;, &lt;b&gt;, &lt;strong&gt;,
112      *      &lt;i&gt;, &lt;em&gt;, &lt;code&gt;, &lt;samp&gt;, &lt;tt&gt;, &lt;a&gt;, &lt;table&gt;, &lt;tr&gt;,
113      *      &lt;th&gt;, &lt;td&gt;, &lt;caption&gt;, &lt;br/&gt;, &lt;hr/&gt;, &lt;img/&gt;.
114      *   </code>
115      * </p>
116      *
117      * @param parser A parser.
118      * @param sink the sink to receive the events.
119      * @return True if the event has been handled by this method, i.e. the tag was recognized, false otherwise.
120      */
121     protected boolean baseStartTag( XmlPullParser parser, Sink sink )
122     {
123         boolean visited = true;
124 
125         SinkEventAttributeSet attribs = getAttributesFromParser( parser );
126 
127         if ( parser.getName().equals( HtmlMarkup.H2.toString() ) )
128         {
129             handleSectionStart( sink, Sink.SECTION_LEVEL_1, attribs );
130         }
131         else if ( parser.getName().equals( HtmlMarkup.H3.toString() ) )
132         {
133             handleSectionStart( sink, Sink.SECTION_LEVEL_2, attribs );
134         }
135         else if ( parser.getName().equals( HtmlMarkup.H4.toString() ) )
136         {
137             handleSectionStart( sink, Sink.SECTION_LEVEL_3, attribs );
138         }
139         else if ( parser.getName().equals( HtmlMarkup.H5.toString() ) )
140         {
141             handleSectionStart( sink, Sink.SECTION_LEVEL_4, attribs );
142         }
143         else if ( parser.getName().equals( HtmlMarkup.H6.toString() ) )
144         {
145             handleSectionStart( sink, Sink.SECTION_LEVEL_5, attribs );
146         }
147         else if ( parser.getName().equals( HtmlMarkup.U.toString() ) )
148         {
149             decoration.addAttribute( SinkEventAttributes.DECORATION, "underline" );
150         }
151         else if ( parser.getName().equals( HtmlMarkup.S.toString() )
152                 || parser.getName().equals( HtmlMarkup.STRIKE.toString() )
153                 || parser.getName().equals( "del" ) )
154         {
155             decoration.addAttribute( SinkEventAttributes.DECORATION, "line-through" );
156         }
157         else if ( parser.getName().equals( HtmlMarkup.SUB.toString() ) )
158         {
159             decoration.addAttribute( SinkEventAttributes.VALIGN, "sub" );
160         }
161         else if ( parser.getName().equals( HtmlMarkup.SUP.toString() ) )
162         {
163             decoration.addAttribute( SinkEventAttributes.VALIGN, "sup" );
164         }
165         else if ( parser.getName().equals( HtmlMarkup.P.toString() ) )
166         {
167             handlePStart( sink, attribs );
168         }
169         else if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) )
170         {
171             visited = handleDivStart( parser, attribs, sink );
172         }
173         else if ( parser.getName().equals( HtmlMarkup.PRE.toString() ) )
174         {
175             handlePreStart( attribs, sink );
176         }
177         else if ( parser.getName().equals( HtmlMarkup.UL.toString() ) )
178         {
179             sink.list( attribs );
180         }
181         else if ( parser.getName().equals( HtmlMarkup.OL.toString() ) )
182         {
183             handleOLStart( parser, sink, attribs );
184         }
185         else if ( parser.getName().equals( HtmlMarkup.LI.toString() ) )
186         {
187             handleLIStart( sink, attribs );
188         }
189         else if ( parser.getName().equals( HtmlMarkup.DL.toString() ) )
190         {
191             sink.definitionList( attribs );
192         }
193         else if ( parser.getName().equals( HtmlMarkup.DT.toString() ) )
194         {
195             sink.definitionListItem( attribs );
196             sink.definedTerm( attribs );
197         }
198         else if ( parser.getName().equals( HtmlMarkup.DD.toString() ) )
199         {
200             sink.definition( attribs );
201         }
202         else if ( ( parser.getName().equals( HtmlMarkup.B.toString() ) )
203                 || ( parser.getName().equals( HtmlMarkup.STRONG.toString() ) ) )
204         {
205             sink.bold();
206         }
207         else if ( ( parser.getName().equals( HtmlMarkup.I.toString() ) )
208                 || ( parser.getName().equals( HtmlMarkup.EM.toString() ) ) )
209         {
210             handleFigureCaptionStart( sink, attribs );
211         }
212         else if ( ( parser.getName().equals( HtmlMarkup.CODE.toString() ) )
213                 || ( parser.getName().equals( HtmlMarkup.SAMP.toString() ) )
214                 || ( parser.getName().equals( HtmlMarkup.TT.toString() ) ) )
215         {
216             sink.monospaced();
217         }
218         else if ( parser.getName().equals( HtmlMarkup.A.toString() ) )
219         {
220             handleAStart( parser, sink, attribs );
221         }
222         else if ( parser.getName().equals( HtmlMarkup.TABLE.toString() ) )
223         {
224             handleTableStart( sink, attribs, parser );
225         }
226         else if ( parser.getName().equals( HtmlMarkup.TR.toString() ) )
227         {
228             sink.tableRow( attribs );
229         }
230         else if ( parser.getName().equals( HtmlMarkup.TH.toString() ) )
231         {
232             sink.tableHeaderCell( attribs );
233         }
234         else if ( parser.getName().equals( HtmlMarkup.TD.toString() ) )
235         {
236             sink.tableCell( attribs );
237         }
238         else if ( parser.getName().equals( HtmlMarkup.CAPTION.toString() ) )
239         {
240             sink.tableCaption( attribs );
241         }
242         else if ( parser.getName().equals( HtmlMarkup.BR.toString() ) )
243         {
244             sink.lineBreak( attribs );
245         }
246         else if ( parser.getName().equals( HtmlMarkup.HR.toString() ) )
247         {
248             sink.horizontalRule( attribs );
249         }
250         else if ( parser.getName().equals( HtmlMarkup.IMG.toString() ) )
251         {
252             handleImgStart( parser, sink, attribs );
253         }
254         else if ( parser.getName().equals( HtmlMarkup.SCRIPT.toString() ) )
255         {
256             handleUnknown( parser, sink, TAG_TYPE_START );
257             scriptBlock = true;
258         }
259         else
260         {
261             visited = false;
262         }
263 
264         return visited;
265     }
266 
267     /**
268      * <p>
269      *   Goes through a common list of possible html end tags.
270      *   These should be re-usable by different xhtml-based parsers.
271      *   The tags handled here are the same as for {@link #baseStartTag(XmlPullParser,Sink)},
272      *   except for the empty elements (<code>&lt;br/&gt;, &lt;hr/&gt;, &lt;img/&gt;<code>).
273      * </p>
274      *
275      * @param parser A parser.
276      * @param sink the sink to receive the events.
277      * @return True if the event has been handled by this method, false otherwise.
278      */
279     protected boolean baseEndTag( XmlPullParser parser, Sink sink )
280     {
281         boolean visited = true;
282 
283         if ( parser.getName().equals( HtmlMarkup.P.toString() ) )
284         {
285             if ( !inFigure )
286             {
287                 sink.paragraph_();
288             }
289         }
290         else if ( parser.getName().equals( HtmlMarkup.U.toString() )
291                 || parser.getName().equals( HtmlMarkup.S.toString() )
292                 || parser.getName().equals( HtmlMarkup.STRIKE.toString() )
293                 || parser.getName().equals( "del" ) )
294         {
295             decoration.removeAttribute( SinkEventAttributes.DECORATION );
296         }
297         else if ( parser.getName().equals( HtmlMarkup.SUB.toString() )
298                 || parser.getName().equals( HtmlMarkup.SUP.toString() ) )
299         {
300             decoration.removeAttribute( SinkEventAttributes.VALIGN );
301         }
302         else if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) )
303         {
304             if ( inFigure )
305             {
306                 sink.figure_();
307                 this.inFigure = false;
308             }
309             else
310             {
311                 visited = false;
312             }
313         }
314         else if ( parser.getName().equals( HtmlMarkup.PRE.toString() ) )
315         {
316             verbatim_();
317 
318             sink.verbatim_();
319         }
320         else if ( parser.getName().equals( HtmlMarkup.UL.toString() ) )
321         {
322             sink.list_();
323         }
324         else if ( parser.getName().equals( HtmlMarkup.OL.toString() ) )
325         {
326             sink.numberedList_();
327             orderedListDepth--;
328         }
329         else if ( parser.getName().equals( HtmlMarkup.LI.toString() ) )
330         {
331             handleListItemEnd( sink );
332         }
333         else if ( parser.getName().equals( HtmlMarkup.DL.toString() ) )
334         {
335             sink.definitionList_();
336         }
337         else if ( parser.getName().equals( HtmlMarkup.DT.toString() ) )
338         {
339             sink.definedTerm_();
340         }
341         else if ( parser.getName().equals( HtmlMarkup.DD.toString() ) )
342         {
343             sink.definition_();
344             sink.definitionListItem_();
345         }
346         else if ( ( parser.getName().equals( HtmlMarkup.B.toString() ) )
347                 || ( parser.getName().equals( HtmlMarkup.STRONG.toString() ) ) )
348         {
349             sink.bold_();
350         }
351         else if ( ( parser.getName().equals( HtmlMarkup.I.toString() ) )
352                 || ( parser.getName().equals( HtmlMarkup.EM.toString() ) ) )
353         {
354             handleFigureCaptionEnd( sink );
355         }
356         else if ( ( parser.getName().equals( HtmlMarkup.CODE.toString() ) )
357                 || ( parser.getName().equals( HtmlMarkup.SAMP.toString() ) )
358                 || ( parser.getName().equals( HtmlMarkup.TT.toString() ) ) )
359         {
360             sink.monospaced_();
361         }
362         else if ( parser.getName().equals( HtmlMarkup.A.toString() ) )
363         {
364             handleAEnd( sink );
365         }
366 
367         // ----------------------------------------------------------------------
368         // Tables
369         // ----------------------------------------------------------------------
370 
371         else if ( parser.getName().equals( HtmlMarkup.TABLE.toString() ) )
372         {
373             sink.tableRows_();
374 
375             sink.table_();
376         }
377         else if ( parser.getName().equals( HtmlMarkup.TR.toString() ) )
378         {
379             sink.tableRow_();
380         }
381         else if ( parser.getName().equals( HtmlMarkup.TH.toString() ) )
382         {
383             sink.tableHeaderCell_();
384         }
385         else if ( parser.getName().equals( HtmlMarkup.TD.toString() ) )
386         {
387             sink.tableCell_();
388         }
389         else if ( parser.getName().equals( HtmlMarkup.CAPTION.toString() ) )
390         {
391             sink.tableCaption_();
392         }
393         else if ( parser.getName().equals( HtmlMarkup.H2.toString() ) )
394         {
395             sink.sectionTitle1_();
396         }
397         else if ( parser.getName().equals( HtmlMarkup.H3.toString() ) )
398         {
399             sink.sectionTitle2_();
400         }
401         else if ( parser.getName().equals( HtmlMarkup.H4.toString() ) )
402         {
403             sink.sectionTitle3_();
404         }
405         else if ( parser.getName().equals( HtmlMarkup.H5.toString() ) )
406         {
407             sink.sectionTitle4_();
408         }
409         else if ( parser.getName().equals( HtmlMarkup.H6.toString() ) )
410         {
411             sink.sectionTitle5_();
412         }
413         else if ( parser.getName().equals( HtmlMarkup.SCRIPT.toString() ) )
414         {
415             handleUnknown( parser, sink, TAG_TYPE_END );
416 
417             scriptBlock = false;
418         }
419         else
420         {
421             visited = false;
422         }
423 
424         return visited;
425     }
426 
427     /**
428      * {@inheritDoc}
429      *
430      * Just calls {@link #baseStartTag(XmlPullParser,Sink)}, this should be
431      * overridden by implementing parsers to include additional tags.
432      */
433     protected void handleStartTag( XmlPullParser parser, Sink sink )
434         throws XmlPullParserException, MacroExecutionException
435     {
436         if ( !baseStartTag( parser, sink ) )
437         {
438             if ( getLog().isWarnEnabled() )
439             {
440                 String position = "[" + parser.getLineNumber() + ":"
441                     + parser.getColumnNumber() + "]";
442                 String tag = "<" + parser.getName() + ">";
443 
444                 getLog().warn( "Unrecognized xml tag: " + tag + " at " + position );
445             }
446         }
447     }
448 
449     /**
450      * {@inheritDoc}
451      *
452      * Just calls {@link #baseEndTag(XmlPullParser,Sink)}, this should be
453      * overridden by implementing parsers to include additional tags.
454      */
455     protected void handleEndTag( XmlPullParser parser, Sink sink )
456         throws XmlPullParserException, MacroExecutionException
457     {
458         if ( !baseEndTag( parser, sink ) )
459         {
460             // unrecognized tag is already logged in StartTag
461         }
462     }
463 
464     /** {@inheritDoc} */
465     protected void handleText( XmlPullParser parser, Sink sink )
466         throws XmlPullParserException
467     {
468         String text = getText( parser );
469 
470         /*
471          * NOTE: Don't do any whitespace trimming here. Whitespace normalization has already been performed by the
472          * parser so any whitespace that makes it here is significant.
473          *
474          * NOTE: text within script tags is ignored, scripting code should be embedded in CDATA.
475          */
476         if ( StringUtils.isNotEmpty( text ) && !isScriptBlock() )
477         {
478             sink.text( text, decoration );
479         }
480     }
481 
482     /** {@inheritDoc} */
483     protected void handleComment( XmlPullParser parser, Sink sink )
484         throws XmlPullParserException
485     {
486         String text = getText( parser ).trim();
487 
488         if ( "PB".equals( text ) )
489         {
490             sink.pageBreak();
491         }
492         else
493         {
494             sink.comment( text );
495         }
496     }
497 
498     /** {@inheritDoc} */
499     protected void handleCdsect( XmlPullParser parser, Sink sink )
500         throws XmlPullParserException
501     {
502         String text = getText( parser );
503 
504         if ( isScriptBlock() )
505         {
506             sink.unknown( CDATA, new Object[] {new Integer( CDATA_TYPE ), text}, null );
507         }
508         else
509         {
510             sink.text( text );
511         }
512     }
513 
514     /**
515      * Make sure sections are nested consecutively.
516      *
517      * <p>
518      * HTML doesn't have any sections, only sectionTitles (&lt;h2&gt; etc), that means we have to
519      * open close any sections that are missing in between.
520      * </p>
521      *
522      * <p>
523      * For instance, if the following sequence is parsed:
524      * <pre>
525      * &lt;h3&gt;&lt;/h3&gt;
526      * &lt;h6&gt;&lt;/h6&gt;
527      * </pre>
528      * we have to insert two section starts before we open the <code>&lt;h6&gt;</code>.
529      * In the following sequence
530      * <pre>
531      * &lt;h6&gt;&lt;/h6&gt;
532      * &lt;h3&gt;&lt;/h3&gt;
533      * </pre>
534      * we have to close two sections before we open the <code>&lt;h3&gt;</code>.
535      * </p>
536      *
537      * <p>The current level is set to newLevel afterwards.</p>
538      *
539      * @param newLevel the new section level, all upper levels have to be closed.
540      * @param sink the sink to receive the events.
541      */
542     protected void consecutiveSections( int newLevel, Sink sink )
543     {
544         closeOpenSections( newLevel, sink );
545         openMissingSections( newLevel, sink );
546 
547         this.sectionLevel = newLevel;
548     }
549 
550     /**
551      * Close open sections.
552      *
553      * @param newLevel the new section level, all upper levels have to be closed.
554      * @param sink the sink to receive the events.
555      */
556     private void closeOpenSections( int newLevel, Sink sink )
557     {
558         while ( this.sectionLevel >= newLevel )
559         {
560             if ( sectionLevel == Sink.SECTION_LEVEL_5 )
561             {
562                 sink.section5_();
563             }
564             else if ( sectionLevel == Sink.SECTION_LEVEL_4 )
565             {
566                 sink.section4_();
567             }
568             else if ( sectionLevel == Sink.SECTION_LEVEL_3 )
569             {
570                 sink.section3_();
571             }
572             else if ( sectionLevel == Sink.SECTION_LEVEL_2 )
573             {
574                 sink.section2_();
575             }
576             else if ( sectionLevel == Sink.SECTION_LEVEL_1 )
577             {
578                 sink.section1_();
579             }
580 
581             this.sectionLevel--;
582         }
583     }
584 
585     /**
586      * Open missing sections.
587      *
588      * @param newLevel the new section level, all lower levels have to be opened.
589      * @param sink the sink to receive the events.
590      */
591     private void openMissingSections( int newLevel, Sink sink )
592     {
593         while ( this.sectionLevel < newLevel - 1 )
594         {
595             this.sectionLevel++;
596 
597             if ( sectionLevel == Sink.SECTION_LEVEL_5 )
598             {
599                 sink.section5();
600             }
601             else if ( sectionLevel == Sink.SECTION_LEVEL_4 )
602             {
603                 sink.section4();
604             }
605             else if ( sectionLevel == Sink.SECTION_LEVEL_3 )
606             {
607                 sink.section3();
608             }
609             else if ( sectionLevel == Sink.SECTION_LEVEL_2 )
610             {
611                 sink.section2();
612             }
613             else if ( sectionLevel == Sink.SECTION_LEVEL_1 )
614             {
615                 sink.section1();
616             }
617         }
618     }
619 
620     /**
621      * Return the current section level.
622      *
623      * @return the current section level.
624      */
625     protected int getSectionLevel()
626     {
627         return this.sectionLevel;
628     }
629 
630     /**
631      * Set the current section level.
632      *
633      * @param newLevel the new section level.
634      */
635     protected void setSectionLevel( int newLevel )
636     {
637         this.sectionLevel = newLevel;
638     }
639 
640     /**
641      * Stop verbatim mode.
642      */
643     protected void verbatim_()
644     {
645         this.inVerbatim = false;
646     }
647 
648     /**
649      * Start verbatim mode.
650      */
651     protected void verbatim()
652     {
653         this.inVerbatim = true;
654     }
655 
656     /**
657      * Checks if we are currently inside a &lt;pre&gt; tag.
658      *
659      * @return true if we are currently in verbatim mode.
660      */
661     protected boolean isVerbatim()
662     {
663         return this.inVerbatim;
664     }
665 
666     /**
667      * Checks if we are currently inside a &lt;script&gt; tag.
668      *
669      * @return true if we are currently inside <code>&lt;script&gt;</code> tags.
670      *
671      * @since 1.1.1.
672      */
673     protected boolean isScriptBlock()
674     {
675         return this.scriptBlock;
676     }
677 
678     /**
679      * Checks if the given id is a valid Doxia id and if not, returns a transformed one.
680      *
681      * @param id The id to validate.
682      * @return A transformed id or the original id if it was already valid.
683      * @see DoxiaUtils#encodeId(String)
684      */
685     protected String validAnchor( String id )
686     {
687         if ( !DoxiaUtils.isValidId( id ) )
688         {
689             String linkAnchor = DoxiaUtils.encodeId( id, true );
690 
691             String msg = "Modified invalid link: '" + id + "' to '" + linkAnchor + "'";
692             logMessage( "modifiedLink", msg );
693 
694             return linkAnchor;
695         }
696 
697         return id;
698     }
699 
700     /** {@inheritDoc} */
701     protected void init()
702     {
703         super.init();
704 
705         this.scriptBlock = false;
706         this.isLink = false;
707         this.isAnchor = false;
708         this.orderedListDepth = 0;
709         this.sectionLevel = 0;
710         this.inVerbatim = false;
711         this.inFigure = false;
712         while ( this.decoration.getAttributeNames().hasMoreElements() )
713         {
714             this.decoration.removeAttribute( this.decoration.getAttributeNames().nextElement() );
715         }
716         this.warnMessages = null;
717     }
718 
719     private void handleAEnd( Sink sink )
720     {
721         if ( isLink )
722         {
723             sink.link_();
724             isLink = false;
725         }
726         else if ( isAnchor )
727         {
728             sink.anchor_();
729             isAnchor = false;
730         }
731     }
732 
733     private void handleAStart( XmlPullParser parser, Sink sink, SinkEventAttributeSet attribs )
734     {
735         String href = parser.getAttributeValue( null, Attribute.HREF.toString() );
736 
737         if ( href != null )
738         {
739             int hashIndex = href.indexOf( "#" );
740             if ( hashIndex != -1 && !DoxiaUtils.isExternalLink( href ) )
741             {
742                 String hash = href.substring( hashIndex + 1 );
743 
744                 if ( !DoxiaUtils.isValidId( hash ) )
745                 {
746                     href = href.substring( 0, hashIndex ) + "#" + DoxiaUtils.encodeId( hash, true );
747 
748                     String msg = "Modified invalid link: '" + hash + "' to '" + href + "'";
749                     logMessage( "modifiedLink", msg );
750                 }
751             }
752             sink.link( href, attribs );
753             isLink = true;
754         }
755         else
756         {
757             String name = parser.getAttributeValue( null, Attribute.NAME.toString() );
758 
759             if ( name != null )
760             {
761                 sink.anchor( validAnchor( name ), attribs );
762                 isAnchor = true;
763             }
764             else
765             {
766                 String id = parser.getAttributeValue( null, Attribute.ID.toString() );
767                 if ( id != null )
768                 {
769                     sink.anchor( validAnchor( id ), attribs );
770                     isAnchor = true;
771                 }
772             }
773         }
774     }
775 
776     private boolean handleDivStart( XmlPullParser parser, SinkEventAttributeSet attribs, Sink sink )
777     {
778         boolean visited = true;
779 
780         String divclass = parser.getAttributeValue( null, Attribute.CLASS.toString() );
781 
782         if ( "figure".equals( divclass ) )
783         {
784             this.inFigure = true;
785             SinkEventAttributeSet atts = new SinkEventAttributeSet( attribs );
786             atts.removeAttribute( SinkEventAttributes.CLASS );
787             sink.figure( atts );
788         }
789         else
790         {
791             visited = false;
792         }
793 
794         return visited;
795     }
796 
797     private void handleFigureCaptionEnd( Sink sink )
798     {
799         if ( inFigure )
800         {
801             sink.figureCaption_();
802         }
803         else
804         {
805             sink.italic_();
806         }
807     }
808 
809     private void handleFigureCaptionStart( Sink sink, SinkEventAttributeSet attribs )
810     {
811         if ( inFigure )
812         {
813             sink.figureCaption( attribs );
814         }
815         else
816         {
817             sink.italic();
818         }
819     }
820 
821     private void handleImgStart( XmlPullParser parser, Sink sink, SinkEventAttributeSet attribs )
822     {
823         String src = parser.getAttributeValue( null, Attribute.SRC.toString() );
824 
825         if ( src != null )
826         {
827             sink.figureGraphics( src, attribs );
828         }
829     }
830 
831     private void handleLIStart( Sink sink, SinkEventAttributeSet attribs )
832     {
833         if ( orderedListDepth == 0 )
834         {
835             sink.listItem( attribs );
836         }
837         else
838         {
839             sink.numberedListItem( attribs );
840         }
841     }
842 
843     private void handleListItemEnd( Sink sink )
844     {
845         if ( orderedListDepth == 0 )
846         {
847             sink.listItem_();
848         }
849         else
850         {
851             sink.numberedListItem_();
852         }
853     }
854 
855     private void handleOLStart( XmlPullParser parser, Sink sink, SinkEventAttributeSet attribs )
856     {
857         int numbering = Sink.NUMBERING_DECIMAL;
858         // this will have to be generalized if we handle styles
859         String style = parser.getAttributeValue( null, Attribute.STYLE.toString() );
860 
861         if ( style != null )
862         {
863             if ( "list-style-type: upper-alpha".equals( style ) )
864             {
865                 numbering = Sink.NUMBERING_UPPER_ALPHA;
866             }
867             else if ( "list-style-type: lower-alpha".equals( style ) )
868             {
869                 numbering = Sink.NUMBERING_LOWER_ALPHA;
870             }
871             else if ( "list-style-type: upper-roman".equals( style ) )
872             {
873                 numbering = Sink.NUMBERING_UPPER_ROMAN;
874             }
875             else if ( "list-style-type: lower-roman".equals( style ) )
876             {
877                 numbering = Sink.NUMBERING_LOWER_ROMAN;
878             }
879             else if ( "list-style-type: decimal".equals( style ) )
880             {
881                 numbering = Sink.NUMBERING_DECIMAL;
882             }
883         }
884 
885         sink.numberedList( numbering, attribs );
886         orderedListDepth++;
887     }
888 
889     private void handlePStart( Sink sink, SinkEventAttributeSet attribs )
890     {
891         if ( !inFigure )
892         {
893             sink.paragraph( attribs );
894         }
895     }
896 
897     /*
898      * The PRE element tells visual user agents that the enclosed text is
899      * "preformatted". When handling preformatted text, visual user agents:
900      * - May leave white space intact.
901      * - May render text with a fixed-pitch font.
902      * - May disable automatic word wrap.
903      * - Must not disable bidirectional processing.
904      * Non-visual user agents are not required to respect extra white space
905      * in the content of a PRE element.
906      */
907     private void handlePreStart( SinkEventAttributeSet attribs, Sink sink )
908     {
909         verbatim();
910         attribs.removeAttribute( SinkEventAttributes.DECORATION );
911         sink.verbatim( attribs );
912     }
913 
914     private void handleSectionStart( Sink sink, int level, SinkEventAttributeSet attribs )
915     {
916         consecutiveSections( level, sink );
917         sink.section( level, attribs );
918         sink.sectionTitle( level, attribs );
919     }
920 
921     private void handleTableStart( Sink sink, SinkEventAttributeSet attribs, XmlPullParser parser )
922     {
923         sink.table( attribs );
924         String border = parser.getAttributeValue( null, Attribute.BORDER.toString() );
925         boolean grid = true;
926 
927         if ( border == null || "0".equals( border ) )
928         {
929             grid = false;
930         }
931 
932         String align = parser.getAttributeValue( null, Attribute.ALIGN.toString() );
933         int[] justif = {Sink.JUSTIFY_LEFT};
934 
935         if ( "center".equals( align ) )
936         {
937             justif[0] = Sink.JUSTIFY_CENTER;
938         }
939         else if ( "right".equals( align ) )
940         {
941             justif[0] = Sink.JUSTIFY_RIGHT;
942         }
943 
944         sink.tableRows( justif, grid );
945     }
946 
947     /**
948      * If debug mode is enabled, log the <code>msg</code> as is, otherwise add unique msg in <code>warnMessages</code>.
949      *
950      * @param key not null
951      * @param msg not null
952      * @see #parse(Reader, Sink)
953      * @since 1.1.1
954      */
955     private void logMessage( String key, String msg )
956     {
957         msg = "[XHTML Parser] " + msg;
958         if ( getLog().isDebugEnabled() )
959         {
960             getLog().debug( msg );
961 
962             return;
963         }
964 
965         if ( warnMessages == null )
966         {
967             warnMessages = new HashMap<String, Set<String>>();
968         }
969 
970         Set<String> set = warnMessages.get( key );
971         if ( set == null )
972         {
973             set = new TreeSet<String>();
974         }
975         set.add( msg );
976         warnMessages.put( key, set );
977     }
978 
979     /**
980      * @since 1.1.1
981      */
982     private void logWarnings()
983     {
984         if ( getLog().isWarnEnabled() && this.warnMessages != null && !isSecondParsing() )
985         {
986             for ( Map.Entry<String, Set<String>> entry : this.warnMessages.entrySet() )
987             {
988                 for ( String msg : entry.getValue() )
989                 {
990                     getLog().warn( msg );
991                 }
992             }
993 
994             this.warnMessages = null;
995         }
996     }
997 }