View Javadoc

1   package org.apache.maven.html2xdoc;
2   
3   /* ====================================================================
4    *   Copyright 2001-2004 The Apache Software Foundation.
5    *
6    *   Licensed under the Apache License, Version 2.0 (the "License");
7    *   you may not use this file except in compliance with the License.
8    *   You may obtain a copy of the License at
9    *
10   *       http://www.apache.org/licenses/LICENSE-2.0
11   *
12   *   Unless required by applicable law or agreed to in writing, software
13   *   distributed under the License is distributed on an "AS IS" BASIS,
14   *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   *   See the License for the specific language governing permissions and
16   *   limitations under the License.
17   * ====================================================================
18   */
19  import java.util.Iterator;
20  import java.util.LinkedList;
21  import java.util.List;
22  
23  import org.apache.commons.logging.Log;
24  import org.apache.commons.logging.LogFactory;
25  import org.dom4j.CharacterData;
26  import org.dom4j.Comment;
27  import org.dom4j.Document;
28  import org.dom4j.DocumentFactory;
29  import org.dom4j.Element;
30  import org.dom4j.Node;
31  
32  /**
33   * A simple bean for converting a HTML document into an XDoc compliant XML
34   * document.
35   * This could be done via XSLT but is a little more complex than it might first
36   * appear so its done via Java code instead.
37   *
38   * @author <a href="mailto:jstrachan@apache.org">James Strachan</a>
39   */
40  public class Html2XdocBean
41  {
42      /** The Log to which logging calls will be made. */
43      private static final Log log = LogFactory.getLog( Html2XdocBean.class );
44  
45      /**
46       * Used to create the output document
47       */
48      private DocumentFactory factory = new DocumentFactory(  );
49  
50      /**
51       * The current node to attach the sub-nodes.
52       */
53      private Element currentNode = null;
54  
55      /**
56       * The current 'root' section node. This is used to keep
57       * track of the root section so that when a subsection is
58       * found it can be associated correctly.
59       */
60      private Element currentSectionNode = null;
61  
62      /**
63       * The current section heading level. If a subsequent level
64       * lower or equal, then create a new section.
65       */
66      private int currentSectionHeadingLevel = Integer.MIN_VALUE;
67  
68      /**
69       * The current paragraph node. This is used to associate text
70       * and formatting nodes to the current paragraph node.
71       */
72      private Element currentParaNode = null;
73  
74      /**
75       * Converts the given HTML document into the corresponding XDoc format
76       * of XML
77       *
78       * @param html the input html document
79       * @return Document
80       */
81      public Document convert( Document html )
82      {
83          Document doc = factory.createDocument(  );
84          Element root = doc.addElement( "document" );
85          Element properties = root.addElement( "properties" );
86          Element title = properties.addElement( "title" );
87  
88          title.setText( html.valueOf( "/html/head/title" ) );
89  
90          Element body = root.addElement( "body" );
91  
92          Element htmlContent = (Element) html.selectSingleNode( "/html/body" );
93  
94          if ( htmlContent == null )
95          {
96              log.info( "No body element found for HTML document: "
97                  + html.asXML(  ) );
98          }
99          else
100         {
101             addSections( body, htmlContent );
102         }
103 
104         return doc;
105     }
106 
107     /**
108      * Iterates thorugh the given body looking for h1, h2, h3 nodes and
109      * creating the associated section elements. Any text nodes
110      * contained inside the body are wrapped in a &lt;p&gt; element
111      *
112      * @param output the output destination
113      * @param body the block of HTML markup to convert
114      */
115     protected void addSections( Element output, Element body )
116     {
117         List content = getBodyContent( body.content(  ) );
118 
119         for ( Iterator iter = content.iterator(  ); iter.hasNext(  ); )
120         {
121             Node node = (Node) iter.next(  );
122 
123             if ( isHeading( node ) )
124             {
125                 makeSection( output, node );
126             }
127             else
128             {
129                 guaranteeHasSection( output );
130                 processNode( node );
131             }
132         }
133     }
134 
135     /**
136      * main algorithm which represents the iteration contract.
137      * Use the protected methods to change the behavior.
138      *
139      * @param node the node to process
140      */
141     private void processNode( Node node )
142     {
143         if ( isCharacterData( node ) )
144         {
145             addTextNode( node );
146         }
147         else if ( isTextFormatting( node ) )
148         {
149             addFormattingNode( node );
150         }
151         else
152         {
153             addNode( node );
154         }
155     }
156 
157     /**
158      * Specifies whether the node is a text modifying construct that should be
159      * passed as is to the resultant html. Such as an anchor '&lt;a&gt;'.
160      *
161      * @param node the node to check
162      * @return true if the node is used to modify the formatting of the
163      *         text; otherwise, false
164      */
165     protected boolean isTextFormatting( Node node )
166     {
167         // Ultimately this needs bold, italic, and so on
168         return ( node.getName(  ) != null ) && node.getName(  ).equals( "a" );
169     }
170 
171     /**
172      * Specifies whether the node is character data and should be passed as
173      * straight text to the resultant html.
174      *
175      * @param node the node to check
176      * @return true if the node is a text node; otherwise, false.
177      */
178     protected boolean isCharacterData( Node node )
179     {
180         return node instanceof CharacterData
181         && ( ( node instanceof Comment ) == false );
182     }
183 
184     /**
185      * Specifies whether the node is a heading node.
186      *
187      * @param node the node to check
188      * @return true if the given node is a heading element
189      *         (h1, h2, h3 etc); otherwise, false
190      */
191     protected boolean isHeading( Node node )
192     {
193         String name = node.getName(  );
194 
195         return ( name != null ) && name.startsWith( "h" );
196     }
197 
198     /**
199      * Determines the heading level of the node.
200      *
201      * @param node the node to check
202      * @return the integer level of the heading
203      */
204     protected int determineHeadingLevel( Node node )
205     {
206         try
207         {
208             String name = node.getName(  ).substring( 1 );
209 
210             return Integer.parseInt( name );
211         }
212         catch ( NumberFormatException nfe )
213         {
214             return 1;
215         }
216     }
217 
218     /**
219      * Creates a section or subsection as necessary based on the node
220      * for the output document.
221      *
222      * @param output the output document to attach the section
223      * @param node the node to base making a section on
224      */
225     protected void makeSection( Element output, Node node )
226     {
227         int level = determineHeadingLevel( node );
228 
229         if ( needsNewSection( node ) )
230         {
231             currentNode = output.addElement( "section" );
232             currentSectionHeadingLevel = level;
233             currentSectionNode = currentNode;
234         }
235         else
236         {
237             currentNode = currentSectionNode.addElement( "subsection" );
238         }
239 
240         currentNode.addAttribute( "name", getSectionText( node ) );
241         currentParaNode = null;
242     }
243 
244     /**
245      * @return the section text for the given node. If the node
246      * contains an embedded element (such as an &lt;a&gt; element)
247      * then return its text
248      */
249     protected String getSectionText( Node node )
250     {
251         String text = node.getText(  ).trim(  );
252 
253         if ( ( text.length(  ) <= 0 ) && node instanceof Element )
254         {
255             Element element = (Element) node;
256 
257             // maybe we contain a hypertext link
258             List childElements = element.elements(  );
259 
260             if ( !childElements.isEmpty(  ) )
261             {
262                 Node child = (Node) childElements.get( 0 );
263 
264                 return child.getText(  );
265             }
266         }
267 
268         return text;
269     }
270 
271     /**
272      * Determines if a new section is needed which is based on whether
273      * the node's a heading level and equal to or less than the current
274      * section's heading level.
275      *
276      * @param node the node to check
277      * @return true if the current node's information means for a new
278      *         section; otherwise, false
279      */
280     protected boolean needsNewSection( Node node )
281     {
282         int level = determineHeadingLevel( node );
283 
284         return ( level <= currentSectionHeadingLevel )
285         || ( currentSectionNode == null );
286     }
287 
288     /**
289      * Determines if a paragraph node is needed.
290      */
291     private void guaranteeHasParaNode(  )
292     {
293         if ( currentParaNode == null )
294         {
295             currentParaNode = currentNode.addElement( "p" );
296         }
297     }
298 
299     /**
300      * Makes sure the current node is section, if necessary.
301      * @param output the output element to add the section to
302      */
303     private void guaranteeHasSection( Element output )
304     {
305         if ( currentNode == null )
306         {
307             // we have a section with no name
308             // should we default it to be the same as the document title?
309             currentNode = output.addElement( "section" );
310         }
311     }
312 
313     /**
314      * Add the node to the current node.
315      * @param node the node to add
316      */
317     private void addNode( Node node )
318     {
319         if ( ( currentParaNode != null ) && !shouldBreakPara( node ) )
320         {
321             currentParaNode.add( cloneNode( node ) );
322         }
323         else
324         {
325             currentNode.add( cloneNode( node ) );
326             currentParaNode = null;
327         }
328     }
329 
330     /**
331      * @return true if the paragraph should be split, such as for a br or p
332      * tag
333      */
334     protected boolean shouldBreakPara( Node node )
335     {
336         String name = node.getName(  );
337 
338         return "p".equals( name ) || "br".equals( name );
339     }
340 
341     /**
342      * Adds the text of the node to the current paragraph.
343      * @param node the node to add
344      */
345     private void addTextNode( Node node )
346     {
347         guaranteeHasParaNode(  );
348         currentParaNode.addText( node.getText(  ) );
349     }
350 
351     /**
352      * Adds the node to the current paragraph.
353      * @param node the node to add
354      */
355     private void addFormattingNode( Node node )
356     {
357         guaranteeHasParaNode(  );
358         currentParaNode.add( cloneNode( node ) );
359     }
360 
361     /**
362      * Returns a copy of the body content, removing any whitespace from
363      * the beginning and end.
364      *
365      * @param content the content node list to obtain body content from
366      * @return List
367      */
368     protected List getBodyContent( List content )
369     {
370         // lets turn <pre> into <source> and concatenate consective entries
371         Element lastPre = null;
372         LinkedList list = new LinkedList(  );
373         boolean lastWasElement = true;
374 
375         for ( Iterator iter = content.iterator(  ); iter.hasNext(  ); )
376         {
377             Node node = (Node) iter.next(  );
378 
379             if ( isPre( node ) )
380             {
381                 if ( lastPre == null )
382                 {
383                     lastPre = factory.createElement( "source" );
384                     list.add( lastPre );
385                 }
386 
387                 lastPre.addText( node.getText(  ) );
388             }
389             else
390             {
391                 if ( isWhitespace( node ) && lastWasElement )
392                 {
393                     if ( lastPre != null )
394                     {
395                         lastPre.addText( node.getText(  ) );
396                     }
397                 }
398                 else
399                 {
400                     lastWasElement = node instanceof Element;
401 
402                     if ( lastWasElement )
403                     {
404                         lastPre = null;
405                     }
406 
407                     list.add( node );
408                 }
409             }
410         }
411 
412         if ( list.size(  ) == 0 )
413         {
414             return list;
415         }
416 
417         // now lets remove any whitespace text nodes at the beginning and end
418         while ( true )
419         {
420             Node node = (Node) list.getFirst(  );
421 
422             if ( isWhitespace( node ) )
423             {
424                 list.removeFirst(  );
425 
426                 continue;
427             }
428 
429             break;
430         }
431 
432         while ( true )
433         {
434             Node node = (Node) list.getLast(  );
435 
436             if ( isWhitespace( node ) )
437             {
438                 list.removeLast(  );
439 
440                 continue;
441             }
442 
443             break;
444         }
445 
446         return list;
447     }
448 
449     /**
450      * @param node the node to check
451      * @return true if the node is a pre tag; otherwise false.
452      */
453     protected boolean isPre( Node node )
454     {
455         if ( node instanceof Element )
456         {
457             Element element = (Element) node;
458 
459             return element.getName(  ).equals( "pre" );
460         }
461 
462         return false;
463     }
464 
465     /**
466      * @param node the node to check
467      * @return true if the given node is a whitespace text node
468      */
469     protected boolean isWhitespace( Node node )
470     {
471         if ( node instanceof CharacterData )
472         {
473             String text = node.getText(  );
474 
475             return text.trim(  ).length(  ) <= 0;
476         }
477 
478         //        if (node instanceof Element) {
479         //            String name = node.getName();
480         //            if (name.equals("p")) {
481         //                String text = node.getText();
482         //                return text.trim().length() <= 0;
483         //            }
484         //            if (name.equals("br")) {
485         //                return true;
486         //            }
487         //        }
488         return false;
489     }
490 
491     /**
492      * Normalizes the whitespace of any Elements
493      *
494      * @param node the node to clone
495      * @return Node the cloned node
496      */
497     protected Node cloneNode( Node node )
498     {
499         Node answer = (Node) node.clone(  );
500 
501         if ( answer instanceof Element )
502         {
503             Element element = (Element) answer;
504 
505             element.normalize(  );
506         }
507 
508         return answer;
509     }
510 }