1 package org.apache.maven.html2xdoc;
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 import java.util.Iterator;
20 import java.util.LinkedList;
21 import java.util.List;
22
23 import org.apache.commons.logging.Log;
24 import org.apache.commons.logging.LogFactory;
25 import org.dom4j.CharacterData;
26 import org.dom4j.Comment;
27 import org.dom4j.Document;
28 import org.dom4j.DocumentFactory;
29 import org.dom4j.Element;
30 import org.dom4j.Node;
31
32 /**
33 * A simple bean for converting a HTML document into an XDoc compliant XML
34 * document.
35 * This could be done via XSLT but is a little more complex than it might first
36 * appear so its done via Java code instead.
37 *
38 * @author <a href="mailto:jstrachan@apache.org">James Strachan</a>
39 */
40 public class Html2XdocBean
41 {
42 /** The Log to which logging calls will be made. */
43 private static final Log log = LogFactory.getLog( Html2XdocBean.class );
44
45 /**
46 * Used to create the output document
47 */
48 private DocumentFactory factory = new DocumentFactory( );
49
50 /**
51 * The current node to attach the sub-nodes.
52 */
53 private Element currentNode = null;
54
55 /**
56 * The current 'root' section node. This is used to keep
57 * track of the root section so that when a subsection is
58 * found it can be associated correctly.
59 */
60 private Element currentSectionNode = null;
61
62 /**
63 * The current section heading level. If a subsequent level
64 * lower or equal, then create a new section.
65 */
66 private int currentSectionHeadingLevel = Integer.MIN_VALUE;
67
68 /**
69 * The current paragraph node. This is used to associate text
70 * and formatting nodes to the current paragraph node.
71 */
72 private Element currentParaNode = null;
73
74 /**
75 * Converts the given HTML document into the corresponding XDoc format
76 * of XML
77 *
78 * @param html the input html document
79 * @return Document
80 */
81 public Document convert( Document html )
82 {
83 Document doc = factory.createDocument( );
84 Element root = doc.addElement( "document" );
85 Element properties = root.addElement( "properties" );
86 Element title = properties.addElement( "title" );
87
88 title.setText( html.valueOf( "/html/head/title" ) );
89
90 Element body = root.addElement( "body" );
91
92 Element htmlContent = (Element) html.selectSingleNode( "/html/body" );
93
94 if ( htmlContent == null )
95 {
96 log.info( "No body element found for HTML document: "
97 + html.asXML( ) );
98 }
99 else
100 {
101 addSections( body, htmlContent );
102 }
103
104 return doc;
105 }
106
107 /**
108 * Iterates thorugh the given body looking for h1, h2, h3 nodes and
109 * creating the associated section elements. Any text nodes
110 * contained inside the body are wrapped in a <p> element
111 *
112 * @param output the output destination
113 * @param body the block of HTML markup to convert
114 */
115 protected void addSections( Element output, Element body )
116 {
117 List content = getBodyContent( body.content( ) );
118
119 for ( Iterator iter = content.iterator( ); iter.hasNext( ); )
120 {
121 Node node = (Node) iter.next( );
122
123 if ( isHeading( node ) )
124 {
125 makeSection( output, node );
126 }
127 else
128 {
129 guaranteeHasSection( output );
130 processNode( node );
131 }
132 }
133 }
134
135 /**
136 * main algorithm which represents the iteration contract.
137 * Use the protected methods to change the behavior.
138 *
139 * @param node the node to process
140 */
141 private void processNode( Node node )
142 {
143 if ( isCharacterData( node ) )
144 {
145 addTextNode( node );
146 }
147 else if ( isTextFormatting( node ) )
148 {
149 addFormattingNode( node );
150 }
151 else
152 {
153 addNode( node );
154 }
155 }
156
157 /**
158 * Specifies whether the node is a text modifying construct that should be
159 * passed as is to the resultant html. Such as an anchor '<a>'.
160 *
161 * @param node the node to check
162 * @return true if the node is used to modify the formatting of the
163 * text; otherwise, false
164 */
165 protected boolean isTextFormatting( Node node )
166 {
167
168 return ( node.getName( ) != null ) && node.getName( ).equals( "a" );
169 }
170
171 /**
172 * Specifies whether the node is character data and should be passed as
173 * straight text to the resultant html.
174 *
175 * @param node the node to check
176 * @return true if the node is a text node; otherwise, false.
177 */
178 protected boolean isCharacterData( Node node )
179 {
180 return node instanceof CharacterData
181 && ( ( node instanceof Comment ) == false );
182 }
183
184 /**
185 * Specifies whether the node is a heading node.
186 *
187 * @param node the node to check
188 * @return true if the given node is a heading element
189 * (h1, h2, h3 etc); otherwise, false
190 */
191 protected boolean isHeading( Node node )
192 {
193 String name = node.getName( );
194
195 return ( name != null ) && name.startsWith( "h" );
196 }
197
198 /**
199 * Determines the heading level of the node.
200 *
201 * @param node the node to check
202 * @return the integer level of the heading
203 */
204 protected int determineHeadingLevel( Node node )
205 {
206 try
207 {
208 String name = node.getName( ).substring( 1 );
209
210 return Integer.parseInt( name );
211 }
212 catch ( NumberFormatException nfe )
213 {
214 return 1;
215 }
216 }
217
218 /**
219 * Creates a section or subsection as necessary based on the node
220 * for the output document.
221 *
222 * @param output the output document to attach the section
223 * @param node the node to base making a section on
224 */
225 protected void makeSection( Element output, Node node )
226 {
227 int level = determineHeadingLevel( node );
228
229 if ( needsNewSection( node ) )
230 {
231 currentNode = output.addElement( "section" );
232 currentSectionHeadingLevel = level;
233 currentSectionNode = currentNode;
234 }
235 else
236 {
237 currentNode = currentSectionNode.addElement( "subsection" );
238 }
239
240 currentNode.addAttribute( "name", getSectionText( node ) );
241 currentParaNode = null;
242 }
243
244 /**
245 * @return the section text for the given node. If the node
246 * contains an embedded element (such as an <a> element)
247 * then return its text
248 */
249 protected String getSectionText( Node node )
250 {
251 String text = node.getText( ).trim( );
252
253 if ( ( text.length( ) <= 0 ) && node instanceof Element )
254 {
255 Element element = (Element) node;
256
257
258 List childElements = element.elements( );
259
260 if ( !childElements.isEmpty( ) )
261 {
262 Node child = (Node) childElements.get( 0 );
263
264 return child.getText( );
265 }
266 }
267
268 return text;
269 }
270
271 /**
272 * Determines if a new section is needed which is based on whether
273 * the node's a heading level and equal to or less than the current
274 * section's heading level.
275 *
276 * @param node the node to check
277 * @return true if the current node's information means for a new
278 * section; otherwise, false
279 */
280 protected boolean needsNewSection( Node node )
281 {
282 int level = determineHeadingLevel( node );
283
284 return ( level <= currentSectionHeadingLevel )
285 || ( currentSectionNode == null );
286 }
287
288 /**
289 * Determines if a paragraph node is needed.
290 */
291 private void guaranteeHasParaNode( )
292 {
293 if ( currentParaNode == null )
294 {
295 currentParaNode = currentNode.addElement( "p" );
296 }
297 }
298
299 /**
300 * Makes sure the current node is section, if necessary.
301 * @param output the output element to add the section to
302 */
303 private void guaranteeHasSection( Element output )
304 {
305 if ( currentNode == null )
306 {
307
308
309 currentNode = output.addElement( "section" );
310 }
311 }
312
313 /**
314 * Add the node to the current node.
315 * @param node the node to add
316 */
317 private void addNode( Node node )
318 {
319 if ( ( currentParaNode != null ) && !shouldBreakPara( node ) )
320 {
321 currentParaNode.add( cloneNode( node ) );
322 }
323 else
324 {
325 currentNode.add( cloneNode( node ) );
326 currentParaNode = null;
327 }
328 }
329
330 /**
331 * @return true if the paragraph should be split, such as for a br or p
332 * tag
333 */
334 protected boolean shouldBreakPara( Node node )
335 {
336 String name = node.getName( );
337
338 return "p".equals( name ) || "br".equals( name );
339 }
340
341 /**
342 * Adds the text of the node to the current paragraph.
343 * @param node the node to add
344 */
345 private void addTextNode( Node node )
346 {
347 guaranteeHasParaNode( );
348 currentParaNode.addText( node.getText( ) );
349 }
350
351 /**
352 * Adds the node to the current paragraph.
353 * @param node the node to add
354 */
355 private void addFormattingNode( Node node )
356 {
357 guaranteeHasParaNode( );
358 currentParaNode.add( cloneNode( node ) );
359 }
360
361 /**
362 * Returns a copy of the body content, removing any whitespace from
363 * the beginning and end.
364 *
365 * @param content the content node list to obtain body content from
366 * @return List
367 */
368 protected List getBodyContent( List content )
369 {
370
371 Element lastPre = null;
372 LinkedList list = new LinkedList( );
373 boolean lastWasElement = true;
374
375 for ( Iterator iter = content.iterator( ); iter.hasNext( ); )
376 {
377 Node node = (Node) iter.next( );
378
379 if ( isPre( node ) )
380 {
381 if ( lastPre == null )
382 {
383 lastPre = factory.createElement( "source" );
384 list.add( lastPre );
385 }
386
387 lastPre.addText( node.getText( ) );
388 }
389 else
390 {
391 if ( isWhitespace( node ) && lastWasElement )
392 {
393 if ( lastPre != null )
394 {
395 lastPre.addText( node.getText( ) );
396 }
397 }
398 else
399 {
400 lastWasElement = node instanceof Element;
401
402 if ( lastWasElement )
403 {
404 lastPre = null;
405 }
406
407 list.add( node );
408 }
409 }
410 }
411
412 if ( list.size( ) == 0 )
413 {
414 return list;
415 }
416
417
418 while ( true )
419 {
420 Node node = (Node) list.getFirst( );
421
422 if ( isWhitespace( node ) )
423 {
424 list.removeFirst( );
425
426 continue;
427 }
428
429 break;
430 }
431
432 while ( true )
433 {
434 Node node = (Node) list.getLast( );
435
436 if ( isWhitespace( node ) )
437 {
438 list.removeLast( );
439
440 continue;
441 }
442
443 break;
444 }
445
446 return list;
447 }
448
449 /**
450 * @param node the node to check
451 * @return true if the node is a pre tag; otherwise false.
452 */
453 protected boolean isPre( Node node )
454 {
455 if ( node instanceof Element )
456 {
457 Element element = (Element) node;
458
459 return element.getName( ).equals( "pre" );
460 }
461
462 return false;
463 }
464
465 /**
466 * @param node the node to check
467 * @return true if the given node is a whitespace text node
468 */
469 protected boolean isWhitespace( Node node )
470 {
471 if ( node instanceof CharacterData )
472 {
473 String text = node.getText( );
474
475 return text.trim( ).length( ) <= 0;
476 }
477
478
479
480
481
482
483
484
485
486
487
488 return false;
489 }
490
491 /**
492 * Normalizes the whitespace of any Elements
493 *
494 * @param node the node to clone
495 * @return Node the cloned node
496 */
497 protected Node cloneNode( Node node )
498 {
499 Node answer = (Node) node.clone( );
500
501 if ( answer instanceof Element )
502 {
503 Element element = (Element) answer;
504
505 element.normalize( );
506 }
507
508 return answer;
509 }
510 }