1 package org.apache.maven.tools.plugin.generator;
2
3 /*
4 * Licensed to the Apache Software Foundation (ASF) under one
5 * or more contributor license agreements. See the NOTICE file
6 * distributed with this work for additional information
7 * regarding copyright ownership. The ASF licenses this file
8 * to you under the Apache License, Version 2.0 (the
9 * "License"); you may not use this file except in compliance
10 * with the License. You may obtain a copy of the License at
11 *
12 * http://www.apache.org/licenses/LICENSE-2.0
13 *
14 * Unless required by applicable law or agreed to in writing,
15 * software distributed under the License is distributed on an
16 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17 * KIND, either express or implied. See the License for the
18 * specific language governing permissions and limitations
19 * under the License.
20 */
21
22 import org.codehaus.plexus.util.StringUtils;
23 import org.jsoup.Jsoup;
24 import org.jsoup.internal.StringUtil;
25 import org.jsoup.nodes.Document;
26 import org.jsoup.nodes.Element;
27 import org.jsoup.nodes.Node;
28 import org.jsoup.nodes.TextNode;
29 import org.jsoup.select.NodeTraversor;
30 import org.jsoup.select.NodeVisitor;
31
32 /**
33 * Replaces (X)HTML content by plain text equivalent.
34 * Based on work from
35 * <a href="https://github.com/jhy/jsoup/blob/master/src/main/java/org/jsoup/examples/HtmlToPlainText.java">
36 * JSoup Example: HtmlToPlainText</a>.
37 */
38 public class HtmlToPlainTextConverter implements Converter
39 {
40 @Override
41 public String convert( String text )
42 {
43 if ( StringUtils.isBlank( text ) )
44 {
45 return text;
46 }
47 Document document = Jsoup.parse( text );
48 return getPlainText( document );
49 }
50
51 /**
52 * Format an Element to plain-text
53 *
54 * @param element the root element to format
55 * @return formatted text
56 */
57 private String getPlainText( Element element )
58 {
59 FormattingVisitor formatter = new FormattingVisitor();
60 NodeTraversor.traverse( formatter, element ); // walk the DOM, and call .head() and .tail() for each node
61
62 return formatter.toString();
63 }
64
65 // the formatting rules, implemented in a breadth-first DOM traverse
66 private static class FormattingVisitor
67 implements NodeVisitor
68 {
69 private StringBuilder accum = new StringBuilder(); // holds the accumulated text
70
71 // hit when the node is first seen
72 public void head( Node node, int depth )
73 {
74 String name = node.nodeName();
75 if ( node instanceof TextNode )
76 {
77 accum.append( ( (TextNode) node ).text() ); // TextNodes carry all user-readable text in the DOM.
78 }
79 else if ( name.equals( "li" ) )
80 {
81 accum.append( "\n * " );
82 }
83 else if ( name.equals( "dt" ) )
84 {
85 accum.append( " " );
86 }
87 else if ( StringUtil.in( name, "p", "h1", "h2", "h3", "h4", "h5", "tr" ) )
88 {
89 accum.append( "\n" );
90 }
91 }
92
93 // hit when all of the node's children (if any) have been visited
94 public void tail( Node node, int depth )
95 {
96 String name = node.nodeName();
97 if ( StringUtil.in( name, "br", "dd", "dt", "p", "h1", "h2", "h3", "h4", "h5" ) )
98 {
99 accum.append( "\n" );
100 }
101 else if ( name.equals( "a" ) )
102 {
103 // link is empty if it cannot be made absolute
104 String link = node.absUrl( "href" );
105 if ( !link.isEmpty() )
106 {
107 accum.append( String.format( " <%s>", link ) );
108 }
109 }
110 }
111
112 @Override
113 public String toString()
114 {
115 // collate multiple consecutive spaces
116 return accum.toString().replaceAll( " +", " " ).replace( "\n ", "\n" );
117 }
118 }
119 }