View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *   http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  package org.apache.maven.tools.plugin.generator;
20  
21  import org.codehaus.plexus.util.StringUtils;
22  import org.jsoup.Jsoup;
23  import org.jsoup.internal.StringUtil;
24  import org.jsoup.nodes.Document;
25  import org.jsoup.nodes.Element;
26  import org.jsoup.nodes.Node;
27  import org.jsoup.nodes.TextNode;
28  import org.jsoup.select.NodeTraversor;
29  import org.jsoup.select.NodeVisitor;
30  
31  /**
32   * Replaces (X)HTML content by plain text equivalent.
33   * Based on work from
34   * <a href="https://github.com/jhy/jsoup/blob/master/src/main/java/org/jsoup/examples/HtmlToPlainText.java">
35   * JSoup Example: HtmlToPlainText</a>.
36   */
37  public class HtmlToPlainTextConverter implements Converter {
38      @Override
39      public String convert(String text) {
40          if (StringUtils.isBlank(text)) {
41              return text;
42          }
43          Document document = Jsoup.parse(text);
44          return getPlainText(document);
45      }
46  
47      /**
48       * Format an Element to plain-text
49       *
50       * @param element the root element to format
51       * @return formatted text
52       */
53      private String getPlainText(Element element) {
54          FormattingVisitor formatter = new FormattingVisitor();
55          NodeTraversor.traverse(formatter, element); // walk the DOM, and call .head() and .tail() for each node
56  
57          return formatter.toString();
58      }
59  
60      // the formatting rules, implemented in a breadth-first DOM traverse
61      private static class FormattingVisitor implements NodeVisitor {
62          private StringBuilder accum = new StringBuilder(); // holds the accumulated text
63  
64          // hit when the node is first seen
65          public void head(Node node, int depth) {
66              String name = node.nodeName();
67              if (node instanceof TextNode) {
68                  accum.append(((TextNode) node).text()); // TextNodes carry all user-readable text in the DOM.
69              } else if (name.equals("li")) {
70                  accum.append("\n * ");
71              } else if (name.equals("dt")) {
72                  accum.append("  ");
73              } else if (StringUtil.in(name, "p", "h1", "h2", "h3", "h4", "h5", "tr")) {
74                  accum.append("\n");
75              }
76          }
77  
78          // hit when all of the node's children (if any) have been visited
79          public void tail(Node node, int depth) {
80              String name = node.nodeName();
81              if (StringUtil.in(name, "br", "dd", "dt", "p", "h1", "h2", "h3", "h4", "h5")) {
82                  accum.append("\n");
83              } else if (name.equals("a")) {
84                  // link is empty if it cannot be made absolute
85                  String link = node.absUrl("href");
86                  if (!link.isEmpty()) {
87                      accum.append(String.format(" <%s>", link));
88                  }
89              }
90          }
91  
92          @Override
93          public String toString() {
94              // collate multiple consecutive spaces
95              return accum.toString().replaceAll(" +", " ").replace("\n ", "\n");
96          }
97      }
98  }