1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19 package org.apache.maven.tools.plugin.generator;
20
21 import org.codehaus.plexus.util.StringUtils;
22 import org.jsoup.Jsoup;
23 import org.jsoup.internal.StringUtil;
24 import org.jsoup.nodes.Document;
25 import org.jsoup.nodes.Element;
26 import org.jsoup.nodes.Node;
27 import org.jsoup.nodes.TextNode;
28 import org.jsoup.select.NodeTraversor;
29 import org.jsoup.select.NodeVisitor;
30
31 /**
32 * Replaces (X)HTML content by plain text equivalent.
33 * Based on work from
34 * <a href="https://github.com/jhy/jsoup/blob/master/src/main/java/org/jsoup/examples/HtmlToPlainText.java">
35 * JSoup Example: HtmlToPlainText</a>.
36 */
37 public class HtmlToPlainTextConverter implements Converter {
38 @Override
39 public String convert(String text) {
40 if (StringUtils.isBlank(text)) {
41 return text;
42 }
43 Document document = Jsoup.parse(text);
44 return getPlainText(document);
45 }
46
47 /**
48 * Format an Element to plain-text
49 *
50 * @param element the root element to format
51 * @return formatted text
52 */
53 private String getPlainText(Element element) {
54 FormattingVisitor formatter = new FormattingVisitor();
55 NodeTraversor.traverse(formatter, element); // walk the DOM, and call .head() and .tail() for each node
56
57 return formatter.toString();
58 }
59
60 // the formatting rules, implemented in a breadth-first DOM traverse
61 private static class FormattingVisitor implements NodeVisitor {
62 private StringBuilder accum = new StringBuilder(); // holds the accumulated text
63
64 // hit when the node is first seen
65 public void head(Node node, int depth) {
66 String name = node.nodeName();
67 if (node instanceof TextNode) {
68 accum.append(((TextNode) node).text()); // TextNodes carry all user-readable text in the DOM.
69 } else if (name.equals("li")) {
70 accum.append("\n * ");
71 } else if (name.equals("dt")) {
72 accum.append(" ");
73 } else if (StringUtil.in(name, "p", "h1", "h2", "h3", "h4", "h5", "tr")) {
74 accum.append("\n");
75 }
76 }
77
78 // hit when all of the node's children (if any) have been visited
79 public void tail(Node node, int depth) {
80 String name = node.nodeName();
81 if (StringUtil.in(name, "br", "dd", "dt", "p", "h1", "h2", "h3", "h4", "h5")) {
82 accum.append("\n");
83 } else if (name.equals("a")) {
84 // link is empty if it cannot be made absolute
85 String link = node.absUrl("href");
86 if (!link.isEmpty()) {
87 accum.append(String.format(" <%s>", link));
88 }
89 }
90 }
91
92 @Override
93 public String toString() {
94 // collate multiple consecutive spaces
95 return accum.toString().replaceAll(" +", " ").replace("\n ", "\n");
96 }
97 }
98 }