001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *   http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing,
013 * software distributed under the License is distributed on an
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 * KIND, either express or implied.  See the License for the
016 * specific language governing permissions and limitations
017 * under the License.
018 */
019package org.apache.maven.tools.plugin.generator;
020
021import org.codehaus.plexus.util.StringUtils;
022import org.jsoup.Jsoup;
023import org.jsoup.internal.StringUtil;
024import org.jsoup.nodes.Document;
025import org.jsoup.nodes.Element;
026import org.jsoup.nodes.Node;
027import org.jsoup.nodes.TextNode;
028import org.jsoup.select.NodeTraversor;
029import org.jsoup.select.NodeVisitor;
030
031/**
032 * Replaces (X)HTML content by plain text equivalent.
033 * Based on work from
034 * <a href="https://github.com/jhy/jsoup/blob/master/src/main/java/org/jsoup/examples/HtmlToPlainText.java">
035 * JSoup Example: HtmlToPlainText</a>.
036 */
037public class HtmlToPlainTextConverter implements Converter {
038    @Override
039    public String convert(String text) {
040        if (StringUtils.isBlank(text)) {
041            return text;
042        }
043        Document document = Jsoup.parse(text);
044        return getPlainText(document);
045    }
046
047    /**
048     * Format an Element to plain-text
049     *
050     * @param element the root element to format
051     * @return formatted text
052     */
053    private String getPlainText(Element element) {
054        FormattingVisitor formatter = new FormattingVisitor();
055        NodeTraversor.traverse(formatter, element); // walk the DOM, and call .head() and .tail() for each node
056
057        return formatter.toString();
058    }
059
060    // the formatting rules, implemented in a breadth-first DOM traverse
061    private static class FormattingVisitor implements NodeVisitor {
062        private StringBuilder accum = new StringBuilder(); // holds the accumulated text
063
064        // hit when the node is first seen
065        public void head(Node node, int depth) {
066            String name = node.nodeName();
067            if (node instanceof TextNode) {
068                accum.append(((TextNode) node).text()); // TextNodes carry all user-readable text in the DOM.
069            } else if (name.equals("li")) {
070                accum.append("\n * ");
071            } else if (name.equals("dt")) {
072                accum.append("  ");
073            } else if (StringUtil.in(name, "p", "h1", "h2", "h3", "h4", "h5", "tr")) {
074                accum.append("\n");
075            }
076        }
077
078        // hit when all of the node's children (if any) have been visited
079        public void tail(Node node, int depth) {
080            String name = node.nodeName();
081            if (StringUtil.in(name, "br", "dd", "dt", "p", "h1", "h2", "h3", "h4", "h5")) {
082                accum.append("\n");
083            } else if (name.equals("a")) {
084                // link is empty if it cannot be made absolute
085                String link = node.absUrl("href");
086                if (!link.isEmpty()) {
087                    accum.append(String.format(" <%s>", link));
088                }
089            }
090        }
091
092        @Override
093        public String toString() {
094            // collate multiple consecutive spaces
095            return accum.toString().replaceAll(" +", " ").replace("\n ", "\n");
096        }
097    }
098}