001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019package org.apache.maven.tools.plugin.generator; 020 021import org.codehaus.plexus.util.StringUtils; 022import org.jsoup.Jsoup; 023import org.jsoup.internal.StringUtil; 024import org.jsoup.nodes.Document; 025import org.jsoup.nodes.Element; 026import org.jsoup.nodes.Node; 027import org.jsoup.nodes.TextNode; 028import org.jsoup.select.NodeTraversor; 029import org.jsoup.select.NodeVisitor; 030 031/** 032 * Replaces (X)HTML content by plain text equivalent. 033 * Based on work from 034 * <a href="https://github.com/jhy/jsoup/blob/master/src/main/java/org/jsoup/examples/HtmlToPlainText.java"> 035 * JSoup Example: HtmlToPlainText</a>. 036 */ 037public class HtmlToPlainTextConverter implements Converter { 038 @Override 039 public String convert(String text) { 040 if (StringUtils.isBlank(text)) { 041 return text; 042 } 043 Document document = Jsoup.parse(text); 044 return getPlainText(document); 045 } 046 047 /** 048 * Format an Element to plain-text 049 * 050 * @param element the root element to format 051 * @return formatted text 052 */ 053 private String getPlainText(Element element) { 054 FormattingVisitor formatter = new FormattingVisitor(); 055 NodeTraversor.traverse(formatter, element); // walk the DOM, and call .head() and .tail() for each node 056 057 return formatter.toString(); 058 } 059 060 // the formatting rules, implemented in a breadth-first DOM traverse 061 private static class FormattingVisitor implements NodeVisitor { 062 private StringBuilder accum = new StringBuilder(); // holds the accumulated text 063 064 // hit when the node is first seen 065 public void head(Node node, int depth) { 066 String name = node.nodeName(); 067 if (node instanceof TextNode) { 068 accum.append(((TextNode) node).text()); // TextNodes carry all user-readable text in the DOM. 069 } else if (name.equals("li")) { 070 accum.append("\n * "); 071 } else if (name.equals("dt")) { 072 accum.append(" "); 073 } else if (StringUtil.in(name, "p", "h1", "h2", "h3", "h4", "h5", "tr")) { 074 accum.append("\n"); 075 } 076 } 077 078 // hit when all of the node's children (if any) have been visited 079 public void tail(Node node, int depth) { 080 String name = node.nodeName(); 081 if (StringUtil.in(name, "br", "dd", "dt", "p", "h1", "h2", "h3", "h4", "h5")) { 082 accum.append("\n"); 083 } else if (name.equals("a")) { 084 // link is empty if it cannot be made absolute 085 String link = node.absUrl("href"); 086 if (!link.isEmpty()) { 087 accum.append(String.format(" <%s>", link)); 088 } 089 } 090 } 091 092 @Override 093 public String toString() { 094 // collate multiple consecutive spaces 095 return accum.toString().replaceAll(" +", " ").replace("\n ", "\n"); 096 } 097 } 098}