View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *   http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  package org.apache.maven.doxia.util;
20  
21  import javax.swing.text.html.HTML.Tag;
22  
23  import java.nio.charset.StandardCharsets;
24  import java.util.ArrayList;
25  import java.util.HashMap;
26  import java.util.List;
27  import java.util.Map;
28  
29  import org.apache.commons.lang3.StringUtils;
30  import org.apache.commons.text.StringEscapeUtils;
31  import org.apache.maven.doxia.markup.HtmlMarkup;
32  
33  /**
34   * The <code>HtmlTools</code> class defines methods to HTML handling.
35   *
36   * @author <a href="mailto:vincent.siveton@gmail.com">Vincent Siveton</a>
37   * @since 1.0
38   */
39  public class HtmlTools {
40      private static final Tag[] ALL_TAGS = {
41          HtmlMarkup.A,
42          HtmlMarkup.ABBR,
43          HtmlMarkup.ADDRESS,
44          HtmlMarkup.AREA,
45          HtmlMarkup.ARTICLE,
46          HtmlMarkup.ASIDE,
47          HtmlMarkup.AUDIO,
48          HtmlMarkup.B,
49          HtmlMarkup.BASE,
50          HtmlMarkup.BDI,
51          HtmlMarkup.BDO,
52          HtmlMarkup.BLOCKQUOTE,
53          HtmlMarkup.BODY,
54          HtmlMarkup.BR,
55          HtmlMarkup.BUTTON,
56          HtmlMarkup.CANVAS,
57          HtmlMarkup.CAPTION,
58          HtmlMarkup.CITE,
59          HtmlMarkup.CODE,
60          HtmlMarkup.COL,
61          HtmlMarkup.COLGROUP,
62          HtmlMarkup.COMMAND,
63          HtmlMarkup.DATA,
64          HtmlMarkup.DATALIST,
65          HtmlMarkup.DD,
66          HtmlMarkup.DEL,
67          HtmlMarkup.DETAILS,
68          HtmlMarkup.DFN,
69          HtmlMarkup.DIALOG,
70          HtmlMarkup.DIV,
71          HtmlMarkup.DL,
72          HtmlMarkup.DT,
73          HtmlMarkup.EM,
74          HtmlMarkup.EMBED,
75          HtmlMarkup.FIELDSET,
76          HtmlMarkup.FIGCAPTION,
77          HtmlMarkup.FIGURE,
78          HtmlMarkup.FOOTER,
79          HtmlMarkup.FORM,
80          HtmlMarkup.H1,
81          HtmlMarkup.H2,
82          HtmlMarkup.H3,
83          HtmlMarkup.H4,
84          HtmlMarkup.H5,
85          HtmlMarkup.HEAD,
86          HtmlMarkup.HEADER,
87          HtmlMarkup.HGROUP,
88          HtmlMarkup.HR,
89          HtmlMarkup.HTML,
90          HtmlMarkup.I,
91          HtmlMarkup.IFRAME,
92          HtmlMarkup.IMG,
93          HtmlMarkup.INPUT,
94          HtmlMarkup.INS,
95          HtmlMarkup.KBD,
96          HtmlMarkup.KEYGEN,
97          HtmlMarkup.LABEL,
98          HtmlMarkup.LEGEND,
99          HtmlMarkup.LI,
100         HtmlMarkup.LINK,
101         HtmlMarkup.MAIN,
102         HtmlMarkup.MAP,
103         HtmlMarkup.MARK,
104         HtmlMarkup.MENU,
105         HtmlMarkup.MENUITEM,
106         HtmlMarkup.META,
107         HtmlMarkup.METER,
108         HtmlMarkup.NAV,
109         HtmlMarkup.NOSCRIPT,
110         HtmlMarkup.OBJECT,
111         HtmlMarkup.OL,
112         HtmlMarkup.OPTGROUP,
113         HtmlMarkup.OPTION,
114         HtmlMarkup.OUTPUT,
115         HtmlMarkup.P,
116         HtmlMarkup.PARAM,
117         HtmlMarkup.PICTURE,
118         HtmlMarkup.PRE,
119         HtmlMarkup.PROGRESS,
120         HtmlMarkup.Q,
121         HtmlMarkup.RP,
122         HtmlMarkup.RT,
123         HtmlMarkup.RUBY,
124         HtmlMarkup.S,
125         HtmlMarkup.SAMP,
126         HtmlMarkup.SECTION,
127         HtmlMarkup.SCRIPT,
128         HtmlMarkup.SELECT,
129         HtmlMarkup.SMALL,
130         HtmlMarkup.SOURCE,
131         HtmlMarkup.SPAN,
132         HtmlMarkup.STRONG,
133         HtmlMarkup.STYLE,
134         HtmlMarkup.SUB,
135         HtmlMarkup.SUMMARY,
136         HtmlMarkup.SUP,
137         HtmlMarkup.SVG,
138         HtmlMarkup.TABLE,
139         HtmlMarkup.TBODY,
140         HtmlMarkup.TD,
141         HtmlMarkup.TEMPLATE,
142         HtmlMarkup.TEXTAREA,
143         HtmlMarkup.TFOOT,
144         HtmlMarkup.TH,
145         HtmlMarkup.THEAD,
146         HtmlMarkup.TIME,
147         HtmlMarkup.TITLE,
148         HtmlMarkup.TR,
149         HtmlMarkup.TRACK,
150         HtmlMarkup.U,
151         HtmlMarkup.UL,
152         HtmlMarkup.VAR,
153         HtmlMarkup.VIDEO,
154         HtmlMarkup.WBR
155     };
156 
157     private static final Map<String, Tag> TAG_MAP = new HashMap<>(ALL_TAGS.length);
158 
159     private static final int ASCII = 0x7E;
160 
161     static {
162         for (Tag tag : ALL_TAGS) {
163             TAG_MAP.put(tag.toString(), tag);
164         }
165     }
166 
167     /**
168      * Returns a tag for a defined HTML tag name. This is one of
169      * the tags defined in {@link org.apache.maven.doxia.markup.HtmlMarkup}.
170      * If the given name does not represent one of the defined tags, then
171      * <code>null</code> will be returned.
172      *
173      * @param tagName the <code>String</code> name requested.
174      * @return a tag constant corresponding to the <code>tagName</code>,
175      *    or <code>null</code> if not found.
176      * @see <a href="http://www.w3.org/TR/html401/index/elements.html">http://www.w3.org/TR/html401/index/elements.html</a>
177      * @since 1.1
178      */
179     public static Tag getHtmlTag(String tagName) {
180         return TAG_MAP.get(tagName);
181     }
182 
183     /**
184      * Escape special HTML characters in a String in <code>xml</code> mode.
185      *
186      * <b>Note</b>: this method doesn't escape non-ascii characters by numeric characters references.
187      *
188      * @param text the String to escape, may be null.
189      * @return The escaped text or the empty string if text == null.
190      * @see #escapeHTML(String,boolean)
191      */
192     public static String escapeHTML(String text) {
193         return escapeHTML(text, true);
194     }
195 
196     /**
197      * Escape special HTML characters in a String.
198      *
199      * <pre>
200      * &lt; becomes <code>&#38;lt;</code>
201      * &gt; becomes <code>&#38;gt;</code>
202      * &amp; becomes <code>&#38;amp;</code>
203      * " becomes <code>&#38;quot;</code>
204      * ' becomes <code>&#38;apos;</code> if xmlMode = true
205      * </pre>
206      *
207      * If <code>xmlMode</code> is true, every other character than the above remains unchanged,
208      * if <code>xmlMode</code> is false, non-ascii characters get replaced by their hex code.
209      *
210      * <b>Note</b>: all characters are encoded, i.e.:
211      * <pre>
212      * \u0159       = &#38;#x159;
213      * \uD835\uDFED = &#38;#x1d7ed;
214      * </pre>
215      *
216      * @param text The String to escape, may be null.
217      * @param xmlMode <code>true</code> to replace also ' to &#38;apos, <code>false</code> to replace non-ascii
218      * characters by numeric characters references.
219      * @return The escaped text or the empty string if text == null.
220      * @since 1.1
221      * @see <a href="http://www.w3.org/TR/2000/REC-xml-20001006#sec-predefined-ent">http://www.w3.org/TR/2000/REC-xml-20001006#sec-predefined-ent</a>
222      * @see <a href="http://www.w3.org/TR/html401/charset.html#h-5.3">http://www.w3.org/TR/html401/charset.html#h-5.3</a>
223      */
224     public static String escapeHTML(final String text, final boolean xmlMode) {
225         if (text == null) {
226             return "";
227         }
228 
229         int length = text.length();
230         StringBuilder buffer = new StringBuilder(length);
231 
232         for (int i = 0; i < length; ++i) {
233             char c = text.charAt(i);
234             switch (c) {
235                 case '<':
236                     buffer.append("&lt;");
237                     break;
238                 case '>':
239                     buffer.append("&gt;");
240                     break;
241                 case '&':
242                     buffer.append("&amp;");
243                     break;
244                 case '\"':
245                     buffer.append("&quot;");
246                     break;
247                 default:
248                     if (xmlMode) {
249                         if (c == '\'') {
250                             buffer.append("&apos;");
251                         } else {
252                             buffer.append(c);
253                         }
254                     } else {
255                         if (c <= ASCII) {
256                             // ASCII.
257                             buffer.append(c);
258                         } else {
259                             buffer.append("&#x");
260                             if (isHighSurrogate(c)) {
261                                 buffer.append(Integer.toHexString(toCodePoint(c, text.charAt(++i))));
262                             } else {
263                                 buffer.append(Integer.toHexString(c));
264                             }
265                             buffer.append(';');
266                         }
267                     }
268             }
269         }
270 
271         return buffer.toString();
272     }
273 
274     /**
275      * Unescapes HTML entities in a string in non xml mode.
276      *
277      * @param text the <code>String</code> to unescape, may be null.
278      * @return a new unescaped <code>String</code>, <code>null</code> if null string input.
279      * @since 1.1.1.
280      * @see #unescapeHTML(String, boolean)
281      */
282     public static String unescapeHTML(String text) {
283         return unescapeHTML(text, false);
284     }
285 
286     /**
287      * Unescapes HTML entities in a string.
288      *
289      * <p> Unescapes a string containing entity escapes to a string
290      * containing the actual Unicode characters corresponding to the
291      * escapes. Supports HTML 4.0 entities.</p>
292      *
293      * <p>For example, the string "&amp;lt;Fran&amp;ccedil;ais&amp;gt;"
294      * will become "&lt;Fran&ccedil;ais&gt;".</p>
295      *
296      * <b>Note</b>: all unicode entities are decoded, i.e.:
297      * <pre>
298      * &#38;#x159;   = \u0159
299      * &#38;#x1d7ed; = \uD835\uDFED
300      * </pre>
301      *
302      * @param text the <code>String</code> to unescape, may be null.
303      * @param xmlMode set to <code>true</code> to replace &#38;apos by '.
304      * @return a new unescaped <code>String</code>, <code>null</code> if null string input.
305      * @since 1.1.1.
306      */
307     public static String unescapeHTML(String text, boolean xmlMode) {
308         if (text == null) {
309             return null;
310         }
311 
312         String unescaped;
313         if (xmlMode) {
314             unescaped = StringEscapeUtils.unescapeXml(text);
315         } else {
316             // StringEscapeUtils.unescapeHtml4 returns entities it doesn't recognize unchanged
317             unescaped = StringEscapeUtils.unescapeHtml4(text);
318         }
319 
320         String tmp = unescaped;
321         List<String> entities = new ArrayList<>();
322         while (true) {
323             int i = tmp.indexOf("&#x");
324             if (i == -1) {
325                 break;
326             }
327 
328             tmp = tmp.substring(i + 3);
329             if (tmp.indexOf(';') != -1) {
330                 String entity = tmp.substring(0, tmp.indexOf(';'));
331                 try {
332                     Integer.parseInt(entity, 16);
333                     entities.add(entity);
334                 } catch (NumberFormatException e) {
335                     // nop
336                 }
337             }
338         }
339 
340         for (String entity : entities) {
341             int codePoint = Integer.parseInt(entity, 16);
342             unescaped = StringUtils.replace(unescaped, "&#x" + entity + ";", new String(toChars(codePoint)));
343         }
344 
345         return unescaped;
346     }
347 
348     /**
349      * Encode an url
350      *
351      * @param url the String to encode, may be null
352      * @return the text encoded, null if null String input
353      */
354     public static String encodeURL(String url) {
355         if (url == null) {
356             return null;
357         }
358 
359         StringBuilder encoded = new StringBuilder();
360         int length = url.length();
361 
362         char[] unicode = new char[1];
363 
364         for (int i = 0; i < length; ++i) {
365             char c = url.charAt(i);
366 
367             switch (c) {
368                 case ';':
369                 case '/':
370                 case '?':
371                 case ':':
372                 case '@':
373                 case '&':
374                 case '=':
375                 case '+':
376                 case '$':
377                 case ',':
378                 case '[':
379                 case ']': // RFC 2732 (IPV6)
380                 case '-':
381                 case '_':
382                 case '.':
383                 case '!':
384                 case '~':
385                 case '*':
386                 case '\'':
387                 case '(':
388                 case ')':
389                 case '#': // XLink mark
390                     encoded.append(c);
391                     break;
392                 default:
393                     if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')) {
394                         encoded.append(c);
395                     } else {
396                         byte[] bytes;
397 
398                         if (isHighSurrogate(c)) {
399                             int codePoint = toCodePoint(c, url.charAt(++i));
400                             unicode = toChars(codePoint);
401                             bytes = (new String(unicode, 0, unicode.length)).getBytes(StandardCharsets.UTF_8);
402                         } else {
403                             unicode[0] = c;
404                             bytes = (new String(unicode, 0, 1)).getBytes(StandardCharsets.UTF_8);
405                         }
406 
407                         for (byte aByte : bytes) {
408                             encoded.append('%');
409                             encoded.append(String.format("%02X", aByte));
410                         }
411                     }
412             }
413         }
414 
415         return encoded.toString();
416     }
417 
418     private HtmlTools() {
419         // utility class
420     }
421 
422     //
423     // Imported code from ASF Harmony project rev 770909
424     // http://svn.apache.org/repos/asf/harmony/enhanced/classlib/trunk/modules/luni/src/main/java/java/lang/Character.java
425     //
426 
427     private static final char LUNATE_SIGMA = 0x3FF;
428     private static final char NON_PRIVATE_USE_HIGH_SURROGATE = 0xD800;
429     private static final char LOW_SURROGATE = 0xDC00;
430 
431     private static int toCodePoint(char high, char low) {
432         // See RFC 2781, Section 2.2
433         // http://www.faqs.org/rfcs/rfc2781.html
434         int h = (high & LUNATE_SIGMA) << 10;
435         int l = low & LUNATE_SIGMA;
436         return (h | l) + MIN_SUPPLEMENTARY_CODE_POINT;
437     }
438 
439     private static final char MIN_HIGH_SURROGATE = '\uD800';
440     private static final char MAX_HIGH_SURROGATE = '\uDBFF';
441 
442     private static boolean isHighSurrogate(char ch) {
443         return (MIN_HIGH_SURROGATE <= ch && MAX_HIGH_SURROGATE >= ch);
444     }
445 
446     private static final int MIN_CODE_POINT = 0x000000;
447     private static final int MAX_CODE_POINT = 0x10FFFF;
448     private static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x10000;
449 
450     private static boolean isValidCodePoint(int codePoint) {
451         return (MIN_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint);
452     }
453 
454     private static boolean isSupplementaryCodePoint(int codePoint) {
455         return (MIN_SUPPLEMENTARY_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint);
456     }
457 
458     /**
459      * Converts the given code point to an equivalent character array.
460      *
461      * @param codePoint the code point to convert.
462      * @return If codePoint is a supplementary code point, returns a character array of length 2,
463      * otherwise a character array of length 1 containing only the original int as a char.
464      */
465     public static char[] toChars(int codePoint) {
466         if (!isValidCodePoint(codePoint)) {
467             throw new IllegalArgumentException("Code point " + codePoint + " is not valid");
468         }
469 
470         if (isSupplementaryCodePoint(codePoint)) {
471             int cpPrime = codePoint - MIN_SUPPLEMENTARY_CODE_POINT;
472             int high = NON_PRIVATE_USE_HIGH_SURROGATE | ((cpPrime >> 10) & LUNATE_SIGMA);
473             int low = LOW_SURROGATE | (cpPrime & LUNATE_SIGMA);
474             return new char[] {(char) high, (char) low};
475         }
476         return new char[] {(char) codePoint};
477     }
478 }