View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *   http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  package org.apache.maven.doxia.util;
20  
21  import javax.swing.text.html.HTML.Tag;
22  
23  import java.nio.charset.StandardCharsets;
24  import java.util.ArrayList;
25  import java.util.HashMap;
26  import java.util.List;
27  import java.util.Map;
28  
29  import org.apache.maven.doxia.markup.HtmlMarkup;
30  
31  /**
32   * The <code>HtmlTools</code> class defines methods to HTML handling.
33   *
34   * @author <a href="mailto:vincent.siveton@gmail.com">Vincent Siveton</a>
35   * @since 1.0
36   */
37  public class HtmlTools {
38      private static final Tag[] ALL_TAGS = {
39          HtmlMarkup.A,
40          HtmlMarkup.ABBR,
41          HtmlMarkup.ADDRESS,
42          HtmlMarkup.AREA,
43          HtmlMarkup.ARTICLE,
44          HtmlMarkup.ASIDE,
45          HtmlMarkup.AUDIO,
46          HtmlMarkup.B,
47          HtmlMarkup.BASE,
48          HtmlMarkup.BDI,
49          HtmlMarkup.BDO,
50          HtmlMarkup.BLOCKQUOTE,
51          HtmlMarkup.BODY,
52          HtmlMarkup.BR,
53          HtmlMarkup.BUTTON,
54          HtmlMarkup.CANVAS,
55          HtmlMarkup.CAPTION,
56          HtmlMarkup.CITE,
57          HtmlMarkup.CODE,
58          HtmlMarkup.COL,
59          HtmlMarkup.COLGROUP,
60          HtmlMarkup.COMMAND,
61          HtmlMarkup.DATA,
62          HtmlMarkup.DATALIST,
63          HtmlMarkup.DD,
64          HtmlMarkup.DEL,
65          HtmlMarkup.DETAILS,
66          HtmlMarkup.DFN,
67          HtmlMarkup.DIALOG,
68          HtmlMarkup.DIV,
69          HtmlMarkup.DL,
70          HtmlMarkup.DT,
71          HtmlMarkup.EM,
72          HtmlMarkup.EMBED,
73          HtmlMarkup.FIELDSET,
74          HtmlMarkup.FIGCAPTION,
75          HtmlMarkup.FIGURE,
76          HtmlMarkup.FOOTER,
77          HtmlMarkup.FORM,
78          HtmlMarkup.H1,
79          HtmlMarkup.H2,
80          HtmlMarkup.H3,
81          HtmlMarkup.H4,
82          HtmlMarkup.H5,
83          HtmlMarkup.HEAD,
84          HtmlMarkup.HEADER,
85          HtmlMarkup.HGROUP,
86          HtmlMarkup.HR,
87          HtmlMarkup.HTML,
88          HtmlMarkup.I,
89          HtmlMarkup.IFRAME,
90          HtmlMarkup.IMG,
91          HtmlMarkup.INPUT,
92          HtmlMarkup.INS,
93          HtmlMarkup.KBD,
94          HtmlMarkup.KEYGEN,
95          HtmlMarkup.LABEL,
96          HtmlMarkup.LEGEND,
97          HtmlMarkup.LI,
98          HtmlMarkup.LINK,
99          HtmlMarkup.MAIN,
100         HtmlMarkup.MAP,
101         HtmlMarkup.MARK,
102         HtmlMarkup.MENU,
103         HtmlMarkup.MENUITEM,
104         HtmlMarkup.META,
105         HtmlMarkup.METER,
106         HtmlMarkup.NAV,
107         HtmlMarkup.NOSCRIPT,
108         HtmlMarkup.OBJECT,
109         HtmlMarkup.OL,
110         HtmlMarkup.OPTGROUP,
111         HtmlMarkup.OPTION,
112         HtmlMarkup.OUTPUT,
113         HtmlMarkup.P,
114         HtmlMarkup.PARAM,
115         HtmlMarkup.PICTURE,
116         HtmlMarkup.PRE,
117         HtmlMarkup.PROGRESS,
118         HtmlMarkup.Q,
119         HtmlMarkup.RP,
120         HtmlMarkup.RT,
121         HtmlMarkup.RUBY,
122         HtmlMarkup.S,
123         HtmlMarkup.SAMP,
124         HtmlMarkup.SECTION,
125         HtmlMarkup.SCRIPT,
126         HtmlMarkup.SELECT,
127         HtmlMarkup.SMALL,
128         HtmlMarkup.SOURCE,
129         HtmlMarkup.SPAN,
130         HtmlMarkup.STRONG,
131         HtmlMarkup.STYLE,
132         HtmlMarkup.SUB,
133         HtmlMarkup.SUMMARY,
134         HtmlMarkup.SUP,
135         HtmlMarkup.SVG,
136         HtmlMarkup.TABLE,
137         HtmlMarkup.TBODY,
138         HtmlMarkup.TD,
139         HtmlMarkup.TEMPLATE,
140         HtmlMarkup.TEXTAREA,
141         HtmlMarkup.TFOOT,
142         HtmlMarkup.TH,
143         HtmlMarkup.THEAD,
144         HtmlMarkup.TIME,
145         HtmlMarkup.TITLE,
146         HtmlMarkup.TR,
147         HtmlMarkup.TRACK,
148         HtmlMarkup.U,
149         HtmlMarkup.UL,
150         HtmlMarkup.VAR,
151         HtmlMarkup.VIDEO,
152         HtmlMarkup.WBR
153     };
154 
155     private static final Map<String, Tag> TAG_MAP = new HashMap<>(ALL_TAGS.length);
156 
157     private static final int ASCII = 0x7E;
158 
159     static {
160         for (Tag tag : ALL_TAGS) {
161             TAG_MAP.put(tag.toString(), tag);
162         }
163     }
164 
165     /**
166      * Returns a tag for a defined HTML tag name. This is one of
167      * the tags defined in {@link org.apache.maven.doxia.markup.HtmlMarkup}.
168      * If the given name does not represent one of the defined tags, then
169      * <code>null</code> will be returned.
170      *
171      * @param tagName the <code>String</code> name requested.
172      * @return a tag constant corresponding to the <code>tagName</code>,
173      *    or <code>null</code> if not found.
174      * @see <a href="http://www.w3.org/TR/html401/index/elements.html">http://www.w3.org/TR/html401/index/elements.html</a>
175      * @since 1.1
176      */
177     public static Tag getHtmlTag(String tagName) {
178         return TAG_MAP.get(tagName);
179     }
180 
181     /**
182      * Escape special HTML characters in a String in <code>xml</code> mode.
183      *
184      * <b>Note</b>: this method doesn't escape non-ascii characters by numeric characters references.
185      *
186      * @param text the String to escape, may be null.
187      * @return The escaped text or the empty string if text == null.
188      * @see #escapeHTML(String,boolean)
189      */
190     public static String escapeHTML(String text) {
191         return escapeHTML(text, true);
192     }
193 
194     /**
195      * Escape special HTML characters in a String.
196      *
197      * <pre>
198      * &lt; becomes <code>&#38;lt;</code>
199      * &gt; becomes <code>&#38;gt;</code>
200      * &amp; becomes <code>&#38;amp;</code>
201      * " becomes <code>&#38;quot;</code>
202      * ' becomes <code>&#38;apos;</code> if xmlMode = true
203      * </pre>
204      *
205      * If <code>xmlMode</code> is true, every other character than the above remains unchanged,
206      * if <code>xmlMode</code> is false, non-ascii characters get replaced by their hex code.
207      *
208      * <b>Note</b>: all characters are encoded, i.e.:
209      * <pre>
210      * \u0159       = &#38;#x159;
211      * \uD835\uDFED = &#38;#x1d7ed;
212      * </pre>
213      *
214      * @param text The String to escape, may be null.
215      * @param xmlMode <code>true</code> to replace also ' to &#38;apos, <code>false</code> to replace non-ascii
216      * characters by numeric characters references.
217      * @return The escaped text or the empty string if text == null.
218      * @since 1.1
219      * @see <a href="http://www.w3.org/TR/2000/REC-xml-20001006#sec-predefined-ent">http://www.w3.org/TR/2000/REC-xml-20001006#sec-predefined-ent</a>
220      * @see <a href="http://www.w3.org/TR/html401/charset.html#h-5.3">http://www.w3.org/TR/html401/charset.html#h-5.3</a>
221      */
222     public static String escapeHTML(final String text, final boolean xmlMode) {
223         if (text == null) {
224             return "";
225         }
226 
227         int length = text.length();
228         StringBuilder buffer = new StringBuilder(length);
229 
230         for (int i = 0; i < length; ++i) {
231             char c = text.charAt(i);
232             switch (c) {
233                 case '<':
234                     buffer.append("&lt;");
235                     break;
236                 case '>':
237                     buffer.append("&gt;");
238                     break;
239                 case '&':
240                     buffer.append("&amp;");
241                     break;
242                 case '\"':
243                     buffer.append("&quot;");
244                     break;
245                 default:
246                     if (xmlMode) {
247                         if (c == '\'') {
248                             buffer.append("&apos;");
249                         } else {
250                             buffer.append(c);
251                         }
252                     } else {
253                         if (c <= ASCII) {
254                             // ASCII.
255                             buffer.append(c);
256                         } else {
257                             buffer.append("&#x");
258                             if (isHighSurrogate(c)) {
259                                 buffer.append(Integer.toHexString(toCodePoint(c, text.charAt(++i))));
260                             } else {
261                                 buffer.append(Integer.toHexString(c));
262                             }
263                             buffer.append(';');
264                         }
265                     }
266             }
267         }
268 
269         return buffer.toString();
270     }
271 
272     /**
273      * Unescapes HTML entities in a string in non xml mode.
274      *
275      * @param text the <code>String</code> to unescape, may be null.
276      * @return a new unescaped <code>String</code>, <code>null</code> if null string input.
277      * @since 1.1.1.
278      * @see #unescapeHTML(String, boolean)
279      */
280     public static String unescapeHTML(String text) {
281         return unescapeHTML(text, false);
282     }
283 
284     /**
285      * Unescapes HTML entities in a string.
286      *
287      * <p> Unescapes a string containing entity escapes to a string
288      * containing the actual Unicode characters corresponding to the
289      * escapes. Supports HTML 4.0 entities.</p>
290      *
291      * <p>For example, the string "&amp;lt;Fran&amp;ccedil;ais&amp;gt;"
292      * will become "&lt;Fran&ccedil;ais&gt;".</p>
293      *
294      * <b>Note</b>: all unicode entities are decoded, i.e.:
295      * <pre>
296      * &#38;#x159;   = \u0159
297      * &#38;#x1d7ed; = \uD835\uDFED
298      * </pre>
299      *
300      * @param text the <code>String</code> to unescape, may be null.
301      * @param xmlMode set to <code>true</code> to replace &#38;apos by '.
302      * @return a new unescaped <code>String</code>, <code>null</code> if null string input.
303      * @since 1.1.1.
304      */
305     public static String unescapeHTML(String text, boolean xmlMode) {
306         if (text == null) {
307             return null;
308         }
309 
310         String unescaped;
311         if (xmlMode) {
312             unescaped = HtmlEntityUtils.unescapeXml(text);
313         } else {
314             // HtmlEntityUtils.unescapeHtml4 returns entities it doesn't recognize unchanged
315             unescaped = HtmlEntityUtils.unescapeHtml4(text);
316         }
317 
318         String tmp = unescaped;
319         List<String> entities = new ArrayList<>();
320         while (true) {
321             int i = tmp.indexOf("&#x");
322             if (i == -1) {
323                 break;
324             }
325 
326             tmp = tmp.substring(i + 3);
327             if (tmp.indexOf(';') != -1) {
328                 String entity = tmp.substring(0, tmp.indexOf(';'));
329                 try {
330                     Integer.parseInt(entity, 16);
331                     entities.add(entity);
332                 } catch (NumberFormatException e) {
333                     // nop
334                 }
335             }
336         }
337 
338         for (String entity : entities) {
339             int codePoint = Integer.parseInt(entity, 16);
340             unescaped = DoxiaStringUtils.replace(unescaped, "&#x" + entity + ";", new String(toChars(codePoint)));
341         }
342 
343         return unescaped;
344     }
345 
346     /**
347      * Encode an url
348      *
349      * @param url the String to encode, may be null
350      * @return the text encoded, null if null String input
351      */
352     public static String encodeURL(String url) {
353         if (url == null) {
354             return null;
355         }
356 
357         StringBuilder encoded = new StringBuilder();
358         int length = url.length();
359 
360         char[] unicode = new char[1];
361 
362         for (int i = 0; i < length; ++i) {
363             char c = url.charAt(i);
364 
365             switch (c) {
366                 case ';':
367                 case '/':
368                 case '?':
369                 case ':':
370                 case '@':
371                 case '&':
372                 case '=':
373                 case '+':
374                 case '$':
375                 case ',':
376                 case '[':
377                 case ']': // RFC 2732 (IPV6)
378                 case '-':
379                 case '_':
380                 case '.':
381                 case '!':
382                 case '~':
383                 case '*':
384                 case '\'':
385                 case '(':
386                 case ')':
387                 case '#': // XLink mark
388                     encoded.append(c);
389                     break;
390                 default:
391                     if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')) {
392                         encoded.append(c);
393                     } else {
394                         byte[] bytes;
395 
396                         if (isHighSurrogate(c)) {
397                             int codePoint = toCodePoint(c, url.charAt(++i));
398                             unicode = toChars(codePoint);
399                             bytes = (new String(unicode, 0, unicode.length)).getBytes(StandardCharsets.UTF_8);
400                         } else {
401                             unicode[0] = c;
402                             bytes = (new String(unicode, 0, 1)).getBytes(StandardCharsets.UTF_8);
403                         }
404 
405                         for (byte aByte : bytes) {
406                             encoded.append('%');
407                             encoded.append(String.format("%02X", aByte));
408                         }
409                     }
410             }
411         }
412 
413         return encoded.toString();
414     }
415 
416     private HtmlTools() {
417         // utility class
418     }
419 
420     //
421     // Imported code from ASF Harmony project rev 770909
422     // http://svn.apache.org/repos/asf/harmony/enhanced/classlib/trunk/modules/luni/src/main/java/java/lang/Character.java
423     //
424 
425     private static final char LUNATE_SIGMA = 0x3FF;
426     private static final char NON_PRIVATE_USE_HIGH_SURROGATE = 0xD800;
427     private static final char LOW_SURROGATE = 0xDC00;
428 
429     private static int toCodePoint(char high, char low) {
430         // See RFC 2781, Section 2.2
431         // http://www.faqs.org/rfcs/rfc2781.html
432         int h = (high & LUNATE_SIGMA) << 10;
433         int l = low & LUNATE_SIGMA;
434         return (h | l) + MIN_SUPPLEMENTARY_CODE_POINT;
435     }
436 
437     private static final char MIN_HIGH_SURROGATE = '\uD800';
438     private static final char MAX_HIGH_SURROGATE = '\uDBFF';
439 
440     private static boolean isHighSurrogate(char ch) {
441         return (MIN_HIGH_SURROGATE <= ch && MAX_HIGH_SURROGATE >= ch);
442     }
443 
444     private static final int MIN_CODE_POINT = 0x000000;
445     private static final int MAX_CODE_POINT = 0x10FFFF;
446     private static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x10000;
447 
448     private static boolean isValidCodePoint(int codePoint) {
449         return (MIN_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint);
450     }
451 
452     private static boolean isSupplementaryCodePoint(int codePoint) {
453         return (MIN_SUPPLEMENTARY_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint);
454     }
455 
456     /**
457      * Converts the given code point to an equivalent character array.
458      *
459      * @param codePoint the code point to convert.
460      * @return If codePoint is a supplementary code point, returns a character array of length 2,
461      * otherwise a character array of length 1 containing only the original int as a char.
462      */
463     public static char[] toChars(int codePoint) {
464         if (!isValidCodePoint(codePoint)) {
465             throw new IllegalArgumentException("Code point " + codePoint + " is not valid");
466         }
467 
468         if (isSupplementaryCodePoint(codePoint)) {
469             int cpPrime = codePoint - MIN_SUPPLEMENTARY_CODE_POINT;
470             int high = NON_PRIVATE_USE_HIGH_SURROGATE | ((cpPrime >> 10) & LUNATE_SIGMA);
471             int low = LOW_SURROGATE | (cpPrime & LUNATE_SIGMA);
472             return new char[] {(char) high, (char) low};
473         }
474         return new char[] {(char) codePoint};
475     }
476 }