001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *   http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing,
013 * software distributed under the License is distributed on an
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 * KIND, either express or implied.  See the License for the
016 * specific language governing permissions and limitations
017 * under the License.
018 */
019package org.apache.maven.doxia.util;
020
021import javax.swing.text.html.HTML.Tag;
022
023import java.nio.charset.StandardCharsets;
024import java.util.ArrayList;
025import java.util.HashMap;
026import java.util.List;
027import java.util.Map;
028
029import org.apache.commons.lang3.StringUtils;
030import org.apache.commons.text.StringEscapeUtils;
031import org.apache.maven.doxia.markup.HtmlMarkup;
032
033/**
034 * The <code>HtmlTools</code> class defines methods to HTML handling.
035 *
036 * @author <a href="mailto:vincent.siveton@gmail.com">Vincent Siveton</a>
037 * @since 1.0
038 */
039public class HtmlTools {
040    private static final Tag[] ALL_TAGS = {
041        HtmlMarkup.A,
042        HtmlMarkup.ABBR,
043        HtmlMarkup.ADDRESS,
044        HtmlMarkup.AREA,
045        HtmlMarkup.ARTICLE,
046        HtmlMarkup.ASIDE,
047        HtmlMarkup.AUDIO,
048        HtmlMarkup.B,
049        HtmlMarkup.BASE,
050        HtmlMarkup.BDI,
051        HtmlMarkup.BDO,
052        HtmlMarkup.BLOCKQUOTE,
053        HtmlMarkup.BODY,
054        HtmlMarkup.BR,
055        HtmlMarkup.BUTTON,
056        HtmlMarkup.CANVAS,
057        HtmlMarkup.CAPTION,
058        HtmlMarkup.CITE,
059        HtmlMarkup.CODE,
060        HtmlMarkup.COL,
061        HtmlMarkup.COLGROUP,
062        HtmlMarkup.COMMAND,
063        HtmlMarkup.DATA,
064        HtmlMarkup.DATALIST,
065        HtmlMarkup.DD,
066        HtmlMarkup.DEL,
067        HtmlMarkup.DETAILS,
068        HtmlMarkup.DFN,
069        HtmlMarkup.DIALOG,
070        HtmlMarkup.DIV,
071        HtmlMarkup.DL,
072        HtmlMarkup.DT,
073        HtmlMarkup.EM,
074        HtmlMarkup.EMBED,
075        HtmlMarkup.FIELDSET,
076        HtmlMarkup.FIGCAPTION,
077        HtmlMarkup.FIGURE,
078        HtmlMarkup.FOOTER,
079        HtmlMarkup.FORM,
080        HtmlMarkup.H1,
081        HtmlMarkup.H2,
082        HtmlMarkup.H3,
083        HtmlMarkup.H4,
084        HtmlMarkup.H5,
085        HtmlMarkup.HEAD,
086        HtmlMarkup.HEADER,
087        HtmlMarkup.HGROUP,
088        HtmlMarkup.HR,
089        HtmlMarkup.HTML,
090        HtmlMarkup.I,
091        HtmlMarkup.IFRAME,
092        HtmlMarkup.IMG,
093        HtmlMarkup.INPUT,
094        HtmlMarkup.INS,
095        HtmlMarkup.KBD,
096        HtmlMarkup.KEYGEN,
097        HtmlMarkup.LABEL,
098        HtmlMarkup.LEGEND,
099        HtmlMarkup.LI,
100        HtmlMarkup.LINK,
101        HtmlMarkup.MAIN,
102        HtmlMarkup.MAP,
103        HtmlMarkup.MARK,
104        HtmlMarkup.MENU,
105        HtmlMarkup.MENUITEM,
106        HtmlMarkup.META,
107        HtmlMarkup.METER,
108        HtmlMarkup.NAV,
109        HtmlMarkup.NOSCRIPT,
110        HtmlMarkup.OBJECT,
111        HtmlMarkup.OL,
112        HtmlMarkup.OPTGROUP,
113        HtmlMarkup.OPTION,
114        HtmlMarkup.OUTPUT,
115        HtmlMarkup.P,
116        HtmlMarkup.PARAM,
117        HtmlMarkup.PICTURE,
118        HtmlMarkup.PRE,
119        HtmlMarkup.PROGRESS,
120        HtmlMarkup.Q,
121        HtmlMarkup.RP,
122        HtmlMarkup.RT,
123        HtmlMarkup.RUBY,
124        HtmlMarkup.S,
125        HtmlMarkup.SAMP,
126        HtmlMarkup.SECTION,
127        HtmlMarkup.SCRIPT,
128        HtmlMarkup.SELECT,
129        HtmlMarkup.SMALL,
130        HtmlMarkup.SOURCE,
131        HtmlMarkup.SPAN,
132        HtmlMarkup.STRONG,
133        HtmlMarkup.STYLE,
134        HtmlMarkup.SUB,
135        HtmlMarkup.SUMMARY,
136        HtmlMarkup.SUP,
137        HtmlMarkup.SVG,
138        HtmlMarkup.TABLE,
139        HtmlMarkup.TBODY,
140        HtmlMarkup.TD,
141        HtmlMarkup.TEMPLATE,
142        HtmlMarkup.TEXTAREA,
143        HtmlMarkup.TFOOT,
144        HtmlMarkup.TH,
145        HtmlMarkup.THEAD,
146        HtmlMarkup.TIME,
147        HtmlMarkup.TITLE,
148        HtmlMarkup.TR,
149        HtmlMarkup.TRACK,
150        HtmlMarkup.U,
151        HtmlMarkup.UL,
152        HtmlMarkup.VAR,
153        HtmlMarkup.VIDEO,
154        HtmlMarkup.WBR
155    };
156
157    private static final Map<String, Tag> TAG_MAP = new HashMap<>(ALL_TAGS.length);
158
159    private static final int ASCII = 0x7E;
160
161    static {
162        for (Tag tag : ALL_TAGS) {
163            TAG_MAP.put(tag.toString(), tag);
164        }
165    }
166
167    /**
168     * Returns a tag for a defined HTML tag name. This is one of
169     * the tags defined in {@link org.apache.maven.doxia.markup.HtmlMarkup}.
170     * If the given name does not represent one of the defined tags, then
171     * <code>null</code> will be returned.
172     *
173     * @param tagName the <code>String</code> name requested.
174     * @return a tag constant corresponding to the <code>tagName</code>,
175     *    or <code>null</code> if not found.
176     * @see <a href="http://www.w3.org/TR/html401/index/elements.html">http://www.w3.org/TR/html401/index/elements.html</a>
177     * @since 1.1
178     */
179    public static Tag getHtmlTag(String tagName) {
180        return TAG_MAP.get(tagName);
181    }
182
183    /**
184     * Escape special HTML characters in a String in <code>xml</code> mode.
185     *
186     * <b>Note</b>: this method doesn't escape non-ascii characters by numeric characters references.
187     *
188     * @param text the String to escape, may be null.
189     * @return The escaped text or the empty string if text == null.
190     * @see #escapeHTML(String,boolean)
191     */
192    public static String escapeHTML(String text) {
193        return escapeHTML(text, true);
194    }
195
196    /**
197     * Escape special HTML characters in a String.
198     *
199     * <pre>
200     * &lt; becomes <code>&#38;lt;</code>
201     * &gt; becomes <code>&#38;gt;</code>
202     * &amp; becomes <code>&#38;amp;</code>
203     * " becomes <code>&#38;quot;</code>
204     * ' becomes <code>&#38;apos;</code> if xmlMode = true
205     * </pre>
206     *
207     * If <code>xmlMode</code> is true, every other character than the above remains unchanged,
208     * if <code>xmlMode</code> is false, non-ascii characters get replaced by their hex code.
209     *
210     * <b>Note</b>: all characters are encoded, i.e.:
211     * <pre>
212     * \u0159       = &#38;#x159;
213     * \uD835\uDFED = &#38;#x1d7ed;
214     * </pre>
215     *
216     * @param text The String to escape, may be null.
217     * @param xmlMode <code>true</code> to replace also ' to &#38;apos, <code>false</code> to replace non-ascii
218     * characters by numeric characters references.
219     * @return The escaped text or the empty string if text == null.
220     * @since 1.1
221     * @see <a href="http://www.w3.org/TR/2000/REC-xml-20001006#sec-predefined-ent">http://www.w3.org/TR/2000/REC-xml-20001006#sec-predefined-ent</a>
222     * @see <a href="http://www.w3.org/TR/html401/charset.html#h-5.3">http://www.w3.org/TR/html401/charset.html#h-5.3</a>
223     */
224    public static String escapeHTML(final String text, final boolean xmlMode) {
225        if (text == null) {
226            return "";
227        }
228
229        int length = text.length();
230        StringBuilder buffer = new StringBuilder(length);
231
232        for (int i = 0; i < length; ++i) {
233            char c = text.charAt(i);
234            switch (c) {
235                case '<':
236                    buffer.append("&lt;");
237                    break;
238                case '>':
239                    buffer.append("&gt;");
240                    break;
241                case '&':
242                    buffer.append("&amp;");
243                    break;
244                case '\"':
245                    buffer.append("&quot;");
246                    break;
247                default:
248                    if (xmlMode) {
249                        if (c == '\'') {
250                            buffer.append("&apos;");
251                        } else {
252                            buffer.append(c);
253                        }
254                    } else {
255                        if (c <= ASCII) {
256                            // ASCII.
257                            buffer.append(c);
258                        } else {
259                            buffer.append("&#x");
260                            if (isHighSurrogate(c)) {
261                                buffer.append(Integer.toHexString(toCodePoint(c, text.charAt(++i))));
262                            } else {
263                                buffer.append(Integer.toHexString(c));
264                            }
265                            buffer.append(';');
266                        }
267                    }
268            }
269        }
270
271        return buffer.toString();
272    }
273
274    /**
275     * Unescapes HTML entities in a string in non xml mode.
276     *
277     * @param text the <code>String</code> to unescape, may be null.
278     * @return a new unescaped <code>String</code>, <code>null</code> if null string input.
279     * @since 1.1.1.
280     * @see #unescapeHTML(String, boolean)
281     */
282    public static String unescapeHTML(String text) {
283        return unescapeHTML(text, false);
284    }
285
286    /**
287     * Unescapes HTML entities in a string.
288     *
289     * <p> Unescapes a string containing entity escapes to a string
290     * containing the actual Unicode characters corresponding to the
291     * escapes. Supports HTML 4.0 entities.</p>
292     *
293     * <p>For example, the string "&amp;lt;Fran&amp;ccedil;ais&amp;gt;"
294     * will become "&lt;Fran&ccedil;ais&gt;".</p>
295     *
296     * <b>Note</b>: all unicode entities are decoded, i.e.:
297     * <pre>
298     * &#38;#x159;   = \u0159
299     * &#38;#x1d7ed; = \uD835\uDFED
300     * </pre>
301     *
302     * @param text the <code>String</code> to unescape, may be null.
303     * @param xmlMode set to <code>true</code> to replace &#38;apos by '.
304     * @return a new unescaped <code>String</code>, <code>null</code> if null string input.
305     * @since 1.1.1.
306     */
307    public static String unescapeHTML(String text, boolean xmlMode) {
308        if (text == null) {
309            return null;
310        }
311
312        String unescaped;
313        if (xmlMode) {
314            unescaped = StringEscapeUtils.unescapeXml(text);
315        } else {
316            // StringEscapeUtils.unescapeHtml4 returns entities it doesn't recognize unchanged
317            unescaped = StringEscapeUtils.unescapeHtml4(text);
318        }
319
320        String tmp = unescaped;
321        List<String> entities = new ArrayList<>();
322        while (true) {
323            int i = tmp.indexOf("&#x");
324            if (i == -1) {
325                break;
326            }
327
328            tmp = tmp.substring(i + 3);
329            if (tmp.indexOf(';') != -1) {
330                String entity = tmp.substring(0, tmp.indexOf(';'));
331                try {
332                    Integer.parseInt(entity, 16);
333                    entities.add(entity);
334                } catch (NumberFormatException e) {
335                    // nop
336                }
337            }
338        }
339
340        for (String entity : entities) {
341            int codePoint = Integer.parseInt(entity, 16);
342            unescaped = StringUtils.replace(unescaped, "&#x" + entity + ";", new String(toChars(codePoint)));
343        }
344
345        return unescaped;
346    }
347
348    /**
349     * Encode an url
350     *
351     * @param url the String to encode, may be null
352     * @return the text encoded, null if null String input
353     */
354    public static String encodeURL(String url) {
355        if (url == null) {
356            return null;
357        }
358
359        StringBuilder encoded = new StringBuilder();
360        int length = url.length();
361
362        char[] unicode = new char[1];
363
364        for (int i = 0; i < length; ++i) {
365            char c = url.charAt(i);
366
367            switch (c) {
368                case ';':
369                case '/':
370                case '?':
371                case ':':
372                case '@':
373                case '&':
374                case '=':
375                case '+':
376                case '$':
377                case ',':
378                case '[':
379                case ']': // RFC 2732 (IPV6)
380                case '-':
381                case '_':
382                case '.':
383                case '!':
384                case '~':
385                case '*':
386                case '\'':
387                case '(':
388                case ')':
389                case '#': // XLink mark
390                    encoded.append(c);
391                    break;
392                default:
393                    if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')) {
394                        encoded.append(c);
395                    } else {
396                        byte[] bytes;
397
398                        if (isHighSurrogate(c)) {
399                            int codePoint = toCodePoint(c, url.charAt(++i));
400                            unicode = toChars(codePoint);
401                            bytes = (new String(unicode, 0, unicode.length)).getBytes(StandardCharsets.UTF_8);
402                        } else {
403                            unicode[0] = c;
404                            bytes = (new String(unicode, 0, 1)).getBytes(StandardCharsets.UTF_8);
405                        }
406
407                        for (byte aByte : bytes) {
408                            encoded.append('%');
409                            encoded.append(String.format("%02X", aByte));
410                        }
411                    }
412            }
413        }
414
415        return encoded.toString();
416    }
417
418    private HtmlTools() {
419        // utility class
420    }
421
422    //
423    // Imported code from ASF Harmony project rev 770909
424    // http://svn.apache.org/repos/asf/harmony/enhanced/classlib/trunk/modules/luni/src/main/java/java/lang/Character.java
425    //
426
427    private static final char LUNATE_SIGMA = 0x3FF;
428    private static final char NON_PRIVATE_USE_HIGH_SURROGATE = 0xD800;
429    private static final char LOW_SURROGATE = 0xDC00;
430
431    private static int toCodePoint(char high, char low) {
432        // See RFC 2781, Section 2.2
433        // http://www.faqs.org/rfcs/rfc2781.html
434        int h = (high & LUNATE_SIGMA) << 10;
435        int l = low & LUNATE_SIGMA;
436        return (h | l) + MIN_SUPPLEMENTARY_CODE_POINT;
437    }
438
439    private static final char MIN_HIGH_SURROGATE = '\uD800';
440    private static final char MAX_HIGH_SURROGATE = '\uDBFF';
441
442    private static boolean isHighSurrogate(char ch) {
443        return (MIN_HIGH_SURROGATE <= ch && MAX_HIGH_SURROGATE >= ch);
444    }
445
446    private static final int MIN_CODE_POINT = 0x000000;
447    private static final int MAX_CODE_POINT = 0x10FFFF;
448    private static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x10000;
449
450    private static boolean isValidCodePoint(int codePoint) {
451        return (MIN_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint);
452    }
453
454    private static boolean isSupplementaryCodePoint(int codePoint) {
455        return (MIN_SUPPLEMENTARY_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint);
456    }
457
458    /**
459     * Converts the given code point to an equivalent character array.
460     *
461     * @param codePoint the code point to convert.
462     * @return If codePoint is a supplementary code point, returns a character array of length 2,
463     * otherwise a character array of length 1 containing only the original int as a char.
464     */
465    public static char[] toChars(int codePoint) {
466        if (!isValidCodePoint(codePoint)) {
467            throw new IllegalArgumentException("Code point " + codePoint + " is not valid");
468        }
469
470        if (isSupplementaryCodePoint(codePoint)) {
471            int cpPrime = codePoint - MIN_SUPPLEMENTARY_CODE_POINT;
472            int high = NON_PRIVATE_USE_HIGH_SURROGATE | ((cpPrime >> 10) & LUNATE_SIGMA);
473            int low = LOW_SURROGATE | (cpPrime & LUNATE_SIGMA);
474            return new char[] {(char) high, (char) low};
475        }
476        return new char[] {(char) codePoint};
477    }
478}