001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *   http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing,
013 * software distributed under the License is distributed on an
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 * KIND, either express or implied.  See the License for the
016 * specific language governing permissions and limitations
017 * under the License.
018 */
019package org.apache.maven.doxia.util;
020
021import javax.swing.text.html.HTML.Tag;
022
023import java.nio.charset.StandardCharsets;
024import java.util.ArrayList;
025import java.util.HashMap;
026import java.util.List;
027import java.util.Map;
028
029import org.apache.maven.doxia.markup.HtmlMarkup;
030
031/**
032 * The <code>HtmlTools</code> class defines methods to HTML handling.
033 *
034 * @author <a href="mailto:vincent.siveton@gmail.com">Vincent Siveton</a>
035 * @since 1.0
036 */
037public class HtmlTools {
038    private static final Tag[] ALL_TAGS = {
039        HtmlMarkup.A,
040        HtmlMarkup.ABBR,
041        HtmlMarkup.ADDRESS,
042        HtmlMarkup.AREA,
043        HtmlMarkup.ARTICLE,
044        HtmlMarkup.ASIDE,
045        HtmlMarkup.AUDIO,
046        HtmlMarkup.B,
047        HtmlMarkup.BASE,
048        HtmlMarkup.BDI,
049        HtmlMarkup.BDO,
050        HtmlMarkup.BLOCKQUOTE,
051        HtmlMarkup.BODY,
052        HtmlMarkup.BR,
053        HtmlMarkup.BUTTON,
054        HtmlMarkup.CANVAS,
055        HtmlMarkup.CAPTION,
056        HtmlMarkup.CITE,
057        HtmlMarkup.CODE,
058        HtmlMarkup.COL,
059        HtmlMarkup.COLGROUP,
060        HtmlMarkup.COMMAND,
061        HtmlMarkup.DATA,
062        HtmlMarkup.DATALIST,
063        HtmlMarkup.DD,
064        HtmlMarkup.DEL,
065        HtmlMarkup.DETAILS,
066        HtmlMarkup.DFN,
067        HtmlMarkup.DIALOG,
068        HtmlMarkup.DIV,
069        HtmlMarkup.DL,
070        HtmlMarkup.DT,
071        HtmlMarkup.EM,
072        HtmlMarkup.EMBED,
073        HtmlMarkup.FIELDSET,
074        HtmlMarkup.FIGCAPTION,
075        HtmlMarkup.FIGURE,
076        HtmlMarkup.FOOTER,
077        HtmlMarkup.FORM,
078        HtmlMarkup.H1,
079        HtmlMarkup.H2,
080        HtmlMarkup.H3,
081        HtmlMarkup.H4,
082        HtmlMarkup.H5,
083        HtmlMarkup.HEAD,
084        HtmlMarkup.HEADER,
085        HtmlMarkup.HGROUP,
086        HtmlMarkup.HR,
087        HtmlMarkup.HTML,
088        HtmlMarkup.I,
089        HtmlMarkup.IFRAME,
090        HtmlMarkup.IMG,
091        HtmlMarkup.INPUT,
092        HtmlMarkup.INS,
093        HtmlMarkup.KBD,
094        HtmlMarkup.KEYGEN,
095        HtmlMarkup.LABEL,
096        HtmlMarkup.LEGEND,
097        HtmlMarkup.LI,
098        HtmlMarkup.LINK,
099        HtmlMarkup.MAIN,
100        HtmlMarkup.MAP,
101        HtmlMarkup.MARK,
102        HtmlMarkup.MENU,
103        HtmlMarkup.MENUITEM,
104        HtmlMarkup.META,
105        HtmlMarkup.METER,
106        HtmlMarkup.NAV,
107        HtmlMarkup.NOSCRIPT,
108        HtmlMarkup.OBJECT,
109        HtmlMarkup.OL,
110        HtmlMarkup.OPTGROUP,
111        HtmlMarkup.OPTION,
112        HtmlMarkup.OUTPUT,
113        HtmlMarkup.P,
114        HtmlMarkup.PARAM,
115        HtmlMarkup.PICTURE,
116        HtmlMarkup.PRE,
117        HtmlMarkup.PROGRESS,
118        HtmlMarkup.Q,
119        HtmlMarkup.RP,
120        HtmlMarkup.RT,
121        HtmlMarkup.RUBY,
122        HtmlMarkup.S,
123        HtmlMarkup.SAMP,
124        HtmlMarkup.SECTION,
125        HtmlMarkup.SCRIPT,
126        HtmlMarkup.SELECT,
127        HtmlMarkup.SMALL,
128        HtmlMarkup.SOURCE,
129        HtmlMarkup.SPAN,
130        HtmlMarkup.STRONG,
131        HtmlMarkup.STYLE,
132        HtmlMarkup.SUB,
133        HtmlMarkup.SUMMARY,
134        HtmlMarkup.SUP,
135        HtmlMarkup.SVG,
136        HtmlMarkup.TABLE,
137        HtmlMarkup.TBODY,
138        HtmlMarkup.TD,
139        HtmlMarkup.TEMPLATE,
140        HtmlMarkup.TEXTAREA,
141        HtmlMarkup.TFOOT,
142        HtmlMarkup.TH,
143        HtmlMarkup.THEAD,
144        HtmlMarkup.TIME,
145        HtmlMarkup.TITLE,
146        HtmlMarkup.TR,
147        HtmlMarkup.TRACK,
148        HtmlMarkup.U,
149        HtmlMarkup.UL,
150        HtmlMarkup.VAR,
151        HtmlMarkup.VIDEO,
152        HtmlMarkup.WBR
153    };
154
155    private static final Map<String, Tag> TAG_MAP = new HashMap<>(ALL_TAGS.length);
156
157    private static final int ASCII = 0x7E;
158
159    static {
160        for (Tag tag : ALL_TAGS) {
161            TAG_MAP.put(tag.toString(), tag);
162        }
163    }
164
165    /**
166     * Returns a tag for a defined HTML tag name. This is one of
167     * the tags defined in {@link org.apache.maven.doxia.markup.HtmlMarkup}.
168     * If the given name does not represent one of the defined tags, then
169     * <code>null</code> will be returned.
170     *
171     * @param tagName the <code>String</code> name requested.
172     * @return a tag constant corresponding to the <code>tagName</code>,
173     *    or <code>null</code> if not found.
174     * @see <a href="http://www.w3.org/TR/html401/index/elements.html">http://www.w3.org/TR/html401/index/elements.html</a>
175     * @since 1.1
176     */
177    public static Tag getHtmlTag(String tagName) {
178        return TAG_MAP.get(tagName);
179    }
180
181    /**
182     * Escape special HTML characters in a String in <code>xml</code> mode.
183     *
184     * <b>Note</b>: this method doesn't escape non-ascii characters by numeric characters references.
185     *
186     * @param text the String to escape, may be null.
187     * @return The escaped text or the empty string if text == null.
188     * @see #escapeHTML(String,boolean)
189     */
190    public static String escapeHTML(String text) {
191        return escapeHTML(text, true);
192    }
193
194    /**
195     * Escape special HTML characters in a String.
196     *
197     * <pre>
198     * &lt; becomes <code>&#38;lt;</code>
199     * &gt; becomes <code>&#38;gt;</code>
200     * &amp; becomes <code>&#38;amp;</code>
201     * " becomes <code>&#38;quot;</code>
202     * ' becomes <code>&#38;apos;</code> if xmlMode = true
203     * </pre>
204     *
205     * If <code>xmlMode</code> is true, every other character than the above remains unchanged,
206     * if <code>xmlMode</code> is false, non-ascii characters get replaced by their hex code.
207     *
208     * <b>Note</b>: all characters are encoded, i.e.:
209     * <pre>
210     * \u0159       = &#38;#x159;
211     * \uD835\uDFED = &#38;#x1d7ed;
212     * </pre>
213     *
214     * @param text The String to escape, may be null.
215     * @param xmlMode <code>true</code> to replace also ' to &#38;apos, <code>false</code> to replace non-ascii
216     * characters by numeric characters references.
217     * @return The escaped text or the empty string if text == null.
218     * @since 1.1
219     * @see <a href="http://www.w3.org/TR/2000/REC-xml-20001006#sec-predefined-ent">http://www.w3.org/TR/2000/REC-xml-20001006#sec-predefined-ent</a>
220     * @see <a href="http://www.w3.org/TR/html401/charset.html#h-5.3">http://www.w3.org/TR/html401/charset.html#h-5.3</a>
221     */
222    public static String escapeHTML(final String text, final boolean xmlMode) {
223        if (text == null) {
224            return "";
225        }
226
227        int length = text.length();
228        StringBuilder buffer = new StringBuilder(length);
229
230        for (int i = 0; i < length; ++i) {
231            char c = text.charAt(i);
232            switch (c) {
233                case '<':
234                    buffer.append("&lt;");
235                    break;
236                case '>':
237                    buffer.append("&gt;");
238                    break;
239                case '&':
240                    buffer.append("&amp;");
241                    break;
242                case '\"':
243                    buffer.append("&quot;");
244                    break;
245                default:
246                    if (xmlMode) {
247                        if (c == '\'') {
248                            buffer.append("&apos;");
249                        } else {
250                            buffer.append(c);
251                        }
252                    } else {
253                        if (c <= ASCII) {
254                            // ASCII.
255                            buffer.append(c);
256                        } else {
257                            buffer.append("&#x");
258                            if (isHighSurrogate(c)) {
259                                buffer.append(Integer.toHexString(toCodePoint(c, text.charAt(++i))));
260                            } else {
261                                buffer.append(Integer.toHexString(c));
262                            }
263                            buffer.append(';');
264                        }
265                    }
266            }
267        }
268
269        return buffer.toString();
270    }
271
272    /**
273     * Unescapes HTML entities in a string in non xml mode.
274     *
275     * @param text the <code>String</code> to unescape, may be null.
276     * @return a new unescaped <code>String</code>, <code>null</code> if null string input.
277     * @since 1.1.1.
278     * @see #unescapeHTML(String, boolean)
279     */
280    public static String unescapeHTML(String text) {
281        return unescapeHTML(text, false);
282    }
283
284    /**
285     * Unescapes HTML entities in a string.
286     *
287     * <p> Unescapes a string containing entity escapes to a string
288     * containing the actual Unicode characters corresponding to the
289     * escapes. Supports HTML 4.0 entities.</p>
290     *
291     * <p>For example, the string "&amp;lt;Fran&amp;ccedil;ais&amp;gt;"
292     * will become "&lt;Fran&ccedil;ais&gt;".</p>
293     *
294     * <b>Note</b>: all unicode entities are decoded, i.e.:
295     * <pre>
296     * &#38;#x159;   = \u0159
297     * &#38;#x1d7ed; = \uD835\uDFED
298     * </pre>
299     *
300     * @param text the <code>String</code> to unescape, may be null.
301     * @param xmlMode set to <code>true</code> to replace &#38;apos by '.
302     * @return a new unescaped <code>String</code>, <code>null</code> if null string input.
303     * @since 1.1.1.
304     */
305    public static String unescapeHTML(String text, boolean xmlMode) {
306        if (text == null) {
307            return null;
308        }
309
310        String unescaped;
311        if (xmlMode) {
312            unescaped = HtmlEntityUtils.unescapeXml(text);
313        } else {
314            // HtmlEntityUtils.unescapeHtml4 returns entities it doesn't recognize unchanged
315            unescaped = HtmlEntityUtils.unescapeHtml4(text);
316        }
317
318        String tmp = unescaped;
319        List<String> entities = new ArrayList<>();
320        while (true) {
321            int i = tmp.indexOf("&#x");
322            if (i == -1) {
323                break;
324            }
325
326            tmp = tmp.substring(i + 3);
327            if (tmp.indexOf(';') != -1) {
328                String entity = tmp.substring(0, tmp.indexOf(';'));
329                try {
330                    Integer.parseInt(entity, 16);
331                    entities.add(entity);
332                } catch (NumberFormatException e) {
333                    // nop
334                }
335            }
336        }
337
338        for (String entity : entities) {
339            int codePoint = Integer.parseInt(entity, 16);
340            unescaped = DoxiaStringUtils.replace(unescaped, "&#x" + entity + ";", new String(toChars(codePoint)));
341        }
342
343        return unescaped;
344    }
345
346    /**
347     * Encode an url
348     *
349     * @param url the String to encode, may be null
350     * @return the text encoded, null if null String input
351     */
352    public static String encodeURL(String url) {
353        if (url == null) {
354            return null;
355        }
356
357        StringBuilder encoded = new StringBuilder();
358        int length = url.length();
359
360        char[] unicode = new char[1];
361
362        for (int i = 0; i < length; ++i) {
363            char c = url.charAt(i);
364
365            switch (c) {
366                case ';':
367                case '/':
368                case '?':
369                case ':':
370                case '@':
371                case '&':
372                case '=':
373                case '+':
374                case '$':
375                case ',':
376                case '[':
377                case ']': // RFC 2732 (IPV6)
378                case '-':
379                case '_':
380                case '.':
381                case '!':
382                case '~':
383                case '*':
384                case '\'':
385                case '(':
386                case ')':
387                case '#': // XLink mark
388                    encoded.append(c);
389                    break;
390                default:
391                    if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')) {
392                        encoded.append(c);
393                    } else {
394                        byte[] bytes;
395
396                        if (isHighSurrogate(c)) {
397                            int codePoint = toCodePoint(c, url.charAt(++i));
398                            unicode = toChars(codePoint);
399                            bytes = (new String(unicode, 0, unicode.length)).getBytes(StandardCharsets.UTF_8);
400                        } else {
401                            unicode[0] = c;
402                            bytes = (new String(unicode, 0, 1)).getBytes(StandardCharsets.UTF_8);
403                        }
404
405                        for (byte aByte : bytes) {
406                            encoded.append('%');
407                            encoded.append(String.format("%02X", aByte));
408                        }
409                    }
410            }
411        }
412
413        return encoded.toString();
414    }
415
416    private HtmlTools() {
417        // utility class
418    }
419
420    //
421    // Imported code from ASF Harmony project rev 770909
422    // http://svn.apache.org/repos/asf/harmony/enhanced/classlib/trunk/modules/luni/src/main/java/java/lang/Character.java
423    //
424
425    private static final char LUNATE_SIGMA = 0x3FF;
426    private static final char NON_PRIVATE_USE_HIGH_SURROGATE = 0xD800;
427    private static final char LOW_SURROGATE = 0xDC00;
428
429    private static int toCodePoint(char high, char low) {
430        // See RFC 2781, Section 2.2
431        // http://www.faqs.org/rfcs/rfc2781.html
432        int h = (high & LUNATE_SIGMA) << 10;
433        int l = low & LUNATE_SIGMA;
434        return (h | l) + MIN_SUPPLEMENTARY_CODE_POINT;
435    }
436
437    private static final char MIN_HIGH_SURROGATE = '\uD800';
438    private static final char MAX_HIGH_SURROGATE = '\uDBFF';
439
440    private static boolean isHighSurrogate(char ch) {
441        return (MIN_HIGH_SURROGATE <= ch && MAX_HIGH_SURROGATE >= ch);
442    }
443
444    private static final int MIN_CODE_POINT = 0x000000;
445    private static final int MAX_CODE_POINT = 0x10FFFF;
446    private static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x10000;
447
448    private static boolean isValidCodePoint(int codePoint) {
449        return (MIN_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint);
450    }
451
452    private static boolean isSupplementaryCodePoint(int codePoint) {
453        return (MIN_SUPPLEMENTARY_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint);
454    }
455
456    /**
457     * Converts the given code point to an equivalent character array.
458     *
459     * @param codePoint the code point to convert.
460     * @return If codePoint is a supplementary code point, returns a character array of length 2,
461     * otherwise a character array of length 1 containing only the original int as a char.
462     */
463    public static char[] toChars(int codePoint) {
464        if (!isValidCodePoint(codePoint)) {
465            throw new IllegalArgumentException("Code point " + codePoint + " is not valid");
466        }
467
468        if (isSupplementaryCodePoint(codePoint)) {
469            int cpPrime = codePoint - MIN_SUPPLEMENTARY_CODE_POINT;
470            int high = NON_PRIVATE_USE_HIGH_SURROGATE | ((cpPrime >> 10) & LUNATE_SIGMA);
471            int low = LOW_SURROGATE | (cpPrime & LUNATE_SIGMA);
472            return new char[] {(char) high, (char) low};
473        }
474        return new char[] {(char) codePoint};
475    }
476}