001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019package org.apache.maven.doxia.util; 020 021import javax.swing.text.html.HTML.Tag; 022 023import java.nio.charset.StandardCharsets; 024import java.util.ArrayList; 025import java.util.HashMap; 026import java.util.List; 027import java.util.Map; 028 029import org.apache.commons.lang3.StringUtils; 030import org.apache.commons.text.StringEscapeUtils; 031import org.apache.maven.doxia.markup.HtmlMarkup; 032 033/** 034 * The <code>HtmlTools</code> class defines methods to HTML handling. 035 * 036 * @author <a href="mailto:vincent.siveton@gmail.com">Vincent Siveton</a> 037 * @since 1.0 038 */ 039public class HtmlTools { 040 private static final Tag[] ALL_TAGS = { 041 HtmlMarkup.A, 042 HtmlMarkup.ABBR, 043 HtmlMarkup.ADDRESS, 044 HtmlMarkup.AREA, 045 HtmlMarkup.ARTICLE, 046 HtmlMarkup.ASIDE, 047 HtmlMarkup.AUDIO, 048 HtmlMarkup.B, 049 HtmlMarkup.BASE, 050 HtmlMarkup.BDI, 051 HtmlMarkup.BDO, 052 HtmlMarkup.BLOCKQUOTE, 053 HtmlMarkup.BODY, 054 HtmlMarkup.BR, 055 HtmlMarkup.BUTTON, 056 HtmlMarkup.CANVAS, 057 HtmlMarkup.CAPTION, 058 HtmlMarkup.CITE, 059 HtmlMarkup.CODE, 060 HtmlMarkup.COL, 061 HtmlMarkup.COLGROUP, 062 HtmlMarkup.COMMAND, 063 HtmlMarkup.DATA, 064 HtmlMarkup.DATALIST, 065 HtmlMarkup.DD, 066 HtmlMarkup.DEL, 067 HtmlMarkup.DETAILS, 068 HtmlMarkup.DFN, 069 HtmlMarkup.DIALOG, 070 HtmlMarkup.DIV, 071 HtmlMarkup.DL, 072 HtmlMarkup.DT, 073 HtmlMarkup.EM, 074 HtmlMarkup.EMBED, 075 HtmlMarkup.FIELDSET, 076 HtmlMarkup.FIGCAPTION, 077 HtmlMarkup.FIGURE, 078 HtmlMarkup.FOOTER, 079 HtmlMarkup.FORM, 080 HtmlMarkup.H1, 081 HtmlMarkup.H2, 082 HtmlMarkup.H3, 083 HtmlMarkup.H4, 084 HtmlMarkup.H5, 085 HtmlMarkup.HEAD, 086 HtmlMarkup.HEADER, 087 HtmlMarkup.HGROUP, 088 HtmlMarkup.HR, 089 HtmlMarkup.HTML, 090 HtmlMarkup.I, 091 HtmlMarkup.IFRAME, 092 HtmlMarkup.IMG, 093 HtmlMarkup.INPUT, 094 HtmlMarkup.INS, 095 HtmlMarkup.KBD, 096 HtmlMarkup.KEYGEN, 097 HtmlMarkup.LABEL, 098 HtmlMarkup.LEGEND, 099 HtmlMarkup.LI, 100 HtmlMarkup.LINK, 101 HtmlMarkup.MAIN, 102 HtmlMarkup.MAP, 103 HtmlMarkup.MARK, 104 HtmlMarkup.MENU, 105 HtmlMarkup.MENUITEM, 106 HtmlMarkup.META, 107 HtmlMarkup.METER, 108 HtmlMarkup.NAV, 109 HtmlMarkup.NOSCRIPT, 110 HtmlMarkup.OBJECT, 111 HtmlMarkup.OL, 112 HtmlMarkup.OPTGROUP, 113 HtmlMarkup.OPTION, 114 HtmlMarkup.OUTPUT, 115 HtmlMarkup.P, 116 HtmlMarkup.PARAM, 117 HtmlMarkup.PICTURE, 118 HtmlMarkup.PRE, 119 HtmlMarkup.PROGRESS, 120 HtmlMarkup.Q, 121 HtmlMarkup.RP, 122 HtmlMarkup.RT, 123 HtmlMarkup.RUBY, 124 HtmlMarkup.S, 125 HtmlMarkup.SAMP, 126 HtmlMarkup.SECTION, 127 HtmlMarkup.SCRIPT, 128 HtmlMarkup.SELECT, 129 HtmlMarkup.SMALL, 130 HtmlMarkup.SOURCE, 131 HtmlMarkup.SPAN, 132 HtmlMarkup.STRONG, 133 HtmlMarkup.STYLE, 134 HtmlMarkup.SUB, 135 HtmlMarkup.SUMMARY, 136 HtmlMarkup.SUP, 137 HtmlMarkup.SVG, 138 HtmlMarkup.TABLE, 139 HtmlMarkup.TBODY, 140 HtmlMarkup.TD, 141 HtmlMarkup.TEMPLATE, 142 HtmlMarkup.TEXTAREA, 143 HtmlMarkup.TFOOT, 144 HtmlMarkup.TH, 145 HtmlMarkup.THEAD, 146 HtmlMarkup.TIME, 147 HtmlMarkup.TITLE, 148 HtmlMarkup.TR, 149 HtmlMarkup.TRACK, 150 HtmlMarkup.U, 151 HtmlMarkup.UL, 152 HtmlMarkup.VAR, 153 HtmlMarkup.VIDEO, 154 HtmlMarkup.WBR 155 }; 156 157 private static final Map<String, Tag> TAG_MAP = new HashMap<>(ALL_TAGS.length); 158 159 private static final int ASCII = 0x7E; 160 161 static { 162 for (Tag tag : ALL_TAGS) { 163 TAG_MAP.put(tag.toString(), tag); 164 } 165 } 166 167 /** 168 * Returns a tag for a defined HTML tag name. This is one of 169 * the tags defined in {@link org.apache.maven.doxia.markup.HtmlMarkup}. 170 * If the given name does not represent one of the defined tags, then 171 * <code>null</code> will be returned. 172 * 173 * @param tagName the <code>String</code> name requested. 174 * @return a tag constant corresponding to the <code>tagName</code>, 175 * or <code>null</code> if not found. 176 * @see <a href="http://www.w3.org/TR/html401/index/elements.html">http://www.w3.org/TR/html401/index/elements.html</a> 177 * @since 1.1 178 */ 179 public static Tag getHtmlTag(String tagName) { 180 return TAG_MAP.get(tagName); 181 } 182 183 /** 184 * Escape special HTML characters in a String in <code>xml</code> mode. 185 * 186 * <b>Note</b>: this method doesn't escape non-ascii characters by numeric characters references. 187 * 188 * @param text the String to escape, may be null. 189 * @return The escaped text or the empty string if text == null. 190 * @see #escapeHTML(String,boolean) 191 */ 192 public static String escapeHTML(String text) { 193 return escapeHTML(text, true); 194 } 195 196 /** 197 * Escape special HTML characters in a String. 198 * 199 * <pre> 200 * < becomes <code>&lt;</code> 201 * > becomes <code>&gt;</code> 202 * & becomes <code>&amp;</code> 203 * " becomes <code>&quot;</code> 204 * ' becomes <code>&apos;</code> if xmlMode = true 205 * </pre> 206 * 207 * If <code>xmlMode</code> is true, every other character than the above remains unchanged, 208 * if <code>xmlMode</code> is false, non-ascii characters get replaced by their hex code. 209 * 210 * <b>Note</b>: all characters are encoded, i.e.: 211 * <pre> 212 * \u0159 = &#x159; 213 * \uD835\uDFED = &#x1d7ed; 214 * </pre> 215 * 216 * @param text The String to escape, may be null. 217 * @param xmlMode <code>true</code> to replace also ' to &apos, <code>false</code> to replace non-ascii 218 * characters by numeric characters references. 219 * @return The escaped text or the empty string if text == null. 220 * @since 1.1 221 * @see <a href="http://www.w3.org/TR/2000/REC-xml-20001006#sec-predefined-ent">http://www.w3.org/TR/2000/REC-xml-20001006#sec-predefined-ent</a> 222 * @see <a href="http://www.w3.org/TR/html401/charset.html#h-5.3">http://www.w3.org/TR/html401/charset.html#h-5.3</a> 223 */ 224 public static String escapeHTML(final String text, final boolean xmlMode) { 225 if (text == null) { 226 return ""; 227 } 228 229 int length = text.length(); 230 StringBuilder buffer = new StringBuilder(length); 231 232 for (int i = 0; i < length; ++i) { 233 char c = text.charAt(i); 234 switch (c) { 235 case '<': 236 buffer.append("<"); 237 break; 238 case '>': 239 buffer.append(">"); 240 break; 241 case '&': 242 buffer.append("&"); 243 break; 244 case '\"': 245 buffer.append("""); 246 break; 247 default: 248 if (xmlMode) { 249 if (c == '\'') { 250 buffer.append("'"); 251 } else { 252 buffer.append(c); 253 } 254 } else { 255 if (c <= ASCII) { 256 // ASCII. 257 buffer.append(c); 258 } else { 259 buffer.append("&#x"); 260 if (isHighSurrogate(c)) { 261 buffer.append(Integer.toHexString(toCodePoint(c, text.charAt(++i)))); 262 } else { 263 buffer.append(Integer.toHexString(c)); 264 } 265 buffer.append(';'); 266 } 267 } 268 } 269 } 270 271 return buffer.toString(); 272 } 273 274 /** 275 * Unescapes HTML entities in a string in non xml mode. 276 * 277 * @param text the <code>String</code> to unescape, may be null. 278 * @return a new unescaped <code>String</code>, <code>null</code> if null string input. 279 * @since 1.1.1. 280 * @see #unescapeHTML(String, boolean) 281 */ 282 public static String unescapeHTML(String text) { 283 return unescapeHTML(text, false); 284 } 285 286 /** 287 * Unescapes HTML entities in a string. 288 * 289 * <p> Unescapes a string containing entity escapes to a string 290 * containing the actual Unicode characters corresponding to the 291 * escapes. Supports HTML 4.0 entities.</p> 292 * 293 * <p>For example, the string "&lt;Fran&ccedil;ais&gt;" 294 * will become "<Français>".</p> 295 * 296 * <b>Note</b>: all unicode entities are decoded, i.e.: 297 * <pre> 298 * &#x159; = \u0159 299 * &#x1d7ed; = \uD835\uDFED 300 * </pre> 301 * 302 * @param text the <code>String</code> to unescape, may be null. 303 * @param xmlMode set to <code>true</code> to replace &apos by '. 304 * @return a new unescaped <code>String</code>, <code>null</code> if null string input. 305 * @since 1.1.1. 306 */ 307 public static String unescapeHTML(String text, boolean xmlMode) { 308 if (text == null) { 309 return null; 310 } 311 312 String unescaped; 313 if (xmlMode) { 314 unescaped = StringEscapeUtils.unescapeXml(text); 315 } else { 316 // StringEscapeUtils.unescapeHtml4 returns entities it doesn't recognize unchanged 317 unescaped = StringEscapeUtils.unescapeHtml4(text); 318 } 319 320 String tmp = unescaped; 321 List<String> entities = new ArrayList<>(); 322 while (true) { 323 int i = tmp.indexOf("&#x"); 324 if (i == -1) { 325 break; 326 } 327 328 tmp = tmp.substring(i + 3); 329 if (tmp.indexOf(';') != -1) { 330 String entity = tmp.substring(0, tmp.indexOf(';')); 331 try { 332 Integer.parseInt(entity, 16); 333 entities.add(entity); 334 } catch (NumberFormatException e) { 335 // nop 336 } 337 } 338 } 339 340 for (String entity : entities) { 341 int codePoint = Integer.parseInt(entity, 16); 342 unescaped = StringUtils.replace(unescaped, "&#x" + entity + ";", new String(toChars(codePoint))); 343 } 344 345 return unescaped; 346 } 347 348 /** 349 * Encode an url 350 * 351 * @param url the String to encode, may be null 352 * @return the text encoded, null if null String input 353 */ 354 public static String encodeURL(String url) { 355 if (url == null) { 356 return null; 357 } 358 359 StringBuilder encoded = new StringBuilder(); 360 int length = url.length(); 361 362 char[] unicode = new char[1]; 363 364 for (int i = 0; i < length; ++i) { 365 char c = url.charAt(i); 366 367 switch (c) { 368 case ';': 369 case '/': 370 case '?': 371 case ':': 372 case '@': 373 case '&': 374 case '=': 375 case '+': 376 case '$': 377 case ',': 378 case '[': 379 case ']': // RFC 2732 (IPV6) 380 case '-': 381 case '_': 382 case '.': 383 case '!': 384 case '~': 385 case '*': 386 case '\'': 387 case '(': 388 case ')': 389 case '#': // XLink mark 390 encoded.append(c); 391 break; 392 default: 393 if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')) { 394 encoded.append(c); 395 } else { 396 byte[] bytes; 397 398 if (isHighSurrogate(c)) { 399 int codePoint = toCodePoint(c, url.charAt(++i)); 400 unicode = toChars(codePoint); 401 bytes = (new String(unicode, 0, unicode.length)).getBytes(StandardCharsets.UTF_8); 402 } else { 403 unicode[0] = c; 404 bytes = (new String(unicode, 0, 1)).getBytes(StandardCharsets.UTF_8); 405 } 406 407 for (byte aByte : bytes) { 408 encoded.append('%'); 409 encoded.append(String.format("%02X", aByte)); 410 } 411 } 412 } 413 } 414 415 return encoded.toString(); 416 } 417 418 private HtmlTools() { 419 // utility class 420 } 421 422 // 423 // Imported code from ASF Harmony project rev 770909 424 // http://svn.apache.org/repos/asf/harmony/enhanced/classlib/trunk/modules/luni/src/main/java/java/lang/Character.java 425 // 426 427 private static final char LUNATE_SIGMA = 0x3FF; 428 private static final char NON_PRIVATE_USE_HIGH_SURROGATE = 0xD800; 429 private static final char LOW_SURROGATE = 0xDC00; 430 431 private static int toCodePoint(char high, char low) { 432 // See RFC 2781, Section 2.2 433 // http://www.faqs.org/rfcs/rfc2781.html 434 int h = (high & LUNATE_SIGMA) << 10; 435 int l = low & LUNATE_SIGMA; 436 return (h | l) + MIN_SUPPLEMENTARY_CODE_POINT; 437 } 438 439 private static final char MIN_HIGH_SURROGATE = '\uD800'; 440 private static final char MAX_HIGH_SURROGATE = '\uDBFF'; 441 442 private static boolean isHighSurrogate(char ch) { 443 return (MIN_HIGH_SURROGATE <= ch && MAX_HIGH_SURROGATE >= ch); 444 } 445 446 private static final int MIN_CODE_POINT = 0x000000; 447 private static final int MAX_CODE_POINT = 0x10FFFF; 448 private static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x10000; 449 450 private static boolean isValidCodePoint(int codePoint) { 451 return (MIN_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint); 452 } 453 454 private static boolean isSupplementaryCodePoint(int codePoint) { 455 return (MIN_SUPPLEMENTARY_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint); 456 } 457 458 /** 459 * Converts the given code point to an equivalent character array. 460 * 461 * @param codePoint the code point to convert. 462 * @return If codePoint is a supplementary code point, returns a character array of length 2, 463 * otherwise a character array of length 1 containing only the original int as a char. 464 */ 465 public static char[] toChars(int codePoint) { 466 if (!isValidCodePoint(codePoint)) { 467 throw new IllegalArgumentException("Code point " + codePoint + " is not valid"); 468 } 469 470 if (isSupplementaryCodePoint(codePoint)) { 471 int cpPrime = codePoint - MIN_SUPPLEMENTARY_CODE_POINT; 472 int high = NON_PRIVATE_USE_HIGH_SURROGATE | ((cpPrime >> 10) & LUNATE_SIGMA); 473 int low = LOW_SURROGATE | (cpPrime & LUNATE_SIGMA); 474 return new char[] {(char) high, (char) low}; 475 } 476 return new char[] {(char) codePoint}; 477 } 478}