001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019package org.apache.maven.doxia.util; 020 021import javax.swing.text.html.HTML.Tag; 022 023import java.nio.charset.StandardCharsets; 024import java.util.ArrayList; 025import java.util.HashMap; 026import java.util.List; 027import java.util.Map; 028 029import org.apache.maven.doxia.markup.HtmlMarkup; 030 031/** 032 * The <code>HtmlTools</code> class defines methods to HTML handling. 033 * 034 * @author <a href="mailto:vincent.siveton@gmail.com">Vincent Siveton</a> 035 * @since 1.0 036 */ 037public class HtmlTools { 038 private static final Tag[] ALL_TAGS = { 039 HtmlMarkup.A, 040 HtmlMarkup.ABBR, 041 HtmlMarkup.ADDRESS, 042 HtmlMarkup.AREA, 043 HtmlMarkup.ARTICLE, 044 HtmlMarkup.ASIDE, 045 HtmlMarkup.AUDIO, 046 HtmlMarkup.B, 047 HtmlMarkup.BASE, 048 HtmlMarkup.BDI, 049 HtmlMarkup.BDO, 050 HtmlMarkup.BLOCKQUOTE, 051 HtmlMarkup.BODY, 052 HtmlMarkup.BR, 053 HtmlMarkup.BUTTON, 054 HtmlMarkup.CANVAS, 055 HtmlMarkup.CAPTION, 056 HtmlMarkup.CITE, 057 HtmlMarkup.CODE, 058 HtmlMarkup.COL, 059 HtmlMarkup.COLGROUP, 060 HtmlMarkup.COMMAND, 061 HtmlMarkup.DATA, 062 HtmlMarkup.DATALIST, 063 HtmlMarkup.DD, 064 HtmlMarkup.DEL, 065 HtmlMarkup.DETAILS, 066 HtmlMarkup.DFN, 067 HtmlMarkup.DIALOG, 068 HtmlMarkup.DIV, 069 HtmlMarkup.DL, 070 HtmlMarkup.DT, 071 HtmlMarkup.EM, 072 HtmlMarkup.EMBED, 073 HtmlMarkup.FIELDSET, 074 HtmlMarkup.FIGCAPTION, 075 HtmlMarkup.FIGURE, 076 HtmlMarkup.FOOTER, 077 HtmlMarkup.FORM, 078 HtmlMarkup.H1, 079 HtmlMarkup.H2, 080 HtmlMarkup.H3, 081 HtmlMarkup.H4, 082 HtmlMarkup.H5, 083 HtmlMarkup.HEAD, 084 HtmlMarkup.HEADER, 085 HtmlMarkup.HGROUP, 086 HtmlMarkup.HR, 087 HtmlMarkup.HTML, 088 HtmlMarkup.I, 089 HtmlMarkup.IFRAME, 090 HtmlMarkup.IMG, 091 HtmlMarkup.INPUT, 092 HtmlMarkup.INS, 093 HtmlMarkup.KBD, 094 HtmlMarkup.KEYGEN, 095 HtmlMarkup.LABEL, 096 HtmlMarkup.LEGEND, 097 HtmlMarkup.LI, 098 HtmlMarkup.LINK, 099 HtmlMarkup.MAIN, 100 HtmlMarkup.MAP, 101 HtmlMarkup.MARK, 102 HtmlMarkup.MENU, 103 HtmlMarkup.MENUITEM, 104 HtmlMarkup.META, 105 HtmlMarkup.METER, 106 HtmlMarkup.NAV, 107 HtmlMarkup.NOSCRIPT, 108 HtmlMarkup.OBJECT, 109 HtmlMarkup.OL, 110 HtmlMarkup.OPTGROUP, 111 HtmlMarkup.OPTION, 112 HtmlMarkup.OUTPUT, 113 HtmlMarkup.P, 114 HtmlMarkup.PARAM, 115 HtmlMarkup.PICTURE, 116 HtmlMarkup.PRE, 117 HtmlMarkup.PROGRESS, 118 HtmlMarkup.Q, 119 HtmlMarkup.RP, 120 HtmlMarkup.RT, 121 HtmlMarkup.RUBY, 122 HtmlMarkup.S, 123 HtmlMarkup.SAMP, 124 HtmlMarkup.SECTION, 125 HtmlMarkup.SCRIPT, 126 HtmlMarkup.SELECT, 127 HtmlMarkup.SMALL, 128 HtmlMarkup.SOURCE, 129 HtmlMarkup.SPAN, 130 HtmlMarkup.STRONG, 131 HtmlMarkup.STYLE, 132 HtmlMarkup.SUB, 133 HtmlMarkup.SUMMARY, 134 HtmlMarkup.SUP, 135 HtmlMarkup.SVG, 136 HtmlMarkup.TABLE, 137 HtmlMarkup.TBODY, 138 HtmlMarkup.TD, 139 HtmlMarkup.TEMPLATE, 140 HtmlMarkup.TEXTAREA, 141 HtmlMarkup.TFOOT, 142 HtmlMarkup.TH, 143 HtmlMarkup.THEAD, 144 HtmlMarkup.TIME, 145 HtmlMarkup.TITLE, 146 HtmlMarkup.TR, 147 HtmlMarkup.TRACK, 148 HtmlMarkup.U, 149 HtmlMarkup.UL, 150 HtmlMarkup.VAR, 151 HtmlMarkup.VIDEO, 152 HtmlMarkup.WBR 153 }; 154 155 private static final Map<String, Tag> TAG_MAP = new HashMap<>(ALL_TAGS.length); 156 157 private static final int ASCII = 0x7E; 158 159 static { 160 for (Tag tag : ALL_TAGS) { 161 TAG_MAP.put(tag.toString(), tag); 162 } 163 } 164 165 /** 166 * Returns a tag for a defined HTML tag name. This is one of 167 * the tags defined in {@link org.apache.maven.doxia.markup.HtmlMarkup}. 168 * If the given name does not represent one of the defined tags, then 169 * <code>null</code> will be returned. 170 * 171 * @param tagName the <code>String</code> name requested. 172 * @return a tag constant corresponding to the <code>tagName</code>, 173 * or <code>null</code> if not found. 174 * @see <a href="http://www.w3.org/TR/html401/index/elements.html">http://www.w3.org/TR/html401/index/elements.html</a> 175 * @since 1.1 176 */ 177 public static Tag getHtmlTag(String tagName) { 178 return TAG_MAP.get(tagName); 179 } 180 181 /** 182 * Escape special HTML characters in a String in <code>xml</code> mode. 183 * 184 * <b>Note</b>: this method doesn't escape non-ascii characters by numeric characters references. 185 * 186 * @param text the String to escape, may be null. 187 * @return The escaped text or the empty string if text == null. 188 * @see #escapeHTML(String,boolean) 189 */ 190 public static String escapeHTML(String text) { 191 return escapeHTML(text, true); 192 } 193 194 /** 195 * Escape special HTML characters in a String. 196 * 197 * <pre> 198 * < becomes <code>&lt;</code> 199 * > becomes <code>&gt;</code> 200 * & becomes <code>&amp;</code> 201 * " becomes <code>&quot;</code> 202 * ' becomes <code>&apos;</code> if xmlMode = true 203 * </pre> 204 * 205 * If <code>xmlMode</code> is true, every other character than the above remains unchanged, 206 * if <code>xmlMode</code> is false, non-ascii characters get replaced by their hex code. 207 * 208 * <b>Note</b>: all characters are encoded, i.e.: 209 * <pre> 210 * \u0159 = &#x159; 211 * \uD835\uDFED = &#x1d7ed; 212 * </pre> 213 * 214 * @param text The String to escape, may be null. 215 * @param xmlMode <code>true</code> to replace also ' to &apos, <code>false</code> to replace non-ascii 216 * characters by numeric characters references. 217 * @return The escaped text or the empty string if text == null. 218 * @since 1.1 219 * @see <a href="http://www.w3.org/TR/2000/REC-xml-20001006#sec-predefined-ent">http://www.w3.org/TR/2000/REC-xml-20001006#sec-predefined-ent</a> 220 * @see <a href="http://www.w3.org/TR/html401/charset.html#h-5.3">http://www.w3.org/TR/html401/charset.html#h-5.3</a> 221 */ 222 public static String escapeHTML(final String text, final boolean xmlMode) { 223 if (text == null) { 224 return ""; 225 } 226 227 int length = text.length(); 228 StringBuilder buffer = new StringBuilder(length); 229 230 for (int i = 0; i < length; ++i) { 231 char c = text.charAt(i); 232 switch (c) { 233 case '<': 234 buffer.append("<"); 235 break; 236 case '>': 237 buffer.append(">"); 238 break; 239 case '&': 240 buffer.append("&"); 241 break; 242 case '\"': 243 buffer.append("""); 244 break; 245 default: 246 if (xmlMode) { 247 if (c == '\'') { 248 buffer.append("'"); 249 } else { 250 buffer.append(c); 251 } 252 } else { 253 if (c <= ASCII) { 254 // ASCII. 255 buffer.append(c); 256 } else { 257 buffer.append("&#x"); 258 if (isHighSurrogate(c)) { 259 buffer.append(Integer.toHexString(toCodePoint(c, text.charAt(++i)))); 260 } else { 261 buffer.append(Integer.toHexString(c)); 262 } 263 buffer.append(';'); 264 } 265 } 266 } 267 } 268 269 return buffer.toString(); 270 } 271 272 /** 273 * Unescapes HTML entities in a string in non xml mode. 274 * 275 * @param text the <code>String</code> to unescape, may be null. 276 * @return a new unescaped <code>String</code>, <code>null</code> if null string input. 277 * @since 1.1.1. 278 * @see #unescapeHTML(String, boolean) 279 */ 280 public static String unescapeHTML(String text) { 281 return unescapeHTML(text, false); 282 } 283 284 /** 285 * Unescapes HTML entities in a string. 286 * 287 * <p> Unescapes a string containing entity escapes to a string 288 * containing the actual Unicode characters corresponding to the 289 * escapes. Supports HTML 4.0 entities.</p> 290 * 291 * <p>For example, the string "&lt;Fran&ccedil;ais&gt;" 292 * will become "<Français>".</p> 293 * 294 * <b>Note</b>: all unicode entities are decoded, i.e.: 295 * <pre> 296 * &#x159; = \u0159 297 * &#x1d7ed; = \uD835\uDFED 298 * </pre> 299 * 300 * @param text the <code>String</code> to unescape, may be null. 301 * @param xmlMode set to <code>true</code> to replace &apos by '. 302 * @return a new unescaped <code>String</code>, <code>null</code> if null string input. 303 * @since 1.1.1. 304 */ 305 public static String unescapeHTML(String text, boolean xmlMode) { 306 if (text == null) { 307 return null; 308 } 309 310 String unescaped; 311 if (xmlMode) { 312 unescaped = HtmlEntityUtils.unescapeXml(text); 313 } else { 314 // HtmlEntityUtils.unescapeHtml4 returns entities it doesn't recognize unchanged 315 unescaped = HtmlEntityUtils.unescapeHtml4(text); 316 } 317 318 String tmp = unescaped; 319 List<String> entities = new ArrayList<>(); 320 while (true) { 321 int i = tmp.indexOf("&#x"); 322 if (i == -1) { 323 break; 324 } 325 326 tmp = tmp.substring(i + 3); 327 if (tmp.indexOf(';') != -1) { 328 String entity = tmp.substring(0, tmp.indexOf(';')); 329 try { 330 Integer.parseInt(entity, 16); 331 entities.add(entity); 332 } catch (NumberFormatException e) { 333 // nop 334 } 335 } 336 } 337 338 for (String entity : entities) { 339 int codePoint = Integer.parseInt(entity, 16); 340 unescaped = DoxiaStringUtils.replace(unescaped, "&#x" + entity + ";", new String(toChars(codePoint))); 341 } 342 343 return unescaped; 344 } 345 346 /** 347 * Encode an url 348 * 349 * @param url the String to encode, may be null 350 * @return the text encoded, null if null String input 351 */ 352 public static String encodeURL(String url) { 353 if (url == null) { 354 return null; 355 } 356 357 StringBuilder encoded = new StringBuilder(); 358 int length = url.length(); 359 360 char[] unicode = new char[1]; 361 362 for (int i = 0; i < length; ++i) { 363 char c = url.charAt(i); 364 365 switch (c) { 366 case ';': 367 case '/': 368 case '?': 369 case ':': 370 case '@': 371 case '&': 372 case '=': 373 case '+': 374 case '$': 375 case ',': 376 case '[': 377 case ']': // RFC 2732 (IPV6) 378 case '-': 379 case '_': 380 case '.': 381 case '!': 382 case '~': 383 case '*': 384 case '\'': 385 case '(': 386 case ')': 387 case '#': // XLink mark 388 encoded.append(c); 389 break; 390 default: 391 if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')) { 392 encoded.append(c); 393 } else { 394 byte[] bytes; 395 396 if (isHighSurrogate(c)) { 397 int codePoint = toCodePoint(c, url.charAt(++i)); 398 unicode = toChars(codePoint); 399 bytes = (new String(unicode, 0, unicode.length)).getBytes(StandardCharsets.UTF_8); 400 } else { 401 unicode[0] = c; 402 bytes = (new String(unicode, 0, 1)).getBytes(StandardCharsets.UTF_8); 403 } 404 405 for (byte aByte : bytes) { 406 encoded.append('%'); 407 encoded.append(String.format("%02X", aByte)); 408 } 409 } 410 } 411 } 412 413 return encoded.toString(); 414 } 415 416 private HtmlTools() { 417 // utility class 418 } 419 420 // 421 // Imported code from ASF Harmony project rev 770909 422 // http://svn.apache.org/repos/asf/harmony/enhanced/classlib/trunk/modules/luni/src/main/java/java/lang/Character.java 423 // 424 425 private static final char LUNATE_SIGMA = 0x3FF; 426 private static final char NON_PRIVATE_USE_HIGH_SURROGATE = 0xD800; 427 private static final char LOW_SURROGATE = 0xDC00; 428 429 private static int toCodePoint(char high, char low) { 430 // See RFC 2781, Section 2.2 431 // http://www.faqs.org/rfcs/rfc2781.html 432 int h = (high & LUNATE_SIGMA) << 10; 433 int l = low & LUNATE_SIGMA; 434 return (h | l) + MIN_SUPPLEMENTARY_CODE_POINT; 435 } 436 437 private static final char MIN_HIGH_SURROGATE = '\uD800'; 438 private static final char MAX_HIGH_SURROGATE = '\uDBFF'; 439 440 private static boolean isHighSurrogate(char ch) { 441 return (MIN_HIGH_SURROGATE <= ch && MAX_HIGH_SURROGATE >= ch); 442 } 443 444 private static final int MIN_CODE_POINT = 0x000000; 445 private static final int MAX_CODE_POINT = 0x10FFFF; 446 private static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x10000; 447 448 private static boolean isValidCodePoint(int codePoint) { 449 return (MIN_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint); 450 } 451 452 private static boolean isSupplementaryCodePoint(int codePoint) { 453 return (MIN_SUPPLEMENTARY_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint); 454 } 455 456 /** 457 * Converts the given code point to an equivalent character array. 458 * 459 * @param codePoint the code point to convert. 460 * @return If codePoint is a supplementary code point, returns a character array of length 2, 461 * otherwise a character array of length 1 containing only the original int as a char. 462 */ 463 public static char[] toChars(int codePoint) { 464 if (!isValidCodePoint(codePoint)) { 465 throw new IllegalArgumentException("Code point " + codePoint + " is not valid"); 466 } 467 468 if (isSupplementaryCodePoint(codePoint)) { 469 int cpPrime = codePoint - MIN_SUPPLEMENTARY_CODE_POINT; 470 int high = NON_PRIVATE_USE_HIGH_SURROGATE | ((cpPrime >> 10) & LUNATE_SIGMA); 471 int low = LOW_SURROGATE | (cpPrime & LUNATE_SIGMA); 472 return new char[] {(char) high, (char) low}; 473 } 474 return new char[] {(char) codePoint}; 475 } 476}