1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19 package org.apache.maven.doxia.util;
20
21 import javax.swing.text.html.HTML.Tag;
22
23 import java.nio.charset.StandardCharsets;
24 import java.util.ArrayList;
25 import java.util.HashMap;
26 import java.util.List;
27 import java.util.Map;
28
29 import org.apache.maven.doxia.markup.HtmlMarkup;
30
31 /**
32 * The <code>HtmlTools</code> class defines methods to HTML handling.
33 *
34 * @author <a href="mailto:vincent.siveton@gmail.com">Vincent Siveton</a>
35 * @since 1.0
36 */
37 public class HtmlTools {
38 private static final Tag[] ALL_TAGS = {
39 HtmlMarkup.A,
40 HtmlMarkup.ABBR,
41 HtmlMarkup.ADDRESS,
42 HtmlMarkup.AREA,
43 HtmlMarkup.ARTICLE,
44 HtmlMarkup.ASIDE,
45 HtmlMarkup.AUDIO,
46 HtmlMarkup.B,
47 HtmlMarkup.BASE,
48 HtmlMarkup.BDI,
49 HtmlMarkup.BDO,
50 HtmlMarkup.BLOCKQUOTE,
51 HtmlMarkup.BODY,
52 HtmlMarkup.BR,
53 HtmlMarkup.BUTTON,
54 HtmlMarkup.CANVAS,
55 HtmlMarkup.CAPTION,
56 HtmlMarkup.CITE,
57 HtmlMarkup.CODE,
58 HtmlMarkup.COL,
59 HtmlMarkup.COLGROUP,
60 HtmlMarkup.COMMAND,
61 HtmlMarkup.DATA,
62 HtmlMarkup.DATALIST,
63 HtmlMarkup.DD,
64 HtmlMarkup.DEL,
65 HtmlMarkup.DETAILS,
66 HtmlMarkup.DFN,
67 HtmlMarkup.DIALOG,
68 HtmlMarkup.DIV,
69 HtmlMarkup.DL,
70 HtmlMarkup.DT,
71 HtmlMarkup.EM,
72 HtmlMarkup.EMBED,
73 HtmlMarkup.FIELDSET,
74 HtmlMarkup.FIGCAPTION,
75 HtmlMarkup.FIGURE,
76 HtmlMarkup.FOOTER,
77 HtmlMarkup.FORM,
78 HtmlMarkup.H1,
79 HtmlMarkup.H2,
80 HtmlMarkup.H3,
81 HtmlMarkup.H4,
82 HtmlMarkup.H5,
83 HtmlMarkup.HEAD,
84 HtmlMarkup.HEADER,
85 HtmlMarkup.HGROUP,
86 HtmlMarkup.HR,
87 HtmlMarkup.HTML,
88 HtmlMarkup.I,
89 HtmlMarkup.IFRAME,
90 HtmlMarkup.IMG,
91 HtmlMarkup.INPUT,
92 HtmlMarkup.INS,
93 HtmlMarkup.KBD,
94 HtmlMarkup.KEYGEN,
95 HtmlMarkup.LABEL,
96 HtmlMarkup.LEGEND,
97 HtmlMarkup.LI,
98 HtmlMarkup.LINK,
99 HtmlMarkup.MAIN,
100 HtmlMarkup.MAP,
101 HtmlMarkup.MARK,
102 HtmlMarkup.MENU,
103 HtmlMarkup.MENUITEM,
104 HtmlMarkup.META,
105 HtmlMarkup.METER,
106 HtmlMarkup.NAV,
107 HtmlMarkup.NOSCRIPT,
108 HtmlMarkup.OBJECT,
109 HtmlMarkup.OL,
110 HtmlMarkup.OPTGROUP,
111 HtmlMarkup.OPTION,
112 HtmlMarkup.OUTPUT,
113 HtmlMarkup.P,
114 HtmlMarkup.PARAM,
115 HtmlMarkup.PICTURE,
116 HtmlMarkup.PRE,
117 HtmlMarkup.PROGRESS,
118 HtmlMarkup.Q,
119 HtmlMarkup.RP,
120 HtmlMarkup.RT,
121 HtmlMarkup.RUBY,
122 HtmlMarkup.S,
123 HtmlMarkup.SAMP,
124 HtmlMarkup.SECTION,
125 HtmlMarkup.SCRIPT,
126 HtmlMarkup.SELECT,
127 HtmlMarkup.SMALL,
128 HtmlMarkup.SOURCE,
129 HtmlMarkup.SPAN,
130 HtmlMarkup.STRONG,
131 HtmlMarkup.STYLE,
132 HtmlMarkup.SUB,
133 HtmlMarkup.SUMMARY,
134 HtmlMarkup.SUP,
135 HtmlMarkup.SVG,
136 HtmlMarkup.TABLE,
137 HtmlMarkup.TBODY,
138 HtmlMarkup.TD,
139 HtmlMarkup.TEMPLATE,
140 HtmlMarkup.TEXTAREA,
141 HtmlMarkup.TFOOT,
142 HtmlMarkup.TH,
143 HtmlMarkup.THEAD,
144 HtmlMarkup.TIME,
145 HtmlMarkup.TITLE,
146 HtmlMarkup.TR,
147 HtmlMarkup.TRACK,
148 HtmlMarkup.U,
149 HtmlMarkup.UL,
150 HtmlMarkup.VAR,
151 HtmlMarkup.VIDEO,
152 HtmlMarkup.WBR
153 };
154
155 private static final Map<String, Tag> TAG_MAP = new HashMap<>(ALL_TAGS.length);
156
157 private static final int ASCII = 0x7E;
158
159 static {
160 for (Tag tag : ALL_TAGS) {
161 TAG_MAP.put(tag.toString(), tag);
162 }
163 }
164
165 /**
166 * Returns a tag for a defined HTML tag name. This is one of
167 * the tags defined in {@link org.apache.maven.doxia.markup.HtmlMarkup}.
168 * If the given name does not represent one of the defined tags, then
169 * <code>null</code> will be returned.
170 *
171 * @param tagName the <code>String</code> name requested.
172 * @return a tag constant corresponding to the <code>tagName</code>,
173 * or <code>null</code> if not found.
174 * @see <a href="http://www.w3.org/TR/html401/index/elements.html">http://www.w3.org/TR/html401/index/elements.html</a>
175 * @since 1.1
176 */
177 public static Tag getHtmlTag(String tagName) {
178 return TAG_MAP.get(tagName);
179 }
180
181 /**
182 * Escape special HTML characters in a String in <code>xml</code> mode.
183 *
184 * <b>Note</b>: this method doesn't escape non-ascii characters by numeric characters references.
185 *
186 * @param text the String to escape, may be null.
187 * @return The escaped text or the empty string if text == null.
188 * @see #escapeHTML(String,boolean)
189 */
190 public static String escapeHTML(String text) {
191 return escapeHTML(text, true);
192 }
193
194 /**
195 * Escape special HTML characters in a String.
196 *
197 * <pre>
198 * < becomes <code>&lt;</code>
199 * > becomes <code>&gt;</code>
200 * & becomes <code>&amp;</code>
201 * " becomes <code>&quot;</code>
202 * ' becomes <code>&apos;</code> if xmlMode = true
203 * </pre>
204 *
205 * If <code>xmlMode</code> is true, every other character than the above remains unchanged,
206 * if <code>xmlMode</code> is false, non-ascii characters get replaced by their hex code.
207 *
208 * <b>Note</b>: all characters are encoded, i.e.:
209 * <pre>
210 * \u0159 = &#x159;
211 * \uD835\uDFED = &#x1d7ed;
212 * </pre>
213 *
214 * @param text The String to escape, may be null.
215 * @param xmlMode <code>true</code> to replace also ' to &apos, <code>false</code> to replace non-ascii
216 * characters by numeric characters references.
217 * @return The escaped text or the empty string if text == null.
218 * @since 1.1
219 * @see <a href="http://www.w3.org/TR/2000/REC-xml-20001006#sec-predefined-ent">http://www.w3.org/TR/2000/REC-xml-20001006#sec-predefined-ent</a>
220 * @see <a href="http://www.w3.org/TR/html401/charset.html#h-5.3">http://www.w3.org/TR/html401/charset.html#h-5.3</a>
221 */
222 public static String escapeHTML(final String text, final boolean xmlMode) {
223 if (text == null) {
224 return "";
225 }
226
227 int length = text.length();
228 StringBuilder buffer = new StringBuilder(length);
229
230 for (int i = 0; i < length; ++i) {
231 char c = text.charAt(i);
232 switch (c) {
233 case '<':
234 buffer.append("<");
235 break;
236 case '>':
237 buffer.append(">");
238 break;
239 case '&':
240 buffer.append("&");
241 break;
242 case '\"':
243 buffer.append(""");
244 break;
245 default:
246 if (xmlMode) {
247 if (c == '\'') {
248 buffer.append("'");
249 } else {
250 buffer.append(c);
251 }
252 } else {
253 if (c <= ASCII) {
254 // ASCII.
255 buffer.append(c);
256 } else {
257 buffer.append("&#x");
258 if (isHighSurrogate(c)) {
259 buffer.append(Integer.toHexString(toCodePoint(c, text.charAt(++i))));
260 } else {
261 buffer.append(Integer.toHexString(c));
262 }
263 buffer.append(';');
264 }
265 }
266 }
267 }
268
269 return buffer.toString();
270 }
271
272 /**
273 * Unescapes HTML entities in a string in non xml mode.
274 *
275 * @param text the <code>String</code> to unescape, may be null.
276 * @return a new unescaped <code>String</code>, <code>null</code> if null string input.
277 * @since 1.1.1.
278 * @see #unescapeHTML(String, boolean)
279 */
280 public static String unescapeHTML(String text) {
281 return unescapeHTML(text, false);
282 }
283
284 /**
285 * Unescapes HTML entities in a string.
286 *
287 * <p> Unescapes a string containing entity escapes to a string
288 * containing the actual Unicode characters corresponding to the
289 * escapes. Supports HTML 4.0 entities.</p>
290 *
291 * <p>For example, the string "&lt;Fran&ccedil;ais&gt;"
292 * will become "<Français>".</p>
293 *
294 * <b>Note</b>: all unicode entities are decoded, i.e.:
295 * <pre>
296 * &#x159; = \u0159
297 * &#x1d7ed; = \uD835\uDFED
298 * </pre>
299 *
300 * @param text the <code>String</code> to unescape, may be null.
301 * @param xmlMode set to <code>true</code> to replace &apos by '.
302 * @return a new unescaped <code>String</code>, <code>null</code> if null string input.
303 * @since 1.1.1.
304 */
305 public static String unescapeHTML(String text, boolean xmlMode) {
306 if (text == null) {
307 return null;
308 }
309
310 String unescaped;
311 if (xmlMode) {
312 unescaped = HtmlEntityUtils.unescapeXml(text);
313 } else {
314 // HtmlEntityUtils.unescapeHtml4 returns entities it doesn't recognize unchanged
315 unescaped = HtmlEntityUtils.unescapeHtml4(text);
316 }
317
318 String tmp = unescaped;
319 List<String> entities = new ArrayList<>();
320 while (true) {
321 int i = tmp.indexOf("&#x");
322 if (i == -1) {
323 break;
324 }
325
326 tmp = tmp.substring(i + 3);
327 if (tmp.indexOf(';') != -1) {
328 String entity = tmp.substring(0, tmp.indexOf(';'));
329 try {
330 Integer.parseInt(entity, 16);
331 entities.add(entity);
332 } catch (NumberFormatException e) {
333 // nop
334 }
335 }
336 }
337
338 for (String entity : entities) {
339 int codePoint = Integer.parseInt(entity, 16);
340 unescaped = DoxiaStringUtils.replace(unescaped, "&#x" + entity + ";", new String(toChars(codePoint)));
341 }
342
343 return unescaped;
344 }
345
346 /**
347 * Encode an url
348 *
349 * @param url the String to encode, may be null
350 * @return the text encoded, null if null String input
351 */
352 public static String encodeURL(String url) {
353 if (url == null) {
354 return null;
355 }
356
357 StringBuilder encoded = new StringBuilder();
358 int length = url.length();
359
360 char[] unicode = new char[1];
361
362 for (int i = 0; i < length; ++i) {
363 char c = url.charAt(i);
364
365 switch (c) {
366 case ';':
367 case '/':
368 case '?':
369 case ':':
370 case '@':
371 case '&':
372 case '=':
373 case '+':
374 case '$':
375 case ',':
376 case '[':
377 case ']': // RFC 2732 (IPV6)
378 case '-':
379 case '_':
380 case '.':
381 case '!':
382 case '~':
383 case '*':
384 case '\'':
385 case '(':
386 case ')':
387 case '#': // XLink mark
388 encoded.append(c);
389 break;
390 default:
391 if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')) {
392 encoded.append(c);
393 } else {
394 byte[] bytes;
395
396 if (isHighSurrogate(c)) {
397 int codePoint = toCodePoint(c, url.charAt(++i));
398 unicode = toChars(codePoint);
399 bytes = (new String(unicode, 0, unicode.length)).getBytes(StandardCharsets.UTF_8);
400 } else {
401 unicode[0] = c;
402 bytes = (new String(unicode, 0, 1)).getBytes(StandardCharsets.UTF_8);
403 }
404
405 for (byte aByte : bytes) {
406 encoded.append('%');
407 encoded.append(String.format("%02X", aByte));
408 }
409 }
410 }
411 }
412
413 return encoded.toString();
414 }
415
416 private HtmlTools() {
417 // utility class
418 }
419
420 //
421 // Imported code from ASF Harmony project rev 770909
422 // http://svn.apache.org/repos/asf/harmony/enhanced/classlib/trunk/modules/luni/src/main/java/java/lang/Character.java
423 //
424
425 private static final char LUNATE_SIGMA = 0x3FF;
426 private static final char NON_PRIVATE_USE_HIGH_SURROGATE = 0xD800;
427 private static final char LOW_SURROGATE = 0xDC00;
428
429 private static int toCodePoint(char high, char low) {
430 // See RFC 2781, Section 2.2
431 // http://www.faqs.org/rfcs/rfc2781.html
432 int h = (high & LUNATE_SIGMA) << 10;
433 int l = low & LUNATE_SIGMA;
434 return (h | l) + MIN_SUPPLEMENTARY_CODE_POINT;
435 }
436
437 private static final char MIN_HIGH_SURROGATE = '\uD800';
438 private static final char MAX_HIGH_SURROGATE = '\uDBFF';
439
440 private static boolean isHighSurrogate(char ch) {
441 return (MIN_HIGH_SURROGATE <= ch && MAX_HIGH_SURROGATE >= ch);
442 }
443
444 private static final int MIN_CODE_POINT = 0x000000;
445 private static final int MAX_CODE_POINT = 0x10FFFF;
446 private static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x10000;
447
448 private static boolean isValidCodePoint(int codePoint) {
449 return (MIN_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint);
450 }
451
452 private static boolean isSupplementaryCodePoint(int codePoint) {
453 return (MIN_SUPPLEMENTARY_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint);
454 }
455
456 /**
457 * Converts the given code point to an equivalent character array.
458 *
459 * @param codePoint the code point to convert.
460 * @return If codePoint is a supplementary code point, returns a character array of length 2,
461 * otherwise a character array of length 1 containing only the original int as a char.
462 */
463 public static char[] toChars(int codePoint) {
464 if (!isValidCodePoint(codePoint)) {
465 throw new IllegalArgumentException("Code point " + codePoint + " is not valid");
466 }
467
468 if (isSupplementaryCodePoint(codePoint)) {
469 int cpPrime = codePoint - MIN_SUPPLEMENTARY_CODE_POINT;
470 int high = NON_PRIVATE_USE_HIGH_SURROGATE | ((cpPrime >> 10) & LUNATE_SIGMA);
471 int low = LOW_SURROGATE | (cpPrime & LUNATE_SIGMA);
472 return new char[] {(char) high, (char) low};
473 }
474 return new char[] {(char) codePoint};
475 }
476 }