1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package org.apache.maven.doxia.util;
20
21 import javax.swing.text.html.HTML.Tag;
22
23 import java.nio.charset.StandardCharsets;
24 import java.util.ArrayList;
25 import java.util.HashMap;
26 import java.util.List;
27 import java.util.Map;
28
29 import org.apache.commons.lang3.StringUtils;
30 import org.apache.commons.text.StringEscapeUtils;
31 import org.apache.maven.doxia.markup.HtmlMarkup;
32
33
34
35
36
37
38
39 public class HtmlTools {
40 private static final Tag[] ALL_TAGS = {
41 HtmlMarkup.A,
42 HtmlMarkup.ABBR,
43 HtmlMarkup.ADDRESS,
44 HtmlMarkup.AREA,
45 HtmlMarkup.ARTICLE,
46 HtmlMarkup.ASIDE,
47 HtmlMarkup.AUDIO,
48 HtmlMarkup.B,
49 HtmlMarkup.BASE,
50 HtmlMarkup.BDI,
51 HtmlMarkup.BDO,
52 HtmlMarkup.BLOCKQUOTE,
53 HtmlMarkup.BODY,
54 HtmlMarkup.BR,
55 HtmlMarkup.BUTTON,
56 HtmlMarkup.CANVAS,
57 HtmlMarkup.CAPTION,
58 HtmlMarkup.CITE,
59 HtmlMarkup.CODE,
60 HtmlMarkup.COL,
61 HtmlMarkup.COLGROUP,
62 HtmlMarkup.COMMAND,
63 HtmlMarkup.DATA,
64 HtmlMarkup.DATALIST,
65 HtmlMarkup.DD,
66 HtmlMarkup.DEL,
67 HtmlMarkup.DETAILS,
68 HtmlMarkup.DFN,
69 HtmlMarkup.DIALOG,
70 HtmlMarkup.DIV,
71 HtmlMarkup.DL,
72 HtmlMarkup.DT,
73 HtmlMarkup.EM,
74 HtmlMarkup.EMBED,
75 HtmlMarkup.FIELDSET,
76 HtmlMarkup.FIGCAPTION,
77 HtmlMarkup.FIGURE,
78 HtmlMarkup.FOOTER,
79 HtmlMarkup.FORM,
80 HtmlMarkup.H1,
81 HtmlMarkup.H2,
82 HtmlMarkup.H3,
83 HtmlMarkup.H4,
84 HtmlMarkup.H5,
85 HtmlMarkup.HEAD,
86 HtmlMarkup.HEADER,
87 HtmlMarkup.HGROUP,
88 HtmlMarkup.HR,
89 HtmlMarkup.HTML,
90 HtmlMarkup.I,
91 HtmlMarkup.IFRAME,
92 HtmlMarkup.IMG,
93 HtmlMarkup.INPUT,
94 HtmlMarkup.INS,
95 HtmlMarkup.KBD,
96 HtmlMarkup.KEYGEN,
97 HtmlMarkup.LABEL,
98 HtmlMarkup.LEGEND,
99 HtmlMarkup.LI,
100 HtmlMarkup.LINK,
101 HtmlMarkup.MAIN,
102 HtmlMarkup.MAP,
103 HtmlMarkup.MARK,
104 HtmlMarkup.MENU,
105 HtmlMarkup.MENUITEM,
106 HtmlMarkup.META,
107 HtmlMarkup.METER,
108 HtmlMarkup.NAV,
109 HtmlMarkup.NOSCRIPT,
110 HtmlMarkup.OBJECT,
111 HtmlMarkup.OL,
112 HtmlMarkup.OPTGROUP,
113 HtmlMarkup.OPTION,
114 HtmlMarkup.OUTPUT,
115 HtmlMarkup.P,
116 HtmlMarkup.PARAM,
117 HtmlMarkup.PICTURE,
118 HtmlMarkup.PRE,
119 HtmlMarkup.PROGRESS,
120 HtmlMarkup.Q,
121 HtmlMarkup.RP,
122 HtmlMarkup.RT,
123 HtmlMarkup.RUBY,
124 HtmlMarkup.S,
125 HtmlMarkup.SAMP,
126 HtmlMarkup.SECTION,
127 HtmlMarkup.SCRIPT,
128 HtmlMarkup.SELECT,
129 HtmlMarkup.SMALL,
130 HtmlMarkup.SOURCE,
131 HtmlMarkup.SPAN,
132 HtmlMarkup.STRONG,
133 HtmlMarkup.STYLE,
134 HtmlMarkup.SUB,
135 HtmlMarkup.SUMMARY,
136 HtmlMarkup.SUP,
137 HtmlMarkup.SVG,
138 HtmlMarkup.TABLE,
139 HtmlMarkup.TBODY,
140 HtmlMarkup.TD,
141 HtmlMarkup.TEMPLATE,
142 HtmlMarkup.TEXTAREA,
143 HtmlMarkup.TFOOT,
144 HtmlMarkup.TH,
145 HtmlMarkup.THEAD,
146 HtmlMarkup.TIME,
147 HtmlMarkup.TITLE,
148 HtmlMarkup.TR,
149 HtmlMarkup.TRACK,
150 HtmlMarkup.U,
151 HtmlMarkup.UL,
152 HtmlMarkup.VAR,
153 HtmlMarkup.VIDEO,
154 HtmlMarkup.WBR
155 };
156
157 private static final Map<String, Tag> TAG_MAP = new HashMap<>(ALL_TAGS.length);
158
159 private static final int ASCII = 0x7E;
160
161 static {
162 for (Tag tag : ALL_TAGS) {
163 TAG_MAP.put(tag.toString(), tag);
164 }
165 }
166
167
168
169
170
171
172
173
174
175
176
177
178
179 public static Tag getHtmlTag(String tagName) {
180 return TAG_MAP.get(tagName);
181 }
182
183
184
185
186
187
188
189
190
191
192 public static String escapeHTML(String text) {
193 return escapeHTML(text, true);
194 }
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224 public static String escapeHTML(final String text, final boolean xmlMode) {
225 if (text == null) {
226 return "";
227 }
228
229 int length = text.length();
230 StringBuilder buffer = new StringBuilder(length);
231
232 for (int i = 0; i < length; ++i) {
233 char c = text.charAt(i);
234 switch (c) {
235 case '<':
236 buffer.append("<");
237 break;
238 case '>':
239 buffer.append(">");
240 break;
241 case '&':
242 buffer.append("&");
243 break;
244 case '\"':
245 buffer.append(""");
246 break;
247 default:
248 if (xmlMode) {
249 if (c == '\'') {
250 buffer.append("'");
251 } else {
252 buffer.append(c);
253 }
254 } else {
255 if (c <= ASCII) {
256
257 buffer.append(c);
258 } else {
259 buffer.append("&#x");
260 if (isHighSurrogate(c)) {
261 buffer.append(Integer.toHexString(toCodePoint(c, text.charAt(++i))));
262 } else {
263 buffer.append(Integer.toHexString(c));
264 }
265 buffer.append(';');
266 }
267 }
268 }
269 }
270
271 return buffer.toString();
272 }
273
274
275
276
277
278
279
280
281
282 public static String unescapeHTML(String text) {
283 return unescapeHTML(text, false);
284 }
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307 public static String unescapeHTML(String text, boolean xmlMode) {
308 if (text == null) {
309 return null;
310 }
311
312 String unescaped;
313 if (xmlMode) {
314 unescaped = StringEscapeUtils.unescapeXml(text);
315 } else {
316
317 unescaped = StringEscapeUtils.unescapeHtml4(text);
318 }
319
320 String tmp = unescaped;
321 List<String> entities = new ArrayList<>();
322 while (true) {
323 int i = tmp.indexOf("&#x");
324 if (i == -1) {
325 break;
326 }
327
328 tmp = tmp.substring(i + 3);
329 if (tmp.indexOf(';') != -1) {
330 String entity = tmp.substring(0, tmp.indexOf(';'));
331 try {
332 Integer.parseInt(entity, 16);
333 entities.add(entity);
334 } catch (NumberFormatException e) {
335
336 }
337 }
338 }
339
340 for (String entity : entities) {
341 int codePoint = Integer.parseInt(entity, 16);
342 unescaped = StringUtils.replace(unescaped, "&#x" + entity + ";", new String(toChars(codePoint)));
343 }
344
345 return unescaped;
346 }
347
348
349
350
351
352
353
354 public static String encodeURL(String url) {
355 if (url == null) {
356 return null;
357 }
358
359 StringBuilder encoded = new StringBuilder();
360 int length = url.length();
361
362 char[] unicode = new char[1];
363
364 for (int i = 0; i < length; ++i) {
365 char c = url.charAt(i);
366
367 switch (c) {
368 case ';':
369 case '/':
370 case '?':
371 case ':':
372 case '@':
373 case '&':
374 case '=':
375 case '+':
376 case '$':
377 case ',':
378 case '[':
379 case ']':
380 case '-':
381 case '_':
382 case '.':
383 case '!':
384 case '~':
385 case '*':
386 case '\'':
387 case '(':
388 case ')':
389 case '#':
390 encoded.append(c);
391 break;
392 default:
393 if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')) {
394 encoded.append(c);
395 } else {
396 byte[] bytes;
397
398 if (isHighSurrogate(c)) {
399 int codePoint = toCodePoint(c, url.charAt(++i));
400 unicode = toChars(codePoint);
401 bytes = (new String(unicode, 0, unicode.length)).getBytes(StandardCharsets.UTF_8);
402 } else {
403 unicode[0] = c;
404 bytes = (new String(unicode, 0, 1)).getBytes(StandardCharsets.UTF_8);
405 }
406
407 for (byte aByte : bytes) {
408 encoded.append('%');
409 encoded.append(String.format("%02X", aByte));
410 }
411 }
412 }
413 }
414
415 return encoded.toString();
416 }
417
418 private HtmlTools() {
419
420 }
421
422
423
424
425
426
427 private static final char LUNATE_SIGMA = 0x3FF;
428 private static final char NON_PRIVATE_USE_HIGH_SURROGATE = 0xD800;
429 private static final char LOW_SURROGATE = 0xDC00;
430
431 private static int toCodePoint(char high, char low) {
432
433
434 int h = (high & LUNATE_SIGMA) << 10;
435 int l = low & LUNATE_SIGMA;
436 return (h | l) + MIN_SUPPLEMENTARY_CODE_POINT;
437 }
438
439 private static final char MIN_HIGH_SURROGATE = '\uD800';
440 private static final char MAX_HIGH_SURROGATE = '\uDBFF';
441
442 private static boolean isHighSurrogate(char ch) {
443 return (MIN_HIGH_SURROGATE <= ch && MAX_HIGH_SURROGATE >= ch);
444 }
445
446 private static final int MIN_CODE_POINT = 0x000000;
447 private static final int MAX_CODE_POINT = 0x10FFFF;
448 private static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x10000;
449
450 private static boolean isValidCodePoint(int codePoint) {
451 return (MIN_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint);
452 }
453
454 private static boolean isSupplementaryCodePoint(int codePoint) {
455 return (MIN_SUPPLEMENTARY_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint);
456 }
457
458
459
460
461
462
463
464
465 public static char[] toChars(int codePoint) {
466 if (!isValidCodePoint(codePoint)) {
467 throw new IllegalArgumentException("Code point " + codePoint + " is not valid");
468 }
469
470 if (isSupplementaryCodePoint(codePoint)) {
471 int cpPrime = codePoint - MIN_SUPPLEMENTARY_CODE_POINT;
472 int high = NON_PRIVATE_USE_HIGH_SURROGATE | ((cpPrime >> 10) & LUNATE_SIGMA);
473 int low = LOW_SURROGATE | (cpPrime & LUNATE_SIGMA);
474 return new char[] {(char) high, (char) low};
475 }
476 return new char[] {(char) codePoint};
477 }
478 }