1 package org.apache.maven.doxia.util;
2
3 /*
4 * Licensed to the Apache Software Foundation (ASF) under one
5 * or more contributor license agreements. See the NOTICE file
6 * distributed with this work for additional information
7 * regarding copyright ownership. The ASF licenses this file
8 * to you under the Apache License, Version 2.0 (the
9 * "License"); you may not use this file except in compliance
10 * with the License. You may obtain a copy of the License at
11 *
12 * http://www.apache.org/licenses/LICENSE-2.0
13 *
14 * Unless required by applicable law or agreed to in writing,
15 * software distributed under the License is distributed on an
16 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17 * KIND, either express or implied. See the License for the
18 * specific language governing permissions and limitations
19 * under the License.
20 */
21
22 import java.io.UnsupportedEncodingException;
23 import java.util.ArrayList;
24 import java.util.HashMap;
25 import java.util.List;
26 import java.util.Map;
27
28 import javax.swing.text.html.HTML.Tag;
29
30 import org.apache.commons.lang.StringEscapeUtils;
31 import org.apache.maven.doxia.markup.HtmlMarkup;
32 import org.codehaus.plexus.util.StringUtils;
33
34 /**
35 * The <code>HtmlTools</code> class defines methods to HTML handling.
36 *
37 * @author <a href="mailto:vincent.siveton@gmail.com">Vincent Siveton</a>
38 * @version $Id: HtmlTools.java 1185112 2011-10-17 11:33:00Z ltheussl $
39 * @since 1.0
40 */
41 public class HtmlTools
42 {
43 private static final Tag[] ALL_TAGS =
44 {
45 HtmlMarkup.A, HtmlMarkup.ABBR, HtmlMarkup.ACRONYM, HtmlMarkup.ADDRESS, HtmlMarkup.APPLET,
46 HtmlMarkup.AREA, HtmlMarkup.B, HtmlMarkup.BASE, HtmlMarkup.BASEFONT, HtmlMarkup.BDO,
47 HtmlMarkup.BIG, HtmlMarkup.BLOCKQUOTE, HtmlMarkup.BODY, HtmlMarkup.BR, HtmlMarkup.BUTTON,
48 HtmlMarkup.CAPTION, HtmlMarkup.CENTER, HtmlMarkup.CITE, HtmlMarkup.CODE, HtmlMarkup.COL,
49 HtmlMarkup.COLGROUP, HtmlMarkup.DD, HtmlMarkup.DEL, HtmlMarkup.DFN, HtmlMarkup.DIR,
50 HtmlMarkup.DIV, HtmlMarkup.DL, HtmlMarkup.DT, HtmlMarkup.EM, HtmlMarkup.FIELDSET,
51 HtmlMarkup.FONT, HtmlMarkup.FORM, HtmlMarkup.FRAME, HtmlMarkup.FRAMESET, HtmlMarkup.H1,
52 HtmlMarkup.H2, HtmlMarkup.H3, HtmlMarkup.H4, HtmlMarkup.H5, HtmlMarkup.H6, HtmlMarkup.HEAD,
53 HtmlMarkup.HR, HtmlMarkup.HTML, HtmlMarkup.I, HtmlMarkup.IFRAME, HtmlMarkup.IMG,
54 HtmlMarkup.INPUT, HtmlMarkup.INS, HtmlMarkup.ISINDEX, HtmlMarkup.KBD, HtmlMarkup.LABEL,
55 HtmlMarkup.LEGEND, HtmlMarkup.LI, HtmlMarkup.LINK, HtmlMarkup.MAP, HtmlMarkup.MENU,
56 HtmlMarkup.META, HtmlMarkup.NOFRAMES, HtmlMarkup.NOSCRIPT, HtmlMarkup.OBJECT, HtmlMarkup.OL,
57 HtmlMarkup.OPTGROUP, HtmlMarkup.OPTION, HtmlMarkup.P, HtmlMarkup.PARAM, HtmlMarkup.PRE,
58 HtmlMarkup.Q, HtmlMarkup.S, HtmlMarkup.SAMP, HtmlMarkup.SCRIPT, HtmlMarkup.SELECT,
59 HtmlMarkup.SMALL, HtmlMarkup.SPAN, HtmlMarkup.STRIKE, HtmlMarkup.STRONG, HtmlMarkup.STYLE,
60 HtmlMarkup.SUB, HtmlMarkup.SUP, HtmlMarkup.TABLE, HtmlMarkup.TBODY, HtmlMarkup.TD,
61 HtmlMarkup.TEXTAREA, HtmlMarkup.TFOOT, HtmlMarkup.TH, HtmlMarkup.THEAD, HtmlMarkup.TITLE,
62 HtmlMarkup.TR, HtmlMarkup.TT, HtmlMarkup.U, HtmlMarkup.UL, HtmlMarkup.VAR
63 };
64
65 private static final Map<String, Tag> TAG_MAP = new HashMap<String, Tag>( ALL_TAGS.length );
66
67 private static final int ASCII = 0x7E;
68
69 static
70 {
71 for ( Tag tag : ALL_TAGS )
72 {
73 TAG_MAP.put( tag.toString(), tag );
74 }
75 }
76
77 /**
78 * Returns a tag for a defined HTML tag name. This is one of
79 * the tags defined in {@link org.apache.maven.doxia.markup.HtmlMarkup}.
80 * If the given name does not represent one of the defined tags, then
81 * <code>null</code> will be returned.
82 *
83 * @param tagName the <code>String</code> name requested.
84 * @return a tag constant corresponding to the <code>tagName</code>,
85 * or <code>null</code> if not found.
86 * @see <a href="http://www.w3.org/TR/html401/index/elements.html">http://www.w3.org/TR/html401/index/elements.html</a>
87 * @since 1.1
88 */
89 public static Tag getHtmlTag( String tagName )
90 {
91 Object t = TAG_MAP.get( tagName );
92
93 return (Tag) t;
94 }
95
96 /**
97 * Escape special HTML characters in a String in <code>xml</code> mode.
98 *
99 * <b>Note</b>: this method doesn't escape non-ascii characters by numeric characters references.
100 *
101 * @param text the String to escape, may be null.
102 * @return The escaped text or the empty string if text == null.
103 * @see #escapeHTML(String,boolean)
104 */
105 public static String escapeHTML( String text )
106 {
107 return escapeHTML( text, true );
108 }
109
110 /**
111 * Escape special HTML characters in a String.
112 *
113 * <pre>
114 * < becomes <code>&lt;</code>
115 * > becomes <code>&gt;</code>
116 * & becomes <code>&amp;</code>
117 * " becomes <code>&quot;</code>
118 * ' becomes <code>&apos;</code> if xmlMode = true
119 * </pre>
120 *
121 * If <code>xmlMode</code> is true, every other character than the above remains unchanged,
122 * if <code>xmlMode</code> is false, non-ascii characters get replaced by their hex code.
123 *
124 * <b>Note</b>: all characters are encoded, i.e.:
125 * <pre>
126 * \u0159 = &#x159;
127 * \uD835\uDFED = &#x1d7ed;
128 * </pre>
129 *
130 * @param text The String to escape, may be null.
131 * @param xmlMode <code>true</code> to replace also ' to &apos, <code>false</code> to replace non-ascii
132 * characters by numeric characters references.
133 * @return The escaped text or the empty string if text == null.
134 * @since 1.1
135 * @see <a href="http://www.w3.org/TR/2000/REC-xml-20001006#sec-predefined-ent">http://www.w3.org/TR/2000/REC-xml-20001006#sec-predefined-ent</a>
136 * @see <a href="http://www.w3.org/TR/html401/charset.html#h-5.3">http://www.w3.org/TR/html401/charset.html#h-5.3</a>
137 */
138 public static String escapeHTML( final String text, final boolean xmlMode )
139 {
140 if ( text == null )
141 {
142 return "";
143 }
144
145 int length = text.length();
146 StringBuilder buffer = new StringBuilder( length );
147
148 for ( int i = 0; i < length; ++i )
149 {
150 char c = text.charAt( i );
151 switch ( c )
152 {
153 case '<':
154 buffer.append( "<" );
155 break;
156 case '>':
157 buffer.append( ">" );
158 break;
159 case '&':
160 buffer.append( "&" );
161 break;
162 case '\"':
163 buffer.append( """ );
164 break;
165 default:
166 if ( xmlMode )
167 {
168 if ( c == '\'' )
169 {
170 buffer.append( "'" );
171 }
172 else
173 {
174 buffer.append( c );
175 }
176 }
177 else
178 {
179 if ( c <= ASCII )
180 {
181 // ASCII.
182 buffer.append( c );
183 }
184 else
185 {
186 buffer.append( "&#x" );
187 if ( isHighSurrogate( c ) )
188 {
189 buffer.append( Integer.toHexString( toCodePoint( c, text.charAt( ++i ) ) ) );
190 }
191 else
192 {
193 buffer.append( Integer.toHexString( c ) );
194 }
195 buffer.append( ';' );
196 }
197 }
198 }
199 }
200
201 return buffer.toString();
202 }
203
204 /**
205 * Unescapes HTML entities in a string in non xml mode.
206 *
207 * @param text the <code>String</code> to unescape, may be null.
208 * @return a new unescaped <code>String</code>, <code>null</code> if null string input.
209 * @since 1.1.1.
210 * @see #unescapeHTML(String, boolean)
211 */
212 public static String unescapeHTML( String text )
213 {
214 return unescapeHTML( text, false );
215 }
216
217 /**
218 * Unescapes HTML entities in a string.
219 *
220 * <p> Unescapes a string containing entity escapes to a string
221 * containing the actual Unicode characters corresponding to the
222 * escapes. Supports HTML 4.0 entities.</p>
223 *
224 * <p>For example, the string "&lt;Fran&ccedil;ais&gt;"
225 * will become "<Français>".</p>
226 *
227 * <b>Note</b>: all unicode entities are decoded, i.e.:
228 * <pre>
229 * &#x159; = \u0159
230 * &#x1d7ed; = \uD835\uDFED
231 * </pre>
232 *
233 * @param text the <code>String</code> to unescape, may be null.
234 * @param xmlMode set to <code>true</code> to replace &apos by '.
235 * @return a new unescaped <code>String</code>, <code>null</code> if null string input.
236 * @since 1.1.1.
237 */
238 public static String unescapeHTML( String text, boolean xmlMode )
239 {
240 if ( text == null )
241 {
242 return null;
243 }
244
245 String unescaped;
246 if ( xmlMode )
247 {
248 unescaped = StringEscapeUtils.unescapeXml( text );
249 }
250 else
251 {
252 // StringEscapeUtils.unescapeHtml returns entities it doesn't recognize unchanged
253 unescaped = StringEscapeUtils.unescapeHtml( text );
254 }
255
256 String tmp = unescaped;
257 List<String> entities = new ArrayList<String>();
258 while ( true )
259 {
260 int i = tmp.indexOf( "&#x" );
261 if ( i == -1 )
262 {
263 break;
264 }
265
266 tmp = tmp.substring( i + 3 );
267 if ( tmp.indexOf( ';' ) != -1 )
268 {
269 String entity = tmp.substring( 0, tmp.indexOf( ';' ) );
270 try
271 {
272 Integer.parseInt( entity, 16 );
273 entities.add( entity );
274 }
275 catch ( NumberFormatException e )
276 {
277 // nop
278 }
279 }
280 }
281
282 for ( String entity : entities )
283 {
284 int codePoint = Integer.parseInt( entity, 16 );
285 unescaped = StringUtils.replace( unescaped, "&#x" + entity + ";", new String( toChars( codePoint ) ) );
286 }
287
288 return unescaped;
289 }
290
291 /**
292 * Encode an url
293 *
294 * @param url the String to encode, may be null
295 * @return the text encoded, null if null String input
296 */
297 public static String encodeURL( String url )
298 {
299 if ( url == null )
300 {
301 return null;
302 }
303
304 StringBuilder encoded = new StringBuilder();
305 int length = url.length();
306
307 char[] unicode = new char[1];
308
309 for ( int i = 0; i < length; ++i )
310 {
311 char c = url.charAt( i );
312
313 switch ( c )
314 {
315 case ';':
316 case '/':
317 case '?':
318 case ':':
319 case '@':
320 case '&':
321 case '=':
322 case '+':
323 case '$':
324 case ',':
325 case '[':
326 case ']': // RFC 2732 (IPV6)
327 case '-':
328 case '_':
329 case '.':
330 case '!':
331 case '~':
332 case '*':
333 case '\'':
334 case '(':
335 case ')':
336 case '#': // XLink mark
337 encoded.append( c );
338 break;
339 default:
340 if ( ( c >= 'a' && c <= 'z' ) || ( c >= 'A' && c <= 'Z' ) || ( c >= '0' && c <= '9' ) )
341 {
342 encoded.append( c );
343 }
344 else
345 {
346 byte[] bytes;
347
348 try
349 {
350 if ( isHighSurrogate( c ) )
351 {
352 int codePoint = toCodePoint( c, url.charAt( ++i ) );
353 unicode = toChars( codePoint );
354 bytes = ( new String( unicode, 0, unicode.length ) ).getBytes( "UTF8" );
355 }
356 else
357 {
358 unicode[0] = c;
359 bytes = ( new String( unicode, 0, 1 ) ).getBytes( "UTF8" );
360 }
361 }
362 catch ( UnsupportedEncodingException cannotHappen )
363 {
364 bytes = new byte[0];
365 }
366
367 for ( int j = 0; j < bytes.length; ++j )
368 {
369 String hex = DoxiaUtils.byteToHex( bytes[j] );
370
371 encoded.append( '%' );
372 if ( hex.length() == 1 )
373 {
374 encoded.append( '0' );
375 }
376 encoded.append( hex );
377 }
378 }
379 }
380 }
381
382 return encoded.toString();
383 }
384
385 /**
386 * Construct a valid id.
387 *
388 * <p>
389 * <b>Note</b>: this method is identical to
390 * {@link DoxiaUtils#encodeId(String,boolean) DoxiaUtils.encodeId( id, true)},
391 * the rules to encode an id are laid out there.
392 * </p>
393 *
394 * @param id The id to be encoded.
395 * @return The trimmed and encoded id, or null if id is null.
396 * @see DoxiaUtils#encodeId(java.lang.String,boolean)
397 */
398 public static String encodeId( String id )
399 {
400 return DoxiaUtils.encodeId( id, true );
401 }
402
403 /**
404 * Determines if the specified text is a valid id according to the rules
405 * laid out in {@link #encodeId(String)}.
406 *
407 * @param text The text to be tested.
408 * @return <code>true</code> if the text is a valid id, otherwise <code>false</code>.
409 * @see #encodeId(String).
410 */
411 public static boolean isId( String text )
412 {
413 return DoxiaUtils.isValidId( text );
414 }
415
416 private HtmlTools()
417 {
418 // utility class
419 }
420
421 //
422 // Imported code from ASF Harmony project rev 770909
423 // http://svn.apache.org/repos/asf/harmony/enhanced/classlib/trunk/modules/luni/src/main/java/java/lang/Character.java
424 //
425
426 private static final char LUNATE_SIGMA = 0x3FF;
427 private static final char NON_PRIVATE_USE_HIGH_SURROGATE = 0xD800;
428 private static final char LOW_SURROGATE = 0xDC00;
429
430 private static int toCodePoint( char high, char low )
431 {
432 // See RFC 2781, Section 2.2
433 // http://www.faqs.org/rfcs/rfc2781.html
434 int h = ( high & LUNATE_SIGMA ) << 10;
435 int l = low & LUNATE_SIGMA;
436 return ( h | l ) + MIN_SUPPLEMENTARY_CODE_POINT;
437 }
438
439 private static final char MIN_HIGH_SURROGATE = '\uD800';
440 private static final char MAX_HIGH_SURROGATE = '\uDBFF';
441
442 private static boolean isHighSurrogate( char ch )
443 {
444 return ( MIN_HIGH_SURROGATE <= ch && MAX_HIGH_SURROGATE >= ch );
445 }
446
447 private static final int MIN_CODE_POINT = 0x000000;
448 private static final int MAX_CODE_POINT = 0x10FFFF;
449 private static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x10000;
450
451 private static boolean isValidCodePoint( int codePoint )
452 {
453 return ( MIN_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint );
454 }
455
456 private static boolean isSupplementaryCodePoint( int codePoint )
457 {
458 return ( MIN_SUPPLEMENTARY_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint );
459 }
460
461 /**
462 * Converts the given code point to an equivalent character array.
463 *
464 * @param codePoint the code point to convert.
465 * @return If codePoint is a supplementary code point, returns a character array of length 2,
466 * otherwise a character array of length 1 containing only the original int as a char.
467 */
468 public static char[] toChars( int codePoint )
469 {
470 if ( !isValidCodePoint( codePoint ) )
471 {
472 throw new IllegalArgumentException();
473 }
474
475 if ( isSupplementaryCodePoint( codePoint ) )
476 {
477 int cpPrime = codePoint - MIN_SUPPLEMENTARY_CODE_POINT;
478 int high = NON_PRIVATE_USE_HIGH_SURROGATE | ( ( cpPrime >> 10 ) & LUNATE_SIGMA );
479 int low = LOW_SURROGATE | ( cpPrime & LUNATE_SIGMA );
480 return new char[] { (char) high, (char) low };
481 }
482 return new char[] { (char) codePoint };
483 }
484 }