1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.codehaus.plexus.util.xml;
18
19 import java.io.BufferedInputStream;
20 import java.io.BufferedReader;
21 import java.io.File;
22 import java.io.IOException;
23 import java.io.InputStream;
24 import java.io.InputStreamReader;
25 import java.io.Reader;
26 import java.io.StringReader;
27 import java.net.URL;
28 import java.net.URLConnection;
29 import java.nio.file.Files;
30 import java.net.HttpURLConnection;
31 import java.util.Locale;
32 import java.util.regex.Pattern;
33 import java.util.regex.Matcher;
34 import java.text.MessageFormat;
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60 @Deprecated
61 public class XmlReader
62 extends Reader
63 {
64 private static final int BUFFER_SIZE = 4096;
65
66 private static final String UTF_8 = "UTF-8";
67
68 private static final String US_ASCII = "US-ASCII";
69
70 private static final String UTF_16BE = "UTF-16BE";
71
72 private static final String UTF_16LE = "UTF-16LE";
73
74 private static final String UTF_16 = "UTF-16";
75
76 private static final String EBCDIC = "CP1047";
77
78 private static String _staticDefaultEncoding = null;
79
80 private Reader _reader;
81
82 private String _encoding;
83
84 private String _defaultEncoding;
85
86
87
88
89
90
91
92
93
94
95
96 public static void setDefaultEncoding( String encoding )
97 {
98 _staticDefaultEncoding = encoding;
99 }
100
101
102
103
104
105
106
107
108
109 public static String getDefaultEncoding()
110 {
111 return _staticDefaultEncoding;
112 }
113
114
115
116
117
118
119
120
121
122
123
124
125
126 public XmlReader( File file )
127 throws IOException
128 {
129 this( Files.newInputStream( file.toPath() ) );
130 }
131
132
133
134
135
136
137
138
139
140
141
142
143 public XmlReader( InputStream is )
144 throws IOException
145 {
146 this( is, true );
147 }
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173 public XmlReader( InputStream is, boolean lenient )
174 throws IOException, XmlStreamReaderException
175 {
176 _defaultEncoding = _staticDefaultEncoding;
177 try
178 {
179 doRawStream( is, lenient );
180 }
181 catch ( XmlStreamReaderException ex )
182 {
183 if ( !lenient )
184 {
185 throw ex;
186 }
187 else
188 {
189 doLenientDetection( null, ex );
190 }
191 }
192 }
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209 public XmlReader( URL url )
210 throws IOException
211 {
212 this( url.openConnection() );
213 }
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230 public XmlReader( URLConnection conn )
231 throws IOException
232 {
233 _defaultEncoding = _staticDefaultEncoding;
234 boolean lenient = true;
235 if ( conn instanceof HttpURLConnection )
236 {
237 try
238 {
239 doHttpStream( conn.getInputStream(), conn.getContentType(), lenient );
240 }
241 catch ( XmlStreamReaderException ex )
242 {
243 doLenientDetection( conn.getContentType(), ex );
244 }
245 }
246 else if ( conn.getContentType() != null )
247 {
248 try
249 {
250 doHttpStream( conn.getInputStream(), conn.getContentType(), lenient );
251 }
252 catch ( XmlStreamReaderException ex )
253 {
254 doLenientDetection( conn.getContentType(), ex );
255 }
256 }
257 else
258 {
259 try
260 {
261 doRawStream( conn.getInputStream(), lenient );
262 }
263 catch ( XmlStreamReaderException ex )
264 {
265 doLenientDetection( null, ex );
266 }
267 }
268 }
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284 public XmlReader( InputStream is, String httpContentType )
285 throws IOException
286 {
287 this( is, httpContentType, true );
288 }
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319 public XmlReader( InputStream is, String httpContentType, boolean lenient, String defaultEncoding )
320 throws IOException, XmlStreamReaderException
321 {
322 _defaultEncoding = ( defaultEncoding == null ) ? _staticDefaultEncoding : defaultEncoding;
323 try
324 {
325 doHttpStream( is, httpContentType, lenient );
326 }
327 catch ( XmlStreamReaderException ex )
328 {
329 if ( !lenient )
330 {
331 throw ex;
332 }
333 else
334 {
335 doLenientDetection( httpContentType, ex );
336 }
337 }
338 }
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368 public XmlReader( InputStream is, String httpContentType, boolean lenient )
369 throws IOException, XmlStreamReaderException
370 {
371 this( is, httpContentType, lenient, null );
372 }
373
374 private void doLenientDetection( String httpContentType, XmlStreamReaderException ex )
375 throws IOException
376 {
377 if ( httpContentType != null )
378 {
379 if ( httpContentType.startsWith( "text/html" ) )
380 {
381 httpContentType = httpContentType.substring( "text/html".length() );
382 httpContentType = "text/xml" + httpContentType;
383 try
384 {
385 doHttpStream( ex.getInputStream(), httpContentType, true );
386 ex = null;
387 }
388 catch ( XmlStreamReaderException ex2 )
389 {
390 ex = ex2;
391 }
392 }
393 }
394 if ( ex != null )
395 {
396 String encoding = ex.getXmlEncoding();
397 if ( encoding == null )
398 {
399 encoding = ex.getContentTypeEncoding();
400 }
401 if ( encoding == null )
402 {
403 encoding = ( _defaultEncoding == null ) ? UTF_8 : _defaultEncoding;
404 }
405 prepareReader( ex.getInputStream(), encoding );
406 }
407 }
408
409
410
411
412
413
414
415 public String getEncoding()
416 {
417 return _encoding;
418 }
419
420 @Override
421 public int read( char[] buf, int offset, int len )
422 throws IOException
423 {
424 return _reader.read( buf, offset, len );
425 }
426
427
428
429
430
431
432
433 @Override
434 public void close()
435 throws IOException
436 {
437 _reader.close();
438 }
439
440 private void doRawStream( InputStream is, boolean lenient )
441 throws IOException
442 {
443 BufferedInputStream pis = new BufferedInputStream( is, BUFFER_SIZE );
444 String bomEnc = getBOMEncoding( pis );
445 String xmlGuessEnc = getXMLGuessEncoding( pis );
446 String xmlEnc = getXmlProlog( pis, xmlGuessEnc );
447 String encoding = calculateRawEncoding( bomEnc, xmlGuessEnc, xmlEnc, pis );
448 prepareReader( pis, encoding );
449 }
450
451 private void doHttpStream( InputStream is, String httpContentType, boolean lenient )
452 throws IOException
453 {
454 BufferedInputStream pis = new BufferedInputStream( is, BUFFER_SIZE );
455 String cTMime = getContentTypeMime( httpContentType );
456 String cTEnc = getContentTypeEncoding( httpContentType );
457 String bomEnc = getBOMEncoding( pis );
458 String xmlGuessEnc = getXMLGuessEncoding( pis );
459 String xmlEnc = getXmlProlog( pis, xmlGuessEnc );
460 String encoding = calculateHttpEncoding( cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc, pis, lenient );
461 prepareReader( pis, encoding );
462 }
463
464 private void prepareReader( InputStream is, String encoding )
465 throws IOException
466 {
467 _reader = new InputStreamReader( is, encoding );
468 _encoding = encoding;
469 }
470
471
472 private String calculateRawEncoding( String bomEnc, String xmlGuessEnc, String xmlEnc, InputStream is )
473 throws IOException
474 {
475 String encoding;
476 if ( bomEnc == null )
477 {
478 if ( xmlGuessEnc == null || xmlEnc == null )
479 {
480 encoding = ( _defaultEncoding == null ) ? UTF_8 : _defaultEncoding;
481 }
482 else if ( xmlEnc.equals( UTF_16 ) && ( xmlGuessEnc.equals( UTF_16BE ) || xmlGuessEnc.equals( UTF_16LE ) ) )
483 {
484 encoding = xmlGuessEnc;
485 }
486 else
487 {
488 encoding = xmlEnc;
489 }
490 }
491 else if ( bomEnc.equals( UTF_8 ) )
492 {
493 if ( xmlGuessEnc != null && !xmlGuessEnc.equals( UTF_8 ) )
494 {
495 throw new XmlStreamReaderException( RAW_EX_1.format( new Object[] { bomEnc, xmlGuessEnc, xmlEnc } ),
496 bomEnc, xmlGuessEnc, xmlEnc, is );
497 }
498 if ( xmlEnc != null && !xmlEnc.equals( UTF_8 ) )
499 {
500 throw new XmlStreamReaderException( RAW_EX_1.format( new Object[] { bomEnc, xmlGuessEnc, xmlEnc } ),
501 bomEnc, xmlGuessEnc, xmlEnc, is );
502 }
503 encoding = UTF_8;
504 }
505 else if ( bomEnc.equals( UTF_16BE ) || bomEnc.equals( UTF_16LE ) )
506 {
507 if ( xmlGuessEnc != null && !xmlGuessEnc.equals( bomEnc ) )
508 {
509 throw new IOException( RAW_EX_1.format( new Object[] { bomEnc, xmlGuessEnc, xmlEnc } ) );
510 }
511 if ( xmlEnc != null && !xmlEnc.equals( UTF_16 ) && !xmlEnc.equals( bomEnc ) )
512 {
513 throw new XmlStreamReaderException( RAW_EX_1.format( new Object[] { bomEnc, xmlGuessEnc, xmlEnc } ),
514 bomEnc, xmlGuessEnc, xmlEnc, is );
515 }
516 encoding = bomEnc;
517 }
518 else
519 {
520 throw new XmlStreamReaderException( RAW_EX_2.format( new Object[] { bomEnc, xmlGuessEnc, xmlEnc } ), bomEnc,
521 xmlGuessEnc, xmlEnc, is );
522 }
523 return encoding;
524 }
525
526
527 private String calculateHttpEncoding( String cTMime, String cTEnc, String bomEnc, String xmlGuessEnc, String xmlEnc,
528 InputStream is, boolean lenient )
529 throws IOException
530 {
531 String encoding;
532 if ( lenient & xmlEnc != null )
533 {
534 encoding = xmlEnc;
535 }
536 else
537 {
538 boolean appXml = isAppXml( cTMime );
539 boolean textXml = isTextXml( cTMime );
540 if ( appXml || textXml )
541 {
542 if ( cTEnc == null )
543 {
544 if ( appXml )
545 {
546 encoding = calculateRawEncoding( bomEnc, xmlGuessEnc, xmlEnc, is );
547 }
548 else
549 {
550 encoding = ( _defaultEncoding == null ) ? US_ASCII : _defaultEncoding;
551 }
552 }
553 else if ( bomEnc != null && ( cTEnc.equals( UTF_16BE ) || cTEnc.equals( UTF_16LE ) ) )
554 {
555 throw new XmlStreamReaderException( HTTP_EX_1.format( new Object[] { cTMime, cTEnc, bomEnc,
556 xmlGuessEnc, xmlEnc } ), cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc, is );
557 }
558 else if ( cTEnc.equals( UTF_16 ) )
559 {
560 if ( bomEnc != null && bomEnc.startsWith( UTF_16 ) )
561 {
562 encoding = bomEnc;
563 }
564 else
565 {
566 throw new XmlStreamReaderException( HTTP_EX_2.format( new Object[] { cTMime, cTEnc, bomEnc,
567 xmlGuessEnc, xmlEnc } ), cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc, is );
568 }
569 }
570 else
571 {
572 encoding = cTEnc;
573 }
574 }
575 else
576 {
577 throw new XmlStreamReaderException( HTTP_EX_3.format( new Object[] { cTMime, cTEnc, bomEnc, xmlGuessEnc,
578 xmlEnc } ), cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc, is );
579 }
580 }
581 return encoding;
582 }
583
584
585 private static String getContentTypeMime( String httpContentType )
586 {
587 String mime = null;
588 if ( httpContentType != null )
589 {
590 int i = httpContentType.indexOf( ";" );
591 mime = ( ( i == -1 ) ? httpContentType : httpContentType.substring( 0, i ) ).trim();
592 }
593 return mime;
594 }
595
596 private static final Pattern CHARSET_PATTERN = Pattern.compile( "charset=([.[^; ]]*)" );
597
598
599 private static String getContentTypeEncoding( String httpContentType )
600 {
601 String encoding = null;
602 if ( httpContentType != null )
603 {
604 int i = httpContentType.indexOf( ";" );
605 if ( i > -1 )
606 {
607 String postMime = httpContentType.substring( i + 1 );
608 Matcher m = CHARSET_PATTERN.matcher( postMime );
609 encoding = ( m.find() ) ? m.group( 1 ) : null;
610 encoding = ( encoding != null ) ? encoding.toUpperCase( Locale.ENGLISH ) : null;
611 }
612 }
613 return encoding;
614 }
615
616
617
618 private static String getBOMEncoding( BufferedInputStream is )
619 throws IOException
620 {
621 String encoding = null;
622 int[] bytes = new int[3];
623 is.mark( 3 );
624 bytes[0] = is.read();
625 bytes[1] = is.read();
626 bytes[2] = is.read();
627
628 if ( bytes[0] == 0xFE && bytes[1] == 0xFF )
629 {
630 encoding = UTF_16BE;
631 is.reset();
632 is.read();
633 is.read();
634 }
635 else if ( bytes[0] == 0xFF && bytes[1] == 0xFE )
636 {
637 encoding = UTF_16LE;
638 is.reset();
639 is.read();
640 is.read();
641 }
642 else if ( bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF )
643 {
644 encoding = UTF_8;
645 }
646 else
647 {
648 is.reset();
649 }
650 return encoding;
651 }
652
653
654 private static String getXMLGuessEncoding( BufferedInputStream is )
655 throws IOException
656 {
657 String encoding = null;
658 int[] bytes = new int[4];
659 is.mark( 4 );
660 bytes[0] = is.read();
661 bytes[1] = is.read();
662 bytes[2] = is.read();
663 bytes[3] = is.read();
664 is.reset();
665
666 if ( bytes[0] == 0x00 && bytes[1] == 0x3C && bytes[2] == 0x00 && bytes[3] == 0x3F )
667 {
668 encoding = UTF_16BE;
669 }
670 else if ( bytes[0] == 0x3C && bytes[1] == 0x00 && bytes[2] == 0x3F && bytes[3] == 0x00 )
671 {
672 encoding = UTF_16LE;
673 }
674 else if ( bytes[0] == 0x3C && bytes[1] == 0x3F && bytes[2] == 0x78 && bytes[3] == 0x6D )
675 {
676 encoding = UTF_8;
677 }
678 else if ( bytes[0] == 0x4C && bytes[1] == 0x6F && bytes[2] == 0xA7 && bytes[3] == 0x94 )
679 {
680 encoding = EBCDIC;
681 }
682 return encoding;
683 }
684
685 static final Pattern ENCODING_PATTERN =
686 Pattern.compile( "<\\?xml.*encoding[\\s]*=[\\s]*((?:\".[^\"]*\")|(?:'.[^']*'))", Pattern.MULTILINE );
687
688
689 private static String getXmlProlog( BufferedInputStream is, String guessedEnc )
690 throws IOException
691 {
692 String encoding = null;
693 if ( guessedEnc != null )
694 {
695 byte[] bytes = new byte[BUFFER_SIZE];
696 is.mark( BUFFER_SIZE );
697 int offset = 0;
698 int max = BUFFER_SIZE;
699 int c = is.read( bytes, offset, max );
700 int firstGT = -1;
701 String xmlProlog = null;
702 while ( c != -1 && firstGT == -1 && offset < BUFFER_SIZE )
703 {
704 offset += c;
705 max -= c;
706 c = is.read( bytes, offset, max );
707 xmlProlog = new String( bytes, 0, offset, guessedEnc );
708 firstGT = xmlProlog.indexOf( '>' );
709 }
710 if ( firstGT == -1 )
711 {
712 if ( c == -1 )
713 {
714 throw new IOException( "Unexpected end of XML stream" );
715 }
716 else
717 {
718 throw new IOException( "XML prolog or ROOT element not found on first " + offset + " bytes" );
719 }
720 }
721 int bytesRead = offset;
722 if ( bytesRead > 0 )
723 {
724 is.reset();
725 BufferedReader bReader =
726 new BufferedReader( new StringReader( xmlProlog.substring( 0, firstGT + 1 ) ) );
727 StringBuilder prolog = new StringBuilder();
728 String line = bReader.readLine();
729 while ( line != null )
730 {
731 prolog.append( line );
732 line = bReader.readLine();
733 }
734 Matcher m = ENCODING_PATTERN.matcher( prolog );
735 if ( m.find() )
736 {
737 encoding = m.group( 1 ).toUpperCase( Locale.ENGLISH );
738 encoding = encoding.substring( 1, encoding.length() - 1 );
739 }
740 }
741 }
742 return encoding;
743 }
744
745
746 private static boolean isAppXml( String mime )
747 {
748 return mime != null && ( mime.equals( "application/xml" ) || mime.equals( "application/xml-dtd" )
749 || mime.equals( "application/xml-external-parsed-entity" )
750 || ( mime.startsWith( "application/" ) && mime.endsWith( "+xml" ) ) );
751 }
752
753
754 private static boolean isTextXml( String mime )
755 {
756 return mime != null && ( mime.equals( "text/xml" ) || mime.equals( "text/xml-external-parsed-entity" )
757 || ( mime.startsWith( "text/" ) && mime.endsWith( "+xml" ) ) );
758 }
759
760 private static final MessageFormat RAW_EX_1 =
761 new MessageFormat( "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch" );
762
763 private static final MessageFormat RAW_EX_2 =
764 new MessageFormat( "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM" );
765
766 private static final MessageFormat HTTP_EX_1 =
767 new MessageFormat( "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be NULL" );
768
769 private static final MessageFormat HTTP_EX_2 =
770 new MessageFormat( "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch" );
771
772 private static final MessageFormat HTTP_EX_3 =
773 new MessageFormat( "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Invalid MIME" );
774
775 }