1 /*
2 * Copyright 2004 Sun Microsystems, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *
16 */
17 package org.codehaus.plexus.util.xml;
18
19 import java.io.File;
20 import java.io.IOException;
21 import java.io.InputStream;
22 import java.net.URL;
23 import java.net.URLConnection;
24
25 /**
26 * Character stream that handles (or at least attempts to) all the necessary Voodo to figure out the charset encoding of
27 * the XML document within the stream.
28 * <p>
29 * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. This one IS a character stream.
30 * <p>
31 * All this has to be done without consuming characters from the stream, if not the XML parser will not recognized the
32 * document as a valid XML. This is not 100% true, but it's close enough (UTF-8 BOM is not handled by all parsers right
33 * now, XmlReader handles it and things work in all parsers).
34 * <p>
35 * The XmlReader class handles the charset encoding of XML documents in Files, raw streams and HTTP streams by offering
36 * a wide set of constructors.
37 * <P>
38 * By default the charset encoding detection is lenient, the constructor with the lenient flag can be used for an script
39 * (following HTTP MIME and XML specifications). All this is nicely explained by Mark Pilgrim in his blog,
40 * <a href="http://diveintomark.org/archives/2004/02/13/xml-media-types"> Determining the character encoding of a
41 * feed</a>.
42 * <p>
43 *
44 * @author Alejandro Abdelnur
45 * @version revision 1.17 taken on 26/06/2007 from Rome (see
46 * https://rome.dev.java.net/source/browse/rome/src/java/com/sun/syndication/io/XmlReader.java)
47 * @since 1.4.4
48 */
49 public class XmlStreamReader
50 extends XmlReader
51 {
52 /**
53 * Creates a Reader for a File.
54 * <p>
55 * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also missing defaults to
56 * UTF-8.
57 * <p>
58 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
59 * <p>
60 *
61 * @param file File to create a Reader from.
62 * @throws IOException thrown if there is a problem reading the file.
63 */
64 public XmlStreamReader( File file )
65 throws IOException
66 {
67 super( file );
68 }
69
70 /**
71 * Creates a Reader for a raw InputStream.
72 * <p>
73 * It follows the same logic used for files.
74 * <p>
75 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
76 * <p>
77 *
78 * @param is InputStream to create a Reader from.
79 * @throws IOException thrown if there is a problem reading the stream.
80 */
81 public XmlStreamReader( InputStream is )
82 throws IOException
83 {
84 super( is );
85 }
86
87 /**
88 * Creates a Reader for a raw InputStream.
89 * <p>
90 * It follows the same logic used for files.
91 * <p>
92 * If lenient detection is indicated and the detection above fails as per specifications it then attempts the
93 * following:
94 * <p>
95 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
96 * <p>
97 * Else if the XML prolog had a charset encoding that encoding is used.
98 * <p>
99 * Else if the content type had a charset encoding that encoding is used.
100 * <p>
101 * Else 'UTF-8' is used.
102 * <p>
103 * If lenient detection is indicated an XmlStreamReaderException is never thrown.
104 * <p>
105 *
106 * @param is InputStream to create a Reader from.
107 * @param lenient indicates if the charset encoding detection should be relaxed.
108 * @throws IOException thrown if there is a problem reading the stream.
109 * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specs.
110 */
111 public XmlStreamReader( InputStream is, boolean lenient )
112 throws IOException, XmlStreamReaderException
113 {
114 super( is, lenient );
115 }
116
117 /**
118 * Creates a Reader using the InputStream of a URL.
119 * <p>
120 * If the URL is not of type HTTP and there is not 'content-type' header in the fetched data it uses the same logic
121 * used for Files.
122 * <p>
123 * If the URL is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic used for
124 * an InputStream with content-type.
125 * <p>
126 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
127 * <p>
128 *
129 * @param url URL to create a Reader from.
130 * @throws IOException thrown if there is a problem reading the stream of the URL.
131 */
132 public XmlStreamReader( URL url )
133 throws IOException
134 {
135 super( url );
136 }
137
138 /**
139 * Creates a Reader using the InputStream of a URLConnection.
140 * <p>
141 * If the URLConnection is not of type HttpURLConnection and there is not 'content-type' header in the fetched data
142 * it uses the same logic used for files.
143 * <p>
144 * If the URLConnection is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic
145 * used for an InputStream with content-type.
146 * <p>
147 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
148 * <p>
149 *
150 * @param conn URLConnection to create a Reader from.
151 * @throws IOException thrown if there is a problem reading the stream of the URLConnection.
152 */
153 public XmlStreamReader( URLConnection conn )
154 throws IOException
155 {
156 super( conn );
157 }
158
159 /**
160 * Creates a Reader using an InputStream an the associated content-type header.
161 * <p>
162 * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not
163 * content-type encoding checks the XML prolog encoding. If there is not XML prolog encoding uses the default
164 * encoding mandated by the content-type MIME type.
165 * <p>
166 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
167 * <p>
168 *
169 * @param is InputStream to create the reader from.
170 * @param httpContentType content-type header to use for the resolution of the charset encoding.
171 * @throws IOException thrown if there is a problem reading the file.
172 */
173 public XmlStreamReader( InputStream is, String httpContentType )
174 throws IOException
175 {
176 super( is, httpContentType );
177 }
178
179 /**
180 * Creates a Reader using an InputStream an the associated content-type header. This constructor is lenient
181 * regarding the encoding detection.
182 * <p>
183 * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not
184 * content-type encoding checks the XML prolog encoding. If there is not XML prolog encoding uses the default
185 * encoding mandated by the content-type MIME type.
186 * <p>
187 * If lenient detection is indicated and the detection above fails as per specifications it then attempts the
188 * following:
189 * <p>
190 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
191 * <p>
192 * Else if the XML prolog had a charset encoding that encoding is used.
193 * <p>
194 * Else if the content type had a charset encoding that encoding is used.
195 * <p>
196 * Else 'UTF-8' is used.
197 * <p>
198 * If lenient detection is indicated an XmlStreamReaderException is never thrown.
199 * <p>
200 *
201 * @param is InputStream to create the reader from.
202 * @param httpContentType content-type header to use for the resolution of the charset encoding.
203 * @param lenient indicates if the charset encoding detection should be relaxed.
204 * @param defaultEncoding encoding to use
205 * @throws IOException thrown if there is a problem reading the file.
206 * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specs.
207 */
208 public XmlStreamReader( InputStream is, String httpContentType, boolean lenient, String defaultEncoding )
209 throws IOException, XmlStreamReaderException
210 {
211 super( is, httpContentType, lenient, defaultEncoding );
212 }
213
214 /**
215 * Creates a Reader using an InputStream an the associated content-type header. This constructor is lenient
216 * regarding the encoding detection.
217 * <p>
218 * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not
219 * content-type encoding checks the XML prolog encoding. If there is not XML prolog encoding uses the default
220 * encoding mandated by the content-type MIME type.
221 * <p>
222 * If lenient detection is indicated and the detection above fails as per specifications it then attempts the
223 * following:
224 * <p>
225 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
226 * <p>
227 * Else if the XML prolog had a charset encoding that encoding is used.
228 * <p>
229 * Else if the content type had a charset encoding that encoding is used.
230 * <p>
231 * Else 'UTF-8' is used.
232 * <p>
233 * If lenient detection is indicated an XmlStreamReaderException is never thrown.
234 * <p>
235 *
236 * @param is InputStream to create the reader from.
237 * @param httpContentType content-type header to use for the resolution of the charset encoding.
238 * @param lenient indicates if the charset encoding detection should be relaxed.
239 * @throws IOException thrown if there is a problem reading the file.
240 * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specs.
241 */
242 public XmlStreamReader( InputStream is, String httpContentType, boolean lenient )
243 throws IOException, XmlStreamReaderException
244 {
245 super( is, httpContentType, lenient );
246 }
247 }