1 /* 2 * Copyright 2004 Sun Microsystems, Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 * 16 */ 17 package org.codehaus.plexus.util.xml; 18 19 import java.io.File; 20 import java.io.IOException; 21 import java.io.InputStream; 22 import java.net.URL; 23 import java.net.URLConnection; 24 25 /** 26 * Character stream that handles (or at least attempts to) all the necessary Voodo to figure out the charset encoding of 27 * the XML document within the stream. 28 * <p> 29 * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. This one IS a character stream. 30 * <p> 31 * All this has to be done without consuming characters from the stream, if not the XML parser will not recognized the 32 * document as a valid XML. This is not 100% true, but it's close enough (UTF-8 BOM is not handled by all parsers right 33 * now, XmlReader handles it and things work in all parsers). 34 * <p> 35 * The XmlReader class handles the charset encoding of XML documents in Files, raw streams and HTTP streams by offering 36 * a wide set of constructors. 37 * <P> 38 * By default the charset encoding detection is lenient, the constructor with the lenient flag can be used for an script 39 * (following HTTP MIME and XML specifications). All this is nicely explained by Mark Pilgrim in his blog, 40 * <a href="http://diveintomark.org/archives/2004/02/13/xml-media-types"> Determining the character encoding of a 41 * feed</a>. 42 * <p> 43 * 44 * @author Alejandro Abdelnur 45 * @version revision 1.17 taken on 26/06/2007 from Rome (see 46 * https://rome.dev.java.net/source/browse/rome/src/java/com/sun/syndication/io/XmlReader.java) 47 * @since 1.4.4 48 */ 49 public class XmlStreamReader 50 extends XmlReader 51 { 52 /** 53 * Creates a Reader for a File. 54 * <p> 55 * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also missing defaults to 56 * UTF-8. 57 * <p> 58 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details. 59 * <p> 60 * 61 * @param file File to create a Reader from. 62 * @throws IOException thrown if there is a problem reading the file. 63 */ 64 public XmlStreamReader( File file ) 65 throws IOException 66 { 67 super( file ); 68 } 69 70 /** 71 * Creates a Reader for a raw InputStream. 72 * <p> 73 * It follows the same logic used for files. 74 * <p> 75 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details. 76 * <p> 77 * 78 * @param is InputStream to create a Reader from. 79 * @throws IOException thrown if there is a problem reading the stream. 80 */ 81 public XmlStreamReader( InputStream is ) 82 throws IOException 83 { 84 super( is ); 85 } 86 87 /** 88 * Creates a Reader for a raw InputStream. 89 * <p> 90 * It follows the same logic used for files. 91 * <p> 92 * If lenient detection is indicated and the detection above fails as per specifications it then attempts the 93 * following: 94 * <p> 95 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again. 96 * <p> 97 * Else if the XML prolog had a charset encoding that encoding is used. 98 * <p> 99 * Else if the content type had a charset encoding that encoding is used. 100 * <p> 101 * Else 'UTF-8' is used. 102 * <p> 103 * If lenient detection is indicated an XmlStreamReaderException is never thrown. 104 * <p> 105 * 106 * @param is InputStream to create a Reader from. 107 * @param lenient indicates if the charset encoding detection should be relaxed. 108 * @throws IOException thrown if there is a problem reading the stream. 109 * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specs. 110 */ 111 public XmlStreamReader( InputStream is, boolean lenient ) 112 throws IOException, XmlStreamReaderException 113 { 114 super( is, lenient ); 115 } 116 117 /** 118 * Creates a Reader using the InputStream of a URL. 119 * <p> 120 * If the URL is not of type HTTP and there is not 'content-type' header in the fetched data it uses the same logic 121 * used for Files. 122 * <p> 123 * If the URL is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic used for 124 * an InputStream with content-type. 125 * <p> 126 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details. 127 * <p> 128 * 129 * @param url URL to create a Reader from. 130 * @throws IOException thrown if there is a problem reading the stream of the URL. 131 */ 132 public XmlStreamReader( URL url ) 133 throws IOException 134 { 135 super( url ); 136 } 137 138 /** 139 * Creates a Reader using the InputStream of a URLConnection. 140 * <p> 141 * If the URLConnection is not of type HttpURLConnection and there is not 'content-type' header in the fetched data 142 * it uses the same logic used for files. 143 * <p> 144 * If the URLConnection is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic 145 * used for an InputStream with content-type. 146 * <p> 147 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details. 148 * <p> 149 * 150 * @param conn URLConnection to create a Reader from. 151 * @throws IOException thrown if there is a problem reading the stream of the URLConnection. 152 */ 153 public XmlStreamReader( URLConnection conn ) 154 throws IOException 155 { 156 super( conn ); 157 } 158 159 /** 160 * Creates a Reader using an InputStream an the associated content-type header. 161 * <p> 162 * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not 163 * content-type encoding checks the XML prolog encoding. If there is not XML prolog encoding uses the default 164 * encoding mandated by the content-type MIME type. 165 * <p> 166 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details. 167 * <p> 168 * 169 * @param is InputStream to create the reader from. 170 * @param httpContentType content-type header to use for the resolution of the charset encoding. 171 * @throws IOException thrown if there is a problem reading the file. 172 */ 173 public XmlStreamReader( InputStream is, String httpContentType ) 174 throws IOException 175 { 176 super( is, httpContentType ); 177 } 178 179 /** 180 * Creates a Reader using an InputStream an the associated content-type header. This constructor is lenient 181 * regarding the encoding detection. 182 * <p> 183 * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not 184 * content-type encoding checks the XML prolog encoding. If there is not XML prolog encoding uses the default 185 * encoding mandated by the content-type MIME type. 186 * <p> 187 * If lenient detection is indicated and the detection above fails as per specifications it then attempts the 188 * following: 189 * <p> 190 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again. 191 * <p> 192 * Else if the XML prolog had a charset encoding that encoding is used. 193 * <p> 194 * Else if the content type had a charset encoding that encoding is used. 195 * <p> 196 * Else 'UTF-8' is used. 197 * <p> 198 * If lenient detection is indicated an XmlStreamReaderException is never thrown. 199 * <p> 200 * 201 * @param is InputStream to create the reader from. 202 * @param httpContentType content-type header to use for the resolution of the charset encoding. 203 * @param lenient indicates if the charset encoding detection should be relaxed. 204 * @param defaultEncoding encoding to use 205 * @throws IOException thrown if there is a problem reading the file. 206 * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specs. 207 */ 208 public XmlStreamReader( InputStream is, String httpContentType, boolean lenient, String defaultEncoding ) 209 throws IOException, XmlStreamReaderException 210 { 211 super( is, httpContentType, lenient, defaultEncoding ); 212 } 213 214 /** 215 * Creates a Reader using an InputStream an the associated content-type header. This constructor is lenient 216 * regarding the encoding detection. 217 * <p> 218 * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not 219 * content-type encoding checks the XML prolog encoding. If there is not XML prolog encoding uses the default 220 * encoding mandated by the content-type MIME type. 221 * <p> 222 * If lenient detection is indicated and the detection above fails as per specifications it then attempts the 223 * following: 224 * <p> 225 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again. 226 * <p> 227 * Else if the XML prolog had a charset encoding that encoding is used. 228 * <p> 229 * Else if the content type had a charset encoding that encoding is used. 230 * <p> 231 * Else 'UTF-8' is used. 232 * <p> 233 * If lenient detection is indicated an XmlStreamReaderException is never thrown. 234 * <p> 235 * 236 * @param is InputStream to create the reader from. 237 * @param httpContentType content-type header to use for the resolution of the charset encoding. 238 * @param lenient indicates if the charset encoding detection should be relaxed. 239 * @throws IOException thrown if there is a problem reading the file. 240 * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specs. 241 */ 242 public XmlStreamReader( InputStream is, String httpContentType, boolean lenient ) 243 throws IOException, XmlStreamReaderException 244 { 245 super( is, httpContentType, lenient ); 246 } 247 }