1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil; -*- //------100-columns-wide------>|*/ 2 // for license please see accompanying LICENSE.txt file (available also at http://www.xmlpull.org/) 3 4 package org.codehaus.plexus.util.xml.pull; 5 6 import java.io.InputStream; 7 import java.io.IOException; 8 import java.io.Reader; 9 10 /** 11 * XML Pull Parser is an interface that defines parsing functionality provided in 12 * <a href="http://www.xmlpull.org/">XMLPULL V1 API</a> (visit this website to learn more about API and its 13 * implementations). 14 * <p> 15 * There are following different kinds of parser depending on which features are set: 16 * <ul> 17 * <li><b>non-validating</b> parser as defined in XML 1.0 spec when FEATURE_PROCESS_DOCDECL is set to true 18 * <li><b>validating parser</b> as defined in XML 1.0 spec when FEATURE_VALIDATION is true (and that implies that 19 * FEATURE_PROCESS_DOCDECL is true) 20 * <li>when FEATURE_PROCESS_DOCDECL is false (this is default and if different value is required necessary must be 21 * changed before parsing is started) then parser behaves like XML 1.0 compliant non-validating parser under condition 22 * that <em>no DOCDECL is present</em> in XML documents (internal entities can still be defined with 23 * defineEntityReplacementText()). This mode of operation is intended <b>for operation in constrained environments</b> 24 * such as J2ME. 25 * </ul> 26 * <p> 27 * There are two key methods: next() and nextToken(). While next() provides access to high level parsing events, 28 * nextToken() allows access to lower level tokens. 29 * <p> 30 * The current event state of the parser can be determined by calling the <a href="#getEventType()">getEventType()</a> 31 * method. Initially, the parser is in the <a href="#START_DOCUMENT">START_DOCUMENT</a> state. 32 * <p> 33 * The method <a href="#next()">next()</a> advances the parser to the next event. The int value returned from next 34 * determines the current parser state and is identical to the value returned from following calls to getEventType (). 35 * <p> 36 * The following event types are seen by next() 37 * <dl> 38 * <dt><a href="#START_TAG">START_TAG</a> 39 * <dd>An XML start tag was read. 40 * <dt><a href="#TEXT">TEXT</a> 41 * <dd>Text content was read; the text content can be retrieved using the getText() method. (when in validating mode 42 * next() will not report ignorable whitespaces, use nextToken() instead) 43 * <dt><a href="#END_TAG">END_TAG</a> 44 * <dd>An end tag was read 45 * <dt><a href="#END_DOCUMENT">END_DOCUMENT</a> 46 * <dd>No more events are available 47 * </dl> 48 * <p> 49 * after first next() or nextToken() (or any other next*() method) is called user application can obtain XML version, 50 * standalone and encoding from XML declaration in following ways: 51 * <ul> 52 * <li><b>version</b>: getProperty("<a href= 53 * "http://xmlpull.org/v1/doc/properties.html#xmldecl-version">http://xmlpull.org/v1/doc/properties.html#xmldecl-version</a>") 54 * returns String ("1.0") or null if XMLDecl was not read or if property is not supported 55 * <li><b>standalone</b>: getProperty("<a href= 56 * "http://xmlpull.org/v1/doc/features.html#xmldecl-standalone">http://xmlpull.org/v1/doc/features.html#xmldecl-standalone</a>") 57 * returns Boolean: null if there was no standalone declaration or if property is not supported otherwise returns 58 * Boolean(true) if standalone="yes" and Boolean(false) when standalone="no" 59 * <li><b>encoding</b>: obtained from getInputEncoding() null if stream had unknown encoding (not set in setInputStream) 60 * and it was not declared in XMLDecl 61 * </ul> 62 * A minimal example for using this API may look as follows: 63 * 64 * <pre> 65 * import java.io.IOException; 66 * import java.io.StringReader; 67 * 68 * import org.xmlpull.v1.XmlPullParser; 69 * import org.xmlpull.v1.XmlPullParserException; 70 * import org.xmlpull.v1.XmlPullParserFactory; 71 * 72 * public class SimpleXmlPullApp 73 * { 74 * 75 * public static void main (String args[]) 76 * throws XmlPullParserException, IOException 77 * { 78 * XmlPullParserFactory factory = XmlPullParserFactory.newInstance(); 79 * factory.setNamespaceAware(true); 80 * XmlPullParser xpp = factory.newPullParser(); 81 * 82 * xpp.setInput( new StringReader ( "<foo%gt;Hello World!</foo>" ) ); 83 * int eventType = xpp.getEventType(); 84 * while (eventType != xpp.END_DOCUMENT) { 85 * if(eventType == xpp.START_DOCUMENT) { 86 * System.out.println("Start document"); 87 * } else if(eventType == xpp.END_DOCUMENT) { 88 * System.out.println("End document"); 89 * } else if(eventType == xpp.START_TAG) { 90 * System.out.println("Start tag "+xpp.getName()); 91 * } else if(eventType == xpp.END_TAG) { 92 * System.out.println("End tag "+xpp.getName()); 93 * } else if(eventType == xpp.TEXT) { 94 * System.out.println("Text "+xpp.getText()); 95 * } 96 * eventType = xpp.next(); 97 * } 98 * } 99 * } 100 * </pre> 101 * <p> 102 * The above example will generate the following output: 103 * 104 * <pre> 105 * Start document 106 * Start tag foo 107 * Text Hello World! 108 * End tag foo 109 * </pre> 110 * 111 * For more details on API usage, please refer to the quick Introduction available at 112 * <a href="http://www.xmlpull.org">http://www.xmlpull.org</a> 113 * 114 * @see #defineEntityReplacementText 115 * @see #getName 116 * @see #getNamespace 117 * @see #getText 118 * @see #next 119 * @see #nextToken 120 * @see #setInput 121 * @see #FEATURE_PROCESS_DOCDECL 122 * @see #FEATURE_VALIDATION 123 * @see #START_DOCUMENT 124 * @see #START_TAG 125 * @see #TEXT 126 * @see #END_TAG 127 * @see #END_DOCUMENT 128 * @author <a href="http://www-ai.cs.uni-dortmund.de/PERSONAL/haustein.html">Stefan Haustein</a> 129 * @author <a href="http://www.extreme.indiana.edu/~aslom/">Aleksander Slominski</a> 130 */ 131 132 public interface XmlPullParser 133 { 134 135 /** This constant represents the default namespace (empty string "") */ 136 String NO_NAMESPACE = ""; 137 138 // ---------------------------------------------------------------------------- 139 // EVENT TYPES as reported by next() 140 141 /** 142 * Signalize that parser is at the very beginning of the document and nothing was read yet. This event type can only 143 * be observed by calling getEvent() before the first call to next(), nextToken, or nextTag()). 144 * 145 * @see #next 146 * @see #nextToken 147 */ 148 int START_DOCUMENT = 0; 149 150 /** 151 * Logical end of the xml document. Returned from getEventType, next() and nextToken() when the end of the input 152 * document has been reached. 153 * <p> 154 * <strong>NOTE:</strong> calling again <a href="#next()">next()</a> or <a href="#nextToken()">nextToken()</a> will 155 * result in exception being thrown. 156 * 157 * @see #next 158 * @see #nextToken 159 */ 160 int END_DOCUMENT = 1; 161 162 /** 163 * Returned from getEventType(), <a href="#next()">next()</a>, <a href="#nextToken()">nextToken()</a> when a start 164 * tag was read. The name of start tag is available from getName(), its namespace and prefix are available from 165 * getNamespace() and getPrefix() if <a href='#FEATURE_PROCESS_NAMESPACES'>namespaces are enabled</a>. See 166 * getAttribute* methods to retrieve element attributes. See getNamespace* methods to retrieve newly declared 167 * namespaces. 168 * 169 * @see #next 170 * @see #nextToken 171 * @see #getName 172 * @see #getPrefix 173 * @see #getNamespace 174 * @see #getAttributeCount 175 * @see #getDepth 176 * @see #getNamespaceCount 177 * @see #getNamespace 178 * @see #FEATURE_PROCESS_NAMESPACES 179 */ 180 int START_TAG = 2; 181 182 /** 183 * Returned from getEventType(), <a href="#next()">next()</a>, or <a href="#nextToken()">nextToken()</a> when an end 184 * tag was read. The name of start tag is available from getName(), its namespace and prefix are available from 185 * getNamespace() and getPrefix(). 186 * 187 * @see #next 188 * @see #nextToken 189 * @see #getName 190 * @see #getPrefix 191 * @see #getNamespace 192 * @see #FEATURE_PROCESS_NAMESPACES 193 */ 194 int END_TAG = 3; 195 196 /** 197 * Character data was read and will is available by calling getText(). 198 * <p> 199 * <strong>Please note:</strong> <a href="#next()">next()</a> will accumulate multiple events into one TEXT event, 200 * skipping IGNORABLE_WHITESPACE, PROCESSING_INSTRUCTION and COMMENT events, In contrast, 201 * <a href="#nextToken()">nextToken()</a> will stop reading text when any other event is observed. Also, when the 202 * state was reached by calling next(), the text value will be normalized, whereas getText() will return 203 * unnormalized content in the case of nextToken(). This allows an exact roundtrip without changing line ends when 204 * examining low level events, whereas for high level applications the text is normalized appropriately. 205 * 206 * @see #next 207 * @see #nextToken 208 * @see #getText 209 */ 210 int TEXT = 4; 211 212 // ---------------------------------------------------------------------------- 213 // additional events exposed by lower level nextToken() 214 215 /** 216 * A CDATA sections was just read; this token is available only from calls to 217 * <a href="#nextToken()">nextToken()</a>. A call to next() will accumulate various text events into a single event 218 * of type TEXT. The text contained in the CDATA section is available by calling getText(). 219 * 220 * @see #nextToken 221 * @see #getText 222 */ 223 int CDSECT = 5; 224 225 /** 226 * An entity reference was just read; this token is available from <a href="#nextToken()">nextToken()</a> only. The 227 * entity name is available by calling getName(). If available, the replacement text can be obtained by calling 228 * getTextt(); otherwise, the user is responsible for resolving the entity reference. This event type is never 229 * returned from next(); next() will accumulate the replacement text and other text events to a single TEXT event. 230 * 231 * @see #nextToken 232 * @see #getText 233 */ 234 int ENTITY_REF = 6; 235 236 /** 237 * Ignorable whitespace was just read. This token is available only from <a href="#nextToken()">nextToken()</a>). 238 * For non-validating parsers, this event is only reported by nextToken() when outside the root element. Validating 239 * parsers may be able to detect ignorable whitespace at other locations. The ignorable whitespace string is 240 * available by calling getText() 241 * <p> 242 * <strong>NOTE:</strong> this is different from calling the isWhitespace() method, since text content may be 243 * whitespace but not ignorable. Ignorable whitespace is skipped by next() automatically; this event type is never 244 * returned from next(). 245 * 246 * @see #nextToken 247 * @see #getText 248 */ 249 int IGNORABLE_WHITESPACE = 7; 250 251 /** 252 * An XML processing instruction declaration was just read. This event type is available only via 253 * <a href="#nextToken()">nextToken()</a>. getText() will return text that is inside the processing instruction. 254 * Calls to next() will skip processing instructions automatically. 255 * 256 * @see #nextToken 257 * @see #getText 258 */ 259 int PROCESSING_INSTRUCTION = 8; 260 261 /** 262 * An XML comment was just read. This event type is this token is available via 263 * <a href="#nextToken()">nextToken()</a> only; calls to next() will skip comments automatically. The content of the 264 * comment can be accessed using the getText() method. 265 * 266 * @see #nextToken 267 * @see #getText 268 */ 269 int COMMENT = 9; 270 271 /** 272 * An XML document type declaration was just read. This token is available from 273 * <a href="#nextToken()">nextToken()</a> only. The unparsed text inside the doctype is available via the getText() 274 * method. 275 * 276 * @see #nextToken 277 * @see #getText 278 */ 279 int DOCDECL = 10; 280 281 /** 282 * This array can be used to convert the event type integer constants such as START_TAG or TEXT to to a string. For 283 * example, the value of TYPES[START_TAG] is the string "START_TAG". This array is intended for diagnostic output 284 * only. Relying on the contents of the array may be dangerous since malicious applications may alter the array, 285 * although it is final, due to limitations of the Java language. 286 */ 287 String[] TYPES = { "START_DOCUMENT", "END_DOCUMENT", "START_TAG", "END_TAG", "TEXT", "CDSECT", "ENTITY_REF", 288 "IGNORABLE_WHITESPACE", "PROCESSING_INSTRUCTION", "COMMENT", "DOCDECL" }; 289 290 // ---------------------------------------------------------------------------- 291 // namespace related features 292 293 /** 294 * This feature determines whether the parser processes namespaces. As for all features, the default value is false. 295 * <p> 296 * <strong>NOTE:</strong> The value can not be changed during parsing an must be set before parsing. 297 * 298 * @see #getFeature 299 * @see #setFeature 300 */ 301 String FEATURE_PROCESS_NAMESPACES = "http://xmlpull.org/v1/doc/features.html#process-namespaces"; 302 303 /** 304 * This feature determines whether namespace attributes are exposed via the attribute access methods. Like all 305 * features, the default value is false. This feature cannot be changed during parsing. 306 * 307 * @see #getFeature 308 * @see #setFeature 309 */ 310 String FEATURE_REPORT_NAMESPACE_ATTRIBUTES = "http://xmlpull.org/v1/doc/features.html#report-namespace-prefixes"; 311 312 /** 313 * This feature determines whether the document declaration is processed. If set to false, the DOCDECL event type is 314 * reported by nextToken() and ignored by next(). If this feature is activated, then the document declaration must 315 * be processed by the parser. 316 * <p> 317 * <strong>Please note:</strong> If the document type declaration was ignored, entity references may cause 318 * exceptions later in the parsing process. The default value of this feature is false. It cannot be changed during 319 * parsing. 320 * 321 * @see #getFeature 322 * @see #setFeature 323 */ 324 String FEATURE_PROCESS_DOCDECL = "http://xmlpull.org/v1/doc/features.html#process-docdecl"; 325 326 /** 327 * If this feature is activated, all validation errors as defined in the XML 1.0 specification are reported. This 328 * implies that FEATURE_PROCESS_DOCDECL is true and both, the internal and external document type declaration will 329 * be processed. 330 * <p> 331 * <strong>Please Note:</strong> This feature can not be changed during parsing. The default value is false. 332 * 333 * @see #getFeature 334 * @see #setFeature 335 */ 336 String FEATURE_VALIDATION = "http://xmlpull.org/v1/doc/features.html#validation"; 337 338 /** 339 * Use this call to change the general behaviour of the parser, such as namespace processing or doctype declaration 340 * handling. This method must be called before the first call to next or nextToken. Otherwise, an exception is 341 * thrown. 342 * <p> 343 * Example: call setFeature(FEATURE_PROCESS_NAMESPACES, true) in order to switch on namespace processing. The 344 * initial settings correspond to the properties requested from the XML Pull Parser factory. If none were requested, 345 * all features are deactivated by default. 346 * @param name feature name 347 * @param state feature state 348 * @exception XmlPullParserException If the feature is not supported or can not be set 349 * @exception IllegalArgumentException If string with the feature name is null 350 */ 351 void setFeature( String name, boolean state ) 352 throws XmlPullParserException; 353 354 /** 355 * Returns the current value of the given feature. 356 * <p> 357 * <strong>Please note:</strong> unknown features are <strong>always</strong> returned as false. 358 * 359 * @param name The name of feature to be retrieved. 360 * @return The value of the feature. 361 * @exception IllegalArgumentException if string the feature name is null 362 */ 363 boolean getFeature( String name ); 364 365 /** 366 * Set the value of a property. The property name is any fully-qualified URI. 367 * @param name property name 368 * @param value property value 369 * @exception XmlPullParserException If the property is not supported or can not be set 370 * @exception IllegalArgumentException If string with the property name is null 371 * @throws XmlPullParserException parsing issue 372 */ 373 void setProperty( String name, Object value ) 374 throws XmlPullParserException; 375 376 /** 377 * Look up the value of a property. The property name is any fully-qualified URI. 378 * <p> 379 * <strong>NOTE:</strong> unknown properties are <strong>always</strong> returned as null. 380 * 381 * @param name The name of property to be retrieved. 382 * @return The value of named property. 383 */ 384 Object getProperty( String name ); 385 386 /** 387 * Set the input source for parser to the given reader and resets the parser. The event type is set to the initial 388 * value START_DOCUMENT. Setting the reader to null will just stop parsing and reset parser state, allowing the 389 * parser to free internal resources such as parsing buffers. 390 * @param in the Reader 391 * @throws XmlPullParserException parsing issue 392 */ 393 void setInput( Reader in ) 394 throws XmlPullParserException; 395 396 /** 397 * Sets the input stream the parser is going to process. This call resets the parser state and sets the event type 398 * to the initial value START_DOCUMENT. 399 * <p> 400 * <strong>NOTE:</strong> If an input encoding string is passed, it MUST be used. Otherwise, if inputEncoding is 401 * null, the parser SHOULD try to determine input encoding following XML 1.0 specification (see below). If encoding 402 * detection is supported then following feature <a href= 403 * "http://xmlpull.org/v1/doc/features.html#detect-encoding">http://xmlpull.org/v1/doc/features.html#detect-encoding</a> 404 * MUST be true and otherwise it must be false 405 * 406 * @param inputStream contains a raw byte input stream of possibly unknown encoding (when inputEncoding is null). 407 * @param inputEncoding if not null it MUST be used as encoding for inputStream 408 * @throws XmlPullParserException parsing issue 409 */ 410 void setInput( InputStream inputStream, String inputEncoding ) 411 throws XmlPullParserException; 412 413 /** 414 * @return the input encoding if known, null otherwise. If setInput(InputStream, inputEncoding) was called with an 415 * inputEncoding value other than null, this value must be returned from this method. Otherwise, if inputEncoding is 416 * null and the parser supports the encoding detection feature 417 * (http://xmlpull.org/v1/doc/features.html#detect-encoding), it must return the detected encoding. If 418 * setInput(Reader) was called, null is returned. After first call to next if XML declaration was present this 419 * method will return encoding declared. 420 */ 421 String getInputEncoding(); 422 423 /** 424 * Set new value for entity replacement text as defined in 425 * <a href="http://www.w3.org/TR/REC-xml#intern-replacement">XML 1.0 Section 4.5 Construction of Internal Entity 426 * Replacement Text</a>. If FEATURE_PROCESS_DOCDECL or FEATURE_VALIDATION are set, calling this function will result 427 * in an exception -- when processing of DOCDECL is enabled, there is no need to the entity replacement text 428 * manually. 429 * <p> 430 * The motivation for this function is to allow very small implementations of XMLPULL that will work in J2ME 431 * environments. Though these implementations may not be able to process the document type declaration, they still 432 * can work with known DTDs by using this function. 433 * <p> 434 * <b>Please notes:</b> The given value is used literally as replacement text and it corresponds to declaring entity 435 * in DTD that has all special characters escaped: left angle bracket is replaced with &lt;, ampersand with 436 * &amp; and so on. 437 * <p> 438 * <b>Note:</b> The given value is the literal replacement text and must not contain any other entity reference (if 439 * it contains any entity reference there will be no further replacement). 440 * <p> 441 * <b>Note:</b> The list of pre-defined entity names will always contain standard XML entities such as amp 442 * (&amp;), lt (&lt;), gt (&gt;), quot (&quot;), and apos (&apos;). Those cannot be redefined by 443 * this method! 444 * @param entityName entity name 445 * @param replacementText remplacement 446 * @see #setInput 447 * @see #FEATURE_PROCESS_DOCDECL 448 * @see #FEATURE_VALIDATION 449 * @throws XmlPullParserException parsing issue 450 */ 451 void defineEntityReplacementText( String entityName, String replacementText ) 452 throws XmlPullParserException; 453 454 /** 455 * @return the numbers of elements in the namespace stack for the given depth. If namespaces are not enabled, 0 is 456 * returned. 457 * <p> 458 * <b>NOTE:</b> when parser is on END_TAG then it is allowed to call this function with getDepth()+1 argument to 459 * retrieve position of namespace prefixes and URIs that were declared on corresponding START_TAG. 460 * <p> 461 * <b>NOTE:</b> to retrieve lsit of namespaces declared in current element: 462 * 463 * <pre> 464 * XmlPullParser pp = ... 465 * int nsStart = pp.getNamespaceCount(pp.getDepth()-1); 466 * int nsEnd = pp.getNamespaceCount(pp.getDepth()); 467 * for (int i = nsStart; i > nsEnd; i++) { 468 * String prefix = pp.getNamespacePrefix(i); 469 * String ns = pp.getNamespaceUri(i); 470 * // ... 471 * } 472 * </pre> 473 * 474 * @see #getNamespacePrefix 475 * @see #getNamespaceUri 476 * @see #getNamespace() 477 * @see #getNamespace(String) 478 * @param depth depth 479 * @throws XmlPullParserException parsing issue 480 */ 481 int getNamespaceCount( int depth ) 482 throws XmlPullParserException; 483 484 /** 485 * @return Returns the namespace prefix for the given position in the namespace stack. Default namespace declaration 486 * (xmlns='...') will have null as prefix. If the given index is out of range, an exception is thrown. 487 * 488 * <b>Please note:</b> when the parser is on an END_TAG, namespace prefixes that were declared in the corresponding 489 * START_TAG are still accessible although they are no longer in scope. 490 * namespace prefix 491 * @param pos namespace stack position 492 * @throws XmlPullParserException parsing issue 493 */ 494 String getNamespacePrefix( int pos ) 495 throws XmlPullParserException; 496 497 /** 498 * @return Returns the namespace URI for the given position in the namespace stack If the position is out of range, an 499 * exception is thrown. 500 * 501 * <b>NOTE:</b> when parser is on END_TAG then namespace prefixes that were declared in corresponding START_TAG are 502 * still accessible even though they are not in scope 503 * @throws XmlPullParserException parsing issue 504 * @param pos namespace stack position 505 */ 506 String getNamespaceUri( int pos ) 507 throws XmlPullParserException; 508 509 /** 510 * @return the URI corresponding to the given prefix, depending on current state of the parser. 511 * <p> 512 * If the prefix was not declared in the current scope, null is returned. The default namespace is included in the 513 * namespace table and is available via getNamespace (null). 514 * <p> 515 * This method is a convenience method for 516 * 517 * <pre> 518 * for ( int i = getNamespaceCount( getDepth() ) - 1; i >= 0; i-- ) 519 * { 520 * if ( getNamespacePrefix( i ).equals( prefix ) ) 521 * { 522 * return getNamespaceUri( i ); 523 * } 524 * } 525 * return null; 526 * </pre> 527 * <p> 528 * <strong>Please note:</strong> parser implementations may provide more efficient lookup, e.g. using a Hashtable. 529 * The 'xml' prefix is bound to "http://www.w3.org/XML/1998/namespace", as defined in the 530 * <a href="http://www.w3.org/TR/REC-xml-names/#ns-using">Namespaces in XML</a> specification. Analogous, the 531 * 'xmlns' prefix is resolved to <a href="http://www.w3.org/2000/xmlns/">http://www.w3.org/2000/xmlns/</a> 532 * @param prefix given prefix 533 * @see #getNamespaceCount 534 * @see #getNamespacePrefix 535 * @see #getNamespaceUri 536 */ 537 String getNamespace( String prefix ); 538 539 // -------------------------------------------------------------------------- 540 // miscellaneous reporting methods 541 542 /** 543 * @return the current depth of the element. Outside the root element, the depth is 0. The depth is incremented by 1 544 * when a start tag is reached. The depth is decremented AFTER the end tag event was observed. 545 * 546 * <pre> 547 * <!-- outside --> 0 548 * <root> 1 549 * sometext 1 550 * <foobar> 2 551 * </foobar> 2 552 * </root> 1 553 * <!-- outside --> 0 554 * </pre> 555 */ 556 int getDepth(); 557 558 /** 559 * @return a short text describing the current parser state, including the position, a description of the current 560 * event and the data source if known. This method is especially useful to provide meaningful error messages and for 561 * debugging purposes. 562 */ 563 String getPositionDescription(); 564 565 /** 566 * Returns the current line number, starting from 1. When the parser does not know the current line number or can 567 * not determine it, -1 is returned (e.g. for WBXML). 568 * 569 * @return current line number or -1 if unknown. 570 */ 571 int getLineNumber(); 572 573 /** 574 * Returns the current column number, starting from 0. When the parser does not know the current column number or 575 * can not determine it, -1 is returned (e.g. for WBXML). 576 * 577 * @return current column number or -1 if unknown. 578 */ 579 int getColumnNumber(); 580 581 // -------------------------------------------------------------------------- 582 // TEXT related methods 583 584 /** 585 * @return Checks whether the current TEXT event contains only whitespace characters. For IGNORABLE_WHITESPACE, this is 586 * always true. For TEXT and CDSECT, false is returned when the current event text contains at least one non-white 587 * space character. For any other event type an exception is thrown. 588 * <p> 589 * <b>Please note:</b> non-validating parsers are not able to distinguish whitespace and ignorable whitespace, 590 * except from whitespace outside the root element. Ignorable whitespace is reported as separate event, which is 591 * exposed via nextToken only. 592 * @throws XmlPullParserException parsing issue 593 */ 594 boolean isWhitespace() 595 throws XmlPullParserException; 596 597 /** 598 * @return the text content of the current event as String. The value returned depends on current event type, for 599 * example for TEXT event it is element content (this is typical case when next() is used). See description of 600 * nextToken() for detailed description of possible returned values for different types of events. 601 * <p> 602 * <strong>NOTE:</strong> in case of ENTITY_REF, this method returns the entity replacement text (or null if not 603 * available). This is the only case where getText() and getTextCharacters() return different values. 604 * 605 * @see #getEventType 606 * @see #next 607 * @see #nextToken 608 */ 609 String getText(); 610 611 /** 612 * Returns the buffer that contains the text of the current event, as well as the start offset and length relevant 613 * for the current event. See getText(), next() and nextToken() for description of possible returned values. 614 * <p> 615 * <strong>Please note:</strong> this buffer must not be modified and its content MAY change after a call to next() 616 * or nextToken(). This method will always return the same value as getText(), except for ENTITY_REF. In the case of 617 * ENTITY ref, getText() returns the replacement text and this method returns the actual input buffer containing the 618 * entity name. If getText() returns null, this method returns null as well and the values returned in the holder 619 * array MUST be -1 (both start and length). 620 * 621 * @see #getText 622 * @see #next 623 * @see #nextToken 624 * @param holderForStartAndLength Must hold an 2-element int array into which the start offset and length values 625 * will be written. 626 * @return char buffer that contains the text of the current event (null if the current event has no text 627 * associated). 628 */ 629 char[] getTextCharacters( int[] holderForStartAndLength ); 630 631 // -------------------------------------------------------------------------- 632 // START_TAG / END_TAG shared methods 633 634 /** 635 * @return the namespace URI of the current element. The default namespace is represented as empty string. If 636 * namespaces are not enabled, an empty String ("") is always returned. The current event must be START_TAG or 637 * END_TAG; otherwise, null is returned. 638 */ 639 String getNamespace(); 640 641 /** 642 * @return For START_TAG or END_TAG events, the (local) name of the current element is returned when namespaces are enabled. 643 * When namespace processing is disabled, the raw name is returned. For ENTITY_REF events, the entity name is 644 * returned. If the current event is not START_TAG, END_TAG, or ENTITY_REF, null is returned. 645 * <p> 646 * <b>Please note:</b> To reconstruct the raw element name when namespaces are enabled and the prefix is not null, 647 * you will need to add the prefix and a colon to localName.. 648 */ 649 String getName(); 650 651 /** 652 * @return the prefix of the current element. If the element is in the default namespace (has no prefix), null is 653 * returned. If namespaces are not enabled, or the current event is not START_TAG or END_TAG, null is returned. 654 */ 655 String getPrefix(); 656 657 /** 658 * @return true if the current event is START_TAG and the tag is degenerated (e.g. <foobar/>). 659 * <p> 660 * <b>NOTE:</b> if the parser is not on START_TAG, an exception will be thrown. 661 * @throws XmlPullParserException parsing issue 662 */ 663 boolean isEmptyElementTag() 664 throws XmlPullParserException; 665 666 // -------------------------------------------------------------------------- 667 // START_TAG Attributes retrieval methods 668 669 /** 670 * @return the number of attributes of the current start tag, or -1 if the current event type is not START_TAG 671 * 672 * @see #getAttributeNamespace 673 * @see #getAttributeName 674 * @see #getAttributePrefix 675 * @see #getAttributeValue 676 */ 677 int getAttributeCount(); 678 679 /** 680 * Returns the namespace URI of the attribute with the given index (starts from 0). Returns an empty string ("") if 681 * namespaces are not enabled or the attribute has no namespace. Throws an IndexOutOfBoundsException if the index is 682 * out of range or the current event type is not START_TAG. 683 * <p> 684 * <strong>NOTE:</strong> if FEATURE_REPORT_NAMESPACE_ATTRIBUTES is set then namespace attributes (xmlns:ns='...') 685 * must be reported with namespace <a href="http://www.w3.org/2000/xmlns/">http://www.w3.org/2000/xmlns/</a> (visit 686 * this URL for description!). The default namespace attribute (xmlns="...") will be reported with empty namespace. 687 * <p> 688 * <strong>NOTE:</strong>The xml prefix is bound as defined in 689 * <a href="http://www.w3.org/TR/REC-xml-names/#ns-using">Namespaces in XML</a> specification to 690 * "http://www.w3.org/XML/1998/namespace". 691 * 692 * @param index zero based index of attribute 693 * @return attribute namespace, empty string ("") is returned if namespaces processing is not enabled or namespaces 694 * processing is enabled but attribute has no namespace (it has no prefix). 695 */ 696 String getAttributeNamespace( int index ); 697 698 /** 699 * Returns the local name of the specified attribute if namespaces are enabled or just attribute name if namespaces 700 * are disabled. Throws an IndexOutOfBoundsException if the index is out of range or current event type is not 701 * START_TAG. 702 * 703 * @param index zero based index of attribute 704 * @return attribute name (null is never returned) 705 */ 706 String getAttributeName( int index ); 707 708 /** 709 * Returns the prefix of the specified attribute Returns null if the element has no prefix. If namespaces are 710 * disabled it will always return null. Throws an IndexOutOfBoundsException if the index is out of range or current 711 * event type is not START_TAG. 712 * 713 * @param index zero based index of attribute 714 * @return attribute prefix or null if namespaces processing is not enabled. 715 */ 716 String getAttributePrefix( int index ); 717 718 /** 719 * Returns the type of the specified attribute If parser is non-validating it MUST return CDATA. 720 * 721 * @param index zero based index of attribute 722 * @return attribute type (null is never returned) 723 */ 724 String getAttributeType( int index ); 725 726 /** 727 * Returns if the specified attribute was not in input was declared in XML. If parser is non-validating it MUST 728 * always return false. This information is part of XML infoset: 729 * 730 * @param index zero based index of attribute 731 * @return false if attribute was in input 732 */ 733 boolean isAttributeDefault( int index ); 734 735 /** 736 * Returns the given attributes value. Throws an IndexOutOfBoundsException if the index is out of range or current 737 * event type is not START_TAG. 738 * <p> 739 * <strong>NOTE:</strong> attribute value must be normalized (including entity replacement text if PROCESS_DOCDECL 740 * is false) as described in <a href="http://www.w3.org/TR/REC-xml#AVNormalize">XML 1.0 section 3.3.3 741 * Attribute-Value Normalization</a> 742 * 743 * @see #defineEntityReplacementText 744 * @param index zero based index of attribute 745 * @return value of attribute (null is never returned) 746 */ 747 String getAttributeValue( int index ); 748 749 /** 750 * Returns the attributes value identified by namespace URI and namespace localName. If namespaces are disabled 751 * namespace must be null. If current event type is not START_TAG then IndexOutOfBoundsException will be thrown. 752 * <p> 753 * <strong>NOTE:</strong> attribute value must be normalized (including entity replacement text if PROCESS_DOCDECL 754 * is false) as described in <a href="http://www.w3.org/TR/REC-xml#AVNormalize">XML 1.0 section 3.3.3 755 * Attribute-Value Normalization</a> 756 * 757 * @see #defineEntityReplacementText 758 * @param namespace Namespace of the attribute if namespaces are enabled otherwise must be null 759 * @param name If namespaces enabled local name of attribute otherwise just attribute name 760 * @return value of attribute or null if attribute with given name does not exist 761 */ 762 String getAttributeValue( String namespace, String name ); 763 764 // -------------------------------------------------------------------------- 765 // actual parsing methods 766 767 /** 768 * @return the type of the current event (START_TAG, END_TAG, TEXT, etc.) 769 * 770 * @see #next() 771 * @see #nextToken() 772 * @throws XmlPullParserException parsing issue 773 */ 774 int getEventType() 775 throws XmlPullParserException; 776 777 /** 778 * @return Get next parsing event - element content wil be coalesced and only one TEXT event must be returned for whole 779 * element content (comments and processing instructions will be ignored and entity references must be expanded or 780 * exception mus be thrown if entity reference can not be expanded). If element content is empty (content is "") 781 * then no TEXT event will be reported. 782 * <p> 783 * <b>NOTE:</b> empty element (such as <tag/>) will be reported with two separate events: START_TAG, END_TAG - it 784 * must be so to preserve parsing equivalency of empty element to <tag></tag>. (see isEmptyElementTag ()) 785 * 786 * @see #isEmptyElementTag 787 * @see #START_TAG 788 * @see #TEXT 789 * @see #END_TAG 790 * @see #END_DOCUMENT 791 * @throws XmlPullParserException parsing issue 792 * @throws IOException io issue 793 */ 794 int next() 795 throws XmlPullParserException, IOException; 796 797 /** 798 * This method works similarly to next() but will expose additional event types (COMMENT, CDSECT, DOCDECL, 799 * ENTITY_REF, PROCESSING_INSTRUCTION, or IGNORABLE_WHITESPACE) if they are available in input. 800 * <p> 801 * If special feature <a href="http://xmlpull.org/v1/doc/features.html#xml-roundtrip">FEATURE_XML_ROUNDTRIP</a> 802 * (identified by URI: http://xmlpull.org/v1/doc/features.html#xml-roundtrip) is enabled it is possible to do XML 803 * document round trip ie. reproduce exactly on output the XML input using getText(): returned content is always 804 * unnormalized (exactly as in input). Otherwise returned content is end-of-line normalized as described 805 * <a href="http://www.w3.org/TR/REC-xml#sec-line-ends">XML 1.0 End-of-Line Handling</a> and. Also when this feature 806 * is enabled exact content of START_TAG, END_TAG, DOCDECL and PROCESSING_INSTRUCTION is available. 807 * <p> 808 * Here is the list of tokens that can be returned from nextToken() and what getText() and getTextCharacters() 809 * @return 810 * <dl> 811 * <dt>START_DOCUMENT 812 * <dd>null 813 * <dt>END_DOCUMENT 814 * <dd>null 815 * <dt>START_TAG 816 * <dd>null unless FEATURE_XML_ROUNDTRIP enabled and then returns XML tag, ex: <tag attr='val'> 817 * <dt>END_TAG 818 * <dd>null unless FEATURE_XML_ROUNDTRIP id enabled and then returns XML tag, ex: </tag> 819 * <dt>TEXT 820 * <dd>return element content. <br> 821 * Note: that element content may be delivered in multiple consecutive TEXT events. 822 * <dt>IGNORABLE_WHITESPACE 823 * <dd>return characters that are determined to be ignorable white space. If the FEATURE_XML_ROUNDTRIP is enabled 824 * all whitespace content outside root element will always reported as IGNORABLE_WHITESPACE otherwise reporting is 825 * optional. <br> 826 * Note: that element content may be delivered in multiple consecutive IGNORABLE_WHITESPACE events. 827 * <dt>CDSECT 828 * <dd>return text <em>inside</em> CDATA (ex. 'fo<o' from <!CDATA[fo<o]]>) 829 * <dt>PROCESSING_INSTRUCTION 830 * <dd>if FEATURE_XML_ROUNDTRIP is true return exact PI content ex: 'pi foo' from <?pi foo?> otherwise it may be 831 * exact PI content or concatenation of PI target, space and data so for example for <?target data?> string 832 * "target data" may be returned if FEATURE_XML_ROUNDTRIP is false. 833 * <dt>COMMENT 834 * <dd>return comment content ex. 'foo bar' from <!--foo bar--> 835 * <dt>ENTITY_REF 836 * <dd>getText() MUST return entity replacement text if PROCESS_DOCDECL is false otherwise getText() MAY return 837 * null, additionally getTextCharacters() MUST return entity name (for example 'entity_name' for &entity_name;). 838 * <br> 839 * <b>NOTE:</b> this is the only place where value returned from getText() and getTextCharacters() <b>are 840 * different</b> <br> 841 * <b>NOTE:</b> it is user responsibility to resolve entity reference if PROCESS_DOCDECL is false and there is no 842 * entity replacement text set in defineEntityReplacementText() method (getText() will be null) <br> 843 * <b>NOTE:</b> character entities (ex. &#32;) and standard entities such as &amp; &lt; &gt; 844 * &quot; &apos; are reported as well and are <b>not</b> reported as TEXT tokens but as ENTITY_REF tokens! 845 * This requirement is added to allow to do roundtrip of XML documents! 846 * <dt>DOCDECL 847 * <dd>if FEATURE_XML_ROUNDTRIP is true or PROCESS_DOCDECL is false then return what is inside of DOCDECL for 848 * example it returns: 849 * 850 * <pre> 851 * " titlepage SYSTEM "http://www.foo.bar/dtds/typo.dtd" 852 * [<!ENTITY % active.links "INCLUDE">]" 853 * </pre> 854 * <p> 855 * for input document that contained: 856 * 857 * <pre> 858 * <!DOCTYPE titlepage SYSTEM "http://www.foo.bar/dtds/typo.dtd" 859 * [<!ENTITY % active.links "INCLUDE">]> 860 * </pre> 861 * 862 * otherwise if FEATURE_XML_ROUNDTRIP is false and PROCESS_DOCDECL is true then what is returned is undefined (it 863 * may be even null)</dd> 864 * </dl> 865 * <p> 866 * <strong>NOTE:</strong> there is no guarantee that there will only one TEXT or IGNORABLE_WHITESPACE event from 867 * nextToken() as parser may chose to deliver element content in multiple tokens (dividing element content into 868 * chunks) 869 * <p> 870 * <strong>NOTE:</strong> whether returned text of token is end-of-line normalized is depending on 871 * FEATURE_XML_ROUNDTRIP. 872 * <p> 873 * <strong>NOTE:</strong> XMLDecl (<?xml ...?>) is not reported but its content is available through optional 874 * properties (see class description above). 875 * @throws XmlPullParserException parsing issue 876 * @throws IOException io issue 877 * @see #next 878 * @see #START_TAG 879 * @see #TEXT 880 * @see #END_TAG 881 * @see #END_DOCUMENT 882 * @see #COMMENT 883 * @see #DOCDECL 884 * @see #PROCESSING_INSTRUCTION 885 * @see #ENTITY_REF 886 * @see #IGNORABLE_WHITESPACE 887 */ 888 int nextToken() 889 throws XmlPullParserException, IOException; 890 891 // ----------------------------------------------------------------------------- 892 // utility methods to mak XML parsing easier ... 893 894 /** 895 * Test if the current event is of the given type and if the namespace and name do match. null will match any 896 * namespace and any name. If the test is not passed, an exception is thrown. The exception text indicates the 897 * parser position, the expected event and the current event that is not meeting the requirement. 898 * <p> 899 * Essentially it does this 900 * 901 * <pre> 902 * if ( type != getEventType() || ( namespace != null && !namespace.equals( getNamespace() ) ) 903 * || ( name != null && !name.equals( getName() ) ) ) 904 * throw new XmlPullParserException( "expected " + TYPES[type] + getPositionDescription() ); 905 * </pre> 906 * @param type type 907 * @param name name 908 * @param namespace namespace 909 * @throws XmlPullParserException parsing issue 910 * @throws IOException io issue 911 */ 912 void require( int type, String namespace, String name ) 913 throws XmlPullParserException, IOException; 914 915 /** 916 * If current event is START_TAG then if next element is TEXT then element content is returned or if next event is 917 * END_TAG then empty string is returned, otherwise exception is thrown. After calling this function successfully 918 * parser will be positioned on END_TAG. 919 * <p> 920 * The motivation for this function is to allow to parse consistently both empty elements and elements that has non 921 * empty content, for example for input: 922 * <ol> 923 * <li><tag>foo</tag> 924 * <li><tag></tag> (which is equivalent to <tag/> both input can be parsed with the same code: 925 * 926 * <pre> 927 * p.nextTag() 928 * p.requireEvent(p.START_TAG, "", "tag"); 929 * String content = p.nextText(); 930 * p.requireEvent(p.END_TAG, "", "tag"); 931 * </pre></li></ol> 932 * 933 * This function together with nextTag make it very easy to parse XML that has no mixed content. 934 * <p> 935 * Essentially it does this 936 * 937 * <pre> 938 * if ( getEventType() != START_TAG ) 939 * { 940 * throw new XmlPullParserException( "parser must be on START_TAG to read next text", this, null ); 941 * } 942 * int eventType = next(); 943 * if ( eventType == TEXT ) 944 * { 945 * String result = getText(); 946 * eventType = next(); 947 * if ( eventType != END_TAG ) 948 * { 949 * throw new XmlPullParserException( "event TEXT it must be immediately followed by END_TAG", this, null ); 950 * } 951 * return result; 952 * } 953 * else if ( eventType == END_TAG ) 954 * { 955 * return ""; 956 * } 957 * else 958 * { 959 * throw new XmlPullParserException( "parser must be on START_TAG or TEXT to read text", this, null ); 960 * } 961 * </pre> 962 * @return see description 963 * @throws XmlPullParserException parsing issue 964 * @throws IOException io issue 965 */ 966 String nextText() 967 throws XmlPullParserException, IOException; 968 969 /** 970 * Call next() and return event if it is START_TAG or END_TAG otherwise throw an exception. It will skip whitespace 971 * TEXT before actual tag if any. 972 * <p> 973 * essentially it does this 974 * 975 * <pre> 976 * int eventType = next(); 977 * if ( eventType == TEXT && isWhitespace() ) 978 * { // skip whitespace 979 * eventType = next(); 980 * } 981 * if ( eventType != START_TAG && eventType != END_TAG ) 982 * { 983 * throw new XmlPullParserException( "expected start or end tag", this, null ); 984 * } 985 * return eventType; 986 * </pre> 987 * @return see description 988 * @throws XmlPullParserException parsing issue 989 * @throws 990 * IOException io issue 991 */ 992 int nextTag() 993 throws XmlPullParserException, IOException; 994 995 }