| 1 | //! Contains an implementation of pull-based XML parser. |
| 2 | |
| 3 | use crate::common::{is_xml10_char, is_xml11_char, is_xml11_char_not_restricted, is_name_char, is_name_start_char, is_whitespace_char}; |
| 4 | use crate::common::{Position, TextPosition, XmlVersion}; |
| 5 | use crate::name::OwnedName; |
| 6 | use crate::namespace::NamespaceStack; |
| 7 | use crate::reader::config::ParserConfig2; |
| 8 | use crate::reader::error::SyntaxError; |
| 9 | use crate::reader::events::XmlEvent; |
| 10 | use crate::reader::indexset::AttributesSet; |
| 11 | use crate::reader::lexer::{Lexer, Token}; |
| 12 | use super::{Error, ErrorKind}; |
| 13 | |
| 14 | use std::collections::HashMap; |
| 15 | use std::io::Read; |
| 16 | |
| 17 | macro_rules! gen_takes( |
| 18 | ($($field:ident -> $method:ident, $t:ty, $def:expr);+) => ( |
| 19 | $( |
| 20 | impl MarkupData { |
| 21 | #[inline] |
| 22 | #[allow(clippy::mem_replace_option_with_none)] |
| 23 | #[allow(clippy::mem_replace_with_default)] |
| 24 | fn $method(&mut self) -> $t { |
| 25 | std::mem::replace(&mut self.$field, $def) |
| 26 | } |
| 27 | } |
| 28 | )+ |
| 29 | ) |
| 30 | ); |
| 31 | |
| 32 | gen_takes!( |
| 33 | name -> take_name, String, String::new(); |
| 34 | ref_data -> take_ref_data, String, String::new(); |
| 35 | |
| 36 | encoding -> take_encoding, Option<String>, None; |
| 37 | |
| 38 | element_name -> take_element_name, Option<OwnedName>, None; |
| 39 | |
| 40 | attr_name -> take_attr_name, Option<OwnedName>, None; |
| 41 | attributes -> take_attributes, AttributesSet, AttributesSet::new() |
| 42 | ); |
| 43 | |
| 44 | mod inside_cdata; |
| 45 | mod inside_closing_tag_name; |
| 46 | mod inside_comment; |
| 47 | mod inside_declaration; |
| 48 | mod inside_doctype; |
| 49 | mod inside_opening_tag; |
| 50 | mod inside_processing_instruction; |
| 51 | mod inside_reference; |
| 52 | mod outside_tag; |
| 53 | |
| 54 | static DEFAULT_VERSION: XmlVersion = XmlVersion::Version10; |
| 55 | static DEFAULT_STANDALONE: Option<bool> = None; |
| 56 | |
| 57 | type ElementStack = Vec<OwnedName>; |
| 58 | pub type Result = super::Result<XmlEvent>; |
| 59 | |
| 60 | /// Pull-based XML parser. |
| 61 | pub(crate) struct PullParser { |
| 62 | config: ParserConfig2, |
| 63 | lexer: Lexer, |
| 64 | st: State, |
| 65 | state_after_reference: State, |
| 66 | buf: String, |
| 67 | |
| 68 | /// From DTD internal subset |
| 69 | entities: HashMap<String, String>, |
| 70 | |
| 71 | nst: NamespaceStack, |
| 72 | |
| 73 | data: MarkupData, |
| 74 | final_result: Option<Result>, |
| 75 | next_event: Option<Result>, |
| 76 | est: ElementStack, |
| 77 | pos: Vec<TextPosition>, |
| 78 | |
| 79 | encountered: Encountered, |
| 80 | inside_whitespace: bool, |
| 81 | read_prefix_separator: bool, |
| 82 | pop_namespace: bool, |
| 83 | } |
| 84 | |
| 85 | // Keeps track when XML declaration can happen |
| 86 | #[derive (Copy, Clone, PartialEq, Eq, PartialOrd, Ord)] |
| 87 | enum Encountered { |
| 88 | None = 0, |
| 89 | AnyChars, // whitespace before <?xml is not allowed |
| 90 | Declaration, |
| 91 | Comment, |
| 92 | Doctype, |
| 93 | Element, |
| 94 | } |
| 95 | |
| 96 | impl PullParser { |
| 97 | /// Returns a new parser using the given config. |
| 98 | #[inline ] |
| 99 | pub fn new(config: impl Into<ParserConfig2>) -> Self { |
| 100 | let config = config.into(); |
| 101 | Self::new_with_config2(config) |
| 102 | } |
| 103 | |
| 104 | #[inline ] |
| 105 | fn new_with_config2(config: ParserConfig2) -> Self { |
| 106 | let mut lexer = Lexer::new(&config); |
| 107 | if let Some(enc) = config.override_encoding { |
| 108 | lexer.set_encoding(enc); |
| 109 | } |
| 110 | |
| 111 | let mut pos = Vec::with_capacity(16); |
| 112 | pos.push(TextPosition::new()); |
| 113 | |
| 114 | Self { |
| 115 | config, |
| 116 | lexer, |
| 117 | st: State::DocumentStart, |
| 118 | state_after_reference: State::OutsideTag, |
| 119 | buf: String::new(), |
| 120 | entities: HashMap::new(), |
| 121 | nst: NamespaceStack::default(), |
| 122 | |
| 123 | data: MarkupData { |
| 124 | name: String::new(), |
| 125 | doctype: None, |
| 126 | version: None, |
| 127 | encoding: None, |
| 128 | standalone: None, |
| 129 | ref_data: String::new(), |
| 130 | element_name: None, |
| 131 | quote: None, |
| 132 | attr_name: None, |
| 133 | attributes: AttributesSet::new(), |
| 134 | }, |
| 135 | final_result: None, |
| 136 | next_event: None, |
| 137 | est: Vec::new(), |
| 138 | pos, |
| 139 | |
| 140 | encountered: Encountered::None, |
| 141 | inside_whitespace: true, |
| 142 | read_prefix_separator: false, |
| 143 | pop_namespace: false, |
| 144 | } |
| 145 | } |
| 146 | |
| 147 | /// Checks if this parser ignores the end of stream errors. |
| 148 | pub fn is_ignoring_end_of_stream(&self) -> bool { self.config.c.ignore_end_of_stream } |
| 149 | |
| 150 | /// Retrieves the Doctype from the document if any |
| 151 | #[inline ] |
| 152 | pub fn doctype(&self) -> Option<&str> { |
| 153 | self.data.doctype.as_deref() |
| 154 | } |
| 155 | |
| 156 | #[inline (never)] |
| 157 | fn set_encountered(&mut self, new_encounter: Encountered) -> Option<Result> { |
| 158 | if new_encounter <= self.encountered { |
| 159 | return None; |
| 160 | } |
| 161 | let prev_enc = self.encountered; |
| 162 | self.encountered = new_encounter; |
| 163 | |
| 164 | // If declaration was not parsed and we have encountered an element, |
| 165 | // emit this declaration as the next event. |
| 166 | if prev_enc == Encountered::None { |
| 167 | self.push_pos(); |
| 168 | Some(Ok(XmlEvent::StartDocument { |
| 169 | version: DEFAULT_VERSION, |
| 170 | encoding: self.lexer.encoding().to_string(), |
| 171 | standalone: DEFAULT_STANDALONE, |
| 172 | })) |
| 173 | } else { |
| 174 | None |
| 175 | } |
| 176 | } |
| 177 | } |
| 178 | |
| 179 | impl Position for PullParser { |
| 180 | /// Returns the position of the last event produced by the parser |
| 181 | #[inline ] |
| 182 | fn position(&self) -> TextPosition { |
| 183 | self.pos.first().copied().unwrap_or_else(TextPosition::new) |
| 184 | } |
| 185 | } |
| 186 | |
| 187 | #[derive (Copy, Clone, PartialEq)] |
| 188 | pub enum State { |
| 189 | OutsideTag, |
| 190 | InsideOpeningTag(OpeningTagSubstate), |
| 191 | InsideClosingTag(ClosingTagSubstate), |
| 192 | InsideProcessingInstruction(ProcessingInstructionSubstate), |
| 193 | InsideComment, |
| 194 | InsideCData, |
| 195 | InsideDeclaration(DeclarationSubstate), |
| 196 | InsideDoctype(DoctypeSubstate), |
| 197 | InsideReference, |
| 198 | DocumentStart, |
| 199 | } |
| 200 | |
| 201 | #[derive (Copy, Clone, PartialEq)] |
| 202 | pub enum DoctypeSubstate { |
| 203 | Outside, |
| 204 | String, |
| 205 | InsideName, |
| 206 | BeforeEntityName, |
| 207 | EntityName, |
| 208 | BeforeEntityValue, |
| 209 | EntityValue, |
| 210 | NumericReferenceStart, |
| 211 | NumericReference, |
| 212 | /// expansion |
| 213 | PEReferenceInValue, |
| 214 | PEReferenceInDtd, |
| 215 | /// name definition |
| 216 | PEReferenceDefinitionStart, |
| 217 | PEReferenceDefinition, |
| 218 | SkipDeclaration, |
| 219 | Comment, |
| 220 | } |
| 221 | |
| 222 | #[derive (Copy, Clone, PartialEq)] |
| 223 | pub enum OpeningTagSubstate { |
| 224 | InsideName, |
| 225 | |
| 226 | InsideTag, |
| 227 | |
| 228 | InsideAttributeName, |
| 229 | AfterAttributeName, |
| 230 | |
| 231 | InsideAttributeValue, |
| 232 | AfterAttributeValue, |
| 233 | } |
| 234 | |
| 235 | #[derive (Copy, Clone, PartialEq)] |
| 236 | pub enum ClosingTagSubstate { |
| 237 | CTInsideName, |
| 238 | CTAfterName, |
| 239 | } |
| 240 | |
| 241 | #[derive (Copy, Clone, PartialEq)] |
| 242 | pub enum ProcessingInstructionSubstate { |
| 243 | PIInsideName, |
| 244 | PIInsideData, |
| 245 | } |
| 246 | |
| 247 | #[derive (Copy, Clone, PartialEq)] |
| 248 | pub enum DeclarationSubstate { |
| 249 | BeforeVersion, |
| 250 | InsideVersion, |
| 251 | AfterVersion, |
| 252 | |
| 253 | InsideVersionValue, |
| 254 | AfterVersionValue, |
| 255 | |
| 256 | BeforeEncoding, |
| 257 | InsideEncoding, |
| 258 | AfterEncoding, |
| 259 | |
| 260 | InsideEncodingValue, |
| 261 | AfterEncodingValue, |
| 262 | |
| 263 | BeforeStandaloneDecl, |
| 264 | InsideStandaloneDecl, |
| 265 | AfterStandaloneDecl, |
| 266 | |
| 267 | InsideStandaloneDeclValue, |
| 268 | AfterStandaloneDeclValue, |
| 269 | } |
| 270 | |
| 271 | #[derive (Copy, Clone, PartialEq)] |
| 272 | enum QualifiedNameTarget { |
| 273 | AttributeNameTarget, |
| 274 | OpeningTagNameTarget, |
| 275 | ClosingTagNameTarget, |
| 276 | } |
| 277 | |
| 278 | #[derive (Copy, Clone, PartialEq, Eq)] |
| 279 | enum QuoteToken { |
| 280 | SingleQuoteToken, |
| 281 | DoubleQuoteToken, |
| 282 | } |
| 283 | |
| 284 | impl QuoteToken { |
| 285 | #[inline ] |
| 286 | fn from_token(t: Token) -> Option<Self> { |
| 287 | match t { |
| 288 | Token::SingleQuote => Some(Self::SingleQuoteToken), |
| 289 | Token::DoubleQuote => Some(Self::DoubleQuoteToken), |
| 290 | _ => { |
| 291 | debug_assert!(false); |
| 292 | None |
| 293 | }, |
| 294 | } |
| 295 | } |
| 296 | |
| 297 | const fn as_token(self) -> Token { |
| 298 | match self { |
| 299 | Self::SingleQuoteToken => Token::SingleQuote, |
| 300 | Self::DoubleQuoteToken => Token::DoubleQuote, |
| 301 | } |
| 302 | } |
| 303 | } |
| 304 | |
| 305 | struct MarkupData { |
| 306 | name: String, // used for processing instruction name |
| 307 | ref_data: String, // used for reference content |
| 308 | |
| 309 | doctype: Option<String>, // keeps a copy of the original doctype |
| 310 | version: Option<XmlVersion>, // used for XML declaration version |
| 311 | encoding: Option<String>, // used for XML declaration encoding |
| 312 | standalone: Option<bool>, // used for XML declaration standalone parameter |
| 313 | |
| 314 | element_name: Option<OwnedName>, // used for element name |
| 315 | |
| 316 | quote: Option<QuoteToken>, // used to hold opening quote for attribute value |
| 317 | attr_name: Option<OwnedName>, // used to hold attribute name |
| 318 | attributes: AttributesSet, // used to hold all accumulated attributes |
| 319 | } |
| 320 | |
| 321 | impl PullParser { |
| 322 | /// Returns next event read from the given buffer. |
| 323 | /// |
| 324 | /// This method should be always called with the same buffer. If you call it |
| 325 | /// providing different buffers each time, the result will be undefined. |
| 326 | pub fn next<R: Read>(&mut self, r: &mut R) -> Result { |
| 327 | if let Some(ref ev) = self.final_result { |
| 328 | return ev.clone(); |
| 329 | } |
| 330 | |
| 331 | if let Some(ev) = self.next_event.take() { |
| 332 | return ev; |
| 333 | } |
| 334 | |
| 335 | if self.pop_namespace { |
| 336 | self.pop_namespace = false; |
| 337 | self.nst.pop(); |
| 338 | } |
| 339 | |
| 340 | loop { |
| 341 | debug_assert!(self.next_event.is_none()); |
| 342 | debug_assert!(!self.pop_namespace); |
| 343 | |
| 344 | // While lexer gives us Ok(maybe_token) -- we loop. |
| 345 | // Upon having a complete XML-event -- we return from the whole function. |
| 346 | match self.lexer.next_token(r) { |
| 347 | Ok(Token::Eof) => { |
| 348 | // Forward pos to the lexer head |
| 349 | self.next_pos(); |
| 350 | return self.handle_eof(); |
| 351 | }, |
| 352 | Ok(token) => match self.dispatch_token(token) { |
| 353 | None => continue, |
| 354 | Some(Ok(xml_event)) => { |
| 355 | self.next_pos(); |
| 356 | return Ok(xml_event); |
| 357 | }, |
| 358 | Some(Err(xml_error)) => { |
| 359 | self.next_pos(); |
| 360 | return self.set_final_result(Err(xml_error)); |
| 361 | }, |
| 362 | }, |
| 363 | Err(lexer_error) => { |
| 364 | self.next_pos(); |
| 365 | return self.set_final_result(Err(lexer_error)); |
| 366 | }, |
| 367 | } |
| 368 | } |
| 369 | } |
| 370 | |
| 371 | /// Handle end of stream |
| 372 | #[cold ] |
| 373 | fn handle_eof(&mut self) -> std::result::Result<XmlEvent, super::Error> { |
| 374 | let ev = if self.depth() == 0 { |
| 375 | if self.encountered == Encountered::Element && self.st == State::OutsideTag { // all is ok |
| 376 | Ok(XmlEvent::EndDocument) |
| 377 | } else if self.encountered < Encountered::Element { |
| 378 | self.error(SyntaxError::NoRootElement) |
| 379 | } else { // self.st != State::OutsideTag |
| 380 | self.error(SyntaxError::UnexpectedEof) // TODO: add expected hint? |
| 381 | } |
| 382 | } else if self.config.c.ignore_end_of_stream { |
| 383 | self.final_result = None; |
| 384 | self.lexer.reset_eof_handled(); |
| 385 | return self.error(SyntaxError::UnbalancedRootElement); |
| 386 | } else { |
| 387 | self.error(SyntaxError::UnbalancedRootElement) |
| 388 | }; |
| 389 | self.set_final_result(ev) |
| 390 | } |
| 391 | |
| 392 | // This function is to be called when a terminal event is reached. |
| 393 | // The function sets up the `self.final_result` into `Some(result)` and return `result`. |
| 394 | #[inline ] |
| 395 | fn set_final_result(&mut self, result: Result) -> Result { |
| 396 | self.final_result = Some(result.clone()); |
| 397 | result |
| 398 | } |
| 399 | |
| 400 | #[cold ] |
| 401 | fn error(&self, e: SyntaxError) -> Result { |
| 402 | Err(Error { |
| 403 | pos: self.lexer.position(), |
| 404 | kind: ErrorKind::Syntax(e.to_cow()), |
| 405 | }) |
| 406 | } |
| 407 | |
| 408 | #[inline ] |
| 409 | fn next_pos(&mut self) { |
| 410 | // unfortunately calls to next_pos will never be perfectly balanced with push_pos, |
| 411 | // at very least because parse errors and EOF can happen unexpectedly without a prior push. |
| 412 | if !self.pos.is_empty() { |
| 413 | if self.pos.len() > 1 { |
| 414 | self.pos.remove(0); |
| 415 | } else { |
| 416 | self.pos[0] = self.lexer.position(); |
| 417 | } |
| 418 | } |
| 419 | } |
| 420 | |
| 421 | #[inline ] |
| 422 | #[track_caller ] |
| 423 | fn push_pos(&mut self) { |
| 424 | debug_assert!(self.pos.len() != self.pos.capacity(), "You've found a bug in xml-rs, caused by calls to push_pos() in states that don't end up emitting events. |
| 425 | This case is ignored in release mode, and merely causes document positions to be out of sync. |
| 426 | Please file a bug and include the XML document that triggers this assert." ); |
| 427 | |
| 428 | // it has capacity preallocated for more than it ever needs, so this reduces code size |
| 429 | if self.pos.len() != self.pos.capacity() { |
| 430 | self.pos.push(self.lexer.position()); |
| 431 | } else if self.pos.len() > 1 { |
| 432 | self.pos.remove(0); // this mitigates the excessive push_pos() call |
| 433 | } |
| 434 | } |
| 435 | |
| 436 | #[inline (never)] |
| 437 | fn dispatch_token(&mut self, t: Token) -> Option<Result> { |
| 438 | match self.st { |
| 439 | State::OutsideTag => self.outside_tag(t), |
| 440 | State::InsideOpeningTag(s) => self.inside_opening_tag(t, s), |
| 441 | State::InsideClosingTag(s) => self.inside_closing_tag_name(t, s), |
| 442 | State::InsideReference => self.inside_reference(t), |
| 443 | State::InsideComment => self.inside_comment(t), |
| 444 | State::InsideCData => self.inside_cdata(t), |
| 445 | State::InsideProcessingInstruction(s) => self.inside_processing_instruction(t, s), |
| 446 | State::InsideDoctype(s) => self.inside_doctype(t, s), |
| 447 | State::InsideDeclaration(s) => self.inside_declaration(t, s), |
| 448 | State::DocumentStart => self.document_start(t), |
| 449 | } |
| 450 | } |
| 451 | |
| 452 | #[inline ] |
| 453 | fn depth(&self) -> usize { |
| 454 | self.est.len() |
| 455 | } |
| 456 | |
| 457 | #[inline ] |
| 458 | fn buf_has_data(&self) -> bool { |
| 459 | !self.buf.is_empty() |
| 460 | } |
| 461 | |
| 462 | #[inline ] |
| 463 | fn take_buf(&mut self) -> String { |
| 464 | std::mem::take(&mut self.buf) |
| 465 | } |
| 466 | |
| 467 | #[inline ] |
| 468 | fn into_state(&mut self, st: State, ev: Option<Result>) -> Option<Result> { |
| 469 | self.st = st; |
| 470 | ev |
| 471 | } |
| 472 | |
| 473 | #[inline ] |
| 474 | fn into_state_continue(&mut self, st: State) -> Option<Result> { |
| 475 | self.into_state(st, None) |
| 476 | } |
| 477 | |
| 478 | #[inline ] |
| 479 | fn into_state_emit(&mut self, st: State, ev: Result) -> Option<Result> { |
| 480 | self.into_state(st, Some(ev)) |
| 481 | } |
| 482 | |
| 483 | /// Dispatches tokens in order to process qualified name. If qualified name cannot be parsed, |
| 484 | /// an error is returned. |
| 485 | /// |
| 486 | /// # Parameters |
| 487 | /// * `t` --- next token; |
| 488 | /// * `on_name` --- a callback which is executed when whitespace is encountered. |
| 489 | fn read_qualified_name<F>(&mut self, t: Token, target: QualifiedNameTarget, on_name: F) -> Option<Result> |
| 490 | where F: Fn(&mut Self, Token, OwnedName) -> Option<Result> { |
| 491 | // We can get here for the first time only when self.data.name contains zero or one character, |
| 492 | // but first character cannot be a colon anyway |
| 493 | if self.buf.len() <= 1 { |
| 494 | self.read_prefix_separator = false; |
| 495 | } |
| 496 | |
| 497 | let invoke_callback = move |this: &mut Self, t| { |
| 498 | let name = this.take_buf(); |
| 499 | match name.parse() { |
| 500 | Ok(name) => on_name(this, t, name), |
| 501 | Err(()) => Some(this.error(SyntaxError::InvalidQualifiedName(name.into()))), |
| 502 | } |
| 503 | }; |
| 504 | |
| 505 | match t { |
| 506 | // There can be only one colon, and not as the first character |
| 507 | Token::Character(':' ) if self.buf_has_data() && !self.read_prefix_separator => { |
| 508 | self.buf.push(':' ); |
| 509 | self.read_prefix_separator = true; |
| 510 | None |
| 511 | }, |
| 512 | |
| 513 | Token::Character(c) if c != ':' && (self.buf.is_empty() && is_name_start_char(c) || |
| 514 | self.buf_has_data() && is_name_char(c)) => { |
| 515 | if self.buf.len() > self.config.max_name_length { |
| 516 | return Some(self.error(SyntaxError::ExceededConfiguredLimit)); |
| 517 | } |
| 518 | self.buf.push(c); |
| 519 | None |
| 520 | }, |
| 521 | |
| 522 | Token::EqualsSign if target == QualifiedNameTarget::AttributeNameTarget => invoke_callback(self, t), |
| 523 | |
| 524 | Token::EmptyTagEnd if target == QualifiedNameTarget::OpeningTagNameTarget => invoke_callback(self, t), |
| 525 | |
| 526 | Token::TagEnd if target == QualifiedNameTarget::OpeningTagNameTarget || |
| 527 | target == QualifiedNameTarget::ClosingTagNameTarget => invoke_callback(self, t), |
| 528 | |
| 529 | Token::Character(c) if is_whitespace_char(c) => invoke_callback(self, t), |
| 530 | |
| 531 | _ => Some(self.error(SyntaxError::UnexpectedQualifiedName(t))), |
| 532 | } |
| 533 | } |
| 534 | |
| 535 | /// Dispatches tokens in order to process attribute value. |
| 536 | /// |
| 537 | /// # Parameters |
| 538 | /// * `t` --- next token; |
| 539 | /// * `on_value` --- a callback which is called when terminating quote is encountered. |
| 540 | fn read_attribute_value<F>(&mut self, t: Token, on_value: F) -> Option<Result> |
| 541 | where F: Fn(&mut Self, String) -> Option<Result> { |
| 542 | match t { |
| 543 | Token::Character(c) if self.data.quote.is_none() && is_whitespace_char(c) => None, // skip leading whitespace |
| 544 | |
| 545 | Token::DoubleQuote | Token::SingleQuote => match self.data.quote { |
| 546 | None => { // Entered attribute value |
| 547 | self.data.quote = QuoteToken::from_token(t); |
| 548 | None |
| 549 | }, |
| 550 | Some(q) if q.as_token() == t => { |
| 551 | self.data.quote = None; |
| 552 | let value = self.take_buf(); |
| 553 | on_value(self, value) |
| 554 | }, |
| 555 | _ => { |
| 556 | if let Token::Character(c) = t { |
| 557 | if !self.is_valid_xml_char_not_restricted(c) { |
| 558 | return Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32))); |
| 559 | } |
| 560 | } |
| 561 | if self.buf.len() > self.config.max_attribute_length { |
| 562 | return Some(self.error(SyntaxError::ExceededConfiguredLimit)); |
| 563 | } |
| 564 | t.push_to_string(&mut self.buf); |
| 565 | None |
| 566 | }, |
| 567 | }, |
| 568 | |
| 569 | Token::ReferenceStart if self.data.quote.is_some() => { |
| 570 | self.state_after_reference = self.st; |
| 571 | self.into_state_continue(State::InsideReference) |
| 572 | }, |
| 573 | |
| 574 | Token::OpeningTagStart => Some(self.error(SyntaxError::UnexpectedOpeningTag)), |
| 575 | |
| 576 | Token::Character(c) if !self.is_valid_xml_char_not_restricted(c) => { |
| 577 | Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32))) |
| 578 | }, |
| 579 | |
| 580 | // Every character except " and ' and < is okay |
| 581 | _ if self.data.quote.is_some() => { |
| 582 | if self.buf.len() > self.config.max_attribute_length { |
| 583 | return Some(self.error(SyntaxError::ExceededConfiguredLimit)); |
| 584 | } |
| 585 | t.push_to_string(&mut self.buf); |
| 586 | None |
| 587 | }, |
| 588 | |
| 589 | _ => Some(self.error(SyntaxError::UnexpectedToken(t))), |
| 590 | } |
| 591 | } |
| 592 | |
| 593 | fn emit_start_element(&mut self, emit_end_element: bool) -> Option<Result> { |
| 594 | let mut name = self.data.take_element_name()?; |
| 595 | let mut attributes = self.data.take_attributes().into_vec(); |
| 596 | |
| 597 | // check whether the name prefix is bound and fix its namespace |
| 598 | match self.nst.get(name.borrow().prefix_repr()) { |
| 599 | Some("" ) => name.namespace = None, // default namespace |
| 600 | Some(ns) => name.namespace = Some(ns.into()), |
| 601 | None => return Some(self.error(SyntaxError::UnboundElementPrefix(name.to_string().into()))), |
| 602 | } |
| 603 | |
| 604 | // check and fix accumulated attributes prefixes |
| 605 | for attr in &mut attributes { |
| 606 | if let Some(ref pfx) = attr.name.prefix { |
| 607 | let new_ns = match self.nst.get(pfx) { |
| 608 | Some("" ) => None, // default namespace |
| 609 | Some(ns) => Some(ns.into()), |
| 610 | None => return Some(self.error(SyntaxError::UnboundAttribute(attr.name.to_string().into()))), |
| 611 | }; |
| 612 | attr.name.namespace = new_ns; |
| 613 | } |
| 614 | } |
| 615 | |
| 616 | if emit_end_element { |
| 617 | self.pop_namespace = true; |
| 618 | self.next_event = Some(Ok(XmlEvent::EndElement { |
| 619 | name: name.clone() |
| 620 | })); |
| 621 | } else { |
| 622 | self.est.push(name.clone()); |
| 623 | } |
| 624 | let namespace = self.nst.squash(); |
| 625 | self.into_state_emit(State::OutsideTag, Ok(XmlEvent::StartElement { |
| 626 | name, |
| 627 | attributes, |
| 628 | namespace |
| 629 | })) |
| 630 | } |
| 631 | |
| 632 | fn emit_end_element(&mut self) -> Option<Result> { |
| 633 | let mut name = self.data.take_element_name()?; |
| 634 | |
| 635 | // check whether the name prefix is bound and fix its namespace |
| 636 | match self.nst.get(name.borrow().prefix_repr()) { |
| 637 | Some("" ) => name.namespace = None, // default namespace |
| 638 | Some(ns) => name.namespace = Some(ns.into()), |
| 639 | None => return Some(self.error(SyntaxError::UnboundElementPrefix(name.to_string().into()))), |
| 640 | } |
| 641 | |
| 642 | let op_name = self.est.pop()?; |
| 643 | |
| 644 | if name == op_name { |
| 645 | self.pop_namespace = true; |
| 646 | self.into_state_emit(State::OutsideTag, Ok(XmlEvent::EndElement { name })) |
| 647 | } else { |
| 648 | Some(self.error(SyntaxError::UnexpectedClosingTag(format!(" {name} != {op_name}" ).into()))) |
| 649 | } |
| 650 | } |
| 651 | |
| 652 | #[inline ] |
| 653 | fn is_valid_xml_char(&self, c: char) -> bool { |
| 654 | if Some(XmlVersion::Version11) == self.data.version { |
| 655 | is_xml11_char(c) |
| 656 | } else { |
| 657 | is_xml10_char(c) |
| 658 | } |
| 659 | } |
| 660 | |
| 661 | #[inline ] |
| 662 | fn is_valid_xml_char_not_restricted(&self, c: char) -> bool { |
| 663 | if Some(XmlVersion::Version11) == self.data.version { |
| 664 | is_xml11_char_not_restricted(c) |
| 665 | } else { |
| 666 | is_xml10_char(c) |
| 667 | } |
| 668 | } |
| 669 | } |
| 670 | |
| 671 | #[cfg (test)] |
| 672 | mod tests { |
| 673 | use crate::attribute::OwnedAttribute; |
| 674 | use crate::common::TextPosition; |
| 675 | use crate::name::OwnedName; |
| 676 | use crate::reader::events::XmlEvent; |
| 677 | use crate::reader::parser::PullParser; |
| 678 | use crate::reader::ParserConfig; |
| 679 | use std::io::BufReader; |
| 680 | |
| 681 | fn new_parser() -> PullParser { |
| 682 | PullParser::new(ParserConfig::new()) |
| 683 | } |
| 684 | |
| 685 | macro_rules! expect_event( |
| 686 | ($r:expr, $p:expr, $t:pat) => ( |
| 687 | match $p.next(&mut $r) { |
| 688 | $t => {} |
| 689 | e => panic!("Unexpected event: {e:?} \nExpected: {}" , stringify!($t)) |
| 690 | } |
| 691 | ); |
| 692 | ($r:expr, $p:expr, $t:pat => $c:expr ) => ( |
| 693 | match $p.next(&mut $r) { |
| 694 | $t if $c => {} |
| 695 | e => panic!("Unexpected event: {e:?} \nExpected: {} if {}" , stringify!($t), stringify!($c)) |
| 696 | } |
| 697 | ) |
| 698 | ); |
| 699 | |
| 700 | macro_rules! test_data( |
| 701 | ($d:expr) => ({ |
| 702 | static DATA: &'static str = $d; |
| 703 | let r = BufReader::new(DATA.as_bytes()); |
| 704 | let p = new_parser(); |
| 705 | (r, p) |
| 706 | }) |
| 707 | ); |
| 708 | |
| 709 | #[test ] |
| 710 | fn issue_3_semicolon_in_attribute_value() { |
| 711 | let (mut r, mut p) = test_data!(r#" |
| 712 | <a attr="zzz;zzz" /> |
| 713 | "# ); |
| 714 | |
| 715 | expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); |
| 716 | expect_event!(r, p, Ok(XmlEvent::StartElement { ref name, ref attributes, ref namespace }) => |
| 717 | *name == OwnedName::local("a" ) && |
| 718 | attributes.len() == 1 && |
| 719 | attributes[0] == OwnedAttribute::new(OwnedName::local("attr" ), "zzz;zzz" ) && |
| 720 | namespace.is_essentially_empty() |
| 721 | ); |
| 722 | expect_event!(r, p, Ok(XmlEvent::EndElement { ref name }) => *name == OwnedName::local("a" )); |
| 723 | expect_event!(r, p, Ok(XmlEvent::EndDocument)); |
| 724 | } |
| 725 | |
| 726 | #[test ] |
| 727 | fn issue_140_entity_reference_inside_tag() { |
| 728 | let (mut r, mut p) = test_data!(r" |
| 729 | <bla>♫</bla> |
| 730 | " ); |
| 731 | |
| 732 | expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); |
| 733 | expect_event!(r, p, Ok(XmlEvent::StartElement { ref name, .. }) => *name == OwnedName::local("bla" )); |
| 734 | expect_event!(r, p, Ok(XmlEvent::Characters(ref s)) => s == " \u{266b}" ); |
| 735 | expect_event!(r, p, Ok(XmlEvent::EndElement { ref name, .. }) => *name == OwnedName::local("bla" )); |
| 736 | expect_event!(r, p, Ok(XmlEvent::EndDocument)); |
| 737 | } |
| 738 | |
| 739 | #[test ] |
| 740 | fn issue_220_comment() { |
| 741 | let (mut r, mut p) = test_data!(r"<x><!-- <!--></x>" ); |
| 742 | expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); |
| 743 | expect_event!(r, p, Ok(XmlEvent::StartElement { .. })); |
| 744 | expect_event!(r, p, Ok(XmlEvent::EndElement { .. })); |
| 745 | expect_event!(r, p, Ok(XmlEvent::EndDocument)); |
| 746 | |
| 747 | let (mut r, mut p) = test_data!(r"<x><!-- <!---></x>" ); |
| 748 | expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); |
| 749 | expect_event!(r, p, Ok(XmlEvent::StartElement { .. })); |
| 750 | expect_event!(r, p, Err(_)); // ---> is forbidden in comments |
| 751 | |
| 752 | let (mut r, mut p) = test_data!(r"<x><!--<text&x;> <!--></x>" ); |
| 753 | p.config.c.ignore_comments = false; |
| 754 | expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); |
| 755 | expect_event!(r, p, Ok(XmlEvent::StartElement { .. })); |
| 756 | expect_event!(r, p, Ok(XmlEvent::Comment(s)) => s == "<text&x;> <!" ); |
| 757 | expect_event!(r, p, Ok(XmlEvent::EndElement { .. })); |
| 758 | expect_event!(r, p, Ok(XmlEvent::EndDocument)); |
| 759 | } |
| 760 | |
| 761 | #[test ] |
| 762 | fn malformed_declaration_attrs() { |
| 763 | let (mut r, mut p) = test_data!(r#"<?xml version x="1.0"?>"# ); |
| 764 | expect_event!(r, p, Err(_)); |
| 765 | |
| 766 | let (mut r, mut p) = test_data!(r#"<?xml version="1.0" version="1.0"?>"# ); |
| 767 | expect_event!(r, p, Err(_)); |
| 768 | |
| 769 | let (mut r, mut p) = test_data!(r#"<?xml version="1.0"encoding="utf-8"?>"# ); |
| 770 | expect_event!(r, p, Err(_)); |
| 771 | |
| 772 | let (mut r, mut p) = test_data!(r#"<?xml version="1.0"standalone="yes"?>"# ); |
| 773 | expect_event!(r, p, Err(_)); |
| 774 | |
| 775 | let (mut r, mut p) = test_data!(r#"<?xml version="1.0" encoding="utf-8"standalone="yes"?>"# ); |
| 776 | expect_event!(r, p, Err(_)); |
| 777 | } |
| 778 | |
| 779 | #[test ] |
| 780 | fn opening_tag_in_attribute_value() { |
| 781 | use crate::reader::error::{SyntaxError, Error, ErrorKind}; |
| 782 | |
| 783 | let (mut r, mut p) = test_data!(r#" |
| 784 | <a attr="zzz<zzz" /> |
| 785 | "# ); |
| 786 | |
| 787 | expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); |
| 788 | expect_event!(r, p, Err(ref e) => |
| 789 | *e == Error { |
| 790 | kind: ErrorKind::Syntax(SyntaxError::UnexpectedOpeningTag.to_cow()), |
| 791 | pos: TextPosition { row: 1, column: 24 } |
| 792 | } |
| 793 | ); |
| 794 | } |
| 795 | |
| 796 | #[test ] |
| 797 | fn reference_err() { |
| 798 | let (mut r, mut p) = test_data!(r" |
| 799 | <a>&&</a> |
| 800 | " ); |
| 801 | |
| 802 | expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); |
| 803 | expect_event!(r, p, Ok(XmlEvent::StartElement { .. })); |
| 804 | expect_event!(r, p, Err(_)); |
| 805 | } |
| 806 | |
| 807 | #[test ] |
| 808 | fn state_size() { |
| 809 | assert_eq!(2, std::mem::size_of::<super::State>()); |
| 810 | assert_eq!(1, std::mem::size_of::<super::DoctypeSubstate>()); |
| 811 | } |
| 812 | } |
| 813 | |