1 | //! Contains an implementation of pull-based XML parser. |
2 | |
3 | use crate::common::{is_xml10_char, is_xml11_char, is_xml11_char_not_restricted, is_name_char, is_name_start_char, is_whitespace_char}; |
4 | use crate::common::{Position, TextPosition, XmlVersion}; |
5 | use crate::name::OwnedName; |
6 | use crate::namespace::NamespaceStack; |
7 | use crate::reader::config::ParserConfig2; |
8 | use crate::reader::error::SyntaxError; |
9 | use crate::reader::events::XmlEvent; |
10 | use crate::reader::indexset::AttributesSet; |
11 | use crate::reader::lexer::{Lexer, Token}; |
12 | use super::{Error, ErrorKind}; |
13 | |
14 | use std::collections::HashMap; |
15 | use std::io::Read; |
16 | |
17 | macro_rules! gen_takes( |
18 | ($($field:ident -> $method:ident, $t:ty, $def:expr);+) => ( |
19 | $( |
20 | impl MarkupData { |
21 | #[inline] |
22 | #[allow(clippy::mem_replace_option_with_none)] |
23 | #[allow(clippy::mem_replace_with_default)] |
24 | fn $method(&mut self) -> $t { |
25 | std::mem::replace(&mut self.$field, $def) |
26 | } |
27 | } |
28 | )+ |
29 | ) |
30 | ); |
31 | |
32 | gen_takes!( |
33 | name -> take_name, String, String::new(); |
34 | ref_data -> take_ref_data, String, String::new(); |
35 | |
36 | encoding -> take_encoding, Option<String>, None; |
37 | |
38 | element_name -> take_element_name, Option<OwnedName>, None; |
39 | |
40 | attr_name -> take_attr_name, Option<OwnedName>, None; |
41 | attributes -> take_attributes, AttributesSet, AttributesSet::new() |
42 | ); |
43 | |
44 | mod inside_cdata; |
45 | mod inside_closing_tag_name; |
46 | mod inside_comment; |
47 | mod inside_declaration; |
48 | mod inside_doctype; |
49 | mod inside_opening_tag; |
50 | mod inside_processing_instruction; |
51 | mod inside_reference; |
52 | mod outside_tag; |
53 | |
54 | static DEFAULT_VERSION: XmlVersion = XmlVersion::Version10; |
55 | static DEFAULT_STANDALONE: Option<bool> = None; |
56 | |
57 | type ElementStack = Vec<OwnedName>; |
58 | pub type Result = super::Result<XmlEvent>; |
59 | |
60 | /// Pull-based XML parser. |
61 | pub(crate) struct PullParser { |
62 | config: ParserConfig2, |
63 | lexer: Lexer, |
64 | st: State, |
65 | state_after_reference: State, |
66 | buf: String, |
67 | |
68 | /// From DTD internal subset |
69 | entities: HashMap<String, String>, |
70 | |
71 | nst: NamespaceStack, |
72 | |
73 | data: MarkupData, |
74 | final_result: Option<Result>, |
75 | next_event: Option<Result>, |
76 | est: ElementStack, |
77 | pos: Vec<TextPosition>, |
78 | |
79 | encountered: Encountered, |
80 | inside_whitespace: bool, |
81 | read_prefix_separator: bool, |
82 | pop_namespace: bool, |
83 | } |
84 | |
85 | // Keeps track when XML declaration can happen |
86 | #[derive (Copy, Clone, PartialEq, Eq, PartialOrd, Ord)] |
87 | enum Encountered { |
88 | None = 0, |
89 | AnyChars, // whitespace before <?xml is not allowed |
90 | Declaration, |
91 | Comment, |
92 | Doctype, |
93 | Element, |
94 | } |
95 | |
96 | impl PullParser { |
97 | /// Returns a new parser using the given config. |
98 | #[inline ] |
99 | pub fn new(config: impl Into<ParserConfig2>) -> Self { |
100 | let config = config.into(); |
101 | Self::new_with_config2(config) |
102 | } |
103 | |
104 | #[inline ] |
105 | fn new_with_config2(config: ParserConfig2) -> Self { |
106 | let mut lexer = Lexer::new(&config); |
107 | if let Some(enc) = config.override_encoding { |
108 | lexer.set_encoding(enc); |
109 | } |
110 | |
111 | let mut pos = Vec::with_capacity(16); |
112 | pos.push(TextPosition::new()); |
113 | |
114 | Self { |
115 | config, |
116 | lexer, |
117 | st: State::DocumentStart, |
118 | state_after_reference: State::OutsideTag, |
119 | buf: String::new(), |
120 | entities: HashMap::new(), |
121 | nst: NamespaceStack::default(), |
122 | |
123 | data: MarkupData { |
124 | name: String::new(), |
125 | doctype: None, |
126 | version: None, |
127 | encoding: None, |
128 | standalone: None, |
129 | ref_data: String::new(), |
130 | element_name: None, |
131 | quote: None, |
132 | attr_name: None, |
133 | attributes: AttributesSet::new(), |
134 | }, |
135 | final_result: None, |
136 | next_event: None, |
137 | est: Vec::new(), |
138 | pos, |
139 | |
140 | encountered: Encountered::None, |
141 | inside_whitespace: true, |
142 | read_prefix_separator: false, |
143 | pop_namespace: false, |
144 | } |
145 | } |
146 | |
147 | /// Checks if this parser ignores the end of stream errors. |
148 | pub fn is_ignoring_end_of_stream(&self) -> bool { self.config.c.ignore_end_of_stream } |
149 | |
150 | /// Retrieves the Doctype from the document if any |
151 | #[inline ] |
152 | pub fn doctype(&self) -> Option<&str> { |
153 | self.data.doctype.as_deref() |
154 | } |
155 | |
156 | #[inline (never)] |
157 | fn set_encountered(&mut self, new_encounter: Encountered) -> Option<Result> { |
158 | if new_encounter <= self.encountered { |
159 | return None; |
160 | } |
161 | let prev_enc = self.encountered; |
162 | self.encountered = new_encounter; |
163 | |
164 | // If declaration was not parsed and we have encountered an element, |
165 | // emit this declaration as the next event. |
166 | if prev_enc == Encountered::None { |
167 | self.push_pos(); |
168 | Some(Ok(XmlEvent::StartDocument { |
169 | version: DEFAULT_VERSION, |
170 | encoding: self.lexer.encoding().to_string(), |
171 | standalone: DEFAULT_STANDALONE, |
172 | })) |
173 | } else { |
174 | None |
175 | } |
176 | } |
177 | } |
178 | |
179 | impl Position for PullParser { |
180 | /// Returns the position of the last event produced by the parser |
181 | #[inline ] |
182 | fn position(&self) -> TextPosition { |
183 | self.pos.first().copied().unwrap_or_else(TextPosition::new) |
184 | } |
185 | } |
186 | |
187 | #[derive (Copy, Clone, PartialEq)] |
188 | pub enum State { |
189 | OutsideTag, |
190 | InsideOpeningTag(OpeningTagSubstate), |
191 | InsideClosingTag(ClosingTagSubstate), |
192 | InsideProcessingInstruction(ProcessingInstructionSubstate), |
193 | InsideComment, |
194 | InsideCData, |
195 | InsideDeclaration(DeclarationSubstate), |
196 | InsideDoctype(DoctypeSubstate), |
197 | InsideReference, |
198 | DocumentStart, |
199 | } |
200 | |
201 | #[derive (Copy, Clone, PartialEq)] |
202 | pub enum DoctypeSubstate { |
203 | Outside, |
204 | String, |
205 | InsideName, |
206 | BeforeEntityName, |
207 | EntityName, |
208 | BeforeEntityValue, |
209 | EntityValue, |
210 | NumericReferenceStart, |
211 | NumericReference, |
212 | /// expansion |
213 | PEReferenceInValue, |
214 | PEReferenceInDtd, |
215 | /// name definition |
216 | PEReferenceDefinitionStart, |
217 | PEReferenceDefinition, |
218 | SkipDeclaration, |
219 | Comment, |
220 | } |
221 | |
222 | #[derive (Copy, Clone, PartialEq)] |
223 | pub enum OpeningTagSubstate { |
224 | InsideName, |
225 | |
226 | InsideTag, |
227 | |
228 | InsideAttributeName, |
229 | AfterAttributeName, |
230 | |
231 | InsideAttributeValue, |
232 | AfterAttributeValue, |
233 | } |
234 | |
235 | #[derive (Copy, Clone, PartialEq)] |
236 | pub enum ClosingTagSubstate { |
237 | CTInsideName, |
238 | CTAfterName, |
239 | } |
240 | |
241 | #[derive (Copy, Clone, PartialEq)] |
242 | pub enum ProcessingInstructionSubstate { |
243 | PIInsideName, |
244 | PIInsideData, |
245 | } |
246 | |
247 | #[derive (Copy, Clone, PartialEq)] |
248 | pub enum DeclarationSubstate { |
249 | BeforeVersion, |
250 | InsideVersion, |
251 | AfterVersion, |
252 | |
253 | InsideVersionValue, |
254 | AfterVersionValue, |
255 | |
256 | BeforeEncoding, |
257 | InsideEncoding, |
258 | AfterEncoding, |
259 | |
260 | InsideEncodingValue, |
261 | AfterEncodingValue, |
262 | |
263 | BeforeStandaloneDecl, |
264 | InsideStandaloneDecl, |
265 | AfterStandaloneDecl, |
266 | |
267 | InsideStandaloneDeclValue, |
268 | AfterStandaloneDeclValue, |
269 | } |
270 | |
271 | #[derive (Copy, Clone, PartialEq)] |
272 | enum QualifiedNameTarget { |
273 | AttributeNameTarget, |
274 | OpeningTagNameTarget, |
275 | ClosingTagNameTarget, |
276 | } |
277 | |
278 | #[derive (Copy, Clone, PartialEq, Eq)] |
279 | enum QuoteToken { |
280 | SingleQuoteToken, |
281 | DoubleQuoteToken, |
282 | } |
283 | |
284 | impl QuoteToken { |
285 | #[inline ] |
286 | fn from_token(t: Token) -> Option<Self> { |
287 | match t { |
288 | Token::SingleQuote => Some(Self::SingleQuoteToken), |
289 | Token::DoubleQuote => Some(Self::DoubleQuoteToken), |
290 | _ => { |
291 | debug_assert!(false); |
292 | None |
293 | }, |
294 | } |
295 | } |
296 | |
297 | const fn as_token(self) -> Token { |
298 | match self { |
299 | Self::SingleQuoteToken => Token::SingleQuote, |
300 | Self::DoubleQuoteToken => Token::DoubleQuote, |
301 | } |
302 | } |
303 | } |
304 | |
305 | struct MarkupData { |
306 | name: String, // used for processing instruction name |
307 | ref_data: String, // used for reference content |
308 | |
309 | doctype: Option<String>, // keeps a copy of the original doctype |
310 | version: Option<XmlVersion>, // used for XML declaration version |
311 | encoding: Option<String>, // used for XML declaration encoding |
312 | standalone: Option<bool>, // used for XML declaration standalone parameter |
313 | |
314 | element_name: Option<OwnedName>, // used for element name |
315 | |
316 | quote: Option<QuoteToken>, // used to hold opening quote for attribute value |
317 | attr_name: Option<OwnedName>, // used to hold attribute name |
318 | attributes: AttributesSet, // used to hold all accumulated attributes |
319 | } |
320 | |
321 | impl PullParser { |
322 | /// Returns next event read from the given buffer. |
323 | /// |
324 | /// This method should be always called with the same buffer. If you call it |
325 | /// providing different buffers each time, the result will be undefined. |
326 | pub fn next<R: Read>(&mut self, r: &mut R) -> Result { |
327 | if let Some(ref ev) = self.final_result { |
328 | return ev.clone(); |
329 | } |
330 | |
331 | if let Some(ev) = self.next_event.take() { |
332 | return ev; |
333 | } |
334 | |
335 | if self.pop_namespace { |
336 | self.pop_namespace = false; |
337 | self.nst.pop(); |
338 | } |
339 | |
340 | loop { |
341 | debug_assert!(self.next_event.is_none()); |
342 | debug_assert!(!self.pop_namespace); |
343 | |
344 | // While lexer gives us Ok(maybe_token) -- we loop. |
345 | // Upon having a complete XML-event -- we return from the whole function. |
346 | match self.lexer.next_token(r) { |
347 | Ok(Token::Eof) => { |
348 | // Forward pos to the lexer head |
349 | self.next_pos(); |
350 | return self.handle_eof(); |
351 | }, |
352 | Ok(token) => match self.dispatch_token(token) { |
353 | None => continue, |
354 | Some(Ok(xml_event)) => { |
355 | self.next_pos(); |
356 | return Ok(xml_event); |
357 | }, |
358 | Some(Err(xml_error)) => { |
359 | self.next_pos(); |
360 | return self.set_final_result(Err(xml_error)); |
361 | }, |
362 | }, |
363 | Err(lexer_error) => { |
364 | self.next_pos(); |
365 | return self.set_final_result(Err(lexer_error)); |
366 | }, |
367 | } |
368 | } |
369 | } |
370 | |
371 | /// Handle end of stream |
372 | #[cold ] |
373 | fn handle_eof(&mut self) -> std::result::Result<XmlEvent, super::Error> { |
374 | let ev = if self.depth() == 0 { |
375 | if self.encountered == Encountered::Element && self.st == State::OutsideTag { // all is ok |
376 | Ok(XmlEvent::EndDocument) |
377 | } else if self.encountered < Encountered::Element { |
378 | self.error(SyntaxError::NoRootElement) |
379 | } else { // self.st != State::OutsideTag |
380 | self.error(SyntaxError::UnexpectedEof) // TODO: add expected hint? |
381 | } |
382 | } else if self.config.c.ignore_end_of_stream { |
383 | self.final_result = None; |
384 | self.lexer.reset_eof_handled(); |
385 | return self.error(SyntaxError::UnbalancedRootElement); |
386 | } else { |
387 | self.error(SyntaxError::UnbalancedRootElement) |
388 | }; |
389 | self.set_final_result(ev) |
390 | } |
391 | |
392 | // This function is to be called when a terminal event is reached. |
393 | // The function sets up the `self.final_result` into `Some(result)` and return `result`. |
394 | #[inline ] |
395 | fn set_final_result(&mut self, result: Result) -> Result { |
396 | self.final_result = Some(result.clone()); |
397 | result |
398 | } |
399 | |
400 | #[cold ] |
401 | fn error(&self, e: SyntaxError) -> Result { |
402 | Err(Error { |
403 | pos: self.lexer.position(), |
404 | kind: ErrorKind::Syntax(e.to_cow()), |
405 | }) |
406 | } |
407 | |
408 | #[inline ] |
409 | fn next_pos(&mut self) { |
410 | // unfortunately calls to next_pos will never be perfectly balanced with push_pos, |
411 | // at very least because parse errors and EOF can happen unexpectedly without a prior push. |
412 | if !self.pos.is_empty() { |
413 | if self.pos.len() > 1 { |
414 | self.pos.remove(0); |
415 | } else { |
416 | self.pos[0] = self.lexer.position(); |
417 | } |
418 | } |
419 | } |
420 | |
421 | #[inline ] |
422 | #[track_caller ] |
423 | fn push_pos(&mut self) { |
424 | debug_assert!(self.pos.len() != self.pos.capacity(), "You've found a bug in xml-rs, caused by calls to push_pos() in states that don't end up emitting events. |
425 | This case is ignored in release mode, and merely causes document positions to be out of sync. |
426 | Please file a bug and include the XML document that triggers this assert." ); |
427 | |
428 | // it has capacity preallocated for more than it ever needs, so this reduces code size |
429 | if self.pos.len() != self.pos.capacity() { |
430 | self.pos.push(self.lexer.position()); |
431 | } else if self.pos.len() > 1 { |
432 | self.pos.remove(0); // this mitigates the excessive push_pos() call |
433 | } |
434 | } |
435 | |
436 | #[inline (never)] |
437 | fn dispatch_token(&mut self, t: Token) -> Option<Result> { |
438 | match self.st { |
439 | State::OutsideTag => self.outside_tag(t), |
440 | State::InsideOpeningTag(s) => self.inside_opening_tag(t, s), |
441 | State::InsideClosingTag(s) => self.inside_closing_tag_name(t, s), |
442 | State::InsideReference => self.inside_reference(t), |
443 | State::InsideComment => self.inside_comment(t), |
444 | State::InsideCData => self.inside_cdata(t), |
445 | State::InsideProcessingInstruction(s) => self.inside_processing_instruction(t, s), |
446 | State::InsideDoctype(s) => self.inside_doctype(t, s), |
447 | State::InsideDeclaration(s) => self.inside_declaration(t, s), |
448 | State::DocumentStart => self.document_start(t), |
449 | } |
450 | } |
451 | |
452 | #[inline ] |
453 | fn depth(&self) -> usize { |
454 | self.est.len() |
455 | } |
456 | |
457 | #[inline ] |
458 | fn buf_has_data(&self) -> bool { |
459 | !self.buf.is_empty() |
460 | } |
461 | |
462 | #[inline ] |
463 | fn take_buf(&mut self) -> String { |
464 | std::mem::take(&mut self.buf) |
465 | } |
466 | |
467 | #[inline ] |
468 | fn into_state(&mut self, st: State, ev: Option<Result>) -> Option<Result> { |
469 | self.st = st; |
470 | ev |
471 | } |
472 | |
473 | #[inline ] |
474 | fn into_state_continue(&mut self, st: State) -> Option<Result> { |
475 | self.into_state(st, None) |
476 | } |
477 | |
478 | #[inline ] |
479 | fn into_state_emit(&mut self, st: State, ev: Result) -> Option<Result> { |
480 | self.into_state(st, Some(ev)) |
481 | } |
482 | |
483 | /// Dispatches tokens in order to process qualified name. If qualified name cannot be parsed, |
484 | /// an error is returned. |
485 | /// |
486 | /// # Parameters |
487 | /// * `t` --- next token; |
488 | /// * `on_name` --- a callback which is executed when whitespace is encountered. |
489 | fn read_qualified_name<F>(&mut self, t: Token, target: QualifiedNameTarget, on_name: F) -> Option<Result> |
490 | where F: Fn(&mut Self, Token, OwnedName) -> Option<Result> { |
491 | // We can get here for the first time only when self.data.name contains zero or one character, |
492 | // but first character cannot be a colon anyway |
493 | if self.buf.len() <= 1 { |
494 | self.read_prefix_separator = false; |
495 | } |
496 | |
497 | let invoke_callback = move |this: &mut Self, t| { |
498 | let name = this.take_buf(); |
499 | match name.parse() { |
500 | Ok(name) => on_name(this, t, name), |
501 | Err(()) => Some(this.error(SyntaxError::InvalidQualifiedName(name.into()))), |
502 | } |
503 | }; |
504 | |
505 | match t { |
506 | // There can be only one colon, and not as the first character |
507 | Token::Character(':' ) if self.buf_has_data() && !self.read_prefix_separator => { |
508 | self.buf.push(':' ); |
509 | self.read_prefix_separator = true; |
510 | None |
511 | }, |
512 | |
513 | Token::Character(c) if c != ':' && (self.buf.is_empty() && is_name_start_char(c) || |
514 | self.buf_has_data() && is_name_char(c)) => { |
515 | if self.buf.len() > self.config.max_name_length { |
516 | return Some(self.error(SyntaxError::ExceededConfiguredLimit)); |
517 | } |
518 | self.buf.push(c); |
519 | None |
520 | }, |
521 | |
522 | Token::EqualsSign if target == QualifiedNameTarget::AttributeNameTarget => invoke_callback(self, t), |
523 | |
524 | Token::EmptyTagEnd if target == QualifiedNameTarget::OpeningTagNameTarget => invoke_callback(self, t), |
525 | |
526 | Token::TagEnd if target == QualifiedNameTarget::OpeningTagNameTarget || |
527 | target == QualifiedNameTarget::ClosingTagNameTarget => invoke_callback(self, t), |
528 | |
529 | Token::Character(c) if is_whitespace_char(c) => invoke_callback(self, t), |
530 | |
531 | _ => Some(self.error(SyntaxError::UnexpectedQualifiedName(t))), |
532 | } |
533 | } |
534 | |
535 | /// Dispatches tokens in order to process attribute value. |
536 | /// |
537 | /// # Parameters |
538 | /// * `t` --- next token; |
539 | /// * `on_value` --- a callback which is called when terminating quote is encountered. |
540 | fn read_attribute_value<F>(&mut self, t: Token, on_value: F) -> Option<Result> |
541 | where F: Fn(&mut Self, String) -> Option<Result> { |
542 | match t { |
543 | Token::Character(c) if self.data.quote.is_none() && is_whitespace_char(c) => None, // skip leading whitespace |
544 | |
545 | Token::DoubleQuote | Token::SingleQuote => match self.data.quote { |
546 | None => { // Entered attribute value |
547 | self.data.quote = QuoteToken::from_token(t); |
548 | None |
549 | }, |
550 | Some(q) if q.as_token() == t => { |
551 | self.data.quote = None; |
552 | let value = self.take_buf(); |
553 | on_value(self, value) |
554 | }, |
555 | _ => { |
556 | if let Token::Character(c) = t { |
557 | if !self.is_valid_xml_char_not_restricted(c) { |
558 | return Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32))); |
559 | } |
560 | } |
561 | if self.buf.len() > self.config.max_attribute_length { |
562 | return Some(self.error(SyntaxError::ExceededConfiguredLimit)); |
563 | } |
564 | t.push_to_string(&mut self.buf); |
565 | None |
566 | }, |
567 | }, |
568 | |
569 | Token::ReferenceStart if self.data.quote.is_some() => { |
570 | self.state_after_reference = self.st; |
571 | self.into_state_continue(State::InsideReference) |
572 | }, |
573 | |
574 | Token::OpeningTagStart => Some(self.error(SyntaxError::UnexpectedOpeningTag)), |
575 | |
576 | Token::Character(c) if !self.is_valid_xml_char_not_restricted(c) => { |
577 | Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32))) |
578 | }, |
579 | |
580 | // Every character except " and ' and < is okay |
581 | _ if self.data.quote.is_some() => { |
582 | if self.buf.len() > self.config.max_attribute_length { |
583 | return Some(self.error(SyntaxError::ExceededConfiguredLimit)); |
584 | } |
585 | t.push_to_string(&mut self.buf); |
586 | None |
587 | }, |
588 | |
589 | _ => Some(self.error(SyntaxError::UnexpectedToken(t))), |
590 | } |
591 | } |
592 | |
593 | fn emit_start_element(&mut self, emit_end_element: bool) -> Option<Result> { |
594 | let mut name = self.data.take_element_name()?; |
595 | let mut attributes = self.data.take_attributes().into_vec(); |
596 | |
597 | // check whether the name prefix is bound and fix its namespace |
598 | match self.nst.get(name.borrow().prefix_repr()) { |
599 | Some("" ) => name.namespace = None, // default namespace |
600 | Some(ns) => name.namespace = Some(ns.into()), |
601 | None => return Some(self.error(SyntaxError::UnboundElementPrefix(name.to_string().into()))), |
602 | } |
603 | |
604 | // check and fix accumulated attributes prefixes |
605 | for attr in &mut attributes { |
606 | if let Some(ref pfx) = attr.name.prefix { |
607 | let new_ns = match self.nst.get(pfx) { |
608 | Some("" ) => None, // default namespace |
609 | Some(ns) => Some(ns.into()), |
610 | None => return Some(self.error(SyntaxError::UnboundAttribute(attr.name.to_string().into()))), |
611 | }; |
612 | attr.name.namespace = new_ns; |
613 | } |
614 | } |
615 | |
616 | if emit_end_element { |
617 | self.pop_namespace = true; |
618 | self.next_event = Some(Ok(XmlEvent::EndElement { |
619 | name: name.clone() |
620 | })); |
621 | } else { |
622 | self.est.push(name.clone()); |
623 | } |
624 | let namespace = self.nst.squash(); |
625 | self.into_state_emit(State::OutsideTag, Ok(XmlEvent::StartElement { |
626 | name, |
627 | attributes, |
628 | namespace |
629 | })) |
630 | } |
631 | |
632 | fn emit_end_element(&mut self) -> Option<Result> { |
633 | let mut name = self.data.take_element_name()?; |
634 | |
635 | // check whether the name prefix is bound and fix its namespace |
636 | match self.nst.get(name.borrow().prefix_repr()) { |
637 | Some("" ) => name.namespace = None, // default namespace |
638 | Some(ns) => name.namespace = Some(ns.into()), |
639 | None => return Some(self.error(SyntaxError::UnboundElementPrefix(name.to_string().into()))), |
640 | } |
641 | |
642 | let op_name = self.est.pop()?; |
643 | |
644 | if name == op_name { |
645 | self.pop_namespace = true; |
646 | self.into_state_emit(State::OutsideTag, Ok(XmlEvent::EndElement { name })) |
647 | } else { |
648 | Some(self.error(SyntaxError::UnexpectedClosingTag(format!(" {name} != {op_name}" ).into()))) |
649 | } |
650 | } |
651 | |
652 | #[inline ] |
653 | fn is_valid_xml_char(&self, c: char) -> bool { |
654 | if Some(XmlVersion::Version11) == self.data.version { |
655 | is_xml11_char(c) |
656 | } else { |
657 | is_xml10_char(c) |
658 | } |
659 | } |
660 | |
661 | #[inline ] |
662 | fn is_valid_xml_char_not_restricted(&self, c: char) -> bool { |
663 | if Some(XmlVersion::Version11) == self.data.version { |
664 | is_xml11_char_not_restricted(c) |
665 | } else { |
666 | is_xml10_char(c) |
667 | } |
668 | } |
669 | } |
670 | |
671 | #[cfg (test)] |
672 | mod tests { |
673 | use crate::attribute::OwnedAttribute; |
674 | use crate::common::TextPosition; |
675 | use crate::name::OwnedName; |
676 | use crate::reader::events::XmlEvent; |
677 | use crate::reader::parser::PullParser; |
678 | use crate::reader::ParserConfig; |
679 | use std::io::BufReader; |
680 | |
681 | fn new_parser() -> PullParser { |
682 | PullParser::new(ParserConfig::new()) |
683 | } |
684 | |
685 | macro_rules! expect_event( |
686 | ($r:expr, $p:expr, $t:pat) => ( |
687 | match $p.next(&mut $r) { |
688 | $t => {} |
689 | e => panic!("Unexpected event: {e:?} \nExpected: {}" , stringify!($t)) |
690 | } |
691 | ); |
692 | ($r:expr, $p:expr, $t:pat => $c:expr ) => ( |
693 | match $p.next(&mut $r) { |
694 | $t if $c => {} |
695 | e => panic!("Unexpected event: {e:?} \nExpected: {} if {}" , stringify!($t), stringify!($c)) |
696 | } |
697 | ) |
698 | ); |
699 | |
700 | macro_rules! test_data( |
701 | ($d:expr) => ({ |
702 | static DATA: &'static str = $d; |
703 | let r = BufReader::new(DATA.as_bytes()); |
704 | let p = new_parser(); |
705 | (r, p) |
706 | }) |
707 | ); |
708 | |
709 | #[test ] |
710 | fn issue_3_semicolon_in_attribute_value() { |
711 | let (mut r, mut p) = test_data!(r#" |
712 | <a attr="zzz;zzz" /> |
713 | "# ); |
714 | |
715 | expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); |
716 | expect_event!(r, p, Ok(XmlEvent::StartElement { ref name, ref attributes, ref namespace }) => |
717 | *name == OwnedName::local("a" ) && |
718 | attributes.len() == 1 && |
719 | attributes[0] == OwnedAttribute::new(OwnedName::local("attr" ), "zzz;zzz" ) && |
720 | namespace.is_essentially_empty() |
721 | ); |
722 | expect_event!(r, p, Ok(XmlEvent::EndElement { ref name }) => *name == OwnedName::local("a" )); |
723 | expect_event!(r, p, Ok(XmlEvent::EndDocument)); |
724 | } |
725 | |
726 | #[test ] |
727 | fn issue_140_entity_reference_inside_tag() { |
728 | let (mut r, mut p) = test_data!(r" |
729 | <bla>♫</bla> |
730 | " ); |
731 | |
732 | expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); |
733 | expect_event!(r, p, Ok(XmlEvent::StartElement { ref name, .. }) => *name == OwnedName::local("bla" )); |
734 | expect_event!(r, p, Ok(XmlEvent::Characters(ref s)) => s == " \u{266b}" ); |
735 | expect_event!(r, p, Ok(XmlEvent::EndElement { ref name, .. }) => *name == OwnedName::local("bla" )); |
736 | expect_event!(r, p, Ok(XmlEvent::EndDocument)); |
737 | } |
738 | |
739 | #[test ] |
740 | fn issue_220_comment() { |
741 | let (mut r, mut p) = test_data!(r"<x><!-- <!--></x>" ); |
742 | expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); |
743 | expect_event!(r, p, Ok(XmlEvent::StartElement { .. })); |
744 | expect_event!(r, p, Ok(XmlEvent::EndElement { .. })); |
745 | expect_event!(r, p, Ok(XmlEvent::EndDocument)); |
746 | |
747 | let (mut r, mut p) = test_data!(r"<x><!-- <!---></x>" ); |
748 | expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); |
749 | expect_event!(r, p, Ok(XmlEvent::StartElement { .. })); |
750 | expect_event!(r, p, Err(_)); // ---> is forbidden in comments |
751 | |
752 | let (mut r, mut p) = test_data!(r"<x><!--<text&x;> <!--></x>" ); |
753 | p.config.c.ignore_comments = false; |
754 | expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); |
755 | expect_event!(r, p, Ok(XmlEvent::StartElement { .. })); |
756 | expect_event!(r, p, Ok(XmlEvent::Comment(s)) => s == "<text&x;> <!" ); |
757 | expect_event!(r, p, Ok(XmlEvent::EndElement { .. })); |
758 | expect_event!(r, p, Ok(XmlEvent::EndDocument)); |
759 | } |
760 | |
761 | #[test ] |
762 | fn malformed_declaration_attrs() { |
763 | let (mut r, mut p) = test_data!(r#"<?xml version x="1.0"?>"# ); |
764 | expect_event!(r, p, Err(_)); |
765 | |
766 | let (mut r, mut p) = test_data!(r#"<?xml version="1.0" version="1.0"?>"# ); |
767 | expect_event!(r, p, Err(_)); |
768 | |
769 | let (mut r, mut p) = test_data!(r#"<?xml version="1.0"encoding="utf-8"?>"# ); |
770 | expect_event!(r, p, Err(_)); |
771 | |
772 | let (mut r, mut p) = test_data!(r#"<?xml version="1.0"standalone="yes"?>"# ); |
773 | expect_event!(r, p, Err(_)); |
774 | |
775 | let (mut r, mut p) = test_data!(r#"<?xml version="1.0" encoding="utf-8"standalone="yes"?>"# ); |
776 | expect_event!(r, p, Err(_)); |
777 | } |
778 | |
779 | #[test ] |
780 | fn opening_tag_in_attribute_value() { |
781 | use crate::reader::error::{SyntaxError, Error, ErrorKind}; |
782 | |
783 | let (mut r, mut p) = test_data!(r#" |
784 | <a attr="zzz<zzz" /> |
785 | "# ); |
786 | |
787 | expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); |
788 | expect_event!(r, p, Err(ref e) => |
789 | *e == Error { |
790 | kind: ErrorKind::Syntax(SyntaxError::UnexpectedOpeningTag.to_cow()), |
791 | pos: TextPosition { row: 1, column: 24 } |
792 | } |
793 | ); |
794 | } |
795 | |
796 | #[test ] |
797 | fn reference_err() { |
798 | let (mut r, mut p) = test_data!(r" |
799 | <a>&&</a> |
800 | " ); |
801 | |
802 | expect_event!(r, p, Ok(XmlEvent::StartDocument { .. })); |
803 | expect_event!(r, p, Ok(XmlEvent::StartElement { .. })); |
804 | expect_event!(r, p, Err(_)); |
805 | } |
806 | |
807 | #[test ] |
808 | fn state_size() { |
809 | assert_eq!(2, std::mem::size_of::<super::State>()); |
810 | assert_eq!(1, std::mem::size_of::<super::DoctypeSubstate>()); |
811 | } |
812 | } |
813 | |