1//! Contains high-level interface for a pull-based XML parser.
2
3#[cfg(feature = "encoding")]
4use encoding_rs::Encoding;
5use std::ops::Range;
6
7use crate::encoding::Decoder;
8use crate::errors::{Error, Result};
9use crate::events::Event;
10use crate::reader::parser::Parser;
11
12use memchr;
13
14macro_rules! configure_methods {
15 ($($holder:ident)?) => {
16 /// Changes whether empty elements should be split into an `Open` and a `Close` event.
17 ///
18 /// When set to `true`, all [`Empty`] events produced by a self-closing tag like `<tag/>` are
19 /// expanded into a [`Start`] event followed by an [`End`] event. When set to `false` (the
20 /// default), those tags are represented by an [`Empty`] event instead.
21 ///
22 /// Note, that setting this to `true` will lead to additional allocates that
23 /// needed to store tag name for an [`End`] event. However if [`check_end_names`]
24 /// is also set, only one additional allocation will be performed that support
25 /// both these options.
26 ///
27 /// (`false` by default)
28 ///
29 /// [`Empty`]: Event::Empty
30 /// [`Start`]: Event::Start
31 /// [`End`]: Event::End
32 /// [`check_end_names`]: Self::check_end_names
33 pub fn expand_empty_elements(&mut self, val: bool) -> &mut Self {
34 self $(.$holder)? .parser.expand_empty_elements = val;
35 self
36 }
37
38 /// Changes whether whitespace before and after character data should be removed.
39 ///
40 /// When set to `true`, all [`Text`] events are trimmed.
41 /// If after that the event is empty it will not be pushed.
42 ///
43 /// Changing this option automatically changes the [`trim_text_end`] option.
44 ///
45 /// (`false` by default).
46 ///
47 /// <div style="background:rgba(80, 240, 100, 0.20);padding:0.75em;">
48 ///
49 /// WARNING: With this option every text events will be trimmed which is
50 /// incorrect behavior when text events delimited by comments, processing
51 /// instructions or CDATA sections. To correctly trim data manually apply
52 /// [`BytesText::inplace_trim_start`] and [`BytesText::inplace_trim_end`]
53 /// only to necessary events.
54 /// </div>
55 ///
56 /// [`Text`]: Event::Text
57 /// [`trim_text_end`]: Self::trim_text_end
58 /// [`BytesText::inplace_trim_start`]: crate::events::BytesText::inplace_trim_start
59 /// [`BytesText::inplace_trim_end`]: crate::events::BytesText::inplace_trim_end
60 pub fn trim_text(&mut self, val: bool) -> &mut Self {
61 self $(.$holder)? .parser.trim_text_start = val;
62 self $(.$holder)? .parser.trim_text_end = val;
63 self
64 }
65
66 /// Changes whether whitespace after character data should be removed.
67 ///
68 /// When set to `true`, trailing whitespace is trimmed in [`Text`] events.
69 /// If after that the event is empty it will not be pushed.
70 ///
71 /// (`false` by default).
72 ///
73 /// <div style="background:rgba(80, 240, 100, 0.20);padding:0.75em;">
74 ///
75 /// WARNING: With this option every text events will be trimmed which is
76 /// incorrect behavior when text events delimited by comments, processing
77 /// instructions or CDATA sections. To correctly trim data manually apply
78 /// [`BytesText::inplace_trim_start`] and [`BytesText::inplace_trim_end`]
79 /// only to necessary events.
80 /// </div>
81 ///
82 /// [`Text`]: Event::Text
83 /// [`BytesText::inplace_trim_start`]: crate::events::BytesText::inplace_trim_start
84 /// [`BytesText::inplace_trim_end`]: crate::events::BytesText::inplace_trim_end
85 pub fn trim_text_end(&mut self, val: bool) -> &mut Self {
86 self $(.$holder)? .parser.trim_text_end = val;
87 self
88 }
89
90 /// Changes whether trailing whitespaces after the markup name are trimmed in closing tags
91 /// `</a >`.
92 ///
93 /// If true the emitted [`End`] event is stripped of trailing whitespace after the markup name.
94 ///
95 /// Note that if set to `false` and `check_end_names` is true the comparison of markup names is
96 /// going to fail erroneously if a closing tag contains trailing whitespaces.
97 ///
98 /// (`true` by default)
99 ///
100 /// [`End`]: Event::End
101 pub fn trim_markup_names_in_closing_tags(&mut self, val: bool) -> &mut Self {
102 self $(.$holder)? .parser.trim_markup_names_in_closing_tags = val;
103 self
104 }
105
106 /// Changes whether mismatched closing tag names should be detected.
107 ///
108 /// Note, that start and end tags [should match literally][spec], they cannot
109 /// have different prefixes even if both prefixes resolve to the same namespace.
110 /// The XML
111 ///
112 /// ```xml
113 /// <outer xmlns="namespace" xmlns:p="namespace">
114 /// </p:outer>
115 /// ```
116 ///
117 /// is not valid, even though semantically the start tag is the same as the
118 /// end tag. The reason is that namespaces are an extension of the original
119 /// XML specification (without namespaces) and it should be backward-compatible.
120 ///
121 /// When set to `false`, it won't check if a closing tag matches the corresponding opening tag.
122 /// For example, `<mytag></different_tag>` will be permitted.
123 ///
124 /// If the XML is known to be sane (already processed, etc.) this saves extra time.
125 ///
126 /// Note that the emitted [`End`] event will not be modified if this is disabled, ie. it will
127 /// contain the data of the mismatched end tag.
128 ///
129 /// Note, that setting this to `true` will lead to additional allocates that
130 /// needed to store tag name for an [`End`] event. However if [`expand_empty_elements`]
131 /// is also set, only one additional allocation will be performed that support
132 /// both these options.
133 ///
134 /// (`true` by default)
135 ///
136 /// [spec]: https://www.w3.org/TR/xml11/#dt-etag
137 /// [`End`]: Event::End
138 /// [`expand_empty_elements`]: Self::expand_empty_elements
139 pub fn check_end_names(&mut self, val: bool) -> &mut Self {
140 self $(.$holder)? .parser.check_end_names = val;
141 self
142 }
143
144 /// Changes whether comments should be validated.
145 ///
146 /// When set to `true`, every [`Comment`] event will be checked for not containing `--`, which
147 /// is not allowed in XML comments. Most of the time we don't want comments at all so we don't
148 /// really care about comment correctness, thus the default value is `false` to improve
149 /// performance.
150 ///
151 /// (`false` by default)
152 ///
153 /// [`Comment`]: Event::Comment
154 pub fn check_comments(&mut self, val: bool) -> &mut Self {
155 self $(.$holder)? .parser.check_comments = val;
156 self
157 }
158 };
159}
160
161macro_rules! read_event_impl {
162 (
163 $self:ident, $buf:ident,
164 $reader:expr,
165 $read_until_open:ident,
166 $read_until_close:ident
167 $(, $await:ident)?
168 ) => {{
169 let event = loop {
170 match $self.parser.state {
171 ParseState::Init => { // Go to OpenedTag state
172 // If encoding set explicitly, we not need to detect it. For example,
173 // explicit UTF-8 set automatically if Reader was created using `from_str`.
174 // But we still need to remove BOM for consistency with no encoding
175 // feature enabled path
176 #[cfg(feature = "encoding")]
177 if let Some(encoding) = $reader.detect_encoding() $(.$await)? ? {
178 if $self.parser.encoding.can_be_refined() {
179 $self.parser.encoding = crate::reader::EncodingRef::BomDetected(encoding);
180 }
181 }
182
183 // Removes UTF-8 BOM if it is present
184 #[cfg(not(feature = "encoding"))]
185 $reader.remove_utf8_bom() $(.$await)? ?;
186
187 // Go to OpenedTag state
188 match $self.$read_until_open($buf) $(.$await)? {
189 Ok(Ok(ev)) => break Ok(ev),
190 Ok(Err(b)) => $buf = b,
191 Err(err) => break Err(err),
192 }
193 },
194 ParseState::ClosedTag => { // Go to OpenedTag state
195 match $self.$read_until_open($buf) $(.$await)? {
196 Ok(Ok(ev)) => break Ok(ev),
197 Ok(Err(b)) => $buf = b,
198 Err(err) => break Err(err),
199 }
200 },
201 // Go to ClosedTag state in next two arms
202 ParseState::OpenedTag => break $self.$read_until_close($buf) $(.$await)?,
203 ParseState::Empty => break $self.parser.close_expanded_empty(),
204 ParseState::Exit => break Ok(Event::Eof),
205 };
206 };
207 match event {
208 Err(_) | Ok(Event::Eof) => $self.parser.state = ParseState::Exit,
209 _ => {}
210 }
211 event
212 }};
213}
214
215/// Read bytes up to `<` and skip it. If current byte (after skipping all space
216/// characters if [`Parser::trim_text_start`] is `true`) is already `<`, then
217/// returns the next event, otherwise stay at position just after the `<` symbol.
218///
219/// Moves parser to the `OpenedTag` state.
220///
221/// This code is executed in two cases:
222/// - after start of parsing just after skipping BOM if it is present
223/// - after parsing `</tag>` or `<tag>`
224macro_rules! read_until_open {
225 (
226 $self:ident, $buf:ident,
227 $reader:expr,
228 $read_event:ident
229 $(, $await:ident)?
230 ) => {{
231 $self.parser.state = ParseState::OpenedTag;
232
233 if $self.parser.trim_text_start {
234 $reader.skip_whitespace(&mut $self.parser.offset) $(.$await)? ?;
235 }
236
237 // If we already at the `<` symbol, do not try to return an empty Text event
238 if $reader.skip_one(b'<', &mut $self.parser.offset) $(.$await)? ? {
239 // Pass $buf to the next next iteration of parsing loop
240 return Ok(Err($buf));
241 }
242
243 match $reader
244 .read_bytes_until(b'<', $buf, &mut $self.parser.offset)
245 $(.$await)?
246 {
247 // Return Text event with `bytes` content
248 Ok(Some(bytes)) => $self.parser.emit_text(bytes).map(Ok),
249 Ok(None) => Ok(Ok(Event::Eof)),
250 Err(e) => Err(e),
251 }
252 }};
253}
254
255/// Read bytes up to the `>` and skip it. This method is expected to be called
256/// after seeing the `<` symbol and skipping it. Inspects the next (current)
257/// symbol and returns an appropriate [`Event`]:
258///
259/// |Symbol |Event
260/// |-------|-------------------------------------
261/// |`!` |[`Comment`], [`CData`] or [`DocType`]
262/// |`/` |[`End`]
263/// |`?` |[`PI`]
264/// |_other_|[`Start`] or [`Empty`]
265///
266/// Moves parser to the `ClosedTag` state.
267///
268/// [`Comment`]: Event::Comment
269/// [`CData`]: Event::CData
270/// [`DocType`]: Event::DocType
271/// [`End`]: Event::End
272/// [`PI`]: Event::PI
273/// [`Start`]: Event::Start
274/// [`Empty`]: Event::Empty
275macro_rules! read_until_close {
276 (
277 $self:ident, $buf:ident,
278 $reader:expr
279 $(, $await:ident)?
280 ) => {{
281 $self.parser.state = ParseState::ClosedTag;
282
283 match $reader.peek_one() $(.$await)? {
284 // `<!` - comment, CDATA or DOCTYPE declaration
285 Ok(Some(b'!')) => match $reader
286 .read_bang_element($buf, &mut $self.parser.offset)
287 $(.$await)?
288 {
289 Ok(None) => Ok(Event::Eof),
290 Ok(Some((bang_type, bytes))) => $self.parser.emit_bang(bang_type, bytes),
291 Err(e) => Err(e),
292 },
293 // `</` - closing tag
294 Ok(Some(b'/')) => match $reader
295 .read_bytes_until(b'>', $buf, &mut $self.parser.offset)
296 $(.$await)?
297 {
298 Ok(None) => Ok(Event::Eof),
299 Ok(Some(bytes)) => $self.parser.emit_end(bytes),
300 Err(e) => Err(e),
301 },
302 // `<?` - processing instruction
303 Ok(Some(b'?')) => match $reader
304 .read_bytes_until(b'>', $buf, &mut $self.parser.offset)
305 $(.$await)?
306 {
307 Ok(None) => Ok(Event::Eof),
308 Ok(Some(bytes)) => $self.parser.emit_question_mark(bytes),
309 Err(e) => Err(e),
310 },
311 // `<...` - opening or self-closed tag
312 Ok(Some(_)) => match $reader
313 .read_element($buf, &mut $self.parser.offset)
314 $(.$await)?
315 {
316 Ok(None) => Ok(Event::Eof),
317 Ok(Some(bytes)) => $self.parser.emit_start(bytes),
318 Err(e) => Err(e),
319 },
320 Ok(None) => Ok(Event::Eof),
321 Err(e) => Err(e),
322 }
323 }};
324}
325
326/// Generalization of `read_to_end` method for buffered and borrowed readers
327macro_rules! read_to_end {
328 (
329 $self:expr, $end:expr, $buf:expr,
330 $read_event:ident,
331 // Code block that performs clearing of internal buffer after read of each event
332 $clear:block
333 $(, $await:ident)?
334 ) => {{
335 let start = $self.buffer_position();
336 let mut depth = 0;
337 loop {
338 $clear
339 let end = $self.buffer_position();
340 match $self.$read_event($buf) $(.$await)? {
341 Err(e) => return Err(e),
342
343 Ok(Event::Start(e)) if e.name() == $end => depth += 1,
344 Ok(Event::End(e)) if e.name() == $end => {
345 if depth == 0 {
346 break start..end;
347 }
348 depth -= 1;
349 }
350 Ok(Event::Eof) => {
351 let name = $self.decoder().decode($end.as_ref());
352 return Err(Error::UnexpectedEof(format!("</{:?}>", name)));
353 }
354 _ => (),
355 }
356 }
357 }};
358}
359
360#[cfg(feature = "async-tokio")]
361mod async_tokio;
362mod buffered_reader;
363mod ns_reader;
364mod parser;
365mod slice_reader;
366
367pub use ns_reader::NsReader;
368
369/// Range of input in bytes, that corresponds to some piece of XML
370pub type Span = Range<usize>;
371
372////////////////////////////////////////////////////////////////////////////////////////////////////
373
374/// Possible reader states. The state transition diagram (`true` and `false` shows
375/// value of [`Reader::expand_empty_elements()`] option):
376///
377/// ```mermaid
378/// flowchart LR
379/// subgraph _
380/// direction LR
381///
382/// Init -- "(no event)"\n --> OpenedTag
383/// OpenedTag -- Decl, DocType, PI\nComment, CData\nStart, Empty, End --> ClosedTag
384/// ClosedTag -- "#lt;false#gt;\n(no event)"\nText --> OpenedTag
385/// end
386/// ClosedTag -- "#lt;true#gt;"\nStart --> Empty
387/// Empty -- End --> ClosedTag
388/// _ -. Eof .-> Exit
389/// ```
390#[derive(Clone)]
391enum ParseState {
392 /// Initial state in which reader stay after creation. Transition from that
393 /// state could produce a `Text`, `Decl`, `Comment` or `Start` event. The next
394 /// state is always `OpenedTag`. The reader will never return to this state. The
395 /// event emitted during transition to `OpenedTag` is a `StartEvent` if the
396 /// first symbol not `<`, otherwise no event are emitted.
397 Init,
398 /// State after seeing the `<` symbol. Depending on the next symbol all other
399 /// events could be generated.
400 ///
401 /// After generating one event the reader moves to the `ClosedTag` state.
402 OpenedTag,
403 /// State in which reader searches the `<` symbol of a markup. All bytes before
404 /// that symbol will be returned in the [`Event::Text`] event. After that
405 /// the reader moves to the `OpenedTag` state.
406 ClosedTag,
407 /// This state is used only if option [`expand_empty_elements`] is set to `true`.
408 /// Reader enters to this state when it is in a `ClosedTag` state and emits an
409 /// [`Event::Start`] event. The next event emitted will be an [`Event::End`],
410 /// after which reader returned to the `ClosedTag` state.
411 ///
412 /// [`expand_empty_elements`]: Parser::expand_empty_elements
413 Empty,
414 /// Reader enters this state when `Eof` event generated or an error occurred.
415 /// This is the last state, the reader stay in it forever.
416 Exit,
417}
418
419/// A reference to an encoding together with information about how it was retrieved.
420///
421/// The state transition diagram:
422///
423/// ```mermaid
424/// flowchart LR
425/// Implicit -- from_str --> Explicit
426/// Implicit -- BOM --> BomDetected
427/// Implicit -- "encoding=..." --> XmlDetected
428/// BomDetected -- "encoding=..." --> XmlDetected
429/// ```
430#[cfg(feature = "encoding")]
431#[derive(Clone, Copy)]
432enum EncodingRef {
433 /// Encoding was implicitly assumed to have a specified value. It can be refined
434 /// using BOM or by the XML declaration event (`<?xml encoding=... ?>`)
435 Implicit(&'static Encoding),
436 /// Encoding was explicitly set to the desired value. It cannot be changed
437 /// nor by BOM, nor by parsing XML declaration (`<?xml encoding=... ?>`)
438 Explicit(&'static Encoding),
439 /// Encoding was detected from a byte order mark (BOM) or by the first bytes
440 /// of the content. It can be refined by the XML declaration event (`<?xml encoding=... ?>`)
441 BomDetected(&'static Encoding),
442 /// Encoding was detected using XML declaration event (`<?xml encoding=... ?>`).
443 /// It can no longer change
444 XmlDetected(&'static Encoding),
445}
446#[cfg(feature = "encoding")]
447impl EncodingRef {
448 #[inline]
449 fn encoding(&self) -> &'static Encoding {
450 match self {
451 Self::Implicit(e) => e,
452 Self::Explicit(e) => e,
453 Self::BomDetected(e) => e,
454 Self::XmlDetected(e) => e,
455 }
456 }
457 #[inline]
458 fn can_be_refined(&self) -> bool {
459 match self {
460 Self::Implicit(_) | Self::BomDetected(_) => true,
461 Self::Explicit(_) | Self::XmlDetected(_) => false,
462 }
463 }
464}
465
466////////////////////////////////////////////////////////////////////////////////////////////////////
467
468/// A low level encoding-agnostic XML event reader.
469///
470/// Consumes bytes and streams XML [`Event`]s.
471///
472/// This reader does not manage namespace declarations and not able to resolve
473/// prefixes. If you want these features, use the [`NsReader`].
474///
475/// # Examples
476///
477/// ```
478/// use quick_xml::events::Event;
479/// use quick_xml::reader::Reader;
480///
481/// let xml = r#"<tag1 att1 = "test">
482/// <tag2><!--Test comment-->Test</tag2>
483/// <tag2>Test 2</tag2>
484/// </tag1>"#;
485/// let mut reader = Reader::from_str(xml);
486/// reader.trim_text(true);
487///
488/// let mut count = 0;
489/// let mut txt = Vec::new();
490/// let mut buf = Vec::new();
491///
492/// // The `Reader` does not implement `Iterator` because it outputs borrowed data (`Cow`s)
493/// loop {
494/// // NOTE: this is the generic case when we don't know about the input BufRead.
495/// // when the input is a &str or a &[u8], we don't actually need to use another
496/// // buffer, we could directly call `reader.read_event()`
497/// match reader.read_event_into(&mut buf) {
498/// Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
499/// // exits the loop when reaching end of file
500/// Ok(Event::Eof) => break,
501///
502/// Ok(Event::Start(e)) => {
503/// match e.name().as_ref() {
504/// b"tag1" => println!("attributes values: {:?}",
505/// e.attributes().map(|a| a.unwrap().value)
506/// .collect::<Vec<_>>()),
507/// b"tag2" => count += 1,
508/// _ => (),
509/// }
510/// }
511/// Ok(Event::Text(e)) => txt.push(e.unescape().unwrap().into_owned()),
512///
513/// // There are several other `Event`s we do not consider here
514/// _ => (),
515/// }
516/// // if we don't keep a borrow elsewhere, we can clear the buffer to keep memory usage low
517/// buf.clear();
518/// }
519/// ```
520///
521/// [`NsReader`]: crate::reader::NsReader
522#[derive(Clone)]
523pub struct Reader<R> {
524 /// Source of data for parse
525 reader: R,
526 /// Configuration and current parse state
527 parser: Parser,
528}
529
530/// Builder methods
531impl<R> Reader<R> {
532 /// Creates a `Reader` that reads from a given reader.
533 pub fn from_reader(reader: R) -> Self {
534 Self {
535 reader,
536 parser: Parser::default(),
537 }
538 }
539
540 configure_methods!();
541}
542
543/// Getters
544impl<R> Reader<R> {
545 /// Consumes `Reader` returning the underlying reader
546 ///
547 /// Can be used to compute line and column of a parsing error position
548 ///
549 /// # Examples
550 ///
551 /// ```
552 /// # use pretty_assertions::assert_eq;
553 /// use std::{str, io::Cursor};
554 /// use quick_xml::events::Event;
555 /// use quick_xml::reader::Reader;
556 ///
557 /// let xml = r#"<tag1 att1 = "test">
558 /// <tag2><!--Test comment-->Test</tag2>
559 /// <tag3>Test 2</tag3>
560 /// </tag1>"#;
561 /// let mut reader = Reader::from_reader(Cursor::new(xml.as_bytes()));
562 /// let mut buf = Vec::new();
563 ///
564 /// fn into_line_and_column(reader: Reader<Cursor<&[u8]>>) -> (usize, usize) {
565 /// let end_pos = reader.buffer_position();
566 /// let mut cursor = reader.into_inner();
567 /// let s = String::from_utf8(cursor.into_inner()[0..end_pos].to_owned())
568 /// .expect("can't make a string");
569 /// let mut line = 1;
570 /// let mut column = 0;
571 /// for c in s.chars() {
572 /// if c == '\n' {
573 /// line += 1;
574 /// column = 0;
575 /// } else {
576 /// column += 1;
577 /// }
578 /// }
579 /// (line, column)
580 /// }
581 ///
582 /// loop {
583 /// match reader.read_event_into(&mut buf) {
584 /// Ok(Event::Start(ref e)) => match e.name().as_ref() {
585 /// b"tag1" | b"tag2" => (),
586 /// tag => {
587 /// assert_eq!(b"tag3", tag);
588 /// assert_eq!((3, 22), into_line_and_column(reader));
589 /// break;
590 /// }
591 /// },
592 /// Ok(Event::Eof) => unreachable!(),
593 /// _ => (),
594 /// }
595 /// buf.clear();
596 /// }
597 /// ```
598 pub fn into_inner(self) -> R {
599 self.reader
600 }
601
602 /// Gets a reference to the underlying reader.
603 pub fn get_ref(&self) -> &R {
604 &self.reader
605 }
606
607 /// Gets a mutable reference to the underlying reader.
608 pub fn get_mut(&mut self) -> &mut R {
609 &mut self.reader
610 }
611
612 /// Gets the current byte position in the input data.
613 ///
614 /// Useful when debugging errors.
615 pub fn buffer_position(&self) -> usize {
616 // when internal state is OpenedTag, we have actually read until '<',
617 // which we don't want to show
618 if let ParseState::OpenedTag = self.parser.state {
619 self.parser.offset - 1
620 } else {
621 self.parser.offset
622 }
623 }
624
625 /// Get the decoder, used to decode bytes, read by this reader, to the strings.
626 ///
627 /// If `encoding` feature is enabled, the used encoding may change after
628 /// parsing the XML declaration, otherwise encoding is fixed to UTF-8.
629 ///
630 /// If `encoding` feature is enabled and no encoding is specified in declaration,
631 /// defaults to UTF-8.
632 #[inline]
633 pub fn decoder(&self) -> Decoder {
634 self.parser.decoder()
635 }
636}
637
638/// Private sync reading methods
639impl<R> Reader<R> {
640 /// Read text into the given buffer, and return an event that borrows from
641 /// either that buffer or from the input itself, based on the type of the
642 /// reader.
643 fn read_event_impl<'i, B>(&mut self, mut buf: B) -> Result<Event<'i>>
644 where
645 R: XmlSource<'i, B>,
646 {
647 read_event_impl!(self, buf, self.reader, read_until_open, read_until_close)
648 }
649
650 /// Read until '<' is found, moves reader to an `OpenedTag` state and returns a `Text` event.
651 ///
652 /// Returns inner `Ok` if the loop should be broken and an event returned.
653 /// Returns inner `Err` with the same `buf` because Rust borrowck stumbles upon this case in particular.
654 fn read_until_open<'i, B>(&mut self, buf: B) -> Result<std::result::Result<Event<'i>, B>>
655 where
656 R: XmlSource<'i, B>,
657 {
658 read_until_open!(self, buf, self.reader, read_event_impl)
659 }
660
661 /// Private function to read until `>` is found. This function expects that
662 /// it was called just after encounter a `<` symbol.
663 fn read_until_close<'i, B>(&mut self, buf: B) -> Result<Event<'i>>
664 where
665 R: XmlSource<'i, B>,
666 {
667 read_until_close!(self, buf, self.reader)
668 }
669}
670
671////////////////////////////////////////////////////////////////////////////////////////////////////
672
673/// Represents an input for a reader that can return borrowed data.
674///
675/// There are two implementors of this trait: generic one that read data from
676/// `Self`, copies some part of it into a provided buffer of type `B` and then
677/// returns data that borrow from that buffer.
678///
679/// The other implementor is for `&[u8]` and instead of copying data returns
680/// borrowed data from `Self` instead. This implementation allows zero-copy
681/// deserialization.
682///
683/// # Parameters
684/// - `'r`: lifetime of a buffer from which events will borrow
685/// - `B`: a type of a buffer that can be used to store data read from `Self` and
686/// from which events can borrow
687trait XmlSource<'r, B> {
688 /// Removes UTF-8 BOM if it is present
689 #[cfg(not(feature = "encoding"))]
690 fn remove_utf8_bom(&mut self) -> Result<()>;
691
692 /// Determines encoding from the start of input and removes BOM if it is present
693 #[cfg(feature = "encoding")]
694 fn detect_encoding(&mut self) -> Result<Option<&'static Encoding>>;
695
696 /// Read input until `byte` is found or end of input is reached.
697 ///
698 /// Returns a slice of data read up to `byte`, which does not include into result.
699 /// If input (`Self`) is exhausted, returns `None`.
700 ///
701 /// # Example
702 ///
703 /// ```ignore
704 /// let mut position = 0;
705 /// let mut input = b"abc*def".as_ref();
706 /// // ^= 4
707 ///
708 /// assert_eq!(
709 /// input.read_bytes_until(b'*', (), &mut position).unwrap(),
710 /// Some(b"abc".as_ref())
711 /// );
712 /// assert_eq!(position, 4); // position after the symbol matched
713 /// ```
714 ///
715 /// # Parameters
716 /// - `byte`: Byte for search
717 /// - `buf`: Buffer that could be filled from an input (`Self`) and
718 /// from which [events] could borrow their data
719 /// - `position`: Will be increased by amount of bytes consumed
720 ///
721 /// [events]: crate::events::Event
722 fn read_bytes_until(
723 &mut self,
724 byte: u8,
725 buf: B,
726 position: &mut usize,
727 ) -> Result<Option<&'r [u8]>>;
728
729 /// Read input until comment, CDATA or processing instruction is finished.
730 ///
731 /// This method expect that `<` already was read.
732 ///
733 /// Returns a slice of data read up to end of comment, CDATA or processing
734 /// instruction (`>`), which does not include into result.
735 ///
736 /// If input (`Self`) is exhausted and nothing was read, returns `None`.
737 ///
738 /// # Parameters
739 /// - `buf`: Buffer that could be filled from an input (`Self`) and
740 /// from which [events] could borrow their data
741 /// - `position`: Will be increased by amount of bytes consumed
742 ///
743 /// [events]: crate::events::Event
744 fn read_bang_element(
745 &mut self,
746 buf: B,
747 position: &mut usize,
748 ) -> Result<Option<(BangType, &'r [u8])>>;
749
750 /// Read input until XML element is closed by approaching a `>` symbol.
751 /// Returns `Some(buffer)` that contains a data between `<` and `>` or
752 /// `None` if end-of-input was reached and nothing was read.
753 ///
754 /// Derived from `read_until`, but modified to handle XML attributes
755 /// using a minimal state machine.
756 ///
757 /// Attribute values are [defined] as follows:
758 /// ```plain
759 /// AttValue := '"' (([^<&"]) | Reference)* '"'
760 /// | "'" (([^<&']) | Reference)* "'"
761 /// ```
762 /// (`Reference` is something like `&quot;`, but we don't care about
763 /// escaped characters at this level)
764 ///
765 /// # Parameters
766 /// - `buf`: Buffer that could be filled from an input (`Self`) and
767 /// from which [events] could borrow their data
768 /// - `position`: Will be increased by amount of bytes consumed
769 ///
770 /// [defined]: https://www.w3.org/TR/xml11/#NT-AttValue
771 /// [events]: crate::events::Event
772 fn read_element(&mut self, buf: B, position: &mut usize) -> Result<Option<&'r [u8]>>;
773
774 /// Consume and discard all the whitespace until the next non-whitespace
775 /// character or EOF.
776 ///
777 /// # Parameters
778 /// - `position`: Will be increased by amount of bytes consumed
779 fn skip_whitespace(&mut self, position: &mut usize) -> Result<()>;
780
781 /// Consume and discard one character if it matches the given byte. Return
782 /// `true` if it matched.
783 ///
784 /// # Parameters
785 /// - `position`: Will be increased by 1 if byte is matched
786 fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result<bool>;
787
788 /// Return one character without consuming it, so that future `read_*` calls
789 /// will still include it. On EOF, return `None`.
790 fn peek_one(&mut self) -> Result<Option<u8>>;
791}
792
793/// Possible elements started with `<!`
794#[derive(Debug, PartialEq)]
795enum BangType {
796 /// <![CDATA[...]]>
797 CData,
798 /// <!--...-->
799 Comment,
800 /// <!DOCTYPE...>
801 DocType,
802}
803impl BangType {
804 #[inline(always)]
805 fn new(byte: Option<u8>) -> Result<Self> {
806 Ok(match byte {
807 Some(b'[') => Self::CData,
808 Some(b'-') => Self::Comment,
809 Some(b'D') | Some(b'd') => Self::DocType,
810 Some(b) => return Err(Error::UnexpectedBang(b)),
811 None => return Err(Error::UnexpectedEof("Bang".to_string())),
812 })
813 }
814
815 /// If element is finished, returns its content up to `>` symbol and
816 /// an index of this symbol, otherwise returns `None`
817 ///
818 /// # Parameters
819 /// - `buf`: buffer with data consumed on previous iterations
820 /// - `chunk`: data read on current iteration and not yet consumed from reader
821 #[inline(always)]
822 fn parse<'b>(&self, buf: &[u8], chunk: &'b [u8]) -> Option<(&'b [u8], usize)> {
823 for i in memchr::memchr_iter(b'>', chunk) {
824 match self {
825 // Need to read at least 6 symbols (`!---->`) for properly finished comment
826 // <!----> - XML comment
827 // 012345 - i
828 Self::Comment if buf.len() + i > 4 => {
829 if chunk[..i].ends_with(b"--") {
830 // We cannot strip last `--` from the buffer because we need it in case of
831 // check_comments enabled option. XML standard requires that comment
832 // will not end with `--->` sequence because this is a special case of
833 // `--` in the comment (https://www.w3.org/TR/xml11/#sec-comments)
834 return Some((&chunk[..i], i + 1)); // +1 for `>`
835 }
836 // End sequence `-|->` was splitted at |
837 // buf --/ \-- chunk
838 if i == 1 && buf.ends_with(b"-") && chunk[0] == b'-' {
839 return Some((&chunk[..i], i + 1)); // +1 for `>`
840 }
841 // End sequence `--|>` was splitted at |
842 // buf --/ \-- chunk
843 if i == 0 && buf.ends_with(b"--") {
844 return Some((&[], i + 1)); // +1 for `>`
845 }
846 }
847 Self::Comment => {}
848 Self::CData => {
849 if chunk[..i].ends_with(b"]]") {
850 return Some((&chunk[..i], i + 1)); // +1 for `>`
851 }
852 // End sequence `]|]>` was splitted at |
853 // buf --/ \-- chunk
854 if i == 1 && buf.ends_with(b"]") && chunk[0] == b']' {
855 return Some((&chunk[..i], i + 1)); // +1 for `>`
856 }
857 // End sequence `]]|>` was splitted at |
858 // buf --/ \-- chunk
859 if i == 0 && buf.ends_with(b"]]") {
860 return Some((&[], i + 1)); // +1 for `>`
861 }
862 }
863 Self::DocType => {
864 let content = &chunk[..i];
865 let balance = memchr::memchr2_iter(b'<', b'>', content)
866 .map(|p| if content[p] == b'<' { 1i32 } else { -1 })
867 .sum::<i32>();
868 if balance == 0 {
869 return Some((content, i + 1)); // +1 for `>`
870 }
871 }
872 }
873 }
874 None
875 }
876 #[inline]
877 fn to_err(&self) -> Error {
878 let bang_str = match self {
879 Self::CData => "CData",
880 Self::Comment => "Comment",
881 Self::DocType => "DOCTYPE",
882 };
883 Error::UnexpectedEof(bang_str.to_string())
884 }
885}
886
887/// State machine for the [`XmlSource::read_element`]
888#[derive(Clone, Copy)]
889enum ReadElementState {
890 /// The initial state (inside element, but outside of attribute value)
891 Elem,
892 /// Inside a single-quoted attribute value
893 SingleQ,
894 /// Inside a double-quoted attribute value
895 DoubleQ,
896}
897impl ReadElementState {
898 /// Changes state by analyzing part of input.
899 /// Returns a tuple with part of chunk up to element closing symbol `>`
900 /// and a position after that symbol or `None` if such symbol was not found
901 #[inline(always)]
902 fn change<'b>(&mut self, chunk: &'b [u8]) -> Option<(&'b [u8], usize)> {
903 for i: usize in memchr::memchr3_iter(needle1:b'>', needle2:b'\'', needle3:b'"', haystack:chunk) {
904 *self = match (*self, chunk[i]) {
905 // only allowed to match `>` while we are in state `Elem`
906 (Self::Elem, b'>') => return Some((&chunk[..i], i + 1)),
907 (Self::Elem, b'\'') => Self::SingleQ,
908 (Self::Elem, b'\"') => Self::DoubleQ,
909
910 // the only end_byte that gets us out if the same character
911 (Self::SingleQ, b'\'') | (Self::DoubleQ, b'"') => Self::Elem,
912
913 // all other bytes: no state change
914 _ => *self,
915 };
916 }
917 None
918 }
919}
920
921/// A function to check whether the byte is a whitespace (blank, new line, carriage return or tab)
922#[inline]
923pub(crate) const fn is_whitespace(b: u8) -> bool {
924 matches!(b, b' ' | b'\r' | b'\n' | b'\t')
925}
926
927////////////////////////////////////////////////////////////////////////////////////////////////////
928
929#[cfg(test)]
930mod test {
931 /// Checks the internal implementation of the various reader methods
932 macro_rules! check {
933 (
934 #[$test:meta]
935 $read_event:ident,
936 $read_until_close:ident,
937 // constructor of the XML source on which internal functions will be called
938 $source:path,
939 // constructor of the buffer to which read data will stored
940 $buf:expr
941 $(, $async:ident, $await:ident)?
942 ) => {
943 mod read_bytes_until {
944 use super::*;
945 // Use Bytes for printing bytes as strings for ASCII range
946 use crate::utils::Bytes;
947 use pretty_assertions::assert_eq;
948
949 /// Checks that search in the empty buffer returns `None`
950 #[$test]
951 $($async)? fn empty() {
952 let buf = $buf;
953 let mut position = 0;
954 let mut input = b"".as_ref();
955 // ^= 0
956
957 assert_eq!(
958 $source(&mut input)
959 .read_bytes_until(b'*', buf, &mut position)
960 $(.$await)?
961 .unwrap()
962 .map(Bytes),
963 None
964 );
965 assert_eq!(position, 0);
966 }
967
968 /// Checks that search in the buffer non-existent value returns entire buffer
969 /// as a result and set `position` to `len()`
970 #[$test]
971 $($async)? fn non_existent() {
972 let buf = $buf;
973 let mut position = 0;
974 let mut input = b"abcdef".as_ref();
975 // ^= 6
976
977 assert_eq!(
978 $source(&mut input)
979 .read_bytes_until(b'*', buf, &mut position)
980 $(.$await)?
981 .unwrap()
982 .map(Bytes),
983 Some(Bytes(b"abcdef"))
984 );
985 assert_eq!(position, 6);
986 }
987
988 /// Checks that search in the buffer an element that is located in the front of
989 /// buffer returns empty slice as a result and set `position` to one symbol
990 /// after match (`1`)
991 #[$test]
992 $($async)? fn at_the_start() {
993 let buf = $buf;
994 let mut position = 0;
995 let mut input = b"*abcdef".as_ref();
996 // ^= 1
997
998 assert_eq!(
999 $source(&mut input)
1000 .read_bytes_until(b'*', buf, &mut position)
1001 $(.$await)?
1002 .unwrap()
1003 .map(Bytes),
1004 Some(Bytes(b""))
1005 );
1006 assert_eq!(position, 1); // position after the symbol matched
1007 }
1008
1009 /// Checks that search in the buffer an element that is located in the middle of
1010 /// buffer returns slice before that symbol as a result and set `position` to one
1011 /// symbol after match
1012 #[$test]
1013 $($async)? fn inside() {
1014 let buf = $buf;
1015 let mut position = 0;
1016 let mut input = b"abc*def".as_ref();
1017 // ^= 4
1018
1019 assert_eq!(
1020 $source(&mut input)
1021 .read_bytes_until(b'*', buf, &mut position)
1022 $(.$await)?
1023 .unwrap()
1024 .map(Bytes),
1025 Some(Bytes(b"abc"))
1026 );
1027 assert_eq!(position, 4); // position after the symbol matched
1028 }
1029
1030 /// Checks that search in the buffer an element that is located in the end of
1031 /// buffer returns slice before that symbol as a result and set `position` to one
1032 /// symbol after match (`len()`)
1033 #[$test]
1034 $($async)? fn in_the_end() {
1035 let buf = $buf;
1036 let mut position = 0;
1037 let mut input = b"abcdef*".as_ref();
1038 // ^= 7
1039
1040 assert_eq!(
1041 $source(&mut input)
1042 .read_bytes_until(b'*', buf, &mut position)
1043 $(.$await)?
1044 .unwrap()
1045 .map(Bytes),
1046 Some(Bytes(b"abcdef"))
1047 );
1048 assert_eq!(position, 7); // position after the symbol matched
1049 }
1050 }
1051
1052 mod read_bang_element {
1053 use super::*;
1054
1055 /// Checks that reading CDATA content works correctly
1056 mod cdata {
1057 use super::*;
1058 use crate::errors::Error;
1059 use crate::reader::BangType;
1060 use crate::utils::Bytes;
1061 use pretty_assertions::assert_eq;
1062
1063 /// Checks that if input begins like CDATA element, but CDATA start sequence
1064 /// is not finished, parsing ends with an error
1065 #[$test]
1066 #[ignore = "start CDATA sequence fully checked outside of `read_bang_element`"]
1067 $($async)? fn not_properly_start() {
1068 let buf = $buf;
1069 let mut position = 0;
1070 let mut input = b"![]]>other content".as_ref();
1071 // ^= 0
1072
1073 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1074 Err(Error::UnexpectedEof(s)) if s == "CData" => {}
1075 x => assert!(
1076 false,
1077 r#"Expected `UnexpectedEof("CData")`, but result is: {:?}"#,
1078 x
1079 ),
1080 }
1081 assert_eq!(position, 0);
1082 }
1083
1084 /// Checks that if CDATA startup sequence was matched, but an end sequence
1085 /// is not found, parsing ends with an error
1086 #[$test]
1087 $($async)? fn not_closed() {
1088 let buf = $buf;
1089 let mut position = 0;
1090 let mut input = b"![CDATA[other content".as_ref();
1091 // ^= 0
1092
1093 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1094 Err(Error::UnexpectedEof(s)) if s == "CData" => {}
1095 x => assert!(
1096 false,
1097 r#"Expected `UnexpectedEof("CData")`, but result is: {:?}"#,
1098 x
1099 ),
1100 }
1101 assert_eq!(position, 0);
1102 }
1103
1104 /// Checks that CDATA element without content inside parsed successfully
1105 #[$test]
1106 $($async)? fn empty() {
1107 let buf = $buf;
1108 let mut position = 0;
1109 let mut input = b"![CDATA[]]>other content".as_ref();
1110 // ^= 11
1111
1112 assert_eq!(
1113 $source(&mut input)
1114 .read_bang_element(buf, &mut position)
1115 $(.$await)?
1116 .unwrap()
1117 .map(|(ty, data)| (ty, Bytes(data))),
1118 Some((BangType::CData, Bytes(b"![CDATA[]]")))
1119 );
1120 assert_eq!(position, 11);
1121 }
1122
1123 /// Checks that CDATA element with content parsed successfully.
1124 /// Additionally checks that sequences inside CDATA that may look like
1125 /// a CDATA end sequence do not interrupt CDATA parsing
1126 #[$test]
1127 $($async)? fn with_content() {
1128 let buf = $buf;
1129 let mut position = 0;
1130 let mut input = b"![CDATA[cdata]] ]>content]]>other content]]>".as_ref();
1131 // ^= 28
1132
1133 assert_eq!(
1134 $source(&mut input)
1135 .read_bang_element(buf, &mut position)
1136 $(.$await)?
1137 .unwrap()
1138 .map(|(ty, data)| (ty, Bytes(data))),
1139 Some((BangType::CData, Bytes(b"![CDATA[cdata]] ]>content]]")))
1140 );
1141 assert_eq!(position, 28);
1142 }
1143 }
1144
1145 /// Checks that reading XML comments works correctly. According to the [specification],
1146 /// comment data can contain any sequence except `--`:
1147 ///
1148 /// ```peg
1149 /// comment = '<--' (!'--' char)* '-->';
1150 /// char = [#x1-#x2C]
1151 /// / [#x2E-#xD7FF]
1152 /// / [#xE000-#xFFFD]
1153 /// / [#x10000-#x10FFFF]
1154 /// ```
1155 ///
1156 /// The presence of this limitation, however, is simply a poorly designed specification
1157 /// (maybe for purpose of building of LL(1) XML parser) and quick-xml does not check for
1158 /// presence of these sequences by default. This tests allow such content.
1159 ///
1160 /// [specification]: https://www.w3.org/TR/xml11/#dt-comment
1161 mod comment {
1162 use super::*;
1163 use crate::errors::Error;
1164 use crate::reader::BangType;
1165 use crate::utils::Bytes;
1166 use pretty_assertions::assert_eq;
1167
1168 #[$test]
1169 #[ignore = "start comment sequence fully checked outside of `read_bang_element`"]
1170 $($async)? fn not_properly_start() {
1171 let buf = $buf;
1172 let mut position = 0;
1173 let mut input = b"!- -->other content".as_ref();
1174 // ^= 0
1175
1176 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1177 Err(Error::UnexpectedEof(s)) if s == "Comment" => {}
1178 x => assert!(
1179 false,
1180 r#"Expected `UnexpectedEof("Comment")`, but result is: {:?}"#,
1181 x
1182 ),
1183 }
1184 assert_eq!(position, 0);
1185 }
1186
1187 #[$test]
1188 $($async)? fn not_properly_end() {
1189 let buf = $buf;
1190 let mut position = 0;
1191 let mut input = b"!->other content".as_ref();
1192 // ^= 0
1193
1194 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1195 Err(Error::UnexpectedEof(s)) if s == "Comment" => {}
1196 x => assert!(
1197 false,
1198 r#"Expected `UnexpectedEof("Comment")`, but result is: {:?}"#,
1199 x
1200 ),
1201 }
1202 assert_eq!(position, 0);
1203 }
1204
1205 #[$test]
1206 $($async)? fn not_closed1() {
1207 let buf = $buf;
1208 let mut position = 0;
1209 let mut input = b"!--other content".as_ref();
1210 // ^= 0
1211
1212 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1213 Err(Error::UnexpectedEof(s)) if s == "Comment" => {}
1214 x => assert!(
1215 false,
1216 r#"Expected `UnexpectedEof("Comment")`, but result is: {:?}"#,
1217 x
1218 ),
1219 }
1220 assert_eq!(position, 0);
1221 }
1222
1223 #[$test]
1224 $($async)? fn not_closed2() {
1225 let buf = $buf;
1226 let mut position = 0;
1227 let mut input = b"!-->other content".as_ref();
1228 // ^= 0
1229
1230 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1231 Err(Error::UnexpectedEof(s)) if s == "Comment" => {}
1232 x => assert!(
1233 false,
1234 r#"Expected `UnexpectedEof("Comment")`, but result is: {:?}"#,
1235 x
1236 ),
1237 }
1238 assert_eq!(position, 0);
1239 }
1240
1241 #[$test]
1242 $($async)? fn not_closed3() {
1243 let buf = $buf;
1244 let mut position = 0;
1245 let mut input = b"!--->other content".as_ref();
1246 // ^= 0
1247
1248 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1249 Err(Error::UnexpectedEof(s)) if s == "Comment" => {}
1250 x => assert!(
1251 false,
1252 r#"Expected `UnexpectedEof("Comment")`, but result is: {:?}"#,
1253 x
1254 ),
1255 }
1256 assert_eq!(position, 0);
1257 }
1258
1259 #[$test]
1260 $($async)? fn empty() {
1261 let buf = $buf;
1262 let mut position = 0;
1263 let mut input = b"!---->other content".as_ref();
1264 // ^= 6
1265
1266 assert_eq!(
1267 $source(&mut input)
1268 .read_bang_element(buf, &mut position)
1269 $(.$await)?
1270 .unwrap()
1271 .map(|(ty, data)| (ty, Bytes(data))),
1272 Some((BangType::Comment, Bytes(b"!----")))
1273 );
1274 assert_eq!(position, 6);
1275 }
1276
1277 #[$test]
1278 $($async)? fn with_content() {
1279 let buf = $buf;
1280 let mut position = 0;
1281 let mut input = b"!--->comment<--->other content".as_ref();
1282 // ^= 17
1283
1284 assert_eq!(
1285 $source(&mut input)
1286 .read_bang_element(buf, &mut position)
1287 $(.$await)?
1288 .unwrap()
1289 .map(|(ty, data)| (ty, Bytes(data))),
1290 Some((BangType::Comment, Bytes(b"!--->comment<---")))
1291 );
1292 assert_eq!(position, 17);
1293 }
1294 }
1295
1296 /// Checks that reading DOCTYPE definition works correctly
1297 mod doctype {
1298 use super::*;
1299
1300 mod uppercase {
1301 use super::*;
1302 use crate::errors::Error;
1303 use crate::reader::BangType;
1304 use crate::utils::Bytes;
1305 use pretty_assertions::assert_eq;
1306
1307 #[$test]
1308 $($async)? fn not_properly_start() {
1309 let buf = $buf;
1310 let mut position = 0;
1311 let mut input = b"!D other content".as_ref();
1312 // ^= 0
1313
1314 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1315 Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
1316 x => assert!(
1317 false,
1318 r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
1319 x
1320 ),
1321 }
1322 assert_eq!(position, 0);
1323 }
1324
1325 #[$test]
1326 $($async)? fn without_space() {
1327 let buf = $buf;
1328 let mut position = 0;
1329 let mut input = b"!DOCTYPEother content".as_ref();
1330 // ^= 0
1331
1332 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1333 Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
1334 x => assert!(
1335 false,
1336 r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
1337 x
1338 ),
1339 }
1340 assert_eq!(position, 0);
1341 }
1342
1343 #[$test]
1344 $($async)? fn empty() {
1345 let buf = $buf;
1346 let mut position = 0;
1347 let mut input = b"!DOCTYPE>other content".as_ref();
1348 // ^= 9
1349
1350 assert_eq!(
1351 $source(&mut input)
1352 .read_bang_element(buf, &mut position)
1353 $(.$await)?
1354 .unwrap()
1355 .map(|(ty, data)| (ty, Bytes(data))),
1356 Some((BangType::DocType, Bytes(b"!DOCTYPE")))
1357 );
1358 assert_eq!(position, 9);
1359 }
1360
1361 #[$test]
1362 $($async)? fn not_closed() {
1363 let buf = $buf;
1364 let mut position = 0;
1365 let mut input = b"!DOCTYPE other content".as_ref();
1366 // ^= 0
1367
1368 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1369 Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
1370 x => assert!(
1371 false,
1372 r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
1373 x
1374 ),
1375 }
1376 assert_eq!(position, 0);
1377 }
1378 }
1379
1380 mod lowercase {
1381 use super::*;
1382 use crate::errors::Error;
1383 use crate::reader::BangType;
1384 use crate::utils::Bytes;
1385 use pretty_assertions::assert_eq;
1386
1387 #[$test]
1388 $($async)? fn not_properly_start() {
1389 let buf = $buf;
1390 let mut position = 0;
1391 let mut input = b"!d other content".as_ref();
1392 // ^= 0
1393
1394 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1395 Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
1396 x => assert!(
1397 false,
1398 r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
1399 x
1400 ),
1401 }
1402 assert_eq!(position, 0);
1403 }
1404
1405 #[$test]
1406 $($async)? fn without_space() {
1407 let buf = $buf;
1408 let mut position = 0;
1409 let mut input = b"!doctypeother content".as_ref();
1410 // ^= 0
1411
1412 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1413 Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
1414 x => assert!(
1415 false,
1416 r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
1417 x
1418 ),
1419 }
1420 assert_eq!(position, 0);
1421 }
1422
1423 #[$test]
1424 $($async)? fn empty() {
1425 let buf = $buf;
1426 let mut position = 0;
1427 let mut input = b"!doctype>other content".as_ref();
1428 // ^= 9
1429
1430 assert_eq!(
1431 $source(&mut input)
1432 .read_bang_element(buf, &mut position)
1433 $(.$await)?
1434 .unwrap()
1435 .map(|(ty, data)| (ty, Bytes(data))),
1436 Some((BangType::DocType, Bytes(b"!doctype")))
1437 );
1438 assert_eq!(position, 9);
1439 }
1440
1441 #[$test]
1442 $($async)? fn not_closed() {
1443 let buf = $buf;
1444 let mut position = 0;
1445 let mut input = b"!doctype other content".as_ref();
1446 // ^= 0
1447
1448 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1449 Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
1450 x => assert!(
1451 false,
1452 r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
1453 x
1454 ),
1455 }
1456 assert_eq!(position, 0);
1457 }
1458 }
1459 }
1460 }
1461
1462 mod read_element {
1463 use super::*;
1464 use crate::utils::Bytes;
1465 use pretty_assertions::assert_eq;
1466
1467 /// Checks that nothing was read from empty buffer
1468 #[$test]
1469 $($async)? fn empty() {
1470 let buf = $buf;
1471 let mut position = 0;
1472 let mut input = b"".as_ref();
1473 // ^= 0
1474
1475 assert_eq!(
1476 $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1477 None
1478 );
1479 assert_eq!(position, 0);
1480 }
1481
1482 mod open {
1483 use super::*;
1484 use crate::utils::Bytes;
1485 use pretty_assertions::assert_eq;
1486
1487 #[$test]
1488 $($async)? fn empty_tag() {
1489 let buf = $buf;
1490 let mut position = 0;
1491 let mut input = b">".as_ref();
1492 // ^= 1
1493
1494 assert_eq!(
1495 $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1496 Some(Bytes(b""))
1497 );
1498 assert_eq!(position, 1);
1499 }
1500
1501 #[$test]
1502 $($async)? fn normal() {
1503 let buf = $buf;
1504 let mut position = 0;
1505 let mut input = b"tag>".as_ref();
1506 // ^= 4
1507
1508 assert_eq!(
1509 $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1510 Some(Bytes(b"tag"))
1511 );
1512 assert_eq!(position, 4);
1513 }
1514
1515 #[$test]
1516 $($async)? fn empty_ns_empty_tag() {
1517 let buf = $buf;
1518 let mut position = 0;
1519 let mut input = b":>".as_ref();
1520 // ^= 2
1521
1522 assert_eq!(
1523 $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1524 Some(Bytes(b":"))
1525 );
1526 assert_eq!(position, 2);
1527 }
1528
1529 #[$test]
1530 $($async)? fn empty_ns() {
1531 let buf = $buf;
1532 let mut position = 0;
1533 let mut input = b":tag>".as_ref();
1534 // ^= 5
1535
1536 assert_eq!(
1537 $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1538 Some(Bytes(b":tag"))
1539 );
1540 assert_eq!(position, 5);
1541 }
1542
1543 #[$test]
1544 $($async)? fn with_attributes() {
1545 let buf = $buf;
1546 let mut position = 0;
1547 let mut input = br#"tag attr-1=">" attr2 = '>' 3attr>"#.as_ref();
1548 // ^= 38
1549
1550 assert_eq!(
1551 $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1552 Some(Bytes(br#"tag attr-1=">" attr2 = '>' 3attr"#))
1553 );
1554 assert_eq!(position, 38);
1555 }
1556 }
1557
1558 mod self_closed {
1559 use super::*;
1560 use crate::utils::Bytes;
1561 use pretty_assertions::assert_eq;
1562
1563 #[$test]
1564 $($async)? fn empty_tag() {
1565 let buf = $buf;
1566 let mut position = 0;
1567 let mut input = b"/>".as_ref();
1568 // ^= 2
1569
1570 assert_eq!(
1571 $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1572 Some(Bytes(b"/"))
1573 );
1574 assert_eq!(position, 2);
1575 }
1576
1577 #[$test]
1578 $($async)? fn normal() {
1579 let buf = $buf;
1580 let mut position = 0;
1581 let mut input = b"tag/>".as_ref();
1582 // ^= 5
1583
1584 assert_eq!(
1585 $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1586 Some(Bytes(b"tag/"))
1587 );
1588 assert_eq!(position, 5);
1589 }
1590
1591 #[$test]
1592 $($async)? fn empty_ns_empty_tag() {
1593 let buf = $buf;
1594 let mut position = 0;
1595 let mut input = b":/>".as_ref();
1596 // ^= 3
1597
1598 assert_eq!(
1599 $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1600 Some(Bytes(b":/"))
1601 );
1602 assert_eq!(position, 3);
1603 }
1604
1605 #[$test]
1606 $($async)? fn empty_ns() {
1607 let buf = $buf;
1608 let mut position = 0;
1609 let mut input = b":tag/>".as_ref();
1610 // ^= 6
1611
1612 assert_eq!(
1613 $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1614 Some(Bytes(b":tag/"))
1615 );
1616 assert_eq!(position, 6);
1617 }
1618
1619 #[$test]
1620 $($async)? fn with_attributes() {
1621 let buf = $buf;
1622 let mut position = 0;
1623 let mut input = br#"tag attr-1="/>" attr2 = '/>' 3attr/>"#.as_ref();
1624 // ^= 41
1625
1626 assert_eq!(
1627 $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1628 Some(Bytes(br#"tag attr-1="/>" attr2 = '/>' 3attr/"#))
1629 );
1630 assert_eq!(position, 41);
1631 }
1632 }
1633 }
1634
1635 mod issue_344 {
1636 use crate::errors::Error;
1637 use crate::reader::Reader;
1638
1639 #[$test]
1640 $($async)? fn cdata() {
1641 let mut reader = Reader::from_str("![]]>");
1642
1643 match reader.$read_until_close($buf) $(.$await)? {
1644 Err(Error::UnexpectedEof(s)) if s == "CData" => {}
1645 x => assert!(
1646 false,
1647 r#"Expected `UnexpectedEof("CData")`, but result is: {:?}"#,
1648 x
1649 ),
1650 }
1651 }
1652
1653 #[$test]
1654 $($async)? fn comment() {
1655 let mut reader = Reader::from_str("!- -->");
1656
1657 match reader.$read_until_close($buf) $(.$await)? {
1658 Err(Error::UnexpectedEof(s)) if s == "Comment" => {}
1659 x => assert!(
1660 false,
1661 r#"Expected `UnexpectedEof("Comment")`, but result is: {:?}"#,
1662 x
1663 ),
1664 }
1665 }
1666
1667 #[$test]
1668 $($async)? fn doctype_uppercase() {
1669 let mut reader = Reader::from_str("!D>");
1670
1671 match reader.$read_until_close($buf) $(.$await)? {
1672 Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
1673 x => assert!(
1674 false,
1675 r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
1676 x
1677 ),
1678 }
1679 }
1680
1681 #[$test]
1682 $($async)? fn doctype_lowercase() {
1683 let mut reader = Reader::from_str("!d>");
1684
1685 match reader.$read_until_close($buf) $(.$await)? {
1686 Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
1687 x => assert!(
1688 false,
1689 r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
1690 x
1691 ),
1692 }
1693 }
1694 }
1695
1696 /// Ensures, that no empty `Text` events are generated
1697 mod $read_event {
1698 use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event};
1699 use crate::reader::Reader;
1700 use pretty_assertions::assert_eq;
1701
1702 /// When `encoding` feature is enabled, encoding should be detected
1703 /// from BOM (UTF-8) and BOM should be stripped.
1704 ///
1705 /// When `encoding` feature is disabled, UTF-8 is assumed and BOM
1706 /// character should be stripped for consistency
1707 #[$test]
1708 $($async)? fn bom_from_reader() {
1709 let mut reader = Reader::from_reader("\u{feff}\u{feff}".as_bytes());
1710
1711 assert_eq!(
1712 reader.$read_event($buf) $(.$await)? .unwrap(),
1713 Event::Text(BytesText::from_escaped("\u{feff}"))
1714 );
1715
1716 assert_eq!(
1717 reader.$read_event($buf) $(.$await)? .unwrap(),
1718 Event::Eof
1719 );
1720 }
1721
1722 /// When parsing from &str, encoding is fixed (UTF-8), so
1723 /// - when `encoding` feature is disabled, the behavior the
1724 /// same as in `bom_from_reader` text
1725 /// - when `encoding` feature is enabled, the behavior should
1726 /// stay consistent, so the first BOM character is stripped
1727 #[$test]
1728 $($async)? fn bom_from_str() {
1729 let mut reader = Reader::from_str("\u{feff}\u{feff}");
1730
1731 assert_eq!(
1732 reader.$read_event($buf) $(.$await)? .unwrap(),
1733 Event::Text(BytesText::from_escaped("\u{feff}"))
1734 );
1735
1736 assert_eq!(
1737 reader.$read_event($buf) $(.$await)? .unwrap(),
1738 Event::Eof
1739 );
1740 }
1741
1742 #[$test]
1743 $($async)? fn declaration() {
1744 let mut reader = Reader::from_str("<?xml ?>");
1745
1746 assert_eq!(
1747 reader.$read_event($buf) $(.$await)? .unwrap(),
1748 Event::Decl(BytesDecl::from_start(BytesStart::from_content("xml ", 3)))
1749 );
1750 }
1751
1752 #[$test]
1753 $($async)? fn doctype() {
1754 let mut reader = Reader::from_str("<!DOCTYPE x>");
1755
1756 assert_eq!(
1757 reader.$read_event($buf) $(.$await)? .unwrap(),
1758 Event::DocType(BytesText::from_escaped("x"))
1759 );
1760 }
1761
1762 #[$test]
1763 $($async)? fn processing_instruction() {
1764 let mut reader = Reader::from_str("<?xml-stylesheet?>");
1765
1766 assert_eq!(
1767 reader.$read_event($buf) $(.$await)? .unwrap(),
1768 Event::PI(BytesText::from_escaped("xml-stylesheet"))
1769 );
1770 }
1771
1772 #[$test]
1773 $($async)? fn start() {
1774 let mut reader = Reader::from_str("<tag>");
1775
1776 assert_eq!(
1777 reader.$read_event($buf) $(.$await)? .unwrap(),
1778 Event::Start(BytesStart::new("tag"))
1779 );
1780 }
1781
1782 #[$test]
1783 $($async)? fn end() {
1784 let mut reader = Reader::from_str("</tag>");
1785 // Because we expect invalid XML, do not check that
1786 // the end name paired with the start name
1787 reader.check_end_names(false);
1788
1789 assert_eq!(
1790 reader.$read_event($buf) $(.$await)? .unwrap(),
1791 Event::End(BytesEnd::new("tag"))
1792 );
1793 }
1794
1795 #[$test]
1796 $($async)? fn empty() {
1797 let mut reader = Reader::from_str("<tag/>");
1798
1799 assert_eq!(
1800 reader.$read_event($buf) $(.$await)? .unwrap(),
1801 Event::Empty(BytesStart::new("tag"))
1802 );
1803 }
1804
1805 #[$test]
1806 $($async)? fn text() {
1807 let mut reader = Reader::from_str("text");
1808
1809 assert_eq!(
1810 reader.$read_event($buf) $(.$await)? .unwrap(),
1811 Event::Text(BytesText::from_escaped("text"))
1812 );
1813 }
1814
1815 #[$test]
1816 $($async)? fn cdata() {
1817 let mut reader = Reader::from_str("<![CDATA[]]>");
1818
1819 assert_eq!(
1820 reader.$read_event($buf) $(.$await)? .unwrap(),
1821 Event::CData(BytesCData::new(""))
1822 );
1823 }
1824
1825 #[$test]
1826 $($async)? fn comment() {
1827 let mut reader = Reader::from_str("<!---->");
1828
1829 assert_eq!(
1830 reader.$read_event($buf) $(.$await)? .unwrap(),
1831 Event::Comment(BytesText::from_escaped(""))
1832 );
1833 }
1834
1835 #[$test]
1836 $($async)? fn eof() {
1837 let mut reader = Reader::from_str("");
1838
1839 assert_eq!(
1840 reader.$read_event($buf) $(.$await)? .unwrap(),
1841 Event::Eof
1842 );
1843 }
1844 }
1845 };
1846 }
1847
1848 /// Tests for https://github.com/tafia/quick-xml/issues/469
1849 macro_rules! small_buffers {
1850 (
1851 #[$test:meta]
1852 $read_event:ident: $BufReader:ty
1853 $(, $async:ident, $await:ident)?
1854 ) => {
1855 mod small_buffers {
1856 use crate::events::{BytesCData, BytesDecl, BytesStart, BytesText, Event};
1857 use crate::reader::Reader;
1858 use pretty_assertions::assert_eq;
1859
1860 #[$test]
1861 $($async)? fn decl() {
1862 let xml = "<?xml ?>";
1863 // ^^^^^^^ data that fit into buffer
1864 let size = xml.match_indices("?>").next().unwrap().0 + 1;
1865 let br = <$BufReader>::with_capacity(size, xml.as_bytes());
1866 let mut reader = Reader::from_reader(br);
1867 let mut buf = Vec::new();
1868
1869 assert_eq!(
1870 reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1871 Event::Decl(BytesDecl::from_start(BytesStart::from_content("xml ", 3)))
1872 );
1873 assert_eq!(
1874 reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1875 Event::Eof
1876 );
1877 }
1878
1879 #[$test]
1880 $($async)? fn pi() {
1881 let xml = "<?pi?>";
1882 // ^^^^^ data that fit into buffer
1883 let size = xml.match_indices("?>").next().unwrap().0 + 1;
1884 let br = <$BufReader>::with_capacity(size, xml.as_bytes());
1885 let mut reader = Reader::from_reader(br);
1886 let mut buf = Vec::new();
1887
1888 assert_eq!(
1889 reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1890 Event::PI(BytesText::new("pi"))
1891 );
1892 assert_eq!(
1893 reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1894 Event::Eof
1895 );
1896 }
1897
1898 #[$test]
1899 $($async)? fn empty() {
1900 let xml = "<empty/>";
1901 // ^^^^^^^ data that fit into buffer
1902 let size = xml.match_indices("/>").next().unwrap().0 + 1;
1903 let br = <$BufReader>::with_capacity(size, xml.as_bytes());
1904 let mut reader = Reader::from_reader(br);
1905 let mut buf = Vec::new();
1906
1907 assert_eq!(
1908 reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1909 Event::Empty(BytesStart::new("empty"))
1910 );
1911 assert_eq!(
1912 reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1913 Event::Eof
1914 );
1915 }
1916
1917 #[$test]
1918 $($async)? fn cdata1() {
1919 let xml = "<![CDATA[cdata]]>";
1920 // ^^^^^^^^^^^^^^^ data that fit into buffer
1921 let size = xml.match_indices("]]>").next().unwrap().0 + 1;
1922 let br = <$BufReader>::with_capacity(size, xml.as_bytes());
1923 let mut reader = Reader::from_reader(br);
1924 let mut buf = Vec::new();
1925
1926 assert_eq!(
1927 reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1928 Event::CData(BytesCData::new("cdata"))
1929 );
1930 assert_eq!(
1931 reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1932 Event::Eof
1933 );
1934 }
1935
1936 #[$test]
1937 $($async)? fn cdata2() {
1938 let xml = "<![CDATA[cdata]]>";
1939 // ^^^^^^^^^^^^^^^^ data that fit into buffer
1940 let size = xml.match_indices("]]>").next().unwrap().0 + 2;
1941 let br = <$BufReader>::with_capacity(size, xml.as_bytes());
1942 let mut reader = Reader::from_reader(br);
1943 let mut buf = Vec::new();
1944
1945 assert_eq!(
1946 reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1947 Event::CData(BytesCData::new("cdata"))
1948 );
1949 assert_eq!(
1950 reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1951 Event::Eof
1952 );
1953 }
1954
1955 #[$test]
1956 $($async)? fn comment1() {
1957 let xml = "<!--comment-->";
1958 // ^^^^^^^^^^^^ data that fit into buffer
1959 let size = xml.match_indices("-->").next().unwrap().0 + 1;
1960 let br = <$BufReader>::with_capacity(size, xml.as_bytes());
1961 let mut reader = Reader::from_reader(br);
1962 let mut buf = Vec::new();
1963
1964 assert_eq!(
1965 reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1966 Event::Comment(BytesText::new("comment"))
1967 );
1968 assert_eq!(
1969 reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1970 Event::Eof
1971 );
1972 }
1973
1974 #[$test]
1975 $($async)? fn comment2() {
1976 let xml = "<!--comment-->";
1977 // ^^^^^^^^^^^^^ data that fit into buffer
1978 let size = xml.match_indices("-->").next().unwrap().0 + 2;
1979 let br = <$BufReader>::with_capacity(size, xml.as_bytes());
1980 let mut reader = Reader::from_reader(br);
1981 let mut buf = Vec::new();
1982
1983 assert_eq!(
1984 reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1985 Event::Comment(BytesText::new("comment"))
1986 );
1987 assert_eq!(
1988 reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1989 Event::Eof
1990 );
1991 }
1992 }
1993 };
1994 }
1995
1996 // Export macros for the child modules:
1997 // - buffered_reader
1998 // - slice_reader
1999 pub(super) use check;
2000 pub(super) use small_buffers;
2001}
2002