1//! Contains high-level interface for a pull-based XML parser.
2
3#[cfg(feature = "encoding")]
4use encoding_rs::Encoding;
5use std::ops::Range;
6
7use crate::encoding::Decoder;
8use crate::errors::{Error, Result};
9use crate::events::Event;
10use crate::reader::state::ReaderState;
11
12use memchr;
13
14macro_rules! configure_methods {
15 ($($holder:ident)?) => {
16 /// Changes whether empty elements should be split into an `Open` and a `Close` event.
17 ///
18 /// When set to `true`, all [`Empty`] events produced by a self-closing tag like `<tag/>` are
19 /// expanded into a [`Start`] event followed by an [`End`] event. When set to `false` (the
20 /// default), those tags are represented by an [`Empty`] event instead.
21 ///
22 /// Note, that setting this to `true` will lead to additional allocates that
23 /// needed to store tag name for an [`End`] event. However if [`check_end_names`]
24 /// is also set, only one additional allocation will be performed that support
25 /// both these options.
26 ///
27 /// (`false` by default)
28 ///
29 /// [`Empty`]: Event::Empty
30 /// [`Start`]: Event::Start
31 /// [`End`]: Event::End
32 /// [`check_end_names`]: Self::check_end_names
33 pub fn expand_empty_elements(&mut self, val: bool) -> &mut Self {
34 self $(.$holder)? .state.expand_empty_elements = val;
35 self
36 }
37
38 /// Changes whether whitespace before and after character data should be removed.
39 ///
40 /// When set to `true`, all [`Text`] events are trimmed.
41 /// If after that the event is empty it will not be pushed.
42 ///
43 /// Changing this option automatically changes the [`trim_text_end`] option.
44 ///
45 /// (`false` by default).
46 ///
47 /// <div style="background:rgba(80, 240, 100, 0.20);padding:0.75em;">
48 ///
49 /// WARNING: With this option every text events will be trimmed which is
50 /// incorrect behavior when text events delimited by comments, processing
51 /// instructions or CDATA sections. To correctly trim data manually apply
52 /// [`BytesText::inplace_trim_start`] and [`BytesText::inplace_trim_end`]
53 /// only to necessary events.
54 /// </div>
55 ///
56 /// [`Text`]: Event::Text
57 /// [`trim_text_end`]: Self::trim_text_end
58 /// [`BytesText::inplace_trim_start`]: crate::events::BytesText::inplace_trim_start
59 /// [`BytesText::inplace_trim_end`]: crate::events::BytesText::inplace_trim_end
60 pub fn trim_text(&mut self, val: bool) -> &mut Self {
61 self $(.$holder)? .state.trim_text_start = val;
62 self $(.$holder)? .state.trim_text_end = val;
63 self
64 }
65
66 /// Changes whether whitespace after character data should be removed.
67 ///
68 /// When set to `true`, trailing whitespace is trimmed in [`Text`] events.
69 /// If after that the event is empty it will not be pushed.
70 ///
71 /// (`false` by default).
72 ///
73 /// <div style="background:rgba(80, 240, 100, 0.20);padding:0.75em;">
74 ///
75 /// WARNING: With this option every text events will be trimmed which is
76 /// incorrect behavior when text events delimited by comments, processing
77 /// instructions or CDATA sections. To correctly trim data manually apply
78 /// [`BytesText::inplace_trim_start`] and [`BytesText::inplace_trim_end`]
79 /// only to necessary events.
80 /// </div>
81 ///
82 /// [`Text`]: Event::Text
83 /// [`BytesText::inplace_trim_start`]: crate::events::BytesText::inplace_trim_start
84 /// [`BytesText::inplace_trim_end`]: crate::events::BytesText::inplace_trim_end
85 pub fn trim_text_end(&mut self, val: bool) -> &mut Self {
86 self $(.$holder)? .state.trim_text_end = val;
87 self
88 }
89
90 /// Changes whether trailing whitespaces after the markup name are trimmed in closing tags
91 /// `</a >`.
92 ///
93 /// If true the emitted [`End`] event is stripped of trailing whitespace after the markup name.
94 ///
95 /// Note that if set to `false` and `check_end_names` is true the comparison of markup names is
96 /// going to fail erroneously if a closing tag contains trailing whitespaces.
97 ///
98 /// (`true` by default)
99 ///
100 /// [`End`]: Event::End
101 pub fn trim_markup_names_in_closing_tags(&mut self, val: bool) -> &mut Self {
102 self $(.$holder)? .state.trim_markup_names_in_closing_tags = val;
103 self
104 }
105
106 /// Changes whether mismatched closing tag names should be detected.
107 ///
108 /// Note, that start and end tags [should match literally][spec], they cannot
109 /// have different prefixes even if both prefixes resolve to the same namespace.
110 /// The XML
111 ///
112 /// ```xml
113 /// <outer xmlns="namespace" xmlns:p="namespace">
114 /// </p:outer>
115 /// ```
116 ///
117 /// is not valid, even though semantically the start tag is the same as the
118 /// end tag. The reason is that namespaces are an extension of the original
119 /// XML specification (without namespaces) and it should be backward-compatible.
120 ///
121 /// When set to `false`, it won't check if a closing tag matches the corresponding opening tag.
122 /// For example, `<mytag></different_tag>` will be permitted.
123 ///
124 /// If the XML is known to be sane (already processed, etc.) this saves extra time.
125 ///
126 /// Note that the emitted [`End`] event will not be modified if this is disabled, ie. it will
127 /// contain the data of the mismatched end tag.
128 ///
129 /// Note, that setting this to `true` will lead to additional allocates that
130 /// needed to store tag name for an [`End`] event. However if [`expand_empty_elements`]
131 /// is also set, only one additional allocation will be performed that support
132 /// both these options.
133 ///
134 /// (`true` by default)
135 ///
136 /// [spec]: https://www.w3.org/TR/xml11/#dt-etag
137 /// [`End`]: Event::End
138 /// [`expand_empty_elements`]: Self::expand_empty_elements
139 pub fn check_end_names(&mut self, val: bool) -> &mut Self {
140 self $(.$holder)? .state.check_end_names = val;
141 self
142 }
143
144 /// Changes whether comments should be validated.
145 ///
146 /// When set to `true`, every [`Comment`] event will be checked for not containing `--`, which
147 /// is not allowed in XML comments. Most of the time we don't want comments at all so we don't
148 /// really care about comment correctness, thus the default value is `false` to improve
149 /// performance.
150 ///
151 /// (`false` by default)
152 ///
153 /// [`Comment`]: Event::Comment
154 pub fn check_comments(&mut self, val: bool) -> &mut Self {
155 self $(.$holder)? .state.check_comments = val;
156 self
157 }
158 };
159}
160
161macro_rules! read_event_impl {
162 (
163 $self:ident, $buf:ident,
164 $reader:expr,
165 $read_until_open:ident,
166 $read_until_close:ident
167 $(, $await:ident)?
168 ) => {{
169 let event = loop {
170 match $self.state.state {
171 ParseState::Init => { // Go to OpenedTag state
172 // If encoding set explicitly, we not need to detect it. For example,
173 // explicit UTF-8 set automatically if Reader was created using `from_str`.
174 // But we still need to remove BOM for consistency with no encoding
175 // feature enabled path
176 #[cfg(feature = "encoding")]
177 if let Some(encoding) = $reader.detect_encoding() $(.$await)? ? {
178 if $self.state.encoding.can_be_refined() {
179 $self.state.encoding = crate::reader::EncodingRef::BomDetected(encoding);
180 }
181 }
182
183 // Removes UTF-8 BOM if it is present
184 #[cfg(not(feature = "encoding"))]
185 $reader.remove_utf8_bom() $(.$await)? ?;
186
187 // Go to OpenedTag state
188 match $self.$read_until_open($buf) $(.$await)? {
189 Ok(Ok(ev)) => break Ok(ev),
190 Ok(Err(b)) => $buf = b,
191 Err(err) => break Err(err),
192 }
193 },
194 ParseState::ClosedTag => { // Go to OpenedTag state
195 match $self.$read_until_open($buf) $(.$await)? {
196 Ok(Ok(ev)) => break Ok(ev),
197 Ok(Err(b)) => $buf = b,
198 Err(err) => break Err(err),
199 }
200 },
201 // Go to ClosedTag state in next two arms
202 ParseState::OpenedTag => break $self.$read_until_close($buf) $(.$await)?,
203 ParseState::Empty => break $self.state.close_expanded_empty(),
204 ParseState::Exit => break Ok(Event::Eof),
205 };
206 };
207 match event {
208 Err(_) | Ok(Event::Eof) => $self.state.state = ParseState::Exit,
209 _ => {}
210 }
211 event
212 }};
213}
214
215/// Read bytes up to `<` and skip it. If current byte (after skipping all space
216/// characters if [`ReaderState::trim_text_start`] is `true`) is already `<`, then
217/// returns the next event, otherwise stay at position just after the `<` symbol.
218///
219/// Moves parser to the `OpenedTag` state.
220///
221/// This code is executed in two cases:
222/// - after start of parsing just after skipping BOM if it is present
223/// - after parsing `</tag>` or `<tag>`
224macro_rules! read_until_open {
225 (
226 $self:ident, $buf:ident,
227 $reader:expr,
228 $read_event:ident
229 $(, $await:ident)?
230 ) => {{
231 $self.state.state = ParseState::OpenedTag;
232
233 if $self.state.trim_text_start {
234 $reader.skip_whitespace(&mut $self.state.offset) $(.$await)? ?;
235 }
236
237 // If we already at the `<` symbol, do not try to return an empty Text event
238 if $reader.skip_one(b'<', &mut $self.state.offset) $(.$await)? ? {
239 // Pass $buf to the next next iteration of parsing loop
240 return Ok(Err($buf));
241 }
242
243 match $reader
244 .read_bytes_until(b'<', $buf, &mut $self.state.offset)
245 $(.$await)?
246 {
247 // Return Text event with `bytes` content
248 Ok(Some(bytes)) => $self.state.emit_text(bytes).map(Ok),
249 Ok(None) => Ok(Ok(Event::Eof)),
250 Err(e) => Err(e),
251 }
252 }};
253}
254
255/// Read bytes up to the `>` and skip it. This method is expected to be called
256/// after seeing the `<` symbol and skipping it. Inspects the next (current)
257/// symbol and returns an appropriate [`Event`]:
258///
259/// |Symbol |Event
260/// |-------|-------------------------------------
261/// |`!` |[`Comment`], [`CData`] or [`DocType`]
262/// |`/` |[`End`]
263/// |`?` |[`PI`]
264/// |_other_|[`Start`] or [`Empty`]
265///
266/// Moves parser to the `ClosedTag` state.
267///
268/// [`Comment`]: Event::Comment
269/// [`CData`]: Event::CData
270/// [`DocType`]: Event::DocType
271/// [`End`]: Event::End
272/// [`PI`]: Event::PI
273/// [`Start`]: Event::Start
274/// [`Empty`]: Event::Empty
275macro_rules! read_until_close {
276 (
277 $self:ident, $buf:ident,
278 $reader:expr
279 $(, $await:ident)?
280 ) => {{
281 $self.state.state = ParseState::ClosedTag;
282
283 match $reader.peek_one() $(.$await)? {
284 // `<!` - comment, CDATA or DOCTYPE declaration
285 Ok(Some(b'!')) => match $reader
286 .read_bang_element($buf, &mut $self.state.offset)
287 $(.$await)?
288 {
289 Ok(None) => Ok(Event::Eof),
290 Ok(Some((bang_type, bytes))) => $self.state.emit_bang(bang_type, bytes),
291 Err(e) => Err(e),
292 },
293 // `</` - closing tag
294 Ok(Some(b'/')) => match $reader
295 .read_bytes_until(b'>', $buf, &mut $self.state.offset)
296 $(.$await)?
297 {
298 Ok(None) => Ok(Event::Eof),
299 Ok(Some(bytes)) => $self.state.emit_end(bytes),
300 Err(e) => Err(e),
301 },
302 // `<?` - processing instruction
303 Ok(Some(b'?')) => match $reader
304 .read_bytes_until(b'>', $buf, &mut $self.state.offset)
305 $(.$await)?
306 {
307 Ok(None) => Ok(Event::Eof),
308 Ok(Some(bytes)) => $self.state.emit_question_mark(bytes),
309 Err(e) => Err(e),
310 },
311 // `<...` - opening or self-closed tag
312 Ok(Some(_)) => match $reader
313 .read_element($buf, &mut $self.state.offset)
314 $(.$await)?
315 {
316 Ok(None) => Ok(Event::Eof),
317 Ok(Some(bytes)) => $self.state.emit_start(bytes),
318 Err(e) => Err(e),
319 },
320 Ok(None) => Ok(Event::Eof),
321 Err(e) => Err(e),
322 }
323 }};
324}
325
326/// Generalization of `read_to_end` method for buffered and borrowed readers
327macro_rules! read_to_end {
328 (
329 $self:expr, $end:expr, $buf:expr,
330 $read_event:ident,
331 // Code block that performs clearing of internal buffer after read of each event
332 $clear:block
333 $(, $await:ident)?
334 ) => {{
335 let start = $self.buffer_position();
336 let mut depth = 0;
337 loop {
338 $clear
339 let end = $self.buffer_position();
340 match $self.$read_event($buf) $(.$await)? {
341 Err(e) => return Err(e),
342
343 Ok(Event::Start(e)) if e.name() == $end => depth += 1,
344 Ok(Event::End(e)) if e.name() == $end => {
345 if depth == 0 {
346 break start..end;
347 }
348 depth -= 1;
349 }
350 Ok(Event::Eof) => {
351 let name = $self.decoder().decode($end.as_ref());
352 return Err(Error::UnexpectedEof(format!("</{:?}>", name)));
353 }
354 _ => (),
355 }
356 }
357 }};
358}
359
360#[cfg(feature = "async-tokio")]
361mod async_tokio;
362mod buffered_reader;
363mod ns_reader;
364mod slice_reader;
365mod state;
366
367pub use ns_reader::NsReader;
368
369/// Range of input in bytes, that corresponds to some piece of XML
370pub type Span = Range<usize>;
371
372////////////////////////////////////////////////////////////////////////////////////////////////////
373
374/// Possible reader states. The state transition diagram (`true` and `false` shows
375/// value of [`Reader::expand_empty_elements()`] option):
376///
377/// ```mermaid
378/// flowchart LR
379/// subgraph _
380/// direction LR
381///
382/// Init -- "(no event)"\n --> OpenedTag
383/// OpenedTag -- Decl, DocType, PI\nComment, CData\nStart, Empty, End --> ClosedTag
384/// ClosedTag -- "#lt;false#gt;\n(no event)"\nText --> OpenedTag
385/// end
386/// ClosedTag -- "#lt;true#gt;"\nStart --> Empty
387/// Empty -- End --> ClosedTag
388/// _ -. Eof .-> Exit
389/// ```
390#[derive(Clone)]
391enum ParseState {
392 /// Initial state in which reader stay after creation. Transition from that
393 /// state could produce a `Text`, `Decl`, `Comment` or `Start` event. The next
394 /// state is always `OpenedTag`. The reader will never return to this state. The
395 /// event emitted during transition to `OpenedTag` is a `StartEvent` if the
396 /// first symbol not `<`, otherwise no event are emitted.
397 Init,
398 /// State after seeing the `<` symbol. Depending on the next symbol all other
399 /// events could be generated.
400 ///
401 /// After generating one event the reader moves to the `ClosedTag` state.
402 OpenedTag,
403 /// State in which reader searches the `<` symbol of a markup. All bytes before
404 /// that symbol will be returned in the [`Event::Text`] event. After that
405 /// the reader moves to the `OpenedTag` state.
406 ClosedTag,
407 /// This state is used only if option [`expand_empty_elements`] is set to `true`.
408 /// Reader enters to this state when it is in a `ClosedTag` state and emits an
409 /// [`Event::Start`] event. The next event emitted will be an [`Event::End`],
410 /// after which reader returned to the `ClosedTag` state.
411 ///
412 /// [`expand_empty_elements`]: ReaderState::expand_empty_elements
413 Empty,
414 /// Reader enters this state when `Eof` event generated or an error occurred.
415 /// This is the last state, the reader stay in it forever.
416 Exit,
417}
418
419/// A reference to an encoding together with information about how it was retrieved.
420///
421/// The state transition diagram:
422///
423/// ```mermaid
424/// flowchart LR
425/// Implicit -- from_str --> Explicit
426/// Implicit -- BOM --> BomDetected
427/// Implicit -- "encoding=..." --> XmlDetected
428/// BomDetected -- "encoding=..." --> XmlDetected
429/// ```
430#[cfg(feature = "encoding")]
431#[derive(Clone, Copy)]
432enum EncodingRef {
433 /// Encoding was implicitly assumed to have a specified value. It can be refined
434 /// using BOM or by the XML declaration event (`<?xml encoding=... ?>`)
435 Implicit(&'static Encoding),
436 /// Encoding was explicitly set to the desired value. It cannot be changed
437 /// nor by BOM, nor by parsing XML declaration (`<?xml encoding=... ?>`)
438 Explicit(&'static Encoding),
439 /// Encoding was detected from a byte order mark (BOM) or by the first bytes
440 /// of the content. It can be refined by the XML declaration event (`<?xml encoding=... ?>`)
441 BomDetected(&'static Encoding),
442 /// Encoding was detected using XML declaration event (`<?xml encoding=... ?>`).
443 /// It can no longer change
444 XmlDetected(&'static Encoding),
445}
446#[cfg(feature = "encoding")]
447impl EncodingRef {
448 #[inline]
449 fn encoding(&self) -> &'static Encoding {
450 match self {
451 Self::Implicit(e) => e,
452 Self::Explicit(e) => e,
453 Self::BomDetected(e) => e,
454 Self::XmlDetected(e) => e,
455 }
456 }
457 #[inline]
458 fn can_be_refined(&self) -> bool {
459 match self {
460 Self::Implicit(_) | Self::BomDetected(_) => true,
461 Self::Explicit(_) | Self::XmlDetected(_) => false,
462 }
463 }
464}
465
466////////////////////////////////////////////////////////////////////////////////////////////////////
467
468/// A low level encoding-agnostic XML event reader.
469///
470/// Consumes bytes and streams XML [`Event`]s.
471///
472/// This reader does not manage namespace declarations and not able to resolve
473/// prefixes. If you want these features, use the [`NsReader`].
474///
475/// # Examples
476///
477/// ```
478/// use quick_xml::events::Event;
479/// use quick_xml::reader::Reader;
480///
481/// let xml = r#"<tag1 att1 = "test">
482/// <tag2><!--Test comment-->Test</tag2>
483/// <tag2>Test 2</tag2>
484/// </tag1>"#;
485/// let mut reader = Reader::from_str(xml);
486/// reader.trim_text(true);
487///
488/// let mut count = 0;
489/// let mut txt = Vec::new();
490/// let mut buf = Vec::new();
491///
492/// // The `Reader` does not implement `Iterator` because it outputs borrowed data (`Cow`s)
493/// loop {
494/// // NOTE: this is the generic case when we don't know about the input BufRead.
495/// // when the input is a &str or a &[u8], we don't actually need to use another
496/// // buffer, we could directly call `reader.read_event()`
497/// match reader.read_event_into(&mut buf) {
498/// Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
499/// // exits the loop when reaching end of file
500/// Ok(Event::Eof) => break,
501///
502/// Ok(Event::Start(e)) => {
503/// match e.name().as_ref() {
504/// b"tag1" => println!("attributes values: {:?}",
505/// e.attributes().map(|a| a.unwrap().value)
506/// .collect::<Vec<_>>()),
507/// b"tag2" => count += 1,
508/// _ => (),
509/// }
510/// }
511/// Ok(Event::Text(e)) => txt.push(e.unescape().unwrap().into_owned()),
512///
513/// // There are several other `Event`s we do not consider here
514/// _ => (),
515/// }
516/// // if we don't keep a borrow elsewhere, we can clear the buffer to keep memory usage low
517/// buf.clear();
518/// }
519/// ```
520///
521/// [`NsReader`]: crate::reader::NsReader
522#[derive(Clone)]
523pub struct Reader<R> {
524 /// Source of data for parse
525 reader: R,
526 /// Configuration and current parse state
527 state: ReaderState,
528}
529
530/// Builder methods
531impl<R> Reader<R> {
532 /// Creates a `Reader` that reads from a given reader.
533 pub fn from_reader(reader: R) -> Self {
534 Self {
535 reader,
536 state: ReaderState::default(),
537 }
538 }
539
540 configure_methods!();
541}
542
543/// Getters
544impl<R> Reader<R> {
545 /// Consumes `Reader` returning the underlying reader
546 ///
547 /// Can be used to compute line and column of a parsing error position
548 ///
549 /// # Examples
550 ///
551 /// ```
552 /// # use pretty_assertions::assert_eq;
553 /// use std::{str, io::Cursor};
554 /// use quick_xml::events::Event;
555 /// use quick_xml::reader::Reader;
556 ///
557 /// let xml = r#"<tag1 att1 = "test">
558 /// <tag2><!--Test comment-->Test</tag2>
559 /// <tag3>Test 2</tag3>
560 /// </tag1>"#;
561 /// let mut reader = Reader::from_reader(Cursor::new(xml.as_bytes()));
562 /// let mut buf = Vec::new();
563 ///
564 /// fn into_line_and_column(reader: Reader<Cursor<&[u8]>>) -> (usize, usize) {
565 /// let end_pos = reader.buffer_position();
566 /// let mut cursor = reader.into_inner();
567 /// let s = String::from_utf8(cursor.into_inner()[0..end_pos].to_owned())
568 /// .expect("can't make a string");
569 /// let mut line = 1;
570 /// let mut column = 0;
571 /// for c in s.chars() {
572 /// if c == '\n' {
573 /// line += 1;
574 /// column = 0;
575 /// } else {
576 /// column += 1;
577 /// }
578 /// }
579 /// (line, column)
580 /// }
581 ///
582 /// loop {
583 /// match reader.read_event_into(&mut buf) {
584 /// Ok(Event::Start(ref e)) => match e.name().as_ref() {
585 /// b"tag1" | b"tag2" => (),
586 /// tag => {
587 /// assert_eq!(b"tag3", tag);
588 /// assert_eq!((3, 22), into_line_and_column(reader));
589 /// break;
590 /// }
591 /// },
592 /// Ok(Event::Eof) => unreachable!(),
593 /// _ => (),
594 /// }
595 /// buf.clear();
596 /// }
597 /// ```
598 pub fn into_inner(self) -> R {
599 self.reader
600 }
601
602 /// Gets a reference to the underlying reader.
603 pub fn get_ref(&self) -> &R {
604 &self.reader
605 }
606
607 /// Gets a mutable reference to the underlying reader.
608 pub fn get_mut(&mut self) -> &mut R {
609 &mut self.reader
610 }
611
612 /// Gets the current byte position in the input data.
613 ///
614 /// Useful when debugging errors.
615 pub fn buffer_position(&self) -> usize {
616 // when internal state is OpenedTag, we have actually read until '<',
617 // which we don't want to show
618 if let ParseState::OpenedTag = self.state.state {
619 self.state.offset - 1
620 } else {
621 self.state.offset
622 }
623 }
624
625 /// Get the decoder, used to decode bytes, read by this reader, to the strings.
626 ///
627 /// If [`encoding`] feature is enabled, the used encoding may change after
628 /// parsing the XML declaration, otherwise encoding is fixed to UTF-8.
629 ///
630 /// If [`encoding`] feature is enabled and no encoding is specified in declaration,
631 /// defaults to UTF-8.
632 ///
633 /// [`encoding`]: ../index.html#encoding
634 #[inline]
635 pub fn decoder(&self) -> Decoder {
636 self.state.decoder()
637 }
638}
639
640/// Private sync reading methods
641impl<R> Reader<R> {
642 /// Read text into the given buffer, and return an event that borrows from
643 /// either that buffer or from the input itself, based on the type of the
644 /// reader.
645 fn read_event_impl<'i, B>(&mut self, mut buf: B) -> Result<Event<'i>>
646 where
647 R: XmlSource<'i, B>,
648 {
649 read_event_impl!(self, buf, self.reader, read_until_open, read_until_close)
650 }
651
652 /// Read until '<' is found, moves reader to an `OpenedTag` state and returns a `Text` event.
653 ///
654 /// Returns inner `Ok` if the loop should be broken and an event returned.
655 /// Returns inner `Err` with the same `buf` because Rust borrowck stumbles upon this case in particular.
656 fn read_until_open<'i, B>(&mut self, buf: B) -> Result<std::result::Result<Event<'i>, B>>
657 where
658 R: XmlSource<'i, B>,
659 {
660 read_until_open!(self, buf, self.reader, read_event_impl)
661 }
662
663 /// Private function to read until `>` is found. This function expects that
664 /// it was called just after encounter a `<` symbol.
665 fn read_until_close<'i, B>(&mut self, buf: B) -> Result<Event<'i>>
666 where
667 R: XmlSource<'i, B>,
668 {
669 read_until_close!(self, buf, self.reader)
670 }
671}
672
673////////////////////////////////////////////////////////////////////////////////////////////////////
674
675/// Represents an input for a reader that can return borrowed data.
676///
677/// There are two implementors of this trait: generic one that read data from
678/// `Self`, copies some part of it into a provided buffer of type `B` and then
679/// returns data that borrow from that buffer.
680///
681/// The other implementor is for `&[u8]` and instead of copying data returns
682/// borrowed data from `Self` instead. This implementation allows zero-copy
683/// deserialization.
684///
685/// # Parameters
686/// - `'r`: lifetime of a buffer from which events will borrow
687/// - `B`: a type of a buffer that can be used to store data read from `Self` and
688/// from which events can borrow
689trait XmlSource<'r, B> {
690 /// Removes UTF-8 BOM if it is present
691 #[cfg(not(feature = "encoding"))]
692 fn remove_utf8_bom(&mut self) -> Result<()>;
693
694 /// Determines encoding from the start of input and removes BOM if it is present
695 #[cfg(feature = "encoding")]
696 fn detect_encoding(&mut self) -> Result<Option<&'static Encoding>>;
697
698 /// Read input until `byte` is found or end of input is reached.
699 ///
700 /// Returns a slice of data read up to `byte`, which does not include into result.
701 /// If input (`Self`) is exhausted, returns `None`.
702 ///
703 /// # Example
704 ///
705 /// ```ignore
706 /// let mut position = 0;
707 /// let mut input = b"abc*def".as_ref();
708 /// // ^= 4
709 ///
710 /// assert_eq!(
711 /// input.read_bytes_until(b'*', (), &mut position).unwrap(),
712 /// Some(b"abc".as_ref())
713 /// );
714 /// assert_eq!(position, 4); // position after the symbol matched
715 /// ```
716 ///
717 /// # Parameters
718 /// - `byte`: Byte for search
719 /// - `buf`: Buffer that could be filled from an input (`Self`) and
720 /// from which [events] could borrow their data
721 /// - `position`: Will be increased by amount of bytes consumed
722 ///
723 /// [events]: crate::events::Event
724 fn read_bytes_until(
725 &mut self,
726 byte: u8,
727 buf: B,
728 position: &mut usize,
729 ) -> Result<Option<&'r [u8]>>;
730
731 /// Read input until comment, CDATA or processing instruction is finished.
732 ///
733 /// This method expect that `<` already was read.
734 ///
735 /// Returns a slice of data read up to end of comment, CDATA or processing
736 /// instruction (`>`), which does not include into result.
737 ///
738 /// If input (`Self`) is exhausted and nothing was read, returns `None`.
739 ///
740 /// # Parameters
741 /// - `buf`: Buffer that could be filled from an input (`Self`) and
742 /// from which [events] could borrow their data
743 /// - `position`: Will be increased by amount of bytes consumed
744 ///
745 /// [events]: crate::events::Event
746 fn read_bang_element(
747 &mut self,
748 buf: B,
749 position: &mut usize,
750 ) -> Result<Option<(BangType, &'r [u8])>>;
751
752 /// Read input until XML element is closed by approaching a `>` symbol.
753 /// Returns `Some(buffer)` that contains a data between `<` and `>` or
754 /// `None` if end-of-input was reached and nothing was read.
755 ///
756 /// Derived from `read_until`, but modified to handle XML attributes
757 /// using a minimal state machine.
758 ///
759 /// Attribute values are [defined] as follows:
760 /// ```plain
761 /// AttValue := '"' (([^<&"]) | Reference)* '"'
762 /// | "'" (([^<&']) | Reference)* "'"
763 /// ```
764 /// (`Reference` is something like `&quot;`, but we don't care about
765 /// escaped characters at this level)
766 ///
767 /// # Parameters
768 /// - `buf`: Buffer that could be filled from an input (`Self`) and
769 /// from which [events] could borrow their data
770 /// - `position`: Will be increased by amount of bytes consumed
771 ///
772 /// [defined]: https://www.w3.org/TR/xml11/#NT-AttValue
773 /// [events]: crate::events::Event
774 fn read_element(&mut self, buf: B, position: &mut usize) -> Result<Option<&'r [u8]>>;
775
776 /// Consume and discard all the whitespace until the next non-whitespace
777 /// character or EOF.
778 ///
779 /// # Parameters
780 /// - `position`: Will be increased by amount of bytes consumed
781 fn skip_whitespace(&mut self, position: &mut usize) -> Result<()>;
782
783 /// Consume and discard one character if it matches the given byte. Return
784 /// `true` if it matched.
785 ///
786 /// # Parameters
787 /// - `position`: Will be increased by 1 if byte is matched
788 fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result<bool>;
789
790 /// Return one character without consuming it, so that future `read_*` calls
791 /// will still include it. On EOF, return `None`.
792 fn peek_one(&mut self) -> Result<Option<u8>>;
793}
794
795/// Possible elements started with `<!`
796#[derive(Debug, PartialEq)]
797enum BangType {
798 /// <![CDATA[...]]>
799 CData,
800 /// <!--...-->
801 Comment,
802 /// <!DOCTYPE...>
803 DocType,
804}
805impl BangType {
806 #[inline(always)]
807 fn new(byte: Option<u8>) -> Result<Self> {
808 Ok(match byte {
809 Some(b'[') => Self::CData,
810 Some(b'-') => Self::Comment,
811 Some(b'D') | Some(b'd') => Self::DocType,
812 Some(b) => return Err(Error::UnexpectedBang(b)),
813 None => return Err(Error::UnexpectedEof("Bang".to_string())),
814 })
815 }
816
817 /// If element is finished, returns its content up to `>` symbol and
818 /// an index of this symbol, otherwise returns `None`
819 ///
820 /// # Parameters
821 /// - `buf`: buffer with data consumed on previous iterations
822 /// - `chunk`: data read on current iteration and not yet consumed from reader
823 #[inline(always)]
824 fn parse<'b>(&self, buf: &[u8], chunk: &'b [u8]) -> Option<(&'b [u8], usize)> {
825 for i in memchr::memchr_iter(b'>', chunk) {
826 match self {
827 // Need to read at least 6 symbols (`!---->`) for properly finished comment
828 // <!----> - XML comment
829 // 012345 - i
830 Self::Comment if buf.len() + i > 4 => {
831 if chunk[..i].ends_with(b"--") {
832 // We cannot strip last `--` from the buffer because we need it in case of
833 // check_comments enabled option. XML standard requires that comment
834 // will not end with `--->` sequence because this is a special case of
835 // `--` in the comment (https://www.w3.org/TR/xml11/#sec-comments)
836 return Some((&chunk[..i], i + 1)); // +1 for `>`
837 }
838 // End sequence `-|->` was splitted at |
839 // buf --/ \-- chunk
840 if i == 1 && buf.ends_with(b"-") && chunk[0] == b'-' {
841 return Some((&chunk[..i], i + 1)); // +1 for `>`
842 }
843 // End sequence `--|>` was splitted at |
844 // buf --/ \-- chunk
845 if i == 0 && buf.ends_with(b"--") {
846 return Some((&[], i + 1)); // +1 for `>`
847 }
848 }
849 Self::Comment => {}
850 Self::CData => {
851 if chunk[..i].ends_with(b"]]") {
852 return Some((&chunk[..i], i + 1)); // +1 for `>`
853 }
854 // End sequence `]|]>` was splitted at |
855 // buf --/ \-- chunk
856 if i == 1 && buf.ends_with(b"]") && chunk[0] == b']' {
857 return Some((&chunk[..i], i + 1)); // +1 for `>`
858 }
859 // End sequence `]]|>` was splitted at |
860 // buf --/ \-- chunk
861 if i == 0 && buf.ends_with(b"]]") {
862 return Some((&[], i + 1)); // +1 for `>`
863 }
864 }
865 Self::DocType => {
866 let content = &chunk[..i];
867 let balance = memchr::memchr2_iter(b'<', b'>', content)
868 .map(|p| if content[p] == b'<' { 1i32 } else { -1 })
869 .sum::<i32>();
870 if balance == 0 {
871 return Some((content, i + 1)); // +1 for `>`
872 }
873 }
874 }
875 }
876 None
877 }
878 #[inline]
879 fn to_err(&self) -> Error {
880 let bang_str = match self {
881 Self::CData => "CData",
882 Self::Comment => "Comment",
883 Self::DocType => "DOCTYPE",
884 };
885 Error::UnexpectedEof(bang_str.to_string())
886 }
887}
888
889/// State machine for the [`XmlSource::read_element`]
890#[derive(Clone, Copy)]
891enum ReadElementState {
892 /// The initial state (inside element, but outside of attribute value)
893 Elem,
894 /// Inside a single-quoted attribute value
895 SingleQ,
896 /// Inside a double-quoted attribute value
897 DoubleQ,
898}
899impl ReadElementState {
900 /// Changes state by analyzing part of input.
901 /// Returns a tuple with part of chunk up to element closing symbol `>`
902 /// and a position after that symbol or `None` if such symbol was not found
903 #[inline(always)]
904 fn change<'b>(&mut self, chunk: &'b [u8]) -> Option<(&'b [u8], usize)> {
905 for i: usize in memchr::memchr3_iter(needle1:b'>', needle2:b'\'', needle3:b'"', haystack:chunk) {
906 *self = match (*self, chunk[i]) {
907 // only allowed to match `>` while we are in state `Elem`
908 (Self::Elem, b'>') => return Some((&chunk[..i], i + 1)),
909 (Self::Elem, b'\'') => Self::SingleQ,
910 (Self::Elem, b'\"') => Self::DoubleQ,
911
912 // the only end_byte that gets us out if the same character
913 (Self::SingleQ, b'\'') | (Self::DoubleQ, b'"') => Self::Elem,
914
915 // all other bytes: no state change
916 _ => *self,
917 };
918 }
919 None
920 }
921}
922
923/// A function to check whether the byte is a whitespace (blank, new line, carriage return or tab)
924#[inline]
925pub(crate) const fn is_whitespace(b: u8) -> bool {
926 matches!(b, b' ' | b'\r' | b'\n' | b'\t')
927}
928
929////////////////////////////////////////////////////////////////////////////////////////////////////
930
931#[cfg(test)]
932mod test {
933 /// Checks the internal implementation of the various reader methods
934 macro_rules! check {
935 (
936 #[$test:meta]
937 $read_event:ident,
938 $read_until_close:ident,
939 // constructor of the XML source on which internal functions will be called
940 $source:path,
941 // constructor of the buffer to which read data will stored
942 $buf:expr
943 $(, $async:ident, $await:ident)?
944 ) => {
945 mod read_bytes_until {
946 use super::*;
947 // Use Bytes for printing bytes as strings for ASCII range
948 use crate::utils::Bytes;
949 use pretty_assertions::assert_eq;
950
951 /// Checks that search in the empty buffer returns `None`
952 #[$test]
953 $($async)? fn empty() {
954 let buf = $buf;
955 let mut position = 0;
956 let mut input = b"".as_ref();
957 // ^= 0
958
959 assert_eq!(
960 $source(&mut input)
961 .read_bytes_until(b'*', buf, &mut position)
962 $(.$await)?
963 .unwrap()
964 .map(Bytes),
965 None
966 );
967 assert_eq!(position, 0);
968 }
969
970 /// Checks that search in the buffer non-existent value returns entire buffer
971 /// as a result and set `position` to `len()`
972 #[$test]
973 $($async)? fn non_existent() {
974 let buf = $buf;
975 let mut position = 0;
976 let mut input = b"abcdef".as_ref();
977 // ^= 6
978
979 assert_eq!(
980 $source(&mut input)
981 .read_bytes_until(b'*', buf, &mut position)
982 $(.$await)?
983 .unwrap()
984 .map(Bytes),
985 Some(Bytes(b"abcdef"))
986 );
987 assert_eq!(position, 6);
988 }
989
990 /// Checks that search in the buffer an element that is located in the front of
991 /// buffer returns empty slice as a result and set `position` to one symbol
992 /// after match (`1`)
993 #[$test]
994 $($async)? fn at_the_start() {
995 let buf = $buf;
996 let mut position = 0;
997 let mut input = b"*abcdef".as_ref();
998 // ^= 1
999
1000 assert_eq!(
1001 $source(&mut input)
1002 .read_bytes_until(b'*', buf, &mut position)
1003 $(.$await)?
1004 .unwrap()
1005 .map(Bytes),
1006 Some(Bytes(b""))
1007 );
1008 assert_eq!(position, 1); // position after the symbol matched
1009 }
1010
1011 /// Checks that search in the buffer an element that is located in the middle of
1012 /// buffer returns slice before that symbol as a result and set `position` to one
1013 /// symbol after match
1014 #[$test]
1015 $($async)? fn inside() {
1016 let buf = $buf;
1017 let mut position = 0;
1018 let mut input = b"abc*def".as_ref();
1019 // ^= 4
1020
1021 assert_eq!(
1022 $source(&mut input)
1023 .read_bytes_until(b'*', buf, &mut position)
1024 $(.$await)?
1025 .unwrap()
1026 .map(Bytes),
1027 Some(Bytes(b"abc"))
1028 );
1029 assert_eq!(position, 4); // position after the symbol matched
1030 }
1031
1032 /// Checks that search in the buffer an element that is located in the end of
1033 /// buffer returns slice before that symbol as a result and set `position` to one
1034 /// symbol after match (`len()`)
1035 #[$test]
1036 $($async)? fn in_the_end() {
1037 let buf = $buf;
1038 let mut position = 0;
1039 let mut input = b"abcdef*".as_ref();
1040 // ^= 7
1041
1042 assert_eq!(
1043 $source(&mut input)
1044 .read_bytes_until(b'*', buf, &mut position)
1045 $(.$await)?
1046 .unwrap()
1047 .map(Bytes),
1048 Some(Bytes(b"abcdef"))
1049 );
1050 assert_eq!(position, 7); // position after the symbol matched
1051 }
1052 }
1053
1054 mod read_bang_element {
1055 use super::*;
1056
1057 /// Checks that reading CDATA content works correctly
1058 mod cdata {
1059 use super::*;
1060 use crate::errors::Error;
1061 use crate::reader::BangType;
1062 use crate::utils::Bytes;
1063 use pretty_assertions::assert_eq;
1064
1065 /// Checks that if input begins like CDATA element, but CDATA start sequence
1066 /// is not finished, parsing ends with an error
1067 #[$test]
1068 #[ignore = "start CDATA sequence fully checked outside of `read_bang_element`"]
1069 $($async)? fn not_properly_start() {
1070 let buf = $buf;
1071 let mut position = 0;
1072 let mut input = b"![]]>other content".as_ref();
1073 // ^= 0
1074
1075 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1076 Err(Error::UnexpectedEof(s)) if s == "CData" => {}
1077 x => assert!(
1078 false,
1079 r#"Expected `UnexpectedEof("CData")`, but result is: {:?}"#,
1080 x
1081 ),
1082 }
1083 assert_eq!(position, 0);
1084 }
1085
1086 /// Checks that if CDATA startup sequence was matched, but an end sequence
1087 /// is not found, parsing ends with an error
1088 #[$test]
1089 $($async)? fn not_closed() {
1090 let buf = $buf;
1091 let mut position = 0;
1092 let mut input = b"![CDATA[other content".as_ref();
1093 // ^= 0
1094
1095 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1096 Err(Error::UnexpectedEof(s)) if s == "CData" => {}
1097 x => assert!(
1098 false,
1099 r#"Expected `UnexpectedEof("CData")`, but result is: {:?}"#,
1100 x
1101 ),
1102 }
1103 assert_eq!(position, 0);
1104 }
1105
1106 /// Checks that CDATA element without content inside parsed successfully
1107 #[$test]
1108 $($async)? fn empty() {
1109 let buf = $buf;
1110 let mut position = 0;
1111 let mut input = b"![CDATA[]]>other content".as_ref();
1112 // ^= 11
1113
1114 assert_eq!(
1115 $source(&mut input)
1116 .read_bang_element(buf, &mut position)
1117 $(.$await)?
1118 .unwrap()
1119 .map(|(ty, data)| (ty, Bytes(data))),
1120 Some((BangType::CData, Bytes(b"![CDATA[]]")))
1121 );
1122 assert_eq!(position, 11);
1123 }
1124
1125 /// Checks that CDATA element with content parsed successfully.
1126 /// Additionally checks that sequences inside CDATA that may look like
1127 /// a CDATA end sequence do not interrupt CDATA parsing
1128 #[$test]
1129 $($async)? fn with_content() {
1130 let buf = $buf;
1131 let mut position = 0;
1132 let mut input = b"![CDATA[cdata]] ]>content]]>other content]]>".as_ref();
1133 // ^= 28
1134
1135 assert_eq!(
1136 $source(&mut input)
1137 .read_bang_element(buf, &mut position)
1138 $(.$await)?
1139 .unwrap()
1140 .map(|(ty, data)| (ty, Bytes(data))),
1141 Some((BangType::CData, Bytes(b"![CDATA[cdata]] ]>content]]")))
1142 );
1143 assert_eq!(position, 28);
1144 }
1145 }
1146
1147 /// Checks that reading XML comments works correctly. According to the [specification],
1148 /// comment data can contain any sequence except `--`:
1149 ///
1150 /// ```peg
1151 /// comment = '<--' (!'--' char)* '-->';
1152 /// char = [#x1-#x2C]
1153 /// / [#x2E-#xD7FF]
1154 /// / [#xE000-#xFFFD]
1155 /// / [#x10000-#x10FFFF]
1156 /// ```
1157 ///
1158 /// The presence of this limitation, however, is simply a poorly designed specification
1159 /// (maybe for purpose of building of LL(1) XML parser) and quick-xml does not check for
1160 /// presence of these sequences by default. This tests allow such content.
1161 ///
1162 /// [specification]: https://www.w3.org/TR/xml11/#dt-comment
1163 mod comment {
1164 use super::*;
1165 use crate::errors::Error;
1166 use crate::reader::BangType;
1167 use crate::utils::Bytes;
1168 use pretty_assertions::assert_eq;
1169
1170 #[$test]
1171 #[ignore = "start comment sequence fully checked outside of `read_bang_element`"]
1172 $($async)? fn not_properly_start() {
1173 let buf = $buf;
1174 let mut position = 0;
1175 let mut input = b"!- -->other content".as_ref();
1176 // ^= 0
1177
1178 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1179 Err(Error::UnexpectedEof(s)) if s == "Comment" => {}
1180 x => assert!(
1181 false,
1182 r#"Expected `UnexpectedEof("Comment")`, but result is: {:?}"#,
1183 x
1184 ),
1185 }
1186 assert_eq!(position, 0);
1187 }
1188
1189 #[$test]
1190 $($async)? fn not_properly_end() {
1191 let buf = $buf;
1192 let mut position = 0;
1193 let mut input = b"!->other content".as_ref();
1194 // ^= 0
1195
1196 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1197 Err(Error::UnexpectedEof(s)) if s == "Comment" => {}
1198 x => assert!(
1199 false,
1200 r#"Expected `UnexpectedEof("Comment")`, but result is: {:?}"#,
1201 x
1202 ),
1203 }
1204 assert_eq!(position, 0);
1205 }
1206
1207 #[$test]
1208 $($async)? fn not_closed1() {
1209 let buf = $buf;
1210 let mut position = 0;
1211 let mut input = b"!--other content".as_ref();
1212 // ^= 0
1213
1214 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1215 Err(Error::UnexpectedEof(s)) if s == "Comment" => {}
1216 x => assert!(
1217 false,
1218 r#"Expected `UnexpectedEof("Comment")`, but result is: {:?}"#,
1219 x
1220 ),
1221 }
1222 assert_eq!(position, 0);
1223 }
1224
1225 #[$test]
1226 $($async)? fn not_closed2() {
1227 let buf = $buf;
1228 let mut position = 0;
1229 let mut input = b"!-->other content".as_ref();
1230 // ^= 0
1231
1232 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1233 Err(Error::UnexpectedEof(s)) if s == "Comment" => {}
1234 x => assert!(
1235 false,
1236 r#"Expected `UnexpectedEof("Comment")`, but result is: {:?}"#,
1237 x
1238 ),
1239 }
1240 assert_eq!(position, 0);
1241 }
1242
1243 #[$test]
1244 $($async)? fn not_closed3() {
1245 let buf = $buf;
1246 let mut position = 0;
1247 let mut input = b"!--->other content".as_ref();
1248 // ^= 0
1249
1250 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1251 Err(Error::UnexpectedEof(s)) if s == "Comment" => {}
1252 x => assert!(
1253 false,
1254 r#"Expected `UnexpectedEof("Comment")`, but result is: {:?}"#,
1255 x
1256 ),
1257 }
1258 assert_eq!(position, 0);
1259 }
1260
1261 #[$test]
1262 $($async)? fn empty() {
1263 let buf = $buf;
1264 let mut position = 0;
1265 let mut input = b"!---->other content".as_ref();
1266 // ^= 6
1267
1268 assert_eq!(
1269 $source(&mut input)
1270 .read_bang_element(buf, &mut position)
1271 $(.$await)?
1272 .unwrap()
1273 .map(|(ty, data)| (ty, Bytes(data))),
1274 Some((BangType::Comment, Bytes(b"!----")))
1275 );
1276 assert_eq!(position, 6);
1277 }
1278
1279 #[$test]
1280 $($async)? fn with_content() {
1281 let buf = $buf;
1282 let mut position = 0;
1283 let mut input = b"!--->comment<--->other content".as_ref();
1284 // ^= 17
1285
1286 assert_eq!(
1287 $source(&mut input)
1288 .read_bang_element(buf, &mut position)
1289 $(.$await)?
1290 .unwrap()
1291 .map(|(ty, data)| (ty, Bytes(data))),
1292 Some((BangType::Comment, Bytes(b"!--->comment<---")))
1293 );
1294 assert_eq!(position, 17);
1295 }
1296 }
1297
1298 /// Checks that reading DOCTYPE definition works correctly
1299 mod doctype {
1300 use super::*;
1301
1302 mod uppercase {
1303 use super::*;
1304 use crate::errors::Error;
1305 use crate::reader::BangType;
1306 use crate::utils::Bytes;
1307 use pretty_assertions::assert_eq;
1308
1309 #[$test]
1310 $($async)? fn not_properly_start() {
1311 let buf = $buf;
1312 let mut position = 0;
1313 let mut input = b"!D other content".as_ref();
1314 // ^= 0
1315
1316 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1317 Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
1318 x => assert!(
1319 false,
1320 r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
1321 x
1322 ),
1323 }
1324 assert_eq!(position, 0);
1325 }
1326
1327 #[$test]
1328 $($async)? fn without_space() {
1329 let buf = $buf;
1330 let mut position = 0;
1331 let mut input = b"!DOCTYPEother content".as_ref();
1332 // ^= 0
1333
1334 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1335 Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
1336 x => assert!(
1337 false,
1338 r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
1339 x
1340 ),
1341 }
1342 assert_eq!(position, 0);
1343 }
1344
1345 #[$test]
1346 $($async)? fn empty() {
1347 let buf = $buf;
1348 let mut position = 0;
1349 let mut input = b"!DOCTYPE>other content".as_ref();
1350 // ^= 9
1351
1352 assert_eq!(
1353 $source(&mut input)
1354 .read_bang_element(buf, &mut position)
1355 $(.$await)?
1356 .unwrap()
1357 .map(|(ty, data)| (ty, Bytes(data))),
1358 Some((BangType::DocType, Bytes(b"!DOCTYPE")))
1359 );
1360 assert_eq!(position, 9);
1361 }
1362
1363 #[$test]
1364 $($async)? fn not_closed() {
1365 let buf = $buf;
1366 let mut position = 0;
1367 let mut input = b"!DOCTYPE other content".as_ref();
1368 // ^= 0
1369
1370 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1371 Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
1372 x => assert!(
1373 false,
1374 r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
1375 x
1376 ),
1377 }
1378 assert_eq!(position, 0);
1379 }
1380 }
1381
1382 mod lowercase {
1383 use super::*;
1384 use crate::errors::Error;
1385 use crate::reader::BangType;
1386 use crate::utils::Bytes;
1387 use pretty_assertions::assert_eq;
1388
1389 #[$test]
1390 $($async)? fn not_properly_start() {
1391 let buf = $buf;
1392 let mut position = 0;
1393 let mut input = b"!d other content".as_ref();
1394 // ^= 0
1395
1396 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1397 Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
1398 x => assert!(
1399 false,
1400 r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
1401 x
1402 ),
1403 }
1404 assert_eq!(position, 0);
1405 }
1406
1407 #[$test]
1408 $($async)? fn without_space() {
1409 let buf = $buf;
1410 let mut position = 0;
1411 let mut input = b"!doctypeother content".as_ref();
1412 // ^= 0
1413
1414 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1415 Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
1416 x => assert!(
1417 false,
1418 r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
1419 x
1420 ),
1421 }
1422 assert_eq!(position, 0);
1423 }
1424
1425 #[$test]
1426 $($async)? fn empty() {
1427 let buf = $buf;
1428 let mut position = 0;
1429 let mut input = b"!doctype>other content".as_ref();
1430 // ^= 9
1431
1432 assert_eq!(
1433 $source(&mut input)
1434 .read_bang_element(buf, &mut position)
1435 $(.$await)?
1436 .unwrap()
1437 .map(|(ty, data)| (ty, Bytes(data))),
1438 Some((BangType::DocType, Bytes(b"!doctype")))
1439 );
1440 assert_eq!(position, 9);
1441 }
1442
1443 #[$test]
1444 $($async)? fn not_closed() {
1445 let buf = $buf;
1446 let mut position = 0;
1447 let mut input = b"!doctype other content".as_ref();
1448 // ^= 0
1449
1450 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1451 Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
1452 x => assert!(
1453 false,
1454 r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
1455 x
1456 ),
1457 }
1458 assert_eq!(position, 0);
1459 }
1460 }
1461 }
1462 }
1463
1464 mod read_element {
1465 use super::*;
1466 use crate::utils::Bytes;
1467 use pretty_assertions::assert_eq;
1468
1469 /// Checks that nothing was read from empty buffer
1470 #[$test]
1471 $($async)? fn empty() {
1472 let buf = $buf;
1473 let mut position = 0;
1474 let mut input = b"".as_ref();
1475 // ^= 0
1476
1477 assert_eq!(
1478 $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1479 None
1480 );
1481 assert_eq!(position, 0);
1482 }
1483
1484 mod open {
1485 use super::*;
1486 use crate::utils::Bytes;
1487 use pretty_assertions::assert_eq;
1488
1489 #[$test]
1490 $($async)? fn empty_tag() {
1491 let buf = $buf;
1492 let mut position = 0;
1493 let mut input = b">".as_ref();
1494 // ^= 1
1495
1496 assert_eq!(
1497 $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1498 Some(Bytes(b""))
1499 );
1500 assert_eq!(position, 1);
1501 }
1502
1503 #[$test]
1504 $($async)? fn normal() {
1505 let buf = $buf;
1506 let mut position = 0;
1507 let mut input = b"tag>".as_ref();
1508 // ^= 4
1509
1510 assert_eq!(
1511 $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1512 Some(Bytes(b"tag"))
1513 );
1514 assert_eq!(position, 4);
1515 }
1516
1517 #[$test]
1518 $($async)? fn empty_ns_empty_tag() {
1519 let buf = $buf;
1520 let mut position = 0;
1521 let mut input = b":>".as_ref();
1522 // ^= 2
1523
1524 assert_eq!(
1525 $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1526 Some(Bytes(b":"))
1527 );
1528 assert_eq!(position, 2);
1529 }
1530
1531 #[$test]
1532 $($async)? fn empty_ns() {
1533 let buf = $buf;
1534 let mut position = 0;
1535 let mut input = b":tag>".as_ref();
1536 // ^= 5
1537
1538 assert_eq!(
1539 $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1540 Some(Bytes(b":tag"))
1541 );
1542 assert_eq!(position, 5);
1543 }
1544
1545 #[$test]
1546 $($async)? fn with_attributes() {
1547 let buf = $buf;
1548 let mut position = 0;
1549 let mut input = br#"tag attr-1=">" attr2 = '>' 3attr>"#.as_ref();
1550 // ^= 38
1551
1552 assert_eq!(
1553 $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1554 Some(Bytes(br#"tag attr-1=">" attr2 = '>' 3attr"#))
1555 );
1556 assert_eq!(position, 38);
1557 }
1558 }
1559
1560 mod self_closed {
1561 use super::*;
1562 use crate::utils::Bytes;
1563 use pretty_assertions::assert_eq;
1564
1565 #[$test]
1566 $($async)? fn empty_tag() {
1567 let buf = $buf;
1568 let mut position = 0;
1569 let mut input = b"/>".as_ref();
1570 // ^= 2
1571
1572 assert_eq!(
1573 $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1574 Some(Bytes(b"/"))
1575 );
1576 assert_eq!(position, 2);
1577 }
1578
1579 #[$test]
1580 $($async)? fn normal() {
1581 let buf = $buf;
1582 let mut position = 0;
1583 let mut input = b"tag/>".as_ref();
1584 // ^= 5
1585
1586 assert_eq!(
1587 $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1588 Some(Bytes(b"tag/"))
1589 );
1590 assert_eq!(position, 5);
1591 }
1592
1593 #[$test]
1594 $($async)? fn empty_ns_empty_tag() {
1595 let buf = $buf;
1596 let mut position = 0;
1597 let mut input = b":/>".as_ref();
1598 // ^= 3
1599
1600 assert_eq!(
1601 $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1602 Some(Bytes(b":/"))
1603 );
1604 assert_eq!(position, 3);
1605 }
1606
1607 #[$test]
1608 $($async)? fn empty_ns() {
1609 let buf = $buf;
1610 let mut position = 0;
1611 let mut input = b":tag/>".as_ref();
1612 // ^= 6
1613
1614 assert_eq!(
1615 $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1616 Some(Bytes(b":tag/"))
1617 );
1618 assert_eq!(position, 6);
1619 }
1620
1621 #[$test]
1622 $($async)? fn with_attributes() {
1623 let buf = $buf;
1624 let mut position = 0;
1625 let mut input = br#"tag attr-1="/>" attr2 = '/>' 3attr/>"#.as_ref();
1626 // ^= 41
1627
1628 assert_eq!(
1629 $source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1630 Some(Bytes(br#"tag attr-1="/>" attr2 = '/>' 3attr/"#))
1631 );
1632 assert_eq!(position, 41);
1633 }
1634 }
1635 }
1636
1637 mod issue_344 {
1638 use crate::errors::Error;
1639 use crate::reader::Reader;
1640
1641 #[$test]
1642 $($async)? fn cdata() {
1643 let mut reader = Reader::from_str("![]]>");
1644
1645 match reader.$read_until_close($buf) $(.$await)? {
1646 Err(Error::UnexpectedEof(s)) if s == "CData" => {}
1647 x => assert!(
1648 false,
1649 r#"Expected `UnexpectedEof("CData")`, but result is: {:?}"#,
1650 x
1651 ),
1652 }
1653 }
1654
1655 #[$test]
1656 $($async)? fn comment() {
1657 let mut reader = Reader::from_str("!- -->");
1658
1659 match reader.$read_until_close($buf) $(.$await)? {
1660 Err(Error::UnexpectedEof(s)) if s == "Comment" => {}
1661 x => assert!(
1662 false,
1663 r#"Expected `UnexpectedEof("Comment")`, but result is: {:?}"#,
1664 x
1665 ),
1666 }
1667 }
1668
1669 #[$test]
1670 $($async)? fn doctype_uppercase() {
1671 let mut reader = Reader::from_str("!D>");
1672
1673 match reader.$read_until_close($buf) $(.$await)? {
1674 Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
1675 x => assert!(
1676 false,
1677 r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
1678 x
1679 ),
1680 }
1681 }
1682
1683 #[$test]
1684 $($async)? fn doctype_lowercase() {
1685 let mut reader = Reader::from_str("!d>");
1686
1687 match reader.$read_until_close($buf) $(.$await)? {
1688 Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
1689 x => assert!(
1690 false,
1691 r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
1692 x
1693 ),
1694 }
1695 }
1696 }
1697
1698 /// Ensures, that no empty `Text` events are generated
1699 mod $read_event {
1700 use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event};
1701 use crate::reader::Reader;
1702 use pretty_assertions::assert_eq;
1703
1704 /// When `encoding` feature is enabled, encoding should be detected
1705 /// from BOM (UTF-8) and BOM should be stripped.
1706 ///
1707 /// When `encoding` feature is disabled, UTF-8 is assumed and BOM
1708 /// character should be stripped for consistency
1709 #[$test]
1710 $($async)? fn bom_from_reader() {
1711 let mut reader = Reader::from_reader("\u{feff}\u{feff}".as_bytes());
1712
1713 assert_eq!(
1714 reader.$read_event($buf) $(.$await)? .unwrap(),
1715 Event::Text(BytesText::from_escaped("\u{feff}"))
1716 );
1717
1718 assert_eq!(
1719 reader.$read_event($buf) $(.$await)? .unwrap(),
1720 Event::Eof
1721 );
1722 }
1723
1724 /// When parsing from &str, encoding is fixed (UTF-8), so
1725 /// - when `encoding` feature is disabled, the behavior the
1726 /// same as in `bom_from_reader` text
1727 /// - when `encoding` feature is enabled, the behavior should
1728 /// stay consistent, so the first BOM character is stripped
1729 #[$test]
1730 $($async)? fn bom_from_str() {
1731 let mut reader = Reader::from_str("\u{feff}\u{feff}");
1732
1733 assert_eq!(
1734 reader.$read_event($buf) $(.$await)? .unwrap(),
1735 Event::Text(BytesText::from_escaped("\u{feff}"))
1736 );
1737
1738 assert_eq!(
1739 reader.$read_event($buf) $(.$await)? .unwrap(),
1740 Event::Eof
1741 );
1742 }
1743
1744 #[$test]
1745 $($async)? fn declaration() {
1746 let mut reader = Reader::from_str("<?xml ?>");
1747
1748 assert_eq!(
1749 reader.$read_event($buf) $(.$await)? .unwrap(),
1750 Event::Decl(BytesDecl::from_start(BytesStart::from_content("xml ", 3)))
1751 );
1752 }
1753
1754 #[$test]
1755 $($async)? fn doctype() {
1756 let mut reader = Reader::from_str("<!DOCTYPE x>");
1757
1758 assert_eq!(
1759 reader.$read_event($buf) $(.$await)? .unwrap(),
1760 Event::DocType(BytesText::from_escaped("x"))
1761 );
1762 }
1763
1764 #[$test]
1765 $($async)? fn processing_instruction() {
1766 let mut reader = Reader::from_str("<?xml-stylesheet?>");
1767
1768 assert_eq!(
1769 reader.$read_event($buf) $(.$await)? .unwrap(),
1770 Event::PI(BytesText::from_escaped("xml-stylesheet"))
1771 );
1772 }
1773
1774 #[$test]
1775 $($async)? fn start() {
1776 let mut reader = Reader::from_str("<tag>");
1777
1778 assert_eq!(
1779 reader.$read_event($buf) $(.$await)? .unwrap(),
1780 Event::Start(BytesStart::new("tag"))
1781 );
1782 }
1783
1784 #[$test]
1785 $($async)? fn end() {
1786 let mut reader = Reader::from_str("</tag>");
1787 // Because we expect invalid XML, do not check that
1788 // the end name paired with the start name
1789 reader.check_end_names(false);
1790
1791 assert_eq!(
1792 reader.$read_event($buf) $(.$await)? .unwrap(),
1793 Event::End(BytesEnd::new("tag"))
1794 );
1795 }
1796
1797 #[$test]
1798 $($async)? fn empty() {
1799 let mut reader = Reader::from_str("<tag/>");
1800
1801 assert_eq!(
1802 reader.$read_event($buf) $(.$await)? .unwrap(),
1803 Event::Empty(BytesStart::new("tag"))
1804 );
1805 }
1806
1807 #[$test]
1808 $($async)? fn text() {
1809 let mut reader = Reader::from_str("text");
1810
1811 assert_eq!(
1812 reader.$read_event($buf) $(.$await)? .unwrap(),
1813 Event::Text(BytesText::from_escaped("text"))
1814 );
1815 }
1816
1817 #[$test]
1818 $($async)? fn cdata() {
1819 let mut reader = Reader::from_str("<![CDATA[]]>");
1820
1821 assert_eq!(
1822 reader.$read_event($buf) $(.$await)? .unwrap(),
1823 Event::CData(BytesCData::new(""))
1824 );
1825 }
1826
1827 #[$test]
1828 $($async)? fn comment() {
1829 let mut reader = Reader::from_str("<!---->");
1830
1831 assert_eq!(
1832 reader.$read_event($buf) $(.$await)? .unwrap(),
1833 Event::Comment(BytesText::from_escaped(""))
1834 );
1835 }
1836
1837 #[$test]
1838 $($async)? fn eof() {
1839 let mut reader = Reader::from_str("");
1840
1841 assert_eq!(
1842 reader.$read_event($buf) $(.$await)? .unwrap(),
1843 Event::Eof
1844 );
1845 }
1846 }
1847 };
1848 }
1849
1850 /// Tests for https://github.com/tafia/quick-xml/issues/469
1851 macro_rules! small_buffers {
1852 (
1853 #[$test:meta]
1854 $read_event:ident: $BufReader:ty
1855 $(, $async:ident, $await:ident)?
1856 ) => {
1857 mod small_buffers {
1858 use crate::events::{BytesCData, BytesDecl, BytesStart, BytesText, Event};
1859 use crate::reader::Reader;
1860 use pretty_assertions::assert_eq;
1861
1862 #[$test]
1863 $($async)? fn decl() {
1864 let xml = "<?xml ?>";
1865 // ^^^^^^^ data that fit into buffer
1866 let size = xml.match_indices("?>").next().unwrap().0 + 1;
1867 let br = <$BufReader>::with_capacity(size, xml.as_bytes());
1868 let mut reader = Reader::from_reader(br);
1869 let mut buf = Vec::new();
1870
1871 assert_eq!(
1872 reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1873 Event::Decl(BytesDecl::from_start(BytesStart::from_content("xml ", 3)))
1874 );
1875 assert_eq!(
1876 reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1877 Event::Eof
1878 );
1879 }
1880
1881 #[$test]
1882 $($async)? fn pi() {
1883 let xml = "<?pi?>";
1884 // ^^^^^ data that fit into buffer
1885 let size = xml.match_indices("?>").next().unwrap().0 + 1;
1886 let br = <$BufReader>::with_capacity(size, xml.as_bytes());
1887 let mut reader = Reader::from_reader(br);
1888 let mut buf = Vec::new();
1889
1890 assert_eq!(
1891 reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1892 Event::PI(BytesText::new("pi"))
1893 );
1894 assert_eq!(
1895 reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1896 Event::Eof
1897 );
1898 }
1899
1900 #[$test]
1901 $($async)? fn empty() {
1902 let xml = "<empty/>";
1903 // ^^^^^^^ data that fit into buffer
1904 let size = xml.match_indices("/>").next().unwrap().0 + 1;
1905 let br = <$BufReader>::with_capacity(size, xml.as_bytes());
1906 let mut reader = Reader::from_reader(br);
1907 let mut buf = Vec::new();
1908
1909 assert_eq!(
1910 reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1911 Event::Empty(BytesStart::new("empty"))
1912 );
1913 assert_eq!(
1914 reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1915 Event::Eof
1916 );
1917 }
1918
1919 #[$test]
1920 $($async)? fn cdata1() {
1921 let xml = "<![CDATA[cdata]]>";
1922 // ^^^^^^^^^^^^^^^ data that fit into buffer
1923 let size = xml.match_indices("]]>").next().unwrap().0 + 1;
1924 let br = <$BufReader>::with_capacity(size, xml.as_bytes());
1925 let mut reader = Reader::from_reader(br);
1926 let mut buf = Vec::new();
1927
1928 assert_eq!(
1929 reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1930 Event::CData(BytesCData::new("cdata"))
1931 );
1932 assert_eq!(
1933 reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1934 Event::Eof
1935 );
1936 }
1937
1938 #[$test]
1939 $($async)? fn cdata2() {
1940 let xml = "<![CDATA[cdata]]>";
1941 // ^^^^^^^^^^^^^^^^ data that fit into buffer
1942 let size = xml.match_indices("]]>").next().unwrap().0 + 2;
1943 let br = <$BufReader>::with_capacity(size, xml.as_bytes());
1944 let mut reader = Reader::from_reader(br);
1945 let mut buf = Vec::new();
1946
1947 assert_eq!(
1948 reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1949 Event::CData(BytesCData::new("cdata"))
1950 );
1951 assert_eq!(
1952 reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1953 Event::Eof
1954 );
1955 }
1956
1957 #[$test]
1958 $($async)? fn comment1() {
1959 let xml = "<!--comment-->";
1960 // ^^^^^^^^^^^^ data that fit into buffer
1961 let size = xml.match_indices("-->").next().unwrap().0 + 1;
1962 let br = <$BufReader>::with_capacity(size, xml.as_bytes());
1963 let mut reader = Reader::from_reader(br);
1964 let mut buf = Vec::new();
1965
1966 assert_eq!(
1967 reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1968 Event::Comment(BytesText::new("comment"))
1969 );
1970 assert_eq!(
1971 reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1972 Event::Eof
1973 );
1974 }
1975
1976 #[$test]
1977 $($async)? fn comment2() {
1978 let xml = "<!--comment-->";
1979 // ^^^^^^^^^^^^^ data that fit into buffer
1980 let size = xml.match_indices("-->").next().unwrap().0 + 2;
1981 let br = <$BufReader>::with_capacity(size, xml.as_bytes());
1982 let mut reader = Reader::from_reader(br);
1983 let mut buf = Vec::new();
1984
1985 assert_eq!(
1986 reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1987 Event::Comment(BytesText::new("comment"))
1988 );
1989 assert_eq!(
1990 reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1991 Event::Eof
1992 );
1993 }
1994 }
1995 };
1996 }
1997
1998 // Export macros for the child modules:
1999 // - buffered_reader
2000 // - slice_reader
2001 pub(super) use check;
2002 pub(super) use small_buffers;
2003}
2004