mod.rs source code [crates/quick-xml-0.30.0/src/reader/mod.rs]

1	//! Contains high-level interface for a pull-based XML parser.
2
3	#[cfg(feature = "encoding")]
4	use encoding_rs::Encoding;
5	use std::ops::Range;
6
7	use crate::encoding::Decoder;
8	use crate::errors::{Error, Result};
9	use crate::events::Event;
10	use crate::reader::parser::Parser;
11
12	use memchr;
13
14	macro_rules! configure_methods {
15	($($holder:ident)?) => {
16	/// Changes whether empty elements should be split into an `Open` and a `Close` event.
17	///
18	/// When set to `true`, all [`Empty`] events produced by a self-closing tag like `<tag/>` are
19	/// expanded into a [`Start`] event followed by an [`End`] event. When set to `false` (the
20	/// default), those tags are represented by an [`Empty`] event instead.
21	///
22	/// Note, that setting this to `true` will lead to additional allocates that
23	/// needed to store tag name for an [`End`] event. However if [`check_end_names`]
24	/// is also set, only one additional allocation will be performed that support
25	/// both these options.
26	///
27	/// (`false` by default)
28	///
29	/// [`Empty`]: Event::Empty
30	/// [`Start`]: Event::Start
31	/// [`End`]: Event::End
32	/// [`check_end_names`]: Self::check_end_names
33	pub fn expand_empty_elements(&mut self, val: bool) -> &mut Self {
34	self $(.$holder)? .parser.expand_empty_elements = val;
35	self
36	}
37
38	/// Changes whether whitespace before and after character data should be removed.
39	///
40	/// When set to `true`, all [`Text`] events are trimmed.
41	/// If after that the event is empty it will not be pushed.
42	///
43	/// Changing this option automatically changes the [`trim_text_end`] option.
44	///
45	/// (`false` by default).
46	///
47	/// <div style="background:rgba(80, 240, 100, 0.20);padding:0.75em;">
48	///
49	/// WARNING: With this option every text events will be trimmed which is
50	/// incorrect behavior when text events delimited by comments, processing
51	/// instructions or CDATA sections. To correctly trim data manually apply
52	/// [`BytesText::inplace_trim_start`] and [`BytesText::inplace_trim_end`]
53	/// only to necessary events.
54	/// </div>
55	///
56	/// [`Text`]: Event::Text
57	/// [`trim_text_end`]: Self::trim_text_end
58	/// [`BytesText::inplace_trim_start`]: crate::events::BytesText::inplace_trim_start
59	/// [`BytesText::inplace_trim_end`]: crate::events::BytesText::inplace_trim_end
60	pub fn trim_text(&mut self, val: bool) -> &mut Self {
61	self $(.$holder)? .parser.trim_text_start = val;
62	self $(.$holder)? .parser.trim_text_end = val;
63	self
64	}
65
66	/// Changes whether whitespace after character data should be removed.
67	///
68	/// When set to `true`, trailing whitespace is trimmed in [`Text`] events.
69	/// If after that the event is empty it will not be pushed.
70	///
71	/// (`false` by default).
72	///
73	/// <div style="background:rgba(80, 240, 100, 0.20);padding:0.75em;">
74	///
75	/// WARNING: With this option every text events will be trimmed which is
76	/// incorrect behavior when text events delimited by comments, processing
77	/// instructions or CDATA sections. To correctly trim data manually apply
78	/// [`BytesText::inplace_trim_start`] and [`BytesText::inplace_trim_end`]
79	/// only to necessary events.
80	/// </div>
81	///
82	/// [`Text`]: Event::Text
83	/// [`BytesText::inplace_trim_start`]: crate::events::BytesText::inplace_trim_start
84	/// [`BytesText::inplace_trim_end`]: crate::events::BytesText::inplace_trim_end
85	pub fn trim_text_end(&mut self, val: bool) -> &mut Self {
86	self $(.$holder)? .parser.trim_text_end = val;
87	self
88	}
89
90	/// Changes whether trailing whitespaces after the markup name are trimmed in closing tags
91	/// `</a >`.
92	///
93	/// If true the emitted [`End`] event is stripped of trailing whitespace after the markup name.
94	///
95	/// Note that if set to `false` and `check_end_names` is true the comparison of markup names is
96	/// going to fail erroneously if a closing tag contains trailing whitespaces.
97	///
98	/// (`true` by default)
99	///
100	/// [`End`]: Event::End
101	pub fn trim_markup_names_in_closing_tags(&mut self, val: bool) -> &mut Self {
102	self $(.$holder)? .parser.trim_markup_names_in_closing_tags = val;
103	self
104	}
105
106	/// Changes whether mismatched closing tag names should be detected.
107	///
108	/// Note, that start and end tags [should match literally][spec], they cannot
109	/// have different prefixes even if both prefixes resolve to the same namespace.
110	/// The XML
111	///
112	/// ```xml
113	/// <outer xmlns="namespace" xmlns:p="namespace">
114	/// </p:outer>
115	/// ```
116	///
117	/// is not valid, even though semantically the start tag is the same as the
118	/// end tag. The reason is that namespaces are an extension of the original
119	/// XML specification (without namespaces) and it should be backward-compatible.
120	///
121	/// When set to `false`, it won't check if a closing tag matches the corresponding opening tag.
122	/// For example, `<mytag></different_tag>` will be permitted.
123	///
124	/// If the XML is known to be sane (already processed, etc.) this saves extra time.
125	///
126	/// Note that the emitted [`End`] event will not be modified if this is disabled, ie. it will
127	/// contain the data of the mismatched end tag.
128	///
129	/// Note, that setting this to `true` will lead to additional allocates that
130	/// needed to store tag name for an [`End`] event. However if [`expand_empty_elements`]
131	/// is also set, only one additional allocation will be performed that support
132	/// both these options.
133	///
134	/// (`true` by default)
135	///
136	/// [spec]: https://www.w3.org/TR/xml11/#dt-etag
137	/// [`End`]: Event::End
138	/// [`expand_empty_elements`]: Self::expand_empty_elements
139	pub fn check_end_names(&mut self, val: bool) -> &mut Self {
140	self $(.$holder)? .parser.check_end_names = val;
141	self
142	}
143
144	/// Changes whether comments should be validated.
145	///
146	/// When set to `true`, every [`Comment`] event will be checked for not containing `--`, which
147	/// is not allowed in XML comments. Most of the time we don't want comments at all so we don't
148	/// really care about comment correctness, thus the default value is `false` to improve
149	/// performance.
150	///
151	/// (`false` by default)
152	///
153	/// [`Comment`]: Event::Comment
154	pub fn check_comments(&mut self, val: bool) -> &mut Self {
155	self $(.$holder)? .parser.check_comments = val;
156	self
157	}
158	};
159	}
160
161	macro_rules! read_event_impl {
162	(
163	$self:ident, $buf:ident,
164	$reader:expr,
165	$read_until_open:ident,
166	$read_until_close:ident
167	$(, $await:ident)?
168	) => {{
169	let event = loop {
170	match $self.parser.state {
171	ParseState::Init => { // Go to OpenedTag state
172	// If encoding set explicitly, we not need to detect it. For example,
173	// explicit UTF-8 set automatically if Reader was created using `from_str`.
174	// But we still need to remove BOM for consistency with no encoding
175	// feature enabled path
176	#[cfg(feature = "encoding")]
177	if let Some(encoding) = $reader.detect_encoding() $(.$await)? ? {
178	if $self.parser.encoding.can_be_refined() {
179	$self.parser.encoding = crate::reader::EncodingRef::BomDetected(encoding);
180	}
181	}
182
183	// Removes UTF-8 BOM if it is present
184	#[cfg(not(feature = "encoding"))]
185	$reader.remove_utf8_bom() $(.$await)? ?;
186
187	// Go to OpenedTag state
188	match $self.$read_until_open($buf) $(.$await)? {
189	Ok(Ok(ev)) => break Ok(ev),
190	Ok(Err(b)) => $buf = b,
191	Err(err) => break Err(err),
192	}
193	},
194	ParseState::ClosedTag => { // Go to OpenedTag state
195	match $self.$read_until_open($buf) $(.$await)? {
196	Ok(Ok(ev)) => break Ok(ev),
197	Ok(Err(b)) => $buf = b,
198	Err(err) => break Err(err),
199	}
200	},
201	// Go to ClosedTag state in next two arms
202	ParseState::OpenedTag => break $self.$read_until_close($buf) $(.$await)?,
203	ParseState::Empty => break $self.parser.close_expanded_empty(),
204	ParseState::Exit => break Ok(Event::Eof),
205	};
206	};
207	match event {
208	Err(_) \| Ok(Event::Eof) => $self.parser.state = ParseState::Exit,
209	_ => {}
210	}
211	event
212	}};
213	}
214
215	/// Read bytes up to `<` and skip it. If current byte (after skipping all space
216	/// characters if [`Parser::trim_text_start`] is `true`) is already `<`, then
217	/// returns the next event, otherwise stay at position just after the `<` symbol.
218	///
219	/// Moves parser to the `OpenedTag` state.
220	///
221	/// This code is executed in two cases:
222	/// - after start of parsing just after skipping BOM if it is present
223	/// - after parsing `</tag>` or `<tag>`
224	macro_rules! read_until_open {
225	(
226	$self:ident, $buf:ident,
227	$reader:expr,
228	$read_event:ident
229	$(, $await:ident)?
230	) => {{
231	$self.parser.state = ParseState::OpenedTag;
232
233	if $self.parser.trim_text_start {
234	$reader.skip_whitespace(&mut $self.parser.offset) $(.$await)? ?;
235	}
236
237	// If we already at the `<` symbol, do not try to return an empty Text event
238	if $reader.skip_one(b'<', &mut $self.parser.offset) $(.$await)? ? {
239	// Pass $buf to the next next iteration of parsing loop
240	return Ok(Err($buf));
241	}
242
243	match $reader
244	.read_bytes_until(b'<', $buf, &mut $self.parser.offset)
245	$(.$await)?
246	{
247	// Return Text event with `bytes` content
248	Ok(Some(bytes)) => $self.parser.emit_text(bytes).map(Ok),
249	Ok(None) => Ok(Ok(Event::Eof)),
250	Err(e) => Err(e),
251	}
252	}};
253	}
254
255	/// Read bytes up to the `>` and skip it. This method is expected to be called
256	/// after seeing the `<` symbol and skipping it. Inspects the next (current)
257	/// symbol and returns an appropriate [`Event`]:
258	///
259	/// \|Symbol \|Event
260	/// \|-------\|-------------------------------------
261	/// \|`!` \|[`Comment`], [`CData`] or [`DocType`]
262	/// \|`/` \|[`End`]
263	/// \|`?` \|[`PI`]
264	/// \|_other_\|[`Start`] or [`Empty`]
265	///
266	/// Moves parser to the `ClosedTag` state.
267	///
268	/// [`Comment`]: Event::Comment
269	/// [`CData`]: Event::CData
270	/// [`DocType`]: Event::DocType
271	/// [`End`]: Event::End
272	/// [`PI`]: Event::PI
273	/// [`Start`]: Event::Start
274	/// [`Empty`]: Event::Empty
275	macro_rules! read_until_close {
276	(
277	$self:ident, $buf:ident,
278	$reader:expr
279	$(, $await:ident)?
280	) => {{
281	$self.parser.state = ParseState::ClosedTag;
282
283	match $reader.peek_one() $(.$await)? {
284	// `<!` - comment, CDATA or DOCTYPE declaration
285	Ok(Some(b'!')) => match $reader
286	.read_bang_element($buf, &mut $self.parser.offset)
287	$(.$await)?
288	{
289	Ok(None) => Ok(Event::Eof),
290	Ok(Some((bang_type, bytes))) => $self.parser.emit_bang(bang_type, bytes),
291	Err(e) => Err(e),
292	},
293	// `</` - closing tag
294	Ok(Some(b'/')) => match $reader
295	.read_bytes_until(b'>', $buf, &mut $self.parser.offset)
296	$(.$await)?
297	{
298	Ok(None) => Ok(Event::Eof),
299	Ok(Some(bytes)) => $self.parser.emit_end(bytes),
300	Err(e) => Err(e),
301	},
302	// `<?` - processing instruction
303	Ok(Some(b'?')) => match $reader
304	.read_bytes_until(b'>', $buf, &mut $self.parser.offset)
305	$(.$await)?
306	{
307	Ok(None) => Ok(Event::Eof),
308	Ok(Some(bytes)) => $self.parser.emit_question_mark(bytes),
309	Err(e) => Err(e),
310	},
311	// `<...` - opening or self-closed tag
312	Ok(Some(_)) => match $reader
313	.read_element($buf, &mut $self.parser.offset)
314	$(.$await)?
315	{
316	Ok(None) => Ok(Event::Eof),
317	Ok(Some(bytes)) => $self.parser.emit_start(bytes),
318	Err(e) => Err(e),
319	},
320	Ok(None) => Ok(Event::Eof),
321	Err(e) => Err(e),
322	}
323	}};
324	}
325
326	/// Generalization of `read_to_end` method for buffered and borrowed readers
327	macro_rules! read_to_end {
328	(
329	$self:expr, $end:expr, $buf:expr,
330	$read_event:ident,
331	// Code block that performs clearing of internal buffer after read of each event
332	$clear:block
333	$(, $await:ident)?
334	) => {{
335	let start = $self.buffer_position();
336	let mut depth = `0`;
337	loop {
338	$clear
339	let end = $self.buffer_position();
340	match $self.$read_event($buf) $(.$await)? {
341	Err(e) => return Err(e),
342
343	Ok(Event::Start(e)) if e.name() == $end => depth += `1`,
344	Ok(Event::End(e)) if e.name() == $end => {
345	if depth == `0` {
346	break start..end;
347	}
348	depth -= `1`;
349	}
350	Ok(Event::Eof) => {
351	let name = $self.decoder().decode($end.as_ref());
352	return Err(Error::UnexpectedEof(format!("</{:?}>", name)));
353	}
354	_ => (),
355	}
356	}
357	}};
358	}
359
360	#[cfg(feature = "async-tokio")]
361	mod async_tokio;
362	mod buffered_reader;
363	mod ns_reader;
364	mod parser;
365	mod slice_reader;
366
367	pub use ns_reader::NsReader;
368
369	/// Range of input in bytes, that corresponds to some piece of XML
370	pub type Span = Range<usize>;
371
372	////////////////////////////////////////////////////////////////////////////////////////////////////
373
374	/// Possible reader states. The state transition diagram (`true` and `false` shows
375	/// value of [`Reader::expand_empty_elements()`] option):
376	///
377	/// ```mermaid
378	/// flowchart LR
379	/// subgraph _
380	/// direction LR
381	///
382	/// Init -- "(no event)"\n --> OpenedTag
383	/// OpenedTag -- Decl, DocType, PI\nComment, CData\nStart, Empty, End --> ClosedTag
384	/// ClosedTag -- "#lt;false#gt;\n(no event)"\nText --> OpenedTag
385	/// end
386	/// ClosedTag -- "#lt;true#gt;"\nStart --> Empty
387	/// Empty -- End --> ClosedTag
388	/// _ -. Eof .-> Exit
389	/// ```
390	#[derive(Clone)]
391	enum ParseState {
392	/// Initial state in which reader stay after creation. Transition from that
393	/// state could produce a `Text`, `Decl`, `Comment` or `Start` event. The next
394	/// state is always `OpenedTag`. The reader will never return to this state. The
395	/// event emitted during transition to `OpenedTag` is a `StartEvent` if the
396	/// first symbol not `<`, otherwise no event are emitted.
397	Init,
398	/// State after seeing the `<` symbol. Depending on the next symbol all other
399	/// events could be generated.
400	///
401	/// After generating one event the reader moves to the `ClosedTag` state.
402	OpenedTag,
403	/// State in which reader searches the `<` symbol of a markup. All bytes before
404	/// that symbol will be returned in the [`Event::Text`] event. After that
405	/// the reader moves to the `OpenedTag` state.
406	ClosedTag,
407	/// This state is used only if option [`expand_empty_elements`] is set to `true`.
408	/// Reader enters to this state when it is in a `ClosedTag` state and emits an
409	/// [`Event::Start`] event. The next event emitted will be an [`Event::End`],
410	/// after which reader returned to the `ClosedTag` state.
411	///
412	/// [`expand_empty_elements`]: Parser::expand_empty_elements
413	Empty,
414	/// Reader enters this state when `Eof` event generated or an error occurred.
415	/// This is the last state, the reader stay in it forever.
416	Exit,
417	}
418
419	/// A reference to an encoding together with information about how it was retrieved.
420	///
421	/// The state transition diagram:
422	///
423	/// ```mermaid
424	/// flowchart LR
425	/// Implicit -- from_str --> Explicit
426	/// Implicit -- BOM --> BomDetected
427	/// Implicit -- "encoding=..." --> XmlDetected
428	/// BomDetected -- "encoding=..." --> XmlDetected
429	/// ```
430	#[cfg(feature = "encoding")]
431	#[derive(Clone, Copy)]
432	enum EncodingRef {
433	/// Encoding was implicitly assumed to have a specified value. It can be refined
434	/// using BOM or by the XML declaration event (`<?xml encoding=... ?>`)
435	Implicit(&'static Encoding),
436	/// Encoding was explicitly set to the desired value. It cannot be changed
437	/// nor by BOM, nor by parsing XML declaration (`<?xml encoding=... ?>`)
438	Explicit(&'static Encoding),
439	/// Encoding was detected from a byte order mark (BOM) or by the first bytes
440	/// of the content. It can be refined by the XML declaration event (`<?xml encoding=... ?>`)
441	BomDetected(&'static Encoding),
442	/// Encoding was detected using XML declaration event (`<?xml encoding=... ?>`).
443	/// It can no longer change
444	XmlDetected(&'static Encoding),
445	}
446	#[cfg(feature = "encoding")]
447	impl EncodingRef {
448	#[inline]
449	fn encoding(&self) -> &'static Encoding {
450	match self {
451	Self::Implicit(e) => e,
452	Self::Explicit(e) => e,
453	Self::BomDetected(e) => e,
454	Self::XmlDetected(e) => e,
455	}
456	}
457	#[inline]
458	fn can_be_refined(&self) -> bool {
459	match self {
460	Self::Implicit(_) \| Self::BomDetected(_) => `true`,
461	Self::Explicit(_) \| Self::XmlDetected(_) => `false`,
462	}
463	}
464	}
465
466	////////////////////////////////////////////////////////////////////////////////////////////////////
467
468	/// A low level encoding-agnostic XML event reader.
469	///
470	/// Consumes bytes and streams XML [`Event`]s.
471	///
472	/// This reader does not manage namespace declarations and not able to resolve
473	/// prefixes. If you want these features, use the [`NsReader`].
474	///
475	/// # Examples
476	///
477	/// ```
478	/// use quick_xml::events::Event;
479	/// use quick_xml::reader::Reader;
480	///
481	/// let xml = r#"<tag1 att1 = "test">
482	/// <tag2><!--Test comment-->Test</tag2>
483	/// <tag2>Test 2</tag2>
484	/// </tag1>"#;
485	/// let mut reader = Reader::from_str(xml);
486	/// reader.trim_text(`true`);
487	///
488	/// let mut count = `0`;
489	/// let mut txt = Vec::new();
490	/// let mut buf = Vec::new();
491	///
492	/// // The `Reader` does not implement `Iterator` because it outputs borrowed data (`Cow`s)
493	/// loop {
494	/// // NOTE: this is the generic case when we don't know about the input BufRead.
495	/// // when the input is a &str or a &[u8], we don't actually need to use another
496	/// // buffer, we could directly call `reader.read_event()`
497	/// match reader.read_event_into(&mut buf) {
498	/// Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
499	/// // exits the loop when reaching end of file
500	/// Ok(Event::Eof) => break,
501	///
502	/// Ok(Event::Start(e)) => {
503	/// match e.name().as_ref() {
504	/// b"tag1" => println!("attributes values: {:?}",
505	/// e.attributes().map(\|a\| a.unwrap().value)
506	/// .collect::<Vec<_>>()),
507	/// b"tag2" => count += `1`,
508	/// _ => (),
509	/// }
510	/// }
511	/// Ok(Event::Text(e)) => txt.push(e.unescape().unwrap().into_owned()),
512	///
513	/// // There are several other `Event`s we do not consider here
514	/// _ => (),
515	/// }
516	/// // if we don't keep a borrow elsewhere, we can clear the buffer to keep memory usage low
517	/// buf.clear();
518	/// }
519	/// ```
520	///
521	/// [`NsReader`]: crate::reader::NsReader
522	#[derive(Clone)]
523	pub struct Reader<R> {
524	/// Source of data for parse
525	reader: R,
526	/// Configuration and current parse state
527	parser: Parser,
528	}
529
530	/// Builder methods
531	impl<R> Reader<R> {
532	/// Creates a `Reader` that reads from a given reader.
533	pub fn from_reader(reader: R) -> Self {
534	Self {
535	reader,
536	parser: Parser::default(),
537	}
538	}
539
540	configure_methods!();
541	}
542
543	/// Getters
544	impl<R> Reader<R> {
545	/// Consumes `Reader` returning the underlying reader
546	///
547	/// Can be used to compute line and column of a parsing error position
548	///
549	/// # Examples
550	///
551	/// ```
552	/// # use pretty_assertions::assert_eq;
553	/// use std::{str, io::Cursor};
554	/// use quick_xml::events::Event;
555	/// use quick_xml::reader::Reader;
556	///
557	/// let xml = r#"<tag1 att1 = "test">
558	/// <tag2><!--Test comment-->Test</tag2>
559	/// <tag3>Test 2</tag3>
560	/// </tag1>"#;
561	/// let mut reader = Reader::from_reader(Cursor::new(xml.as_bytes()));
562	/// let mut buf = Vec::new();
563	///
564	/// fn into_line_and_column(reader: Reader<Cursor<&[u8]>>) -> (usize, usize) {
565	/// let end_pos = reader.buffer_position();
566	/// let mut cursor = reader.into_inner();
567	/// let s = String::from_utf8(cursor.into_inner()[`0`..end_pos].to_owned())
568	/// .expect("can't make a string");
569	/// let mut line = `1`;
570	/// let mut column = `0`;
571	/// for c in s.chars() {
572	/// if c == '`\n`' {
573	/// line += `1`;
574	/// column = `0`;
575	/// } else {
576	/// column += `1`;
577	/// }
578	/// }
579	/// (line, column)
580	/// }
581	///
582	/// loop {
583	/// match reader.read_event_into(&mut buf) {
584	/// Ok(Event::Start(ref e)) => match e.name().as_ref() {
585	/// b"tag1" \| b"tag2" => (),
586	/// tag => {
587	/// assert_eq!(b"tag3", tag);
588	/// assert_eq!((`3`, `22`), into_line_and_column(reader));
589	/// break;
590	/// }
591	/// },
592	/// Ok(Event::Eof) => unreachable!(),
593	/// _ => (),
594	/// }
595	/// buf.clear();
596	/// }
597	/// ```
598	pub fn into_inner(self) -> R {
599	self.reader
600	}
601
602	/// Gets a reference to the underlying reader.
603	pub fn get_ref(&self) -> &R {
604	&self.reader
605	}
606
607	/// Gets a mutable reference to the underlying reader.
608	pub fn get_mut(&mut self) -> &mut R {
609	&mut self.reader
610	}
611
612	/// Gets the current byte position in the input data.
613	///
614	/// Useful when debugging errors.
615	pub fn buffer_position(&self) -> usize {
616	// when internal state is OpenedTag, we have actually read until '<',
617	// which we don't want to show
618	if let ParseState::OpenedTag = self.parser.state {
619	self.parser.offset - `1`
620	} else {
621	self.parser.offset
622	}
623	}
624
625	/// Get the decoder, used to decode bytes, read by this reader, to the strings.
626	///
627	/// If `encoding` feature is enabled, the used encoding may change after
628	/// parsing the XML declaration, otherwise encoding is fixed to UTF-8.
629	///
630	/// If `encoding` feature is enabled and no encoding is specified in declaration,
631	/// defaults to UTF-8.
632	#[inline]
633	pub fn decoder(&self) -> Decoder {
634	self.parser.decoder()
635	}
636	}
637
638	/// Private sync reading methods
639	impl<R> Reader<R> {
640	/// Read text into the given buffer, and return an event that borrows from
641	/// either that buffer or from the input itself, based on the type of the
642	/// reader.
643	fn read_event_impl<'i, B>(&mut self, mut buf: B) -> Result<Event<'i>>
644	where
645	R: XmlSource<'i, B>,
646	{
647	read_event_impl!(self, buf, self.reader, read_until_open, read_until_close)
648	}
649
650	/// Read until '<' is found, moves reader to an `OpenedTag` state and returns a `Text` event.
651	///
652	/// Returns inner `Ok` if the loop should be broken and an event returned.
653	/// Returns inner `Err` with the same `buf` because Rust borrowck stumbles upon this case in particular.
654	fn read_until_open<'i, B>(&mut self, buf: B) -> Result<std::result::Result<Event<'i>, B>>
655	where
656	R: XmlSource<'i, B>,
657	{
658	read_until_open!(self, buf, self.reader, read_event_impl)
659	}
660
661	/// Private function to read until `>` is found. This function expects that
662	/// it was called just after encounter a `<` symbol.
663	fn read_until_close<'i, B>(&mut self, buf: B) -> Result<Event<'i>>
664	where
665	R: XmlSource<'i, B>,
666	{
667	read_until_close!(self, buf, self.reader)
668	}
669	}
670
671	////////////////////////////////////////////////////////////////////////////////////////////////////
672
673	/// Represents an input for a reader that can return borrowed data.
674	///
675	/// There are two implementors of this trait: generic one that read data from
676	/// `Self`, copies some part of it into a provided buffer of type `B` and then
677	/// returns data that borrow from that buffer.
678	///
679	/// The other implementor is for `&[u8]` and instead of copying data returns
680	/// borrowed data from `Self` instead. This implementation allows zero-copy
681	/// deserialization.
682	///
683	/// # Parameters
684	/// - `'r`: lifetime of a buffer from which events will borrow
685	/// - `B`: a type of a buffer that can be used to store data read from `Self` and
686	/// from which events can borrow
687	trait XmlSource<'r, B> {
688	/// Removes UTF-8 BOM if it is present
689	#[cfg(not(feature = "encoding"))]
690	fn remove_utf8_bom(&mut self) -> Result<()>;
691
692	/// Determines encoding from the start of input and removes BOM if it is present
693	#[cfg(feature = "encoding")]
694	fn detect_encoding(&mut self) -> Result<Option<&'static Encoding>>;
695
696	/// Read input until `byte` is found or end of input is reached.
697	///
698	/// Returns a slice of data read up to `byte`, which does not include into result.
699	/// If input (`Self`) is exhausted, returns `None`.
700	///
701	/// # Example
702	///
703	/// ```ignore
704	/// let mut position = `0`;
705	/// let mut input = b"abc*def".as_ref();
706	/// // ^= 4
707	///
708	/// assert_eq!(
709	/// input.read_bytes_until(b'', (), &mut* position).unwrap(),
710	/// Some(b"abc".as_ref())
711	/// );
712	/// assert_eq!(position, `4`); // position after the symbol matched
713	/// ```
714	///
715	/// # Parameters
716	/// - `byte`: Byte for search
717	/// - `buf`: Buffer that could be filled from an input (`Self`) and
718	/// from which [events] could borrow their data
719	/// - `position`: Will be increased by amount of bytes consumed
720	///
721	/// [events]: crate::events::Event
722	fn read_bytes_until(
723	&mut self,
724	byte: u8,
725	buf: B,
726	position: &mut usize,
727	) -> Result<Option<&'r [u8]>>;
728
729	/// Read input until comment, CDATA or processing instruction is finished.
730	///
731	/// This method expect that `<` already was read.
732	///
733	/// Returns a slice of data read up to end of comment, CDATA or processing
734	/// instruction (`>`), which does not include into result.
735	///
736	/// If input (`Self`) is exhausted and nothing was read, returns `None`.
737	///
738	/// # Parameters
739	/// - `buf`: Buffer that could be filled from an input (`Self`) and
740	/// from which [events] could borrow their data
741	/// - `position`: Will be increased by amount of bytes consumed
742	///
743	/// [events]: crate::events::Event
744	fn read_bang_element(
745	&mut self,
746	buf: B,
747	position: &mut usize,
748	) -> Result<Option<(BangType, &'r [u8])>>;
749
750	/// Read input until XML element is closed by approaching a `>` symbol.
751	/// Returns `Some(buffer)` that contains a data between `<` and `>` or
752	/// `None` if end-of-input was reached and nothing was read.
753	///
754	/// Derived from `read_until`, but modified to handle XML attributes
755	/// using a minimal state machine.
756	///
757	/// Attribute values are [defined] as follows:
758	/// ```plain
759	/// AttValue := '"' (([^<&"]) \| Reference) '"'*
760	/// \| "'" (([^<&']) \| Reference) "'"*
761	/// ```
762	/// (`Reference` is something like `"`, but we don't care about
763	/// escaped characters at this level)
764	///
765	/// # Parameters
766	/// - `buf`: Buffer that could be filled from an input (`Self`) and
767	/// from which [events] could borrow their data
768	/// - `position`: Will be increased by amount of bytes consumed
769	///
770	/// [defined]: https://www.w3.org/TR/xml11/#NT-AttValue
771	/// [events]: crate::events::Event
772	fn read_element(&mut self, buf: B, position: &mut usize) -> Result<Option<&'r [u8]>>;
773
774	/// Consume and discard all the whitespace until the next non-whitespace
775	/// character or EOF.
776	///
777	/// # Parameters
778	/// - `position`: Will be increased by amount of bytes consumed
779	fn skip_whitespace(&mut self, position: &mut usize) -> Result<()>;
780
781	/// Consume and discard one character if it matches the given byte. Return
782	/// `true` if it matched.
783	///
784	/// # Parameters
785	/// - `position`: Will be increased by 1 if byte is matched
786	fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result<bool>;
787
788	/// Return one character without consuming it, so that future `read_` calls*
789	/// will still include it. On EOF, return `None`.
790	fn peek_one(&mut self) -> Result<Option<u8>>;
791	}
792
793	/// Possible elements started with `<!`
794	#[derive(Debug, PartialEq)]
795	enum BangType {
796	/// <![CDATA[...]]>
797	CData,
798	/// <!--...-->
799	Comment,
800	/// <!DOCTYPE...>
801	DocType,
802	}
803	impl BangType {
804	#[inline(always)]
805	fn new(byte: Option<u8>) -> Result<Self> {
806	Ok(match byte {
807	Some(b'[') => Self::CData,
808	Some(b'-') => Self::Comment,
809	Some(b'D') \| Some(b'd') => Self::DocType,
810	Some(b) => return Err(Error::UnexpectedBang(b)),
811	None => return Err(Error::UnexpectedEof("Bang".to_string())),
812	})
813	}
814
815	/// If element is finished, returns its content up to `>` symbol and
816	/// an index of this symbol, otherwise returns `None`
817	///
818	/// # Parameters
819	/// - `buf`: buffer with data consumed on previous iterations
820	/// - `chunk`: data read on current iteration and not yet consumed from reader
821	#[inline(always)]
822	fn parse<'b>(&self, buf: &[u8], chunk: &'b [u8]) -> Option<(&'b [u8], usize)> {
823	for i in memchr::memchr_iter(b'>', chunk) {
824	match self {
825	// Need to read at least 6 symbols (`!---->`) for properly finished comment
826	// <!----> - XML comment
827	// 012345 - i
828	Self::Comment if buf.len() + i > `4` => {
829	if chunk[..i].ends_with(b"--") {
830	// We cannot strip last `--` from the buffer because we need it in case of
831	// check_comments enabled option. XML standard requires that comment
832	// will not end with `--->` sequence because this is a special case of
833	// `--` in the comment (https://www.w3.org/TR/xml11/#sec-comments)
834	return Some((&chunk[..i], i + `1`)); // +1 for `>`
835	}
836	// End sequence `-\|->` was splitted at \|
837	// buf --/ \-- chunk
838	if i == `1` && buf.ends_with(b"-") && chunk[`0`] == b'-' {
839	return Some((&chunk[..i], i + `1`)); // +1 for `>`
840	}
841	// End sequence `--\|>` was splitted at \|
842	// buf --/ \-- chunk
843	if i == `0` && buf.ends_with(b"--") {
844	return Some((&[], i + `1`)); // +1 for `>`
845	}
846	}
847	Self::Comment => {}
848	Self::CData => {
849	if chunk[..i].ends_with(b"]]") {
850	return Some((&chunk[..i], i + `1`)); // +1 for `>`
851	}
852	// End sequence `]\|]>` was splitted at \|
853	// buf --/ \-- chunk
854	if i == `1` && buf.ends_with(b"]") && chunk[`0`] == b']' {
855	return Some((&chunk[..i], i + `1`)); // +1 for `>`
856	}
857	// End sequence `]]\|>` was splitted at \|
858	// buf --/ \-- chunk
859	if i == `0` && buf.ends_with(b"]]") {
860	return Some((&[], i + `1`)); // +1 for `>`
861	}
862	}
863	Self::DocType => {
864	let content = &chunk[..i];
865	let balance = memchr::memchr2_iter(b'<', b'>', content)
866	.map(\|p\| if content[p] == b'<' { `1i32` } else { `-1` })
867	.sum::<i32>();
868	if balance == `0` {
869	return Some((content, i + `1`)); // +1 for `>`
870	}
871	}
872	}
873	}
874	None
875	}
876	#[inline]
877	fn to_err(&self) -> Error {
878	let bang_str = match self {
879	Self::CData => "CData",
880	Self::Comment => "Comment",
881	Self::DocType => "DOCTYPE",
882	};
883	Error::UnexpectedEof(bang_str.to_string())
884	}
885	}
886
887	/// State machine for the [`XmlSource::read_element`]
888	#[derive(Clone, Copy)]
889	enum ReadElementState {
890	/// The initial state (inside element, but outside of attribute value)
891	Elem,
892	/// Inside a single-quoted attribute value
893	SingleQ,
894	/// Inside a double-quoted attribute value
895	DoubleQ,
896	}
897	impl ReadElementState {
898	/// Changes state by analyzing part of input.
899	/// Returns a tuple with part of chunk up to element closing symbol `>`
900	/// and a position after that symbol or `None` if such symbol was not found
901	#[inline(always)]
902	fn change<'b>(&mut self, chunk: &'b [u8]) -> Option<(&'b [u8], usize)> {
903	for i: usize in memchr::memchr3_iter(needle1:b'>', needle2:b'`\'`', needle3:b'"', haystack:chunk) {
904	self = match* (*self, chunk[i]) {
905	// only allowed to match `>` while we are in state `Elem`
906	(Self::Elem, b'>') => return Some((&chunk[..i], i + `1`)),
907	(Self::Elem, b'`\'`') => Self::SingleQ,
908	(Self::Elem, b'`\"`') => Self::DoubleQ,
909
910	// the only end_byte that gets us out if the same character
911	(Self::SingleQ, b'`\'`') \| (Self::DoubleQ, b'"') => Self::Elem,
912
913	// all other bytes: no state change
914	_ => *self,
915	};
916	}
917	None
918	}
919	}
920
921	/// A function to check whether the byte is a whitespace (blank, new line, carriage return or tab)
922	#[inline]
923	pub(crate) const fn is_whitespace(b: u8) -> bool {
924	matches!(b, b' ' \| b'`\r`' \| b'`\n`' \| b'`\t`')
925	}
926
927	////////////////////////////////////////////////////////////////////////////////////////////////////
928
929	#[cfg(test)]
930	mod test {
931	/// Checks the internal implementation of the various reader methods
932	macro_rules! check {
933	(
934	#[$test:meta]
935	$read_event:ident,
936	$read_until_close:ident,
937	// constructor of the XML source on which internal functions will be called
938	$source:path,
939	// constructor of the buffer to which read data will stored
940	$buf:expr
941	$(, $async:ident, $await:ident)?
942	) => {
943	mod read_bytes_until {
944	use super::*;
945	// Use Bytes for printing bytes as strings for ASCII range
946	use crate::utils::Bytes;
947	use pretty_assertions::assert_eq;
948
949	/// Checks that search in the empty buffer returns `None`
950	#[$test]
951	$($async)? fn empty() {
952	let buf = $buf;
953	let mut position = `0`;
954	let mut input = b"".as_ref();
955	// ^= 0
956
957	assert_eq!(
958	$source(&mut input)
959	.read_bytes_until(b'', buf, &mut* position)
960	$(.$await)?
961	.unwrap()
962	.map(Bytes),
963	None
964	);
965	assert_eq!(position, `0`);
966	}
967
968	/// Checks that search in the buffer non-existent value returns entire buffer
969	/// as a result and set `position` to `len()`
970	#[$test]
971	$($async)? fn non_existent() {
972	let buf = $buf;
973	let mut position = `0`;
974	let mut input = b"abcdef".as_ref();
975	// ^= 6
976
977	assert_eq!(
978	$source(&mut input)
979	.read_bytes_until(b'', buf, &mut* position)
980	$(.$await)?
981	.unwrap()
982	.map(Bytes),
983	Some(Bytes(b"abcdef"))
984	);
985	assert_eq!(position, `6`);
986	}
987
988	/// Checks that search in the buffer an element that is located in the front of
989	/// buffer returns empty slice as a result and set `position` to one symbol
990	/// after match (`1`)
991	#[$test]
992	$($async)? fn at_the_start() {
993	let buf = $buf;
994	let mut position = `0`;
995	let mut input = b"*abcdef".as_ref();
996	// ^= 1
997
998	assert_eq!(
999	$source(&mut input)
1000	.read_bytes_until(b'', buf, &mut* position)
1001	$(.$await)?
1002	.unwrap()
1003	.map(Bytes),
1004	Some(Bytes(b""))
1005	);
1006	assert_eq!(position, `1`); // position after the symbol matched
1007	}
1008
1009	/// Checks that search in the buffer an element that is located in the middle of
1010	/// buffer returns slice before that symbol as a result and set `position` to one
1011	/// symbol after match
1012	#[$test]
1013	$($async)? fn inside() {
1014	let buf = $buf;
1015	let mut position = `0`;
1016	let mut input = b"abc*def".as_ref();
1017	// ^= 4
1018
1019	assert_eq!(
1020	$source(&mut input)
1021	.read_bytes_until(b'', buf, &mut* position)
1022	$(.$await)?
1023	.unwrap()
1024	.map(Bytes),
1025	Some(Bytes(b"abc"))
1026	);
1027	assert_eq!(position, `4`); // position after the symbol matched
1028	}
1029
1030	/// Checks that search in the buffer an element that is located in the end of
1031	/// buffer returns slice before that symbol as a result and set `position` to one
1032	/// symbol after match (`len()`)
1033	#[$test]
1034	$($async)? fn in_the_end() {
1035	let buf = $buf;
1036	let mut position = `0`;
1037	let mut input = b"abcdef*".as_ref();
1038	// ^= 7
1039
1040	assert_eq!(
1041	$source(&mut input)
1042	.read_bytes_until(b'', buf, &mut* position)
1043	$(.$await)?
1044	.unwrap()
1045	.map(Bytes),
1046	Some(Bytes(b"abcdef"))
1047	);
1048	assert_eq!(position, `7`); // position after the symbol matched
1049	}
1050	}
1051
1052	mod read_bang_element {
1053	use super::*;
1054
1055	/// Checks that reading CDATA content works correctly
1056	mod cdata {
1057	use super::*;
1058	use crate::errors::Error;
1059	use crate::reader::BangType;
1060	use crate::utils::Bytes;
1061	use pretty_assertions::assert_eq;
1062
1063	/// Checks that if input begins like CDATA element, but CDATA start sequence
1064	/// is not finished, parsing ends with an error
1065	#[$test]
1066	#[ignore = "start CDATA sequence fully checked outside of `read_bang_element`"]
1067	$($async)? fn not_properly_start() {
1068	let buf = $buf;
1069	let mut position = `0`;
1070	let mut input = b"![]]>other content".as_ref();
1071	// ^= 0
1072
1073	match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1074	Err(Error::UnexpectedEof(s)) if s == "CData" => {}
1075	x => assert!(
1076	`false`,
1077	r#"Expected `UnexpectedEof("CData")`, but result is: {:?}"#,
1078	x
1079	),
1080	}
1081	assert_eq!(position, `0`);
1082	}
1083
1084	/// Checks that if CDATA startup sequence was matched, but an end sequence
1085	/// is not found, parsing ends with an error
1086	#[$test]
1087	$($async)? fn not_closed() {
1088	let buf = $buf;
1089	let mut position = `0`;
1090	let mut input = b"![CDATA[other content".as_ref();
1091	// ^= 0
1092
1093	match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1094	Err(Error::UnexpectedEof(s)) if s == "CData" => {}
1095	x => assert!(
1096	`false`,
1097	r#"Expected `UnexpectedEof("CData")`, but result is: {:?}"#,
1098	x
1099	),
1100	}
1101	assert_eq!(position, `0`);
1102	}
1103
1104	/// Checks that CDATA element without content inside parsed successfully
1105	#[$test]
1106	$($async)? fn empty() {
1107	let buf = $buf;
1108	let mut position = `0`;
1109	let mut input = b"![CDATA[]]>other content".as_ref();
1110	// ^= 11
1111
1112	assert_eq!(
1113	$source(&mut input)
1114	.read_bang_element(buf, &mut position)
1115	$(.$await)?
1116	.unwrap()
1117	.map(\|(ty, data)\| (ty, Bytes(data))),
1118	Some((BangType::CData, Bytes(b"![CDATA[]]")))
1119	);
1120	assert_eq!(position, `11`);
1121	}
1122
1123	/// Checks that CDATA element with content parsed successfully.
1124	/// Additionally checks that sequences inside CDATA that may look like
1125	/// a CDATA end sequence do not interrupt CDATA parsing
1126	#[$test]
1127	$($async)? fn with_content() {
1128	let buf = $buf;
1129	let mut position = `0`;
1130	let mut input = b"![CDATA[cdata]] ]>content]]>other content]]>".as_ref();
1131	// ^= 28
1132
1133	assert_eq!(
1134	$source(&mut input)
1135	.read_bang_element(buf, &mut position)
1136	$(.$await)?
1137	.unwrap()
1138	.map(\|(ty, data)\| (ty, Bytes(data))),
1139	Some((BangType::CData, Bytes(b"![CDATA[cdata]] ]>content]]")))
1140	);
1141	assert_eq!(position, `28`);
1142	}
1143	}
1144
1145	/// Checks that reading XML comments works correctly. According to the [specification],
1146	/// comment data can contain any sequence except `--`:
1147	///
1148	/// ```peg
1149	/// comment = '<--' (!'--' char) '-->';*
1150	/// char = [#x1-#x2C]
1151	/// / [#x2E-#xD7FF]
1152	/// / [#xE000-#xFFFD]
1153	/// / [#x10000-#x10FFFF]
1154	/// ```
1155	///
1156	/// The presence of this limitation, however, is simply a poorly designed specification
1157	/// (maybe for purpose of building of LL(1) XML parser) and quick-xml does not check for
1158	/// presence of these sequences by default. This tests allow such content.
1159	///
1160	/// [specification]: https://www.w3.org/TR/xml11/#dt-comment
1161	mod comment {
1162	use super::*;
1163	use crate::errors::Error;
1164	use crate::reader::BangType;
1165	use crate::utils::Bytes;
1166	use pretty_assertions::assert_eq;
1167
1168	#[$test]
1169	#[ignore = "start comment sequence fully checked outside of `read_bang_element`"]
1170	$($async)? fn not_properly_start() {
1171	let buf = $buf;
1172	let mut position = `0`;
1173	let mut input = b"!- -->other content".as_ref();
1174	// ^= 0
1175
1176	match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1177	Err(Error::UnexpectedEof(s)) if s == "Comment" => {}
1178	x => assert!(
1179	`false`,
1180	r#"Expected `UnexpectedEof("Comment")`, but result is: {:?}"#,
1181	x
1182	),
1183	}
1184	assert_eq!(position, `0`);
1185	}
1186
1187	#[$test]
1188	$($async)? fn not_properly_end() {
1189	let buf = $buf;
1190	let mut position = `0`;
1191	let mut input = b"!->other content".as_ref();
1192	// ^= 0
1193
1194	match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1195	Err(Error::UnexpectedEof(s)) if s == "Comment" => {}
1196	x => assert!(
1197	`false`,
1198	r#"Expected `UnexpectedEof("Comment")`, but result is: {:?}"#,
1199	x
1200	),
1201	}
1202	assert_eq!(position, `0`);
1203	}
1204
1205	#[$test]
1206	$($async)? fn not_closed1() {
1207	let buf = $buf;
1208	let mut position = `0`;
1209	let mut input = b"!--other content".as_ref();
1210	// ^= 0
1211
1212	match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1213	Err(Error::UnexpectedEof(s)) if s == "Comment" => {}
1214	x => assert!(
1215	`false`,
1216	r#"Expected `UnexpectedEof("Comment")`, but result is: {:?}"#,
1217	x
1218	),
1219	}
1220	assert_eq!(position, `0`);
1221	}
1222
1223	#[$test]
1224	$($async)? fn not_closed2() {
1225	let buf = $buf;
1226	let mut position = `0`;
1227	let mut input = b"!-->other content".as_ref();
1228	// ^= 0
1229
1230	match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1231	Err(Error::UnexpectedEof(s)) if s == "Comment" => {}
1232	x => assert!(
1233	`false`,
1234	r#"Expected `UnexpectedEof("Comment")`, but result is: {:?}"#,
1235	x
1236	),
1237	}
1238	assert_eq!(position, `0`);
1239	}
1240
1241	#[$test]
1242	$($async)? fn not_closed3() {
1243	let buf = $buf;
1244	let mut position = `0`;
1245	let mut input = b"!--->other content".as_ref();
1246	// ^= 0
1247
1248	match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1249	Err(Error::UnexpectedEof(s)) if s == "Comment" => {}
1250	x => assert!(
1251	`false`,
1252	r#"Expected `UnexpectedEof("Comment")`, but result is: {:?}"#,
1253	x
1254	),
1255	}
1256	assert_eq!(position, `0`);
1257	}
1258
1259	#[$test]
1260	$($async)? fn empty() {
1261	let buf = $buf;
1262	let mut position = `0`;
1263	let mut input = b"!---->other content".as_ref();
1264	// ^= 6
1265
1266	assert_eq!(
1267	$source(&mut input)
1268	.read_bang_element(buf, &mut position)
1269	$(.$await)?
1270	.unwrap()
1271	.map(\|(ty, data)\| (ty, Bytes(data))),
1272	Some((BangType::Comment, Bytes(b"!----")))
1273	);
1274	assert_eq!(position, `6`);
1275	}
1276
1277	#[$test]
1278	$($async)? fn with_content() {
1279	let buf = $buf;
1280	let mut position = `0`;
1281	let mut input = b"!--->comment<--->other content".as_ref();
1282	// ^= 17
1283
1284	assert_eq!(
1285	$source(&mut input)
1286	.read_bang_element(buf, &mut position)
1287	$(.$await)?
1288	.unwrap()
1289	.map(\|(ty, data)\| (ty, Bytes(data))),
1290	Some((BangType::Comment, Bytes(b"!--->comment<---")))
1291	);
1292	assert_eq!(position, `17`);
1293	}
1294	}
1295
1296	/// Checks that reading DOCTYPE definition works correctly
1297	mod doctype {
1298	use super::*;
1299
1300	mod uppercase {
1301	use super::*;
1302	use crate::errors::Error;
1303	use crate::reader::BangType;
1304	use crate::utils::Bytes;
1305	use pretty_assertions::assert_eq;
1306
1307	#[$test]
1308	$($async)? fn not_properly_start() {
1309	let buf = $buf;
1310	let mut position = `0`;
1311	let mut input = b"!D other content".as_ref();
1312	// ^= 0
1313
1314	match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1315	Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
1316	x => assert!(
1317	`false`,
1318	r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
1319	x
1320	),
1321	}
1322	assert_eq!(position, `0`);
1323	}
1324
1325	#[$test]
1326	$($async)? fn without_space() {
1327	let buf = $buf;
1328	let mut position = `0`;
1329	let mut input = b"!DOCTYPEother content".as_ref();
1330	// ^= 0
1331
1332	match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1333	Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
1334	x => assert!(
1335	`false`,
1336	r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
1337	x
1338	),
1339	}
1340	assert_eq!(position, `0`);
1341	}
1342
1343	#[$test]
1344	$($async)? fn empty() {
1345	let buf = $buf;
1346	let mut position = `0`;
1347	let mut input = b"!DOCTYPE>other content".as_ref();
1348	// ^= 9
1349
1350	assert_eq!(
1351	$source(&mut input)
1352	.read_bang_element(buf, &mut position)
1353	$(.$await)?
1354	.unwrap()
1355	.map(\|(ty, data)\| (ty, Bytes(data))),
1356	Some((BangType::DocType, Bytes(b"!DOCTYPE")))
1357	);
1358	assert_eq!(position, `9`);
1359	}
1360
1361	#[$test]
1362	$($async)? fn not_closed() {
1363	let buf = $buf;
1364	let mut position = `0`;
1365	let mut input = b"!DOCTYPE other content".as_ref();
1366	// ^= 0
1367
1368	match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1369	Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
1370	x => assert!(
1371	`false`,
1372	r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
1373	x
1374	),
1375	}
1376	assert_eq!(position, `0`);
1377	}
1378	}
1379
1380	mod lowercase {
1381	use super::*;
1382	use crate::errors::Error;
1383	use crate::reader::BangType;
1384	use crate::utils::Bytes;
1385	use pretty_assertions::assert_eq;
1386
1387	#[$test]
1388	$($async)? fn not_properly_start() {
1389	let buf = $buf;
1390	let mut position = `0`;
1391	let mut input = b"!d other content".as_ref();
1392	// ^= 0
1393
1394	match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1395	Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
1396	x => assert!(
1397	`false`,
1398	r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
1399	x
1400	),
1401	}
1402	assert_eq!(position, `0`);
1403	}
1404
1405	#[$test]
1406	$($async)? fn without_space() {
1407	let buf = $buf;
1408	let mut position = `0`;
1409	let mut input = b"!doctypeother content".as_ref();
1410	// ^= 0
1411
1412	match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1413	Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
1414	x => assert!(
1415	`false`,
1416	r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
1417	x
1418	),
1419	}
1420	assert_eq!(position, `0`);
1421	}
1422
1423	#[$test]
1424	$($async)? fn empty() {
1425	let buf = $buf;
1426	let mut position = `0`;
1427	let mut input = b"!doctype>other content".as_ref();
1428	// ^= 9
1429
1430	assert_eq!(
1431	$source(&mut input)
1432	.read_bang_element(buf, &mut position)
1433	$(.$await)?
1434	.unwrap()
1435	.map(\|(ty, data)\| (ty, Bytes(data))),
1436	Some((BangType::DocType, Bytes(b"!doctype")))
1437	);
1438	assert_eq!(position, `9`);
1439	}
1440
1441	#[$test]
1442	$($async)? fn not_closed() {
1443	let buf = $buf;
1444	let mut position = `0`;
1445	let mut input = b"!doctype other content".as_ref();
1446	// ^= 0
1447
1448	match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1449	Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
1450	x => assert!(
1451	`false`,
1452	r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
1453	x
1454	),
1455	}
1456	assert_eq!(position, `0`);
1457	}
1458	}
1459	}
1460	}
1461
1462	mod read_element {
1463	use super::*;
1464	use crate::utils::Bytes;
1465	use pretty_assertions::assert_eq;
1466
1467	/// Checks that nothing was read from empty buffer
1468	#[$test]
1469	$($async)? fn empty() {
1470	let buf = $buf;
1471	let mut position = `0`;
1472	let mut input = b"".as_ref();
1473	// ^= 0
1474
1475	assert_eq!(
1476	$source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1477	None
1478	);
1479	assert_eq!(position, `0`);
1480	}
1481
1482	mod open {
1483	use super::*;
1484	use crate::utils::Bytes;
1485	use pretty_assertions::assert_eq;
1486
1487	#[$test]
1488	$($async)? fn empty_tag() {
1489	let buf = $buf;
1490	let mut position = `0`;
1491	let mut input = b">".as_ref();
1492	// ^= 1
1493
1494	assert_eq!(
1495	$source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1496	Some(Bytes(b""))
1497	);
1498	assert_eq!(position, `1`);
1499	}
1500
1501	#[$test]
1502	$($async)? fn normal() {
1503	let buf = $buf;
1504	let mut position = `0`;
1505	let mut input = b"tag>".as_ref();
1506	// ^= 4
1507
1508	assert_eq!(
1509	$source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1510	Some(Bytes(b"tag"))
1511	);
1512	assert_eq!(position, `4`);
1513	}
1514
1515	#[$test]
1516	$($async)? fn empty_ns_empty_tag() {
1517	let buf = $buf;
1518	let mut position = `0`;
1519	let mut input = b":>".as_ref();
1520	// ^= 2
1521
1522	assert_eq!(
1523	$source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1524	Some(Bytes(b":"))
1525	);
1526	assert_eq!(position, `2`);
1527	}
1528
1529	#[$test]
1530	$($async)? fn empty_ns() {
1531	let buf = $buf;
1532	let mut position = `0`;
1533	let mut input = b":tag>".as_ref();
1534	// ^= 5
1535
1536	assert_eq!(
1537	$source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1538	Some(Bytes(b":tag"))
1539	);
1540	assert_eq!(position, `5`);
1541	}
1542
1543	#[$test]
1544	$($async)? fn with_attributes() {
1545	let buf = $buf;
1546	let mut position = `0`;
1547	let mut input = br#"tag attr-1=">" attr2 = '>' 3attr>"#.as_ref();
1548	// ^= 38
1549
1550	assert_eq!(
1551	$source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1552	Some(Bytes(br#"tag attr-1=">" attr2 = '>' 3attr"#))
1553	);
1554	assert_eq!(position, `38`);
1555	}
1556	}
1557
1558	mod self_closed {
1559	use super::*;
1560	use crate::utils::Bytes;
1561	use pretty_assertions::assert_eq;
1562
1563	#[$test]
1564	$($async)? fn empty_tag() {
1565	let buf = $buf;
1566	let mut position = `0`;
1567	let mut input = b"/>".as_ref();
1568	// ^= 2
1569
1570	assert_eq!(
1571	$source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1572	Some(Bytes(b"/"))
1573	);
1574	assert_eq!(position, `2`);
1575	}
1576
1577	#[$test]
1578	$($async)? fn normal() {
1579	let buf = $buf;
1580	let mut position = `0`;
1581	let mut input = b"tag/>".as_ref();
1582	// ^= 5
1583
1584	assert_eq!(
1585	$source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1586	Some(Bytes(b"tag/"))
1587	);
1588	assert_eq!(position, `5`);
1589	}
1590
1591	#[$test]
1592	$($async)? fn empty_ns_empty_tag() {
1593	let buf = $buf;
1594	let mut position = `0`;
1595	let mut input = b":/>".as_ref();
1596	// ^= 3
1597
1598	assert_eq!(
1599	$source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1600	Some(Bytes(b":/"))
1601	);
1602	assert_eq!(position, `3`);
1603	}
1604
1605	#[$test]
1606	$($async)? fn empty_ns() {
1607	let buf = $buf;
1608	let mut position = `0`;
1609	let mut input = b":tag/>".as_ref();
1610	// ^= 6
1611
1612	assert_eq!(
1613	$source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1614	Some(Bytes(b":tag/"))
1615	);
1616	assert_eq!(position, `6`);
1617	}
1618
1619	#[$test]
1620	$($async)? fn with_attributes() {
1621	let buf = $buf;
1622	let mut position = `0`;
1623	let mut input = br#"tag attr-1="/>" attr2 = '/>' 3attr/>"#.as_ref();
1624	// ^= 41
1625
1626	assert_eq!(
1627	$source(&mut input).read_element(buf, &mut position) $(.$await)? .unwrap().map(Bytes),
1628	Some(Bytes(br#"tag attr-1="/>" attr2 = '/>' 3attr/"#))
1629	);
1630	assert_eq!(position, `41`);
1631	}
1632	}
1633	}
1634
1635	mod issue_344 {
1636	use crate::errors::Error;
1637	use crate::reader::Reader;
1638
1639	#[$test]
1640	$($async)? fn cdata() {
1641	let mut reader = Reader::from_str("![]]>");
1642
1643	match reader.$read_until_close($buf) $(.$await)? {
1644	Err(Error::UnexpectedEof(s)) if s == "CData" => {}
1645	x => assert!(
1646	`false`,
1647	r#"Expected `UnexpectedEof("CData")`, but result is: {:?}"#,
1648	x
1649	),
1650	}
1651	}
1652
1653	#[$test]
1654	$($async)? fn comment() {
1655	let mut reader = Reader::from_str("!- -->");
1656
1657	match reader.$read_until_close($buf) $(.$await)? {
1658	Err(Error::UnexpectedEof(s)) if s == "Comment" => {}
1659	x => assert!(
1660	`false`,
1661	r#"Expected `UnexpectedEof("Comment")`, but result is: {:?}"#,
1662	x
1663	),
1664	}
1665	}
1666
1667	#[$test]
1668	$($async)? fn doctype_uppercase() {
1669	let mut reader = Reader::from_str("!D>");
1670
1671	match reader.$read_until_close($buf) $(.$await)? {
1672	Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
1673	x => assert!(
1674	`false`,
1675	r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
1676	x
1677	),
1678	}
1679	}
1680
1681	#[$test]
1682	$($async)? fn doctype_lowercase() {
1683	let mut reader = Reader::from_str("!d>");
1684
1685	match reader.$read_until_close($buf) $(.$await)? {
1686	Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {}
1687	x => assert!(
1688	`false`,
1689	r#"Expected `UnexpectedEof("DOCTYPE")`, but result is: {:?}"#,
1690	x
1691	),
1692	}
1693	}
1694	}
1695
1696	/// Ensures, that no empty `Text` events are generated
1697	mod $read_event {
1698	use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event};
1699	use crate::reader::Reader;
1700	use pretty_assertions::assert_eq;
1701
1702	/// When `encoding` feature is enabled, encoding should be detected
1703	/// from BOM (UTF-8) and BOM should be stripped.
1704	///
1705	/// When `encoding` feature is disabled, UTF-8 is assumed and BOM
1706	/// character should be stripped for consistency
1707	#[$test]
1708	$($async)? fn bom_from_reader() {
1709	let mut reader = Reader::from_reader("`\u{feff}\u{feff}`".as_bytes());
1710
1711	assert_eq!(
1712	reader.$read_event($buf) $(.$await)? .unwrap(),
1713	Event::Text(BytesText::from_escaped("`\u{feff}`"))
1714	);
1715
1716	assert_eq!(
1717	reader.$read_event($buf) $(.$await)? .unwrap(),
1718	Event::Eof
1719	);
1720	}
1721
1722	/// When parsing from &str, encoding is fixed (UTF-8), so
1723	/// - when `encoding` feature is disabled, the behavior the
1724	/// same as in `bom_from_reader` text
1725	/// - when `encoding` feature is enabled, the behavior should
1726	/// stay consistent, so the first BOM character is stripped
1727	#[$test]
1728	$($async)? fn bom_from_str() {
1729	let mut reader = Reader::from_str("`\u{feff}\u{feff}`");
1730
1731	assert_eq!(
1732	reader.$read_event($buf) $(.$await)? .unwrap(),
1733	Event::Text(BytesText::from_escaped("`\u{feff}`"))
1734	);
1735
1736	assert_eq!(
1737	reader.$read_event($buf) $(.$await)? .unwrap(),
1738	Event::Eof
1739	);
1740	}
1741
1742	#[$test]
1743	$($async)? fn declaration() {
1744	let mut reader = Reader::from_str("<?xml ?>");
1745
1746	assert_eq!(
1747	reader.$read_event($buf) $(.$await)? .unwrap(),
1748	Event::Decl(BytesDecl::from_start(BytesStart::from_content("xml ", `3`)))
1749	);
1750	}
1751
1752	#[$test]
1753	$($async)? fn doctype() {
1754	let mut reader = Reader::from_str("<!DOCTYPE x>");
1755
1756	assert_eq!(
1757	reader.$read_event($buf) $(.$await)? .unwrap(),
1758	Event::DocType(BytesText::from_escaped("x"))
1759	);
1760	}
1761
1762	#[$test]
1763	$($async)? fn processing_instruction() {
1764	let mut reader = Reader::from_str("<?xml-stylesheet?>");
1765
1766	assert_eq!(
1767	reader.$read_event($buf) $(.$await)? .unwrap(),
1768	Event::PI(BytesText::from_escaped("xml-stylesheet"))
1769	);
1770	}
1771
1772	#[$test]
1773	$($async)? fn start() {
1774	let mut reader = Reader::from_str("<tag>");
1775
1776	assert_eq!(
1777	reader.$read_event($buf) $(.$await)? .unwrap(),
1778	Event::Start(BytesStart::new("tag"))
1779	);
1780	}
1781
1782	#[$test]
1783	$($async)? fn end() {
1784	let mut reader = Reader::from_str("</tag>");
1785	// Because we expect invalid XML, do not check that
1786	// the end name paired with the start name
1787	reader.check_end_names(`false`);
1788
1789	assert_eq!(
1790	reader.$read_event($buf) $(.$await)? .unwrap(),
1791	Event::End(BytesEnd::new("tag"))
1792	);
1793	}
1794
1795	#[$test]
1796	$($async)? fn empty() {
1797	let mut reader = Reader::from_str("<tag/>");
1798
1799	assert_eq!(
1800	reader.$read_event($buf) $(.$await)? .unwrap(),
1801	Event::Empty(BytesStart::new("tag"))
1802	);
1803	}
1804
1805	#[$test]
1806	$($async)? fn text() {
1807	let mut reader = Reader::from_str("text");
1808
1809	assert_eq!(
1810	reader.$read_event($buf) $(.$await)? .unwrap(),
1811	Event::Text(BytesText::from_escaped("text"))
1812	);
1813	}
1814
1815	#[$test]
1816	$($async)? fn cdata() {
1817	let mut reader = Reader::from_str("<![CDATA[]]>");
1818
1819	assert_eq!(
1820	reader.$read_event($buf) $(.$await)? .unwrap(),
1821	Event::CData(BytesCData::new(""))
1822	);
1823	}
1824
1825	#[$test]
1826	$($async)? fn comment() {
1827	let mut reader = Reader::from_str("<!---->");
1828
1829	assert_eq!(
1830	reader.$read_event($buf) $(.$await)? .unwrap(),
1831	Event::Comment(BytesText::from_escaped(""))
1832	);
1833	}
1834
1835	#[$test]
1836	$($async)? fn eof() {
1837	let mut reader = Reader::from_str("");
1838
1839	assert_eq!(
1840	reader.$read_event($buf) $(.$await)? .unwrap(),
1841	Event::Eof
1842	);
1843	}
1844	}
1845	};
1846	}
1847
1848	/// Tests for https://github.com/tafia/quick-xml/issues/469
1849	macro_rules! small_buffers {
1850	(
1851	#[$test:meta]
1852	$read_event:ident: $BufReader:ty
1853	$(, $async:ident, $await:ident)?
1854	) => {
1855	mod small_buffers {
1856	use crate::events::{BytesCData, BytesDecl, BytesStart, BytesText, Event};
1857	use crate::reader::Reader;
1858	use pretty_assertions::assert_eq;
1859
1860	#[$test]
1861	$($async)? fn decl() {
1862	let xml = "<?xml ?>";
1863	// ^^^^^^^ data that fit into buffer
1864	let size = xml.match_indices("?>").next().unwrap().`0` + `1`;
1865	let br = <$BufReader>::with_capacity(size, xml.as_bytes());
1866	let mut reader = Reader::from_reader(br);
1867	let mut buf = Vec::new();
1868
1869	assert_eq!(
1870	reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1871	Event::Decl(BytesDecl::from_start(BytesStart::from_content("xml ", `3`)))
1872	);
1873	assert_eq!(
1874	reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1875	Event::Eof
1876	);
1877	}
1878
1879	#[$test]
1880	$($async)? fn pi() {
1881	let xml = "<?pi?>";
1882	// ^^^^^ data that fit into buffer
1883	let size = xml.match_indices("?>").next().unwrap().`0` + `1`;
1884	let br = <$BufReader>::with_capacity(size, xml.as_bytes());
1885	let mut reader = Reader::from_reader(br);
1886	let mut buf = Vec::new();
1887
1888	assert_eq!(
1889	reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1890	Event::PI(BytesText::new("pi"))
1891	);
1892	assert_eq!(
1893	reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1894	Event::Eof
1895	);
1896	}
1897
1898	#[$test]
1899	$($async)? fn empty() {
1900	let xml = "<empty/>";
1901	// ^^^^^^^ data that fit into buffer
1902	let size = xml.match_indices("/>").next().unwrap().`0` + `1`;
1903	let br = <$BufReader>::with_capacity(size, xml.as_bytes());
1904	let mut reader = Reader::from_reader(br);
1905	let mut buf = Vec::new();
1906
1907	assert_eq!(
1908	reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1909	Event::Empty(BytesStart::new("empty"))
1910	);
1911	assert_eq!(
1912	reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1913	Event::Eof
1914	);
1915	}
1916
1917	#[$test]
1918	$($async)? fn cdata1() {
1919	let xml = "<![CDATA[cdata]]>";
1920	// ^^^^^^^^^^^^^^^ data that fit into buffer
1921	let size = xml.match_indices("]]>").next().unwrap().`0` + `1`;
1922	let br = <$BufReader>::with_capacity(size, xml.as_bytes());
1923	let mut reader = Reader::from_reader(br);
1924	let mut buf = Vec::new();
1925
1926	assert_eq!(
1927	reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1928	Event::CData(BytesCData::new("cdata"))
1929	);
1930	assert_eq!(
1931	reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1932	Event::Eof
1933	);
1934	}
1935
1936	#[$test]
1937	$($async)? fn cdata2() {
1938	let xml = "<![CDATA[cdata]]>";
1939	// ^^^^^^^^^^^^^^^^ data that fit into buffer
1940	let size = xml.match_indices("]]>").next().unwrap().`0` + `2`;
1941	let br = <$BufReader>::with_capacity(size, xml.as_bytes());
1942	let mut reader = Reader::from_reader(br);
1943	let mut buf = Vec::new();
1944
1945	assert_eq!(
1946	reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1947	Event::CData(BytesCData::new("cdata"))
1948	);
1949	assert_eq!(
1950	reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1951	Event::Eof
1952	);
1953	}
1954
1955	#[$test]
1956	$($async)? fn comment1() {
1957	let xml = "<!--comment-->";
1958	// ^^^^^^^^^^^^ data that fit into buffer
1959	let size = xml.match_indices("-->").next().unwrap().`0` + `1`;
1960	let br = <$BufReader>::with_capacity(size, xml.as_bytes());
1961	let mut reader = Reader::from_reader(br);
1962	let mut buf = Vec::new();
1963
1964	assert_eq!(
1965	reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1966	Event::Comment(BytesText::new("comment"))
1967	);
1968	assert_eq!(
1969	reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1970	Event::Eof
1971	);
1972	}
1973
1974	#[$test]
1975	$($async)? fn comment2() {
1976	let xml = "<!--comment-->";
1977	// ^^^^^^^^^^^^^ data that fit into buffer
1978	let size = xml.match_indices("-->").next().unwrap().`0` + `2`;
1979	let br = <$BufReader>::with_capacity(size, xml.as_bytes());
1980	let mut reader = Reader::from_reader(br);
1981	let mut buf = Vec::new();
1982
1983	assert_eq!(
1984	reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1985	Event::Comment(BytesText::new("comment"))
1986	);
1987	assert_eq!(
1988	reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1989	Event::Eof
1990	);
1991	}
1992	}
1993	};
1994	}
1995
1996	// Export macros for the child modules:
1997	// - buffered_reader
1998	// - slice_reader
1999	pub(super) use check;
2000	pub(super) use small_buffers;
2001	}
2002