mod.rs source code [crates/quick_xml/src/reader/mod.rs]

1	//! Contains high-level interface for a pull-based XML parser.
2
3	#[cfg(feature = "encoding")]
4	use encoding_rs::Encoding;
5	use std::io;
6	use std::ops::Range;
7
8	use crate::encoding::Decoder;
9	use crate::errors::{Error, SyntaxError};
10	use crate::events::Event;
11	use crate::parser::{ElementParser, Parser, PiParser};
12	use crate::reader::state::ReaderState;
13
14	/// A struct that holds a parser configuration.
15	///
16	/// Current parser configuration can be retrieved by calling [`Reader::config()`]
17	/// and changed by changing properties of the object returned by a call to
18	/// [`Reader::config_mut()`].
19	///
20	/// [`Reader::config()`]: crate::reader::Reader::config
21	/// [`Reader::config_mut()`]: crate::reader::Reader::config_mut
22	#[derive(Debug, Clone, PartialEq, Eq)]
23	#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
24	#[cfg_attr(feature = "serde-types", derive(serde::Deserialize, serde::Serialize))]
25	#[non_exhaustive]
26	pub struct Config {
27	/// Whether unmatched closing tag names should be allowed. Unless enabled,
28	/// in case of a dangling end tag, the [`Error::IllFormed(UnmatchedEndTag)`]
29	/// is returned from read methods.
30	///
31	/// When set to `true`, it won't check if a closing tag has a corresponding
32	/// opening tag at all. For example, `<a></a></b>` will be permitted.
33	///
34	/// Note that the emitted [`End`] event will not be modified if this is enabled,
35	/// ie. it will contain the data of the unmatched end tag.
36	///
37	/// Note, that setting this to `true` will lead to additional allocates that
38	/// needed to store tag name for an [`End`] event.
39	///
40	/// Default: `false`
41	///
42	/// [`Error::IllFormed(UnmatchedEndTag)`]: crate::errors::IllFormedError::UnmatchedEndTag
43	/// [`End`]: crate::events::Event::End
44	pub allow_unmatched_ends: bool,
45
46	/// Whether comments should be validated. If enabled, in case of invalid comment
47	/// [`Error::IllFormed(DoubleHyphenInComment)`] is returned from read methods.
48	///
49	/// When set to `true`, every [`Comment`] event will be checked for not
50	/// containing `--`, which [is not allowed] in XML comments. Most of the time
51	/// we don't want comments at all so we don't really care about comment
52	/// correctness, thus the default value is `false` to improve performance.
53	///
54	/// Default: `false`
55	///
56	/// [`Error::IllFormed(DoubleHyphenInComment)`]: crate::errors::IllFormedError::DoubleHyphenInComment
57	/// [`Comment`]: crate::events::Event::Comment
58	/// [is not allowed]: https://www.w3.org/TR/xml11/#sec-comments
59	pub check_comments: bool,
60
61	/// Whether mismatched closing tag names should be detected. If enabled, in
62	/// case of mismatch the [`Error::IllFormed(MismatchedEndTag)`] is returned from
63	/// read methods.
64	///
65	/// Note, that start and end tags [should match literally][spec], they cannot
66	/// have different prefixes even if both prefixes resolve to the same namespace.
67	/// The XML
68	///
69	/// ```xml
70	/// <outer xmlns="namespace" xmlns:p="namespace">
71	/// </p:outer>
72	/// ```
73	///
74	/// is not valid, even though semantically the start tag is the same as the
75	/// end tag. The reason is that namespaces are an extension of the original
76	/// XML specification (without namespaces) and it should be backward-compatible.
77	///
78	/// When set to `false`, it won't check if a closing tag matches the corresponding
79	/// opening tag. For example, `<mytag></different_tag>` will be permitted.
80	///
81	/// If the XML is known to be sane (already processed, etc.) this saves extra time.
82	///
83	/// Note that the emitted [`End`] event will not be modified if this is disabled,
84	/// ie. it will contain the data of the mismatched end tag.
85	///
86	/// Note, that setting this to `true` will lead to additional allocates that
87	/// needed to store tag name for an [`End`] event. However if [`expand_empty_elements`]
88	/// is also set, only one additional allocation will be performed that support
89	/// both these options.
90	///
91	/// Default: `true`
92	///
93	/// [`Error::IllFormed(MismatchedEndTag)`]: crate::errors::IllFormedError::MismatchedEndTag
94	/// [spec]: https://www.w3.org/TR/xml11/#dt-etag
95	/// [`End`]: crate::events::Event::End
96	/// [`expand_empty_elements`]: Self::expand_empty_elements
97	pub check_end_names: bool,
98
99	/// Whether empty elements should be split into an `Open` and a `Close` event.
100	///
101	/// When set to `true`, all [`Empty`] events produced by a self-closing tag
102	/// like `<tag/>` are expanded into a [`Start`] event followed by an [`End`]
103	/// event. When set to `false` (the default), those tags are represented by
104	/// an [`Empty`] event instead.
105	///
106	/// Note, that setting this to `true` will lead to additional allocates that
107	/// needed to store tag name for an [`End`] event. However if [`check_end_names`]
108	/// is also set, only one additional allocation will be performed that support
109	/// both these options.
110	///
111	/// Default: `false`
112	///
113	/// [`Empty`]: crate::events::Event::Empty
114	/// [`Start`]: crate::events::Event::Start
115	/// [`End`]: crate::events::Event::End
116	/// [`check_end_names`]: Self::check_end_names
117	pub expand_empty_elements: bool,
118
119	/// Whether trailing whitespace after the markup name are trimmed in closing
120	/// tags `</a >`.
121	///
122	/// If `true` the emitted [`End`] event is stripped of trailing whitespace
123	/// after the markup name.
124	///
125	/// Note that if set to `false` and [`check_end_names`] is `true` the comparison
126	/// of markup names is going to fail erroneously if a closing tag contains
127	/// trailing whitespace.
128	///
129	/// Default: `true`
130	///
131	/// [`End`]: crate::events::Event::End
132	/// [`check_end_names`]: Self::check_end_names
133	pub trim_markup_names_in_closing_tags: bool,
134
135	/// Whether whitespace before character data should be removed.
136	///
137	/// When set to `true`, leading whitespace is trimmed in [`Text`] events.
138	/// If after that the event is empty it will not be pushed.
139	///
140	/// Default: `false`
141	///
142	/// <div style="background:rgba(80, 240, 100, 0.20);padding:0.75em;">
143	///
144	/// WARNING: With this option every text events will be trimmed which is
145	/// incorrect behavior when text events delimited by comments, processing
146	/// instructions or CDATA sections. To correctly trim data manually apply
147	/// [`BytesText::inplace_trim_start`] and [`BytesText::inplace_trim_end`]
148	/// only to necessary events.
149	/// </div>
150	///
151	/// [`Text`]: crate::events::Event::Text
152	/// [`BytesText::inplace_trim_start`]: crate::events::BytesText::inplace_trim_start
153	/// [`BytesText::inplace_trim_end`]: crate::events::BytesText::inplace_trim_end
154	pub trim_text_start: bool,
155
156	/// Whether whitespace after character data should be removed.
157	///
158	/// When set to `true`, trailing whitespace is trimmed in [`Text`] events.
159	/// If after that the event is empty it will not be pushed.
160	///
161	/// Default: `false`
162	///
163	/// <div style="background:rgba(80, 240, 100, 0.20);padding:0.75em;">
164	///
165	/// WARNING: With this option every text events will be trimmed which is
166	/// incorrect behavior when text events delimited by comments, processing
167	/// instructions or CDATA sections. To correctly trim data manually apply
168	/// [`BytesText::inplace_trim_start`] and [`BytesText::inplace_trim_end`]
169	/// only to necessary events.
170	/// </div>
171	///
172	/// [`Text`]: crate::events::Event::Text
173	/// [`BytesText::inplace_trim_start`]: crate::events::BytesText::inplace_trim_start
174	/// [`BytesText::inplace_trim_end`]: crate::events::BytesText::inplace_trim_end
175	pub trim_text_end: bool,
176	}
177
178	impl Config {
179	/// Set both [`trim_text_start`] and [`trim_text_end`] to the same value.
180	///
181	/// <div style="background:rgba(80, 240, 100, 0.20);padding:0.75em;">
182	///
183	/// WARNING: With this option every text events will be trimmed which is
184	/// incorrect behavior when text events delimited by comments, processing
185	/// instructions or CDATA sections. To correctly trim data manually apply
186	/// [`BytesText::inplace_trim_start`] and [`BytesText::inplace_trim_end`]
187	/// only to necessary events.
188	/// </div>
189	///
190	/// [`trim_text_start`]: Self::trim_text_start
191	/// [`trim_text_end`]: Self::trim_text_end
192	/// [`BytesText::inplace_trim_start`]: crate::events::BytesText::inplace_trim_start
193	/// [`BytesText::inplace_trim_end`]: crate::events::BytesText::inplace_trim_end
194	#[inline]
195	pub fn trim_text(&mut self, trim: bool) {
196	self.trim_text_start = trim;
197	self.trim_text_end = trim;
198	}
199
200	/// Turn on or off all checks for well-formedness. Currently it is that settings:
201	/// - [`check_comments`](Self::check_comments)
202	/// - [`check_end_names`](Self::check_end_names)
203	#[inline]
204	pub fn enable_all_checks(&mut self, enable: bool) {
205	self.check_comments = enable;
206	self.check_end_names = enable;
207	}
208	}
209
210	impl Default for Config {
211	fn default() -> Self {
212	Self {
213	allow_unmatched_ends: `false`,
214	check_comments: `false`,
215	check_end_names: `true`,
216	expand_empty_elements: `false`,
217	trim_markup_names_in_closing_tags: `true`,
218	trim_text_start: `false`,
219	trim_text_end: `false`,
220	}
221	}
222	}
223
224	////////////////////////////////////////////////////////////////////////////////////////////////////
225
226	macro_rules! read_event_impl {
227	(
228	$self:ident, $buf:ident,
229	$reader:expr,
230	$read_until_close:ident
231	$(, $await:ident)?
232	) => {{
233	let event = loop {
234	break match $self.state.state {
235	ParseState::Init => { // Go to InsideMarkup state
236	// If encoding set explicitly, we not need to detect it. For example,
237	// explicit UTF-8 set automatically if Reader was created using `from_str`.
238	// But we still need to remove BOM for consistency with no encoding
239	// feature enabled path
240	#[cfg(feature = "encoding")]
241	if let Some(encoding) = $reader.detect_encoding() $(.$await)? ? {
242	if $self.state.encoding.can_be_refined() {
243	$self.state.encoding = crate::reader::EncodingRef::BomDetected(encoding);
244	}
245	}
246
247	// Removes UTF-8 BOM if it is present
248	#[cfg(not(feature = "encoding"))]
249	$reader.remove_utf8_bom() $(.$await)? ?;
250
251	$self.state.state = ParseState::InsideText;
252	continue;
253	},
254	ParseState::InsideText => { // Go to InsideMarkup or Done state
255	if $self.state.config.trim_text_start {
256	$reader.skip_whitespace(&mut $self.state.offset) $(.$await)? ?;
257	}
258
259	match $reader.read_text($buf, &mut $self.state.offset) $(.$await)? {
260	ReadTextResult::Markup(buf) => {
261	$self.state.state = ParseState::InsideMarkup;
262	// Pass `buf` to the next next iteration of parsing loop
263	$buf = buf;
264	continue;
265	}
266	ReadTextResult::UpToMarkup(bytes) => {
267	$self.state.state = ParseState::InsideMarkup;
268	// FIXME: Can produce an empty event if:
269	// - event contains only spaces
270	// - trim_text_start = false
271	// - trim_text_end = true
272	Ok(Event::Text($self.state.emit_text(bytes)))
273	}
274	ReadTextResult::UpToEof(bytes) => {
275	$self.state.state = ParseState::Done;
276	// Trim bytes from end if required
277	let event = $self.state.emit_text(bytes);
278	if event.is_empty() {
279	Ok(Event::Eof)
280	} else {
281	Ok(Event::Text(event))
282	}
283	}
284	ReadTextResult::Err(e) => Err(Error::Io(e.into())),
285	}
286	},
287	// Go to InsideText state in next two arms
288	ParseState::InsideMarkup => $self.$read_until_close($buf) $(.$await)?,
289	ParseState::InsideEmpty => Ok(Event::End($self.state.close_expanded_empty())),
290	ParseState::Done => Ok(Event::Eof),
291	};
292	};
293	match event {
294	// #513: In case of ill-formed errors we already consume the wrong data
295	// and change the state. We can continue parsing if we wish
296	Err(Error::IllFormed(_)) => {}
297	Err(_) \| Ok(Event::Eof) => $self.state.state = ParseState::Done,
298	_ => {}
299	}
300	event
301	}};
302	}
303
304	/// Read bytes up to the `>` and skip it. This method is expected to be called
305	/// after seeing the `<` symbol and skipping it. Inspects the next (current)
306	/// symbol and returns an appropriate [`Event`]:
307	///
308	/// \|Symbol \|Event
309	/// \|-------\|-------------------------------------
310	/// \|`!` \|[`Comment`], [`CData`] or [`DocType`]
311	/// \|`/` \|[`End`]
312	/// \|`?` \|[`PI`]
313	/// \|_other_\|[`Start`] or [`Empty`]
314	///
315	/// Moves parser to the `InsideText` state.
316	///
317	/// [`Comment`]: Event::Comment
318	/// [`CData`]: Event::CData
319	/// [`DocType`]: Event::DocType
320	/// [`End`]: Event::End
321	/// [`PI`]: Event::PI
322	/// [`Start`]: Event::Start
323	/// [`Empty`]: Event::Empty
324	macro_rules! read_until_close {
325	(
326	$self:ident, $buf:ident,
327	$reader:expr
328	$(, $await:ident)?
329	) => {{
330	$self.state.state = ParseState::InsideText;
331
332	let start = $self.state.offset;
333	match $reader.peek_one() $(.$await)? {
334	// `<!` - comment, CDATA or DOCTYPE declaration
335	Ok(Some(b'!')) => match $reader
336	.read_bang_element($buf, &mut $self.state.offset)
337	$(.$await)?
338	{
339	Ok((bang_type, bytes)) => $self.state.emit_bang(bang_type, bytes),
340	Err(e) => {
341	// We want to report error at `<`, but offset was increased,
342	// so return it back (-1 for `<`)
343	$self.state.last_error_offset = start - `1`;
344	Err(e)
345	}
346	},
347	// `</` - closing tag
348	// #776: We parse using ElementParser which allows us to have attributes
349	// in close tags. While such tags are not allowed by the specification,
350	// we anyway allow to parse them because:
351	// - we do not check constraints during parsing. This is performed by the
352	// optional validate step which user should call manually
353	// - if we just look for `>` we will parse `</tag attr=">" >` as end tag
354	// `</tag attr=">` and text `" >` which probably no one existing parser
355	// does. This is malformed XML, however it is tolerated by some parsers
356	// (e.g. the one used by Adobe Flash) and such documents do exist in the wild.
357	Ok(Some(b'/')) => match $reader
358	.read_with(ElementParser::Outside, $buf, &mut $self.state.offset)
359	$(.$await)?
360	{
361	Ok(bytes) => $self.state.emit_end(bytes),
362	Err(e) => {
363	// We want to report error at `<`, but offset was increased,
364	// so return it back (-1 for `<`)
365	$self.state.last_error_offset = start - `1`;
366	Err(e)
367	}
368	},
369	// `<?` - processing instruction
370	Ok(Some(b'?')) => match $reader
371	.read_with(PiParser(`false`), $buf, &mut $self.state.offset)
372	$(.$await)?
373	{
374	Ok(bytes) => $self.state.emit_question_mark(bytes),
375	Err(e) => {
376	// We want to report error at `<`, but offset was increased,
377	// so return it back (-1 for `<`)
378	$self.state.last_error_offset = start - `1`;
379	Err(e)
380	}
381	},
382	// `<...` - opening or self-closed tag
383	Ok(Some(_)) => match $reader
384	.read_with(ElementParser::Outside, $buf, &mut $self.state.offset)
385	$(.$await)?
386	{
387	Ok(bytes) => Ok($self.state.emit_start(bytes)),
388	Err(e) => {
389	// We want to report error at `<`, but offset was increased,
390	// so return it back (-1 for `<`)
391	$self.state.last_error_offset = start - `1`;
392	Err(e)
393	}
394	},
395	// `<` - syntax error, tag not closed
396	Ok(None) => {
397	// We want to report error at `<`, but offset was increased,
398	// so return it back (-1 for `<`)
399	$self.state.last_error_offset = start - `1`;
400	Err(Error::Syntax(SyntaxError::UnclosedTag))
401	}
402	Err(e) => Err(Error::Io(e.into())),
403	}
404	}};
405	}
406
407	/// Generalization of `read_to_end` method for buffered and borrowed readers
408	macro_rules! read_to_end {
409	(
410	// $self: &mut Reader
411	$self:expr, $end:expr, $buf:expr,
412	$read_event:ident,
413	// Code block that performs clearing of internal buffer after read of each event
414	$clear:block
415	$(, $await:ident)?
416	) => {{
417	// Because we take position after the event before the End event,
418	// it is important that this position indicates beginning of the End event.
419	// If between last event and the End event would be only spaces, then we
420	// take position before the spaces, but spaces would be skipped without
421	// generating event if `trim_text_start` is set to `true`. To prevent that
422	// we temporary disable start text trimming.
423	//
424	// We also cannot take position after getting End event, because if
425	// `trim_markup_names_in_closing_tags` is set to `true` (which is the default),
426	// we do not known the real size of the End event that it is occupies in
427	// the source and cannot correct the position after the End event.
428	// So, we in any case should tweak parser configuration.
429	let config = $self.config_mut();
430	let trim = config.trim_text_start;
431	config.trim_text_start = `false`;
432
433	let start = $self.buffer_position();
434	let mut depth = `0`;
435	loop {
436	$clear
437	let end = $self.buffer_position();
438	match $self.$read_event($buf) $(.$await)? {
439	Err(e) => {
440	$self.config_mut().trim_text_start = trim;
441	return Err(e);
442	}
443
444	Ok(Event::Start(e)) if e.name() == $end => depth += `1`,
445	Ok(Event::End(e)) if e.name() == $end => {
446	if depth == `0` {
447	$self.config_mut().trim_text_start = trim;
448	break start..end;
449	}
450	depth -= `1`;
451	}
452	Ok(Event::Eof) => {
453	$self.config_mut().trim_text_start = trim;
454	return Err(Error::missed_end($end, $self.decoder()));
455	}
456	_ => (),
457	}
458	}
459	}};
460	}
461
462	#[cfg(feature = "async-tokio")]
463	mod async_tokio;
464	mod buffered_reader;
465	mod ns_reader;
466	mod slice_reader;
467	mod state;
468
469	pub use ns_reader::NsReader;
470
471	/// Range of input in bytes, that corresponds to some piece of XML
472	pub type Span = Range<u64>;
473
474	////////////////////////////////////////////////////////////////////////////////////////////////////
475
476	/// Possible reader states. The state transition diagram (`true` and `false` shows
477	/// value of [`Config::expand_empty_elements`] option):
478	///
479	/// ```mermaid
480	/// flowchart LR
481	/// subgraph _
482	/// direction LR
483	///
484	/// Init -- "(no event)"\n --> InsideMarkup
485	/// InsideMarkup -- Decl, DocType, PI\nComment, CData\nStart, Empty, End --> InsideText
486	/// InsideText -- "#lt;false#gt;\n(no event)"\nText --> InsideMarkup
487	/// end
488	/// InsideText -- "#lt;true#gt;"\nStart --> InsideEmpty
489	/// InsideEmpty -- End --> InsideText
490	/// _ -. Eof .-> Done
491	/// ```
492	#[derive(Clone, Debug)]
493	enum ParseState {
494	/// Initial state in which reader stay after creation. Transition from that
495	/// state could produce a `Text`, `Decl`, `Comment` or `Start` event. The next
496	/// state is always `InsideMarkup`. The reader will never return to this state. The
497	/// event emitted during transition to `InsideMarkup` is a `StartEvent` if the
498	/// first symbol not `<`, otherwise no event are emitted.
499	Init,
500	/// State after seeing the `<` symbol. Depending on the next symbol all other
501	/// events could be generated.
502	///
503	/// After generating one event the reader moves to the `InsideText` state.
504	InsideMarkup,
505	/// State in which reader searches the `<` symbol of a markup. All bytes before
506	/// that symbol will be returned in the [`Event::Text`] event. After that
507	/// the reader moves to the `InsideMarkup` state.
508	InsideText,
509	/// This state is used only if option [`expand_empty_elements`] is set to `true`.
510	/// Reader enters to this state when it is in a `InsideText` state and emits an
511	/// [`Event::Start`] event. The next event emitted will be an [`Event::End`],
512	/// after which reader returned to the `InsideText` state.
513	///
514	/// [`expand_empty_elements`]: Config::expand_empty_elements
515	InsideEmpty,
516	/// Reader enters this state when `Eof` event generated or an error occurred.
517	/// This is the last state, the reader stay in it forever.
518	Done,
519	}
520
521	/// A reference to an encoding together with information about how it was retrieved.
522	///
523	/// The state transition diagram:
524	///
525	/// ```mermaid
526	/// flowchart LR
527	/// Implicit -- from_str --> Explicit
528	/// Implicit -- BOM --> BomDetected
529	/// Implicit -- "encoding=..." --> XmlDetected
530	/// BomDetected -- "encoding=..." --> XmlDetected
531	/// ```
532	#[cfg(feature = "encoding")]
533	#[derive(Clone, Copy, Debug)]
534	enum EncodingRef {
535	/// Encoding was implicitly assumed to have a specified value. It can be refined
536	/// using BOM or by the XML declaration event (`<?xml encoding=... ?>`)
537	Implicit(&'static Encoding),
538	/// Encoding was explicitly set to the desired value. It cannot be changed
539	/// nor by BOM, nor by parsing XML declaration (`<?xml encoding=... ?>`)
540	Explicit(&'static Encoding),
541	/// Encoding was detected from a byte order mark (BOM) or by the first bytes
542	/// of the content. It can be refined by the XML declaration event (`<?xml encoding=... ?>`)
543	BomDetected(&'static Encoding),
544	/// Encoding was detected using XML declaration event (`<?xml encoding=... ?>`).
545	/// It can no longer change
546	XmlDetected(&'static Encoding),
547	}
548	#[cfg(feature = "encoding")]
549	impl EncodingRef {
550	#[inline]
551	const fn encoding(&self) -> &'static Encoding {
552	match self {
553	Self::Implicit(e) => e,
554	Self::Explicit(e) => e,
555	Self::BomDetected(e) => e,
556	Self::XmlDetected(e) => e,
557	}
558	}
559	#[inline]
560	const fn can_be_refined(&self) -> bool {
561	match self {
562	Self::Implicit(_) \| Self::BomDetected(_) => `true`,
563	Self::Explicit(_) \| Self::XmlDetected(_) => `false`,
564	}
565	}
566	}
567
568	////////////////////////////////////////////////////////////////////////////////////////////////////
569
570	/// A direct stream to the underlying [`Reader`]s reader which updates
571	/// [`Reader::buffer_position()`] when read from it.
572	#[derive(Debug)]
573	#[must_use = "streams do nothing unless read or polled"]
574	pub struct BinaryStream<'r, R> {
575	inner: &'r mut R,
576	offset: &'r mut u64,
577	}
578
579	impl<'r, R> BinaryStream<'r, R> {
580	/// Returns current position in bytes in the original source.
581	#[inline]
582	pub const fn offset(&self) -> u64 {
583	*self.offset
584	}
585
586	/// Gets a reference to the underlying reader.
587	#[inline]
588	pub const fn get_ref(&self) -> &R {
589	self.inner
590	}
591
592	/// Gets a mutable reference to the underlying reader.
593	///
594	/// Avoid read from this reader because this will not update reader's position
595	/// and will lead to incorrect positions of errors. Read from this stream instead.
596	#[inline]
597	pub fn get_mut(&mut self) -> &mut R {
598	self.inner
599	}
600	}
601
602	impl<'r, R> io::Read for BinaryStream<'r, R>
603	where
604	R: io::Read,
605	{
606	#[inline]
607	fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
608	let amt: usize = self.inner.read(buf)?;
609	self.offset += amt as u64*;
610	Ok(amt)
611	}
612	}
613
614	impl<'r, R> io::BufRead for BinaryStream<'r, R>
615	where
616	R: io::BufRead,
617	{
618	#[inline]
619	fn fill_buf(&mut self) -> io::Result<&[u8]> {
620	self.inner.fill_buf()
621	}
622
623	#[inline]
624	fn consume(&mut self, amt: usize) {
625	self.inner.consume(amount:amt);
626	self.offset += amt as u64*;
627	}
628	}
629
630	////////////////////////////////////////////////////////////////////////////////////////////////////
631
632	/// A low level encoding-agnostic XML event reader.
633	///
634	/// Consumes bytes and streams XML [`Event`]s.
635	///
636	/// This reader does not manage namespace declarations and not able to resolve
637	/// prefixes. If you want these features, use the [`NsReader`].
638	///
639	/// # Examples
640	///
641	/// ```
642	/// use quick_xml::events::Event;
643	/// use quick_xml::reader::Reader;
644	///
645	/// let xml = r#"<tag1 att1 = "test">
646	/// <tag2><!--Test comment-->Test</tag2>
647	/// <tag2>Test 2</tag2>
648	/// </tag1>"#;
649	/// let mut reader = Reader::from_str(xml);
650	/// reader.config_mut().trim_text(`true`);
651	///
652	/// let mut count = `0`;
653	/// let mut txt = Vec::new();
654	/// let mut buf = Vec::new();
655	///
656	/// // The `Reader` does not implement `Iterator` because it outputs borrowed data (`Cow`s)
657	/// loop {
658	/// // NOTE: this is the generic case when we don't know about the input BufRead.
659	/// // when the input is a &str or a &[u8], we don't actually need to use another
660	/// // buffer, we could directly call `reader.read_event()`
661	/// match reader.read_event_into(&mut buf) {
662	/// Err(e) => panic!("Error at position {}: {:?}", reader.error_position(), e),
663	/// // exits the loop when reaching end of file
664	/// Ok(Event::Eof) => break,
665	///
666	/// Ok(Event::Start(e)) => {
667	/// match e.name().as_ref() {
668	/// b"tag1" => println!("attributes values: {:?}",
669	/// e.attributes().map(\|a\| a.unwrap().value)
670	/// .collect::<Vec<_>>()),
671	/// b"tag2" => count += `1`,
672	/// _ => (),
673	/// }
674	/// }
675	/// Ok(Event::Text(e)) => txt.push(e.unescape().unwrap().into_owned()),
676	///
677	/// // There are several other `Event`s we do not consider here
678	/// _ => (),
679	/// }
680	/// // if we don't keep a borrow elsewhere, we can clear the buffer to keep memory usage low
681	/// buf.clear();
682	/// }
683	/// ```
684	///
685	/// [`NsReader`]: crate::reader::NsReader
686	#[derive(Debug, Clone)]
687	pub struct Reader<R> {
688	/// Source of data for parse
689	reader: R,
690	/// Configuration and current parse state
691	state: ReaderState,
692	}
693
694	/// Builder methods
695	impl<R> Reader<R> {
696	/// Creates a `Reader` that reads from a given reader.
697	pub fn from_reader(reader: R) -> Self {
698	Self {
699	reader,
700	state: ReaderState::default(),
701	}
702	}
703
704	/// Returns reference to the parser configuration
705	pub const fn config(&self) -> &Config {
706	&self.state.config
707	}
708
709	/// Returns mutable reference to the parser configuration
710	pub fn config_mut(&mut self) -> &mut Config {
711	&mut self.state.config
712	}
713	}
714
715	/// Getters
716	impl<R> Reader<R> {
717	/// Consumes `Reader` returning the underlying reader
718	///
719	/// Can be used to compute line and column of a parsing error position
720	///
721	/// # Examples
722	///
723	/// ```
724	/// # use pretty_assertions::assert_eq;
725	/// use std::{str, io::Cursor};
726	/// use quick_xml::events::Event;
727	/// use quick_xml::reader::Reader;
728	///
729	/// let xml = r#"<tag1 att1 = "test">
730	/// <tag2><!--Test comment-->Test</tag2>
731	/// <tag3>Test 2</tag3>
732	/// </tag1>"#;
733	/// let mut reader = Reader::from_reader(Cursor::new(xml.as_bytes()));
734	/// let mut buf = Vec::new();
735	///
736	/// fn into_line_and_column(reader: Reader<Cursor<&[u8]>>) -> (usize, usize) {
737	/// // We known that size cannot exceed usize::MAX because we created parser from single &[u8]
738	/// let end_pos = reader.buffer_position() as usize;
739	/// let mut cursor = reader.into_inner();
740	/// let s = String::from_utf8(cursor.into_inner()[`0`..end_pos].to_owned())
741	/// .expect("can't make a string");
742	/// let mut line = `1`;
743	/// let mut column = `0`;
744	/// for c in s.chars() {
745	/// if c == '`\n`' {
746	/// line += `1`;
747	/// column = `0`;
748	/// } else {
749	/// column += `1`;
750	/// }
751	/// }
752	/// (line, column)
753	/// }
754	///
755	/// loop {
756	/// match reader.read_event_into(&mut buf) {
757	/// Ok(Event::Start(ref e)) => match e.name().as_ref() {
758	/// b"tag1" \| b"tag2" => (),
759	/// tag => {
760	/// assert_eq!(b"tag3", tag);
761	/// assert_eq!((`3`, `22`), into_line_and_column(reader));
762	/// break;
763	/// }
764	/// },
765	/// Ok(Event::Eof) => unreachable!(),
766	/// _ => (),
767	/// }
768	/// buf.clear();
769	/// }
770	/// ```
771	pub fn into_inner(self) -> R {
772	self.reader
773	}
774
775	/// Gets a reference to the underlying reader.
776	pub const fn get_ref(&self) -> &R {
777	&self.reader
778	}
779
780	/// Gets a mutable reference to the underlying reader.
781	///
782	/// Avoid read from this reader because this will not update reader's position
783	/// and will lead to incorrect positions of errors. If you want to read, use
784	/// [`stream()`] instead.
785	///
786	/// [`stream()`]: Self::stream
787	pub fn get_mut(&mut self) -> &mut R {
788	&mut self.reader
789	}
790
791	/// Gets the current byte position in the input data.
792	pub const fn buffer_position(&self) -> u64 {
793	// when internal state is InsideMarkup, we have actually read until '<',
794	// which we don't want to show
795	if let ParseState::InsideMarkup = self.state.state {
796	self.state.offset - `1`
797	} else {
798	self.state.offset
799	}
800	}
801
802	/// Gets the last error byte position in the input data. If there is no errors
803	/// yet, returns `0`.
804	///
805	/// Unlike `buffer_position` it will point to the place where it is rational
806	/// to report error to the end user. For example, all [`SyntaxError`]s are
807	/// reported when the parser sees EOF inside of some kind of markup. The
808	/// `buffer_position()` will point to the last byte of input which is not
809	/// very useful. `error_position()` will point to the start of corresponding
810	/// markup element (i. e. to the `<` character).
811	///
812	/// This position is always `<= buffer_position()`.
813	pub const fn error_position(&self) -> u64 {
814	self.state.last_error_offset
815	}
816
817	/// Get the decoder, used to decode bytes, read by this reader, to the strings.
818	///
819	/// If [`encoding`] feature is enabled, the used encoding may change after
820	/// parsing the XML declaration, otherwise encoding is fixed to UTF-8.
821	///
822	/// If [`encoding`] feature is enabled and no encoding is specified in declaration,
823	/// defaults to UTF-8.
824	///
825	/// [`encoding`]: ../index.html#encoding
826	#[inline]
827	pub const fn decoder(&self) -> Decoder {
828	self.state.decoder()
829	}
830
831	/// Get the direct access to the underlying reader, but tracks the amount of
832	/// read data and update [`Reader::buffer_position()`] accordingly.
833	///
834	/// Note, that this method gives you access to the internal reader and read
835	/// data will not be returned in any subsequent events read by `read_event`
836	/// family of methods.
837	///
838	/// # Example
839	///
840	/// This example demonstrates how to read stream raw bytes from an XML document.
841	/// This could be used to implement streaming read of text, or to read raw binary
842	/// bytes embedded in an XML document. (Documents with embedded raw bytes are not
843	/// valid XML, but XML-derived file formats exist where such documents are valid).
844	///
845	/// ```
846	/// # use pretty_assertions::assert_eq;
847	/// use std::io::{BufRead, Read};
848	/// use quick_xml::events::{BytesEnd, BytesStart, Event};
849	/// use quick_xml::reader::Reader;
850	///
851	/// let mut reader = Reader::from_str("<tag>binary << data&></tag>");
852	/// // ^ ^ ^ ^
853	/// // 0 5 21 27
854	///
855	/// assert_eq!(
856	/// (reader.read_event().unwrap(), reader.buffer_position()),
857	/// // 5 - end of the `<tag>`
858	/// (Event::Start(BytesStart::new("tag")), `5`)
859	/// );
860	///
861	/// // Reading directly from underlying reader will not update position
862	/// // let mut inner = reader.get_mut();
863	///
864	/// // Reading from the stream() advances position
865	/// let mut inner = reader.stream();
866	///
867	/// // Read binary data. We must know its size
868	/// let mut binary = [`0u8`; `16`];
869	/// inner.read_exact(&mut binary).unwrap();
870	/// assert_eq!(&binary, b"binary << data&>");
871	/// // 21 - end of the `binary << data&>`
872	/// assert_eq!(inner.offset(), `21`);
873	/// assert_eq!(reader.buffer_position(), `21`);
874	///
875	/// assert_eq!(
876	/// (reader.read_event().unwrap(), reader.buffer_position()),
877	/// // 27 - end of the `</tag>`
878	/// (Event::End(BytesEnd::new("tag")), `27`)
879	/// );
880	///
881	/// assert_eq!(reader.read_event().unwrap(), Event::Eof);
882	/// ```
883	#[inline]
884	pub fn stream(&mut self) -> BinaryStream<R> {
885	BinaryStream {
886	inner: &mut self.reader,
887	offset: &mut self.state.offset,
888	}
889	}
890	}
891
892	/// Private sync reading methods
893	impl<R> Reader<R> {
894	/// Read text into the given buffer, and return an event that borrows from
895	/// either that buffer or from the input itself, based on the type of the
896	/// reader.
897	fn read_event_impl<'i, B>(&mut self, mut buf: B) -> Result<Event<'i>, Error>
898	where
899	R: XmlSource<'i, B>,
900	{
901	read_event_impl!(self, buf, self.reader, read_until_close)
902	}
903
904	/// Private function to read until `>` is found. This function expects that
905	/// it was called just after encounter a `<` symbol.
906	fn read_until_close<'i, B>(&mut self, buf: B) -> Result<Event<'i>, Error>
907	where
908	R: XmlSource<'i, B>,
909	{
910	read_until_close!(self, buf, self.reader)
911	}
912	}
913
914	////////////////////////////////////////////////////////////////////////////////////////////////////
915
916	/// Result of an attempt to read XML textual data from the reader.
917	enum ReadTextResult<'r, B> {
918	/// Start of markup (`<` character) was found in the first byte.
919	/// Contains buffer that should be returned back to the next iteration cycle
920	/// to satisfy borrow checker requirements.
921	Markup(B),
922	/// Contains text block up to start of markup (`<` character).
923	UpToMarkup(&'r [u8]),
924	/// Contains text block up to EOF, start of markup (`<` character) was not found.
925	UpToEof(&'r [u8]),
926	/// IO error occurred.
927	Err(io::Error),
928	}
929
930	/// Represents an input for a reader that can return borrowed data.
931	///
932	/// There are two implementors of this trait: generic one that read data from
933	/// `Self`, copies some part of it into a provided buffer of type `B` and then
934	/// returns data that borrow from that buffer.
935	///
936	/// The other implementor is for `&[u8]` and instead of copying data returns
937	/// borrowed data from `Self` instead. This implementation allows zero-copy
938	/// deserialization.
939	///
940	/// # Parameters
941	/// - `'r`: lifetime of a buffer from which events will borrow
942	/// - `B`: a type of a buffer that can be used to store data read from `Self` and
943	/// from which events can borrow
944	trait XmlSource<'r, B> {
945	/// Removes UTF-8 BOM if it is present
946	#[cfg(not(feature = "encoding"))]
947	fn remove_utf8_bom(&mut self) -> io::Result<()>;
948
949	/// Determines encoding from the start of input and removes BOM if it is present
950	#[cfg(feature = "encoding")]
951	fn detect_encoding(&mut self) -> io::Result<Option<&'static Encoding>>;
952
953	/// Read input until start of markup (the `<`) is found or end of input is reached.
954	///
955	/// # Parameters
956	/// - `buf`: Buffer that could be filled from an input (`Self`) and
957	/// from which [events] could borrow their data
958	/// - `position`: Will be increased by amount of bytes consumed
959	///
960	/// [events]: crate::events::Event
961	fn read_text(&mut self, buf: B, position: &mut u64) -> ReadTextResult<'r, B>;
962
963	/// Read input until processing instruction is finished.
964	///
965	/// This method expect that start sequence of a parser already was read.
966	///
967	/// Returns a slice of data read up to the end of the thing being parsed.
968	/// The end of thing and the returned content is determined by the used parser.
969	///
970	/// If input (`Self`) is exhausted and no bytes was read, or if the specified
971	/// parser could not find the ending sequence of the thing, returns `SyntaxError`.
972	///
973	/// # Parameters
974	/// - `buf`: Buffer that could be filled from an input (`Self`) and
975	/// from which [events] could borrow their data
976	/// - `position`: Will be increased by amount of bytes consumed
977	///
978	/// A `P` type parameter is used to preserve state between calls to the underlying
979	/// reader which provides bytes fed into the parser.
980	///
981	/// [events]: crate::events::Event
982	fn read_with<P>(&mut self, parser: P, buf: B, position: &mut u64) -> Result<&'r [u8], Error>
983	where
984	P: Parser;
985
986	/// Read input until comment or CDATA is finished.
987	///
988	/// This method expect that `<` already was read.
989	///
990	/// Returns a slice of data read up to end of comment or CDATA (`>`),
991	/// which does not include into result.
992	///
993	/// If input (`Self`) is exhausted and nothing was read, returns `None`.
994	///
995	/// # Parameters
996	/// - `buf`: Buffer that could be filled from an input (`Self`) and
997	/// from which [events] could borrow their data
998	/// - `position`: Will be increased by amount of bytes consumed
999	///
1000	/// [events]: crate::events::Event
1001	fn read_bang_element(
1002	&mut self,
1003	buf: B,
1004	position: &mut u64,
1005	) -> Result<(BangType, &'r [u8]), Error>;
1006
1007	/// Consume and discard all the whitespace until the next non-whitespace
1008	/// character or EOF.
1009	///
1010	/// # Parameters
1011	/// - `position`: Will be increased by amount of bytes consumed
1012	fn skip_whitespace(&mut self, position: &mut u64) -> io::Result<()>;
1013
1014	/// Return one character without consuming it, so that future `read_` calls*
1015	/// will still include it. On EOF, return `None`.
1016	fn peek_one(&mut self) -> io::Result<Option<u8>>;
1017	}
1018
1019	/// Possible elements started with `<!`
1020	#[derive(Debug, PartialEq)]
1021	enum BangType {
1022	/// <![CDATA[...]]>
1023	CData,
1024	/// <!--...-->
1025	Comment,
1026	/// <!DOCTYPE...>. Contains balance of '<' (+1) and '>' (-1)
1027	DocType(i32),
1028	}
1029	impl BangType {
1030	#[inline(always)]
1031	const fn new(byte: Option<u8>) -> Result<Self, SyntaxError> {
1032	Ok(match byte {
1033	Some(b'[') => Self::CData,
1034	Some(b'-') => Self::Comment,
1035	Some(b'D') \| Some(b'd') => Self::DocType(`0`),
1036	_ => return Err(SyntaxError::InvalidBangMarkup),
1037	})
1038	}
1039
1040	/// If element is finished, returns its content up to `>` symbol and
1041	/// an index of this symbol, otherwise returns `None`
1042	///
1043	/// # Parameters
1044	/// - `buf`: buffer with data consumed on previous iterations
1045	/// - `chunk`: data read on current iteration and not yet consumed from reader
1046	#[inline(always)]
1047	fn parse<'b>(&mut self, buf: &[u8], chunk: &'b [u8]) -> Option<(&'b [u8], usize)> {
1048	match self {
1049	Self::Comment => {
1050	for i in memchr::memchr_iter(b'>', chunk) {
1051	// Need to read at least 6 symbols (`!---->`) for properly finished comment
1052	// <!----> - XML comment
1053	// 012345 - i
1054	if buf.len() + i > `4` {
1055	if chunk[..i].ends_with(b"--") {
1056	// We cannot strip last `--` from the buffer because we need it in case of
1057	// check_comments enabled option. XML standard requires that comment
1058	// will not end with `--->` sequence because this is a special case of
1059	// `--` in the comment (https://www.w3.org/TR/xml11/#sec-comments)
1060	return Some((&chunk[..i], i + `1`)); // +1 for `>`
1061	}
1062	// End sequence `-\|->` was splitted at \|
1063	// buf --/ \-- chunk
1064	if i == `1` && buf.ends_with(b"-") && chunk[`0`] == b'-' {
1065	return Some((&chunk[..i], i + `1`)); // +1 for `>`
1066	}
1067	// End sequence `--\|>` was splitted at \|
1068	// buf --/ \-- chunk
1069	if i == `0` && buf.ends_with(b"--") {
1070	return Some((&[], i + `1`)); // +1 for `>`
1071	}
1072	}
1073	}
1074	}
1075	Self::CData => {
1076	for i in memchr::memchr_iter(b'>', chunk) {
1077	if chunk[..i].ends_with(b"]]") {
1078	return Some((&chunk[..i], i + `1`)); // +1 for `>`
1079	}
1080	// End sequence `]\|]>` was splitted at \|
1081	// buf --/ \-- chunk
1082	if i == `1` && buf.ends_with(b"]") && chunk[`0`] == b']' {
1083	return Some((&chunk[..i], i + `1`)); // +1 for `>`
1084	}
1085	// End sequence `]]\|>` was splitted at \|
1086	// buf --/ \-- chunk
1087	if i == `0` && buf.ends_with(b"]]") {
1088	return Some((&[], i + `1`)); // +1 for `>`
1089	}
1090	}
1091	}
1092	Self::DocType(ref mut balance) => {
1093	for i in memchr::memchr2_iter(b'<', b'>', chunk) {
1094	if chunk[i] == b'<' {
1095	*balance += `1`;
1096	} else {
1097	if *balance == `0` {
1098	return Some((&chunk[..i], i + `1`)); // +1 for `>`
1099	}
1100	*balance -= `1`;
1101	}
1102	}
1103	}
1104	}
1105	None
1106	}
1107	#[inline]
1108	const fn to_err(&self) -> SyntaxError {
1109	match self {
1110	Self::CData => SyntaxError::UnclosedCData,
1111	Self::Comment => SyntaxError::UnclosedComment,
1112	Self::DocType(_) => SyntaxError::UnclosedDoctype,
1113	}
1114	}
1115	}
1116
1117	////////////////////////////////////////////////////////////////////////////////////////////////////
1118
1119	#[cfg(test)]
1120	mod test {
1121	/// Checks the internal implementation of the various reader methods
1122	macro_rules! check {
1123	(
1124	#[$test:meta]
1125	$read_event:ident,
1126	$read_until_close:ident,
1127	// constructor of the XML source on which internal functions will be called
1128	$source:path,
1129	// constructor of the buffer to which read data will stored
1130	$buf:expr
1131	$(, $async:ident, $await:ident)?
1132	) => {
1133	mod read_bang_element {
1134	use super::*;
1135	use crate::errors::{Error, SyntaxError};
1136	use crate::reader::BangType;
1137	use crate::utils::Bytes;
1138
1139	/// Checks that reading CDATA content works correctly
1140	mod cdata {
1141	use super::*;
1142	use pretty_assertions::assert_eq;
1143
1144	/// Checks that if input begins like CDATA element, but CDATA start sequence
1145	/// is not finished, parsing ends with an error
1146	#[$test]
1147	#[ignore = "start CDATA sequence fully checked outside of `read_bang_element`"]
1148	$($async)? fn not_properly_start() {
1149	let buf = $buf;
1150	let mut position = `1`;
1151	let mut input = b"![]]>other content".as_ref();
1152	// ^= 1
1153
1154	match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1155	Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedCData),
1156	x => panic!(
1157	"Expected `Err(Syntax(_))`, but got `{:?}`",
1158	x
1159	),
1160	}
1161	assert_eq!(position, `1`);
1162	}
1163
1164	/// Checks that if CDATA startup sequence was matched, but an end sequence
1165	/// is not found, parsing ends with an error
1166	#[$test]
1167	$($async)? fn not_closed() {
1168	let buf = $buf;
1169	let mut position = `1`;
1170	let mut input = b"![CDATA[other content".as_ref();
1171	// ^= 1 ^= 22
1172
1173	match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1174	Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedCData),
1175	x => panic!(
1176	"Expected `Err(Syntax(_))`, but got `{:?}`",
1177	x
1178	),
1179	}
1180	assert_eq!(position, `22`);
1181	}
1182
1183	/// Checks that CDATA element without content inside parsed successfully
1184	#[$test]
1185	$($async)? fn empty() {
1186	let buf = $buf;
1187	let mut position = `1`;
1188	let mut input = b"![CDATA[]]>other content".as_ref();
1189	// ^= 1 ^= 12
1190
1191	let (ty, bytes) = $source(&mut input)
1192	.read_bang_element(buf, &mut position)
1193	$(.$await)?
1194	.unwrap();
1195	assert_eq!(
1196	(ty, Bytes(bytes)),
1197	(BangType::CData, Bytes(b"![CDATA[]]"))
1198	);
1199	assert_eq!(position, `12`);
1200	}
1201
1202	/// Checks that CDATA element with content parsed successfully.
1203	/// Additionally checks that sequences inside CDATA that may look like
1204	/// a CDATA end sequence do not interrupt CDATA parsing
1205	#[$test]
1206	$($async)? fn with_content() {
1207	let buf = $buf;
1208	let mut position = `1`;
1209	let mut input = b"![CDATA[cdata]] ]>content]]>other content]]>".as_ref();
1210	// ^= 1 ^= 29
1211
1212	let (ty, bytes) = $source(&mut input)
1213	.read_bang_element(buf, &mut position)
1214	$(.$await)?
1215	.unwrap();
1216	assert_eq!(
1217	(ty, Bytes(bytes)),
1218	(BangType::CData, Bytes(b"![CDATA[cdata]] ]>content]]"))
1219	);
1220	assert_eq!(position, `29`);
1221	}
1222	}
1223
1224	/// Checks that reading XML comments works correctly. According to the [specification],
1225	/// comment data can contain any sequence except `--`:
1226	///
1227	/// ```peg
1228	/// comment = '<--' (!'--' char) '-->';*
1229	/// char = [#x1-#x2C]
1230	/// / [#x2E-#xD7FF]
1231	/// / [#xE000-#xFFFD]
1232	/// / [#x10000-#x10FFFF]
1233	/// ```
1234	///
1235	/// The presence of this limitation, however, is simply a poorly designed specification
1236	/// (maybe for purpose of building of LL(1) XML parser) and quick-xml does not check for
1237	/// presence of these sequences by default. This tests allow such content.
1238	///
1239	/// [specification]: https://www.w3.org/TR/xml11/#dt-comment
1240	mod comment {
1241	use super::*;
1242	use pretty_assertions::assert_eq;
1243
1244	#[$test]
1245	#[ignore = "start comment sequence fully checked outside of `read_bang_element`"]
1246	$($async)? fn not_properly_start() {
1247	let buf = $buf;
1248	let mut position = `1`;
1249	let mut input = b"!- -->other content".as_ref();
1250	// ^= 1
1251
1252	match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1253	Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedComment),
1254	x => panic!(
1255	"Expected `Err(Syntax(_))`, but got `{:?}`",
1256	x
1257	),
1258	}
1259	assert_eq!(position, `1`);
1260	}
1261
1262	#[$test]
1263	$($async)? fn not_properly_end() {
1264	let buf = $buf;
1265	let mut position = `1`;
1266	let mut input = b"!->other content".as_ref();
1267	// ^= 1 ^= 17
1268
1269	match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1270	Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedComment),
1271	x => panic!(
1272	"Expected `Err(Syntax(_))`, but got `{:?}`",
1273	x
1274	),
1275	}
1276	assert_eq!(position, `17`);
1277	}
1278
1279	#[$test]
1280	$($async)? fn not_closed1() {
1281	let buf = $buf;
1282	let mut position = `1`;
1283	let mut input = b"!--other content".as_ref();
1284	// ^= 1 ^= 17
1285
1286	match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1287	Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedComment),
1288	x => panic!(
1289	"Expected `Err(Syntax(_))`, but got `{:?}`",
1290	x
1291	),
1292	}
1293	assert_eq!(position, `17`);
1294	}
1295
1296	#[$test]
1297	$($async)? fn not_closed2() {
1298	let buf = $buf;
1299	let mut position = `1`;
1300	let mut input = b"!-->other content".as_ref();
1301	// ^= 1 ^= 18
1302
1303	match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1304	Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedComment),
1305	x => panic!(
1306	"Expected `Err(Syntax(_))`, but got `{:?}`",
1307	x
1308	),
1309	}
1310	assert_eq!(position, `18`);
1311	}
1312
1313	#[$test]
1314	$($async)? fn not_closed3() {
1315	let buf = $buf;
1316	let mut position = `1`;
1317	let mut input = b"!--->other content".as_ref();
1318	// ^= 1 ^= 19
1319
1320	match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1321	Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedComment),
1322	x => panic!(
1323	"Expected `Err(Syntax(_))`, but got `{:?}`",
1324	x
1325	),
1326	}
1327	assert_eq!(position, `19`);
1328	}
1329
1330	#[$test]
1331	$($async)? fn empty() {
1332	let buf = $buf;
1333	let mut position = `1`;
1334	let mut input = b"!---->other content".as_ref();
1335	// ^= 1 ^= 7
1336
1337	let (ty, bytes) = $source(&mut input)
1338	.read_bang_element(buf, &mut position)
1339	$(.$await)?
1340	.unwrap();
1341	assert_eq!(
1342	(ty, Bytes(bytes)),
1343	(BangType::Comment, Bytes(b"!----"))
1344	);
1345	assert_eq!(position, `7`);
1346	}
1347
1348	#[$test]
1349	$($async)? fn with_content() {
1350	let buf = $buf;
1351	let mut position = `1`;
1352	let mut input = b"!--->comment<--->other content".as_ref();
1353	// ^= 1 ^= 18
1354
1355	let (ty, bytes) = $source(&mut input)
1356	.read_bang_element(buf, &mut position)
1357	$(.$await)?
1358	.unwrap();
1359	assert_eq!(
1360	(ty, Bytes(bytes)),
1361	(BangType::Comment, Bytes(b"!--->comment<---"))
1362	);
1363	assert_eq!(position, `18`);
1364	}
1365	}
1366
1367	/// Checks that reading DOCTYPE definition works correctly
1368	mod doctype {
1369	use super::*;
1370
1371	mod uppercase {
1372	use super::*;
1373	use pretty_assertions::assert_eq;
1374
1375	#[$test]
1376	$($async)? fn not_properly_start() {
1377	let buf = $buf;
1378	let mut position = `1`;
1379	let mut input = b"!D other content".as_ref();
1380	// ^= 1 ^= 17
1381
1382	match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1383	Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1384	x => panic!(
1385	"Expected `Err(Syntax(_))`, but got `{:?}`",
1386	x
1387	),
1388	}
1389	assert_eq!(position, `17`);
1390	}
1391
1392	#[$test]
1393	$($async)? fn without_space() {
1394	let buf = $buf;
1395	let mut position = `1`;
1396	let mut input = b"!DOCTYPEother content".as_ref();
1397	// ^= 1 ^= 22
1398
1399	match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1400	Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1401	x => panic!(
1402	"Expected `Err(Syntax(_))`, but got `{:?}`",
1403	x
1404	),
1405	}
1406	assert_eq!(position, `22`);
1407	}
1408
1409	#[$test]
1410	$($async)? fn empty() {
1411	let buf = $buf;
1412	let mut position = `1`;
1413	let mut input = b"!DOCTYPE>other content".as_ref();
1414	// ^= 1 ^= 10
1415
1416	let (ty, bytes) = $source(&mut input)
1417	.read_bang_element(buf, &mut position)
1418	$(.$await)?
1419	.unwrap();
1420	assert_eq!(
1421	(ty, Bytes(bytes)),
1422	(BangType::DocType(`0`), Bytes(b"!DOCTYPE"))
1423	);
1424	assert_eq!(position, `10`);
1425	}
1426
1427	#[$test]
1428	$($async)? fn not_closed() {
1429	let buf = $buf;
1430	let mut position = `1`;
1431	let mut input = b"!DOCTYPE other content".as_ref();
1432	// ^= 1 ^23
1433
1434	match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1435	Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1436	x => panic!(
1437	"Expected `Err(Syntax(_))`, but got `{:?}`",
1438	x
1439	),
1440	}
1441	assert_eq!(position, `23`);
1442	}
1443	}
1444
1445	mod lowercase {
1446	use super::*;
1447	use pretty_assertions::assert_eq;
1448
1449	#[$test]
1450	$($async)? fn not_properly_start() {
1451	let buf = $buf;
1452	let mut position = `1`;
1453	let mut input = b"!d other content".as_ref();
1454	// ^= 1 ^= 17
1455
1456	match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1457	Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1458	x => panic!(
1459	"Expected `Err(Syntax(_))`, but got `{:?}`",
1460	x
1461	),
1462	}
1463	assert_eq!(position, `17`);
1464	}
1465
1466	#[$test]
1467	$($async)? fn without_space() {
1468	let buf = $buf;
1469	let mut position = `1`;
1470	let mut input = b"!doctypeother content".as_ref();
1471	// ^= 1 ^= 22
1472
1473	match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1474	Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1475	x => panic!(
1476	"Expected `Err(Syntax(_))`, but got `{:?}`",
1477	x
1478	),
1479	}
1480	assert_eq!(position, `22`);
1481	}
1482
1483	#[$test]
1484	$($async)? fn empty() {
1485	let buf = $buf;
1486	let mut position = `1`;
1487	let mut input = b"!doctype>other content".as_ref();
1488	// ^= 1 ^= 10
1489
1490	let (ty, bytes) = $source(&mut input)
1491	.read_bang_element(buf, &mut position)
1492	$(.$await)?
1493	.unwrap();
1494	assert_eq!(
1495	(ty, Bytes(bytes)),
1496	(BangType::DocType(`0`), Bytes(b"!doctype"))
1497	);
1498	assert_eq!(position, `10`);
1499	}
1500
1501	#[$test]
1502	$($async)? fn not_closed() {
1503	let buf = $buf;
1504	let mut position = `1`;
1505	let mut input = b"!doctype other content".as_ref();
1506	// ^= 1 ^= 23
1507
1508	match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1509	Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1510	x => panic!(
1511	"Expected `Err(Syntax(_))`, but got `{:?}`",
1512	x
1513	),
1514	}
1515	assert_eq!(position, `23`);
1516	}
1517	}
1518	}
1519	}
1520
1521	mod read_element {
1522	use super::*;
1523	use crate::errors::{Error, SyntaxError};
1524	use crate::parser::ElementParser;
1525	use crate::utils::Bytes;
1526	use pretty_assertions::assert_eq;
1527
1528	/// Checks that nothing was read from empty buffer
1529	#[$test]
1530	$($async)? fn empty() {
1531	let buf = $buf;
1532	let mut position = `1`;
1533	let mut input = b"".as_ref();
1534	// ^= 1
1535
1536	match $source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? {
1537	Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedTag),
1538	x => panic!(
1539	"Expected `Err(Syntax(_))`, but got `{:?}`",
1540	x
1541	),
1542	}
1543	assert_eq!(position, `1`);
1544	}
1545
1546	mod open {
1547	use super::*;
1548	use pretty_assertions::assert_eq;
1549
1550	#[$test]
1551	$($async)? fn empty_tag() {
1552	let buf = $buf;
1553	let mut position = `1`;
1554	let mut input = b">".as_ref();
1555	// ^= 2
1556
1557	assert_eq!(
1558	Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1559	Bytes(b"")
1560	);
1561	assert_eq!(position, `2`);
1562	}
1563
1564	#[$test]
1565	$($async)? fn normal() {
1566	let buf = $buf;
1567	let mut position = `1`;
1568	let mut input = b"tag>".as_ref();
1569	// ^= 5
1570
1571	assert_eq!(
1572	Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1573	Bytes(b"tag")
1574	);
1575	assert_eq!(position, `5`);
1576	}
1577
1578	#[$test]
1579	$($async)? fn empty_ns_empty_tag() {
1580	let buf = $buf;
1581	let mut position = `1`;
1582	let mut input = b":>".as_ref();
1583	// ^= 3
1584
1585	assert_eq!(
1586	Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1587	Bytes(b":")
1588	);
1589	assert_eq!(position, `3`);
1590	}
1591
1592	#[$test]
1593	$($async)? fn empty_ns() {
1594	let buf = $buf;
1595	let mut position = `1`;
1596	let mut input = b":tag>".as_ref();
1597	// ^= 6
1598
1599	assert_eq!(
1600	Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1601	Bytes(b":tag")
1602	);
1603	assert_eq!(position, `6`);
1604	}
1605
1606	#[$test]
1607	$($async)? fn with_attributes() {
1608	let buf = $buf;
1609	let mut position = `1`;
1610	let mut input = br#"tag attr-1=">" attr2 = '>' 3attr>"#.as_ref();
1611	// ^= 39
1612
1613	assert_eq!(
1614	Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1615	Bytes(br#"tag attr-1=">" attr2 = '>' 3attr"#)
1616	);
1617	assert_eq!(position, `39`);
1618	}
1619	}
1620
1621	mod self_closed {
1622	use super::*;
1623	use pretty_assertions::assert_eq;
1624
1625	#[$test]
1626	$($async)? fn empty_tag() {
1627	let buf = $buf;
1628	let mut position = `1`;
1629	let mut input = b"/>".as_ref();
1630	// ^= 3
1631
1632	assert_eq!(
1633	Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1634	Bytes(b"/")
1635	);
1636	assert_eq!(position, `3`);
1637	}
1638
1639	#[$test]
1640	$($async)? fn normal() {
1641	let buf = $buf;
1642	let mut position = `1`;
1643	let mut input = b"tag/>".as_ref();
1644	// ^= 6
1645
1646	assert_eq!(
1647	Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1648	Bytes(b"tag/")
1649	);
1650	assert_eq!(position, `6`);
1651	}
1652
1653	#[$test]
1654	$($async)? fn empty_ns_empty_tag() {
1655	let buf = $buf;
1656	let mut position = `1`;
1657	let mut input = b":/>".as_ref();
1658	// ^= 4
1659
1660	assert_eq!(
1661	Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1662	Bytes(b":/")
1663	);
1664	assert_eq!(position, `4`);
1665	}
1666
1667	#[$test]
1668	$($async)? fn empty_ns() {
1669	let buf = $buf;
1670	let mut position = `1`;
1671	let mut input = b":tag/>".as_ref();
1672	// ^= 7
1673
1674	assert_eq!(
1675	Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1676	Bytes(b":tag/")
1677	);
1678	assert_eq!(position, `7`);
1679	}
1680
1681	#[$test]
1682	$($async)? fn with_attributes() {
1683	let buf = $buf;
1684	let mut position = `1`;
1685	let mut input = br#"tag attr-1="/>" attr2 = '/>' 3attr/>"#.as_ref();
1686	// ^= 42
1687
1688	assert_eq!(
1689	Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1690	Bytes(br#"tag attr-1="/>" attr2 = '/>' 3attr/"#)
1691	);
1692	assert_eq!(position, `42`);
1693	}
1694	}
1695
1696	mod close {
1697	use super::*;
1698	use pretty_assertions::assert_eq;
1699
1700	#[$test]
1701	$($async)? fn empty_tag() {
1702	let buf = $buf;
1703	let mut position = `1`;
1704	let mut input = b"/ >".as_ref();
1705	// ^= 4
1706
1707	assert_eq!(
1708	Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1709	Bytes(b"/ ")
1710	);
1711	assert_eq!(position, `4`);
1712	}
1713
1714	#[$test]
1715	$($async)? fn normal() {
1716	let buf = $buf;
1717	let mut position = `1`;
1718	let mut input = b"/tag>".as_ref();
1719	// ^= 6
1720
1721	assert_eq!(
1722	Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1723	Bytes(b"/tag")
1724	);
1725	assert_eq!(position, `6`);
1726	}
1727
1728	#[$test]
1729	$($async)? fn empty_ns_empty_tag() {
1730	let buf = $buf;
1731	let mut position = `1`;
1732	let mut input = b"/:>".as_ref();
1733	// ^= 4
1734
1735	assert_eq!(
1736	Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1737	Bytes(b"/:")
1738	);
1739	assert_eq!(position, `4`);
1740	}
1741
1742	#[$test]
1743	$($async)? fn empty_ns() {
1744	let buf = $buf;
1745	let mut position = `1`;
1746	let mut input = b"/:tag>".as_ref();
1747	// ^= 7
1748
1749	assert_eq!(
1750	Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1751	Bytes(b"/:tag")
1752	);
1753	assert_eq!(position, `7`);
1754	}
1755
1756	#[$test]
1757	$($async)? fn with_attributes() {
1758	let buf = $buf;
1759	let mut position = `1`;
1760	let mut input = br#"/tag attr-1=">" attr2 = '>' 3attr>"#.as_ref();
1761	// ^= 40
1762
1763	assert_eq!(
1764	Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1765	Bytes(br#"/tag attr-1=">" attr2 = '>' 3attr"#)
1766	);
1767	assert_eq!(position, `40`);
1768	}
1769	}
1770	}
1771
1772	/// Ensures, that no empty `Text` events are generated
1773	mod $read_event {
1774	use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesPI, BytesStart, BytesText, Event};
1775	use crate::reader::Reader;
1776	use pretty_assertions::assert_eq;
1777
1778	/// When `encoding` feature is enabled, encoding should be detected
1779	/// from BOM (UTF-8) and BOM should be stripped.
1780	///
1781	/// When `encoding` feature is disabled, UTF-8 is assumed and BOM
1782	/// character should be stripped for consistency
1783	#[$test]
1784	$($async)? fn bom_from_reader() {
1785	let mut reader = Reader::from_reader("`\u{feff}\u{feff}`".as_bytes());
1786
1787	assert_eq!(
1788	reader.$read_event($buf) $(.$await)? .unwrap(),
1789	Event::Text(BytesText::from_escaped("`\u{feff}`"))
1790	);
1791
1792	assert_eq!(
1793	reader.$read_event($buf) $(.$await)? .unwrap(),
1794	Event::Eof
1795	);
1796	}
1797
1798	/// When parsing from &str, encoding is fixed (UTF-8), so
1799	/// - when `encoding` feature is disabled, the behavior the
1800	/// same as in `bom_from_reader` text
1801	/// - when `encoding` feature is enabled, the behavior should
1802	/// stay consistent, so the first BOM character is stripped
1803	#[$test]
1804	$($async)? fn bom_from_str() {
1805	let mut reader = Reader::from_str("`\u{feff}\u{feff}`");
1806
1807	assert_eq!(
1808	reader.$read_event($buf) $(.$await)? .unwrap(),
1809	Event::Text(BytesText::from_escaped("`\u{feff}`"))
1810	);
1811
1812	assert_eq!(
1813	reader.$read_event($buf) $(.$await)? .unwrap(),
1814	Event::Eof
1815	);
1816	}
1817
1818	#[$test]
1819	$($async)? fn declaration() {
1820	let mut reader = Reader::from_str("<?xml ?>");
1821
1822	assert_eq!(
1823	reader.$read_event($buf) $(.$await)? .unwrap(),
1824	Event::Decl(BytesDecl::from_start(BytesStart::from_content("xml ", `3`)))
1825	);
1826	}
1827
1828	#[$test]
1829	$($async)? fn doctype() {
1830	let mut reader = Reader::from_str("<!DOCTYPE x>");
1831
1832	assert_eq!(
1833	reader.$read_event($buf) $(.$await)? .unwrap(),
1834	Event::DocType(BytesText::from_escaped("x"))
1835	);
1836	}
1837
1838	#[$test]
1839	$($async)? fn processing_instruction() {
1840	let mut reader = Reader::from_str("<?xml-stylesheet '? >`\"` ?>");
1841
1842	assert_eq!(
1843	reader.$read_event($buf) $(.$await)? .unwrap(),
1844	Event::PI(BytesPI::new("xml-stylesheet '? >`\"` "))
1845	);
1846	}
1847
1848	/// Lone closing tags are not allowed, so testing it together with start tag
1849	#[$test]
1850	$($async)? fn start_and_end() {
1851	let mut reader = Reader::from_str("<tag></tag>");
1852
1853	assert_eq!(
1854	reader.$read_event($buf) $(.$await)? .unwrap(),
1855	Event::Start(BytesStart::new("tag"))
1856	);
1857
1858	assert_eq!(
1859	reader.$read_event($buf) $(.$await)? .unwrap(),
1860	Event::End(BytesEnd::new("tag"))
1861	);
1862	}
1863
1864	#[$test]
1865	$($async)? fn empty() {
1866	let mut reader = Reader::from_str("<tag/>");
1867
1868	assert_eq!(
1869	reader.$read_event($buf) $(.$await)? .unwrap(),
1870	Event::Empty(BytesStart::new("tag"))
1871	);
1872	}
1873
1874	#[$test]
1875	$($async)? fn text() {
1876	let mut reader = Reader::from_str("text");
1877
1878	assert_eq!(
1879	reader.$read_event($buf) $(.$await)? .unwrap(),
1880	Event::Text(BytesText::from_escaped("text"))
1881	);
1882	}
1883
1884	#[$test]
1885	$($async)? fn cdata() {
1886	let mut reader = Reader::from_str("<![CDATA[]]>");
1887
1888	assert_eq!(
1889	reader.$read_event($buf) $(.$await)? .unwrap(),
1890	Event::CData(BytesCData::new(""))
1891	);
1892	}
1893
1894	#[$test]
1895	$($async)? fn comment() {
1896	let mut reader = Reader::from_str("<!---->");
1897
1898	assert_eq!(
1899	reader.$read_event($buf) $(.$await)? .unwrap(),
1900	Event::Comment(BytesText::from_escaped(""))
1901	);
1902	}
1903
1904	#[$test]
1905	$($async)? fn eof() {
1906	let mut reader = Reader::from_str("");
1907
1908	assert_eq!(
1909	reader.$read_event($buf) $(.$await)? .unwrap(),
1910	Event::Eof
1911	);
1912	}
1913	}
1914	};
1915	}
1916
1917	// Export macros for the child modules:
1918	// - buffered_reader
1919	// - slice_reader
1920	pub(super) use check;
1921	}
1922