state.rs source code [crates/quick_xml/src/reader/state.rs]

1	#[cfg(feature = "encoding")]
2	use encoding_rs::UTF_8;
3
4	use crate::encoding::Decoder;
5	use crate::errors::{Error, IllFormedError, Result, SyntaxError};
6	use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesPI, BytesStart, BytesText, Event};
7	#[cfg(feature = "encoding")]
8	use crate::reader::EncodingRef;
9	use crate::reader::{BangType, Config, ParseState};
10	use crate::utils::{is_whitespace, name_len};
11
12	/// A struct that holds a current reader state and a parser configuration.
13	/// It is independent on a way of reading data: the reader feed data into it and
14	/// get back produced [`Event`]s.
15	#[derive(Clone, Debug)]
16	pub(super) struct ReaderState {
17	/// Number of bytes read from the source of data since the reader was created
18	pub offset: u64,
19	/// A snapshot of an `offset` of the last error returned. It can be less than
20	/// `offset`, because some errors conveniently report at earlier position,
21	/// and changing `offset` is not possible, because `Error::IllFormed` errors
22	/// are recoverable.
23	pub last_error_offset: u64,
24	/// Defines how to process next byte
25	pub state: ParseState,
26	/// User-defined settings that affect parsing
27	pub config: Config,
28	/// All currently Started elements which didn't have a matching
29	/// End element yet.
30	///
31	/// For an XML
32	///
33	/// ```xml
34	/// <root><one/><inner attr="value">\|<tag></inner></root>
35	/// ```
36	/// when cursor at the `\|` position buffer contains:
37	///
38	/// ```text
39	/// rootinner
40	/// ^ ^
41	/// ```
42	///
43	/// The `^` symbols shows which positions stored in the [`Self::opened_starts`]
44	/// (0 and 4 in that case).
45	opened_buffer: Vec<u8>,
46	/// Opened name start indexes into [`Self::opened_buffer`]. See documentation
47	/// for that field for details
48	opened_starts: Vec<usize>,
49
50	#[cfg(feature = "encoding")]
51	/// Reference to the encoding used to read an XML
52	pub encoding: EncodingRef,
53	}
54
55	impl ReaderState {
56	/// Trims end whitespaces from `bytes`, if required, and returns a text event.
57	///
58	/// # Parameters
59	/// - `bytes`: data from the start of stream to the first `<` or from `>` to `<`
60	pub fn emit_text<'b>(&mut self, bytes: &'b [u8]) -> BytesText<'b> {
61	let mut content = bytes;
62
63	if self.config.trim_text_end {
64	// Skip the ending '<'
65	let len = bytes
66	.iter()
67	.rposition(\|&b\| !is_whitespace(b))
68	.map_or(`0`, \|p\| p + `1`);
69	content = &bytes[..len];
70	}
71	BytesText::wrap(content, self.decoder())
72	}
73
74	/// Returns `Comment`, `CData` or `DocType` event.
75	///
76	/// `buf` contains data between `<` and `>`:
77	/// - CDATA: `![CDATA[...]]`
78	/// - Comment: `!--...--`
79	/// - Doctype (uppercase): `!D...`
80	/// - Doctype (lowercase): `!d...`
81	pub fn emit_bang<'b>(&mut self, bang_type: BangType, buf: &'b [u8]) -> Result<Event<'b>> {
82	debug_assert_eq!(
83	buf.first(),
84	Some(&b'!'),
85	"CDATA, comment or DOCTYPE should start from '!'"
86	);
87
88	let uncased_starts_with = \|string: &[u8], prefix: &[u8]\| {
89	string.len() >= prefix.len() && string[..prefix.len()].eq_ignore_ascii_case(prefix)
90	};
91
92	let len = buf.len();
93	match bang_type {
94	BangType::Comment if buf.starts_with(b"!--") => {
95	debug_assert!(buf.ends_with(b"--"));
96	if self.config.check_comments {
97	// search if '--' not in comments
98	let mut haystack = &buf[`3`..len - `2`];
99	let mut off = `0`;
100	while let Some(p) = memchr::memchr(b'-', haystack) {
101	off += p + `1`;
102	// if next byte after `-` is also `-`, return an error
103	if buf[`3` + off] == b'-' {
104	// Explanation of the magic:
105	//
106	// - `self.offset`` just after `>`,
107	// - `buf` contains `!-- con--tent --`
108	// - `p` is counted from byte after `<!--`
109	//
110	// <!-- con--tent -->:
111	// ~~~~~~~~~~~~~~~~ : - buf
112	// : =========== : - zone of search (possible values of `p`)
113	// : \|---p : - p is counted from \| (\| is 0)
114	// : : : ^ - self.offset
115	// ^ : : - self.offset - len
116	// ^ : - self.offset - len + 2
117	// ^ - self.offset - len + 2 + p
118	self.last_error_offset = self.offset - len as u64 + `2` + p as u64;
119	return Err(Error::IllFormed(IllFormedError::DoubleHyphenInComment));
120	}
121	// Continue search after single `-` (+1 to skip it)
122	haystack = &haystack[p + `1`..];
123	}
124	}
125	Ok(Event::Comment(BytesText::wrap(
126	// Cut of `!--` and `--` from start and end
127	&buf[`3`..len - `2`],
128	self.decoder(),
129	)))
130	}
131	// XML requires uppercase only:
132	// https://www.w3.org/TR/xml11/#sec-cdata-sect
133	// Even HTML5 required uppercase only:
134	// https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
135	BangType::CData if buf.starts_with(b"![CDATA[") => {
136	debug_assert!(buf.ends_with(b"]]"));
137	Ok(Event::CData(BytesCData::wrap(
138	// Cut of `![CDATA[` and `]]` from start and end
139	&buf[`8`..len - `2`],
140	self.decoder(),
141	)))
142	}
143	// XML requires uppercase only, but we will check that on validation stage:
144	// https://www.w3.org/TR/xml11/#sec-prolog-dtd
145	// HTML5 allows mixed case for doctype declarations:
146	// https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
147	BangType::DocType(`0`) if uncased_starts_with(buf, b"!DOCTYPE") => {
148	match buf[`8`..].iter().position(\|&b\| !is_whitespace(b)) {
149	Some(start) => Ok(Event::DocType(BytesText::wrap(
150	// Cut of `!DOCTYPE` and any number of spaces from start
151	&buf[`8` + start..],
152	self.decoder(),
153	))),
154	None => {
155	// Because we here, we at least read `<!DOCTYPE>` and offset after `>`.
156	// We want report error at place where name is expected - this is just
157	// before `>`
158	self.last_error_offset = self.offset - `1`;
159	return Err(Error::IllFormed(IllFormedError::MissingDoctypeName));
160	}
161	}
162	}
163	_ => {
164	// <!....>
165	// ^^^^^ - `buf` does not contain `<` and `>`, but `self.offset` is after `>`.
166	// ^------- We report error at that position, so we need to subtract 2 and buf len
167	self.last_error_offset = self.offset - len as u64 - `2`;
168	Err(bang_type.to_err().into())
169	}
170	}
171	}
172
173	/// Wraps content of `buf` into the [`Event::End`] event. Does the check that
174	/// end name matches the last opened start name if `self.config.check_end_names` is set.
175	///
176	/// `buf` contains data between `<` and `>`, for example `/tag`.
177	pub fn emit_end<'b>(&mut self, buf: &'b [u8]) -> Result<Event<'b>> {
178	debug_assert_eq!(
179	buf.first(),
180	Some(&b'/'),
181	"closing tag should start from '/'"
182	);
183
184	// Strip the `/` character. `content` contains data between `</` and `>`
185	let content = &buf[`1`..];
186	// XML standard permits whitespaces after the markup name in closing tags.
187	// Let's strip them from the buffer before comparing tag names.
188	let name = if self.config.trim_markup_names_in_closing_tags {
189	if let Some(pos_end_name) = content.iter().rposition(\|&b\| !is_whitespace(b)) {
190	&content[..pos_end_name + `1`]
191	} else {
192	content
193	}
194	} else {
195	content
196	};
197
198	let decoder = self.decoder();
199
200	// Get the index in self.opened_buffer of the name of the last opened tag
201	match self.opened_starts.pop() {
202	Some(start) => {
203	if self.config.check_end_names {
204	let expected = &self.opened_buffer[start..];
205	if name != expected {
206	let expected = decoder.decode(expected).unwrap_or_default().into_owned();
207	// #513: In order to allow error recovery we should drop content of the buffer
208	self.opened_buffer.truncate(start);
209
210	// Report error at start of the end tag at `<` character
211	// -2 for `<` and `>`
212	self.last_error_offset = self.offset - buf.len() as u64 - `2`;
213	return Err(Error::IllFormed(IllFormedError::MismatchedEndTag {
214	expected,
215	found: decoder.decode(name).unwrap_or_default().into_owned(),
216	}));
217	}
218	}
219
220	self.opened_buffer.truncate(start);
221	}
222	None => {
223	if !self.config.allow_unmatched_ends {
224	// Report error at start of the end tag at `<` character
225	// -2 for `<` and `>`
226	self.last_error_offset = self.offset - buf.len() as u64 - `2`;
227	return Err(Error::IllFormed(IllFormedError::UnmatchedEndTag(
228	decoder.decode(name).unwrap_or_default().into_owned(),
229	)));
230	}
231	}
232	}
233
234	Ok(Event::End(BytesEnd::wrap(name.into())))
235	}
236
237	/// `buf` contains data between `<` and `>` and the first byte is `?`.
238	/// `self.offset` already after the `>`
239	///
240	/// Returns `Decl` or `PI` event
241	pub fn emit_question_mark<'b>(&mut self, buf: &'b [u8]) -> Result<Event<'b>> {
242	debug_assert!(buf.len() > `0`);
243	debug_assert_eq!(buf[`0`], b'?');
244
245	let len = buf.len();
246	// We accept at least <??>
247	// ~~ - len = 2
248	if len > `1` && buf[len - `1`] == b'?' {
249	// Cut of `?` and `?` from start and end
250	let content = &buf[`1`..len - `1`];
251	let len = content.len();
252
253	if content.starts_with(b"xml") && (len == `3` \|\| is_whitespace(content[`3`])) {
254	let event = BytesDecl::from_start(BytesStart::wrap(content, `3`));
255
256	// Try getting encoding from the declaration event
257	#[cfg(feature = "encoding")]
258	if self.encoding.can_be_refined() {
259	if let Some(encoding) = event.encoder() {
260	self.encoding = EncodingRef::XmlDetected(encoding);
261	}
262	}
263
264	Ok(Event::Decl(event))
265	} else {
266	Ok(Event::PI(BytesPI::wrap(content, name_len(content))))
267	}
268	} else {
269	// <?....EOF
270	// ^^^^^ - `buf` does not contains `<`, but we want to report error at `<`,
271	// so we move offset to it (-2 for `<` and `>`)
272	self.last_error_offset = self.offset - len as u64 - `2`;
273	Err(Error::Syntax(SyntaxError::UnclosedPIOrXmlDecl))
274	}
275	}
276
277	/// Converts content of a tag to a `Start` or an `Empty` event
278	///
279	/// # Parameters
280	/// - `content`: Content of a tag between `<` and `>`
281	pub fn emit_start<'b>(&mut self, content: &'b [u8]) -> Event<'b> {
282	if let Some(content) = content.strip_suffix(b"/") {
283	// This is self-closed tag `<something/>`
284	let event = BytesStart::wrap(content, name_len(content));
285
286	if self.config.expand_empty_elements {
287	self.state = ParseState::InsideEmpty;
288	self.opened_starts.push(self.opened_buffer.len());
289	self.opened_buffer.extend(event.name().as_ref());
290	Event::Start(event)
291	} else {
292	Event::Empty(event)
293	}
294	} else {
295	let event = BytesStart::wrap(content, name_len(content));
296
297	// #514: Always store names event when .check_end_names == false,
298	// because checks can be temporary disabled and when they would be
299	// enabled, we should have that information
300	self.opened_starts.push(self.opened_buffer.len());
301	self.opened_buffer.extend(event.name().as_ref());
302	Event::Start(event)
303	}
304	}
305
306	#[inline]
307	pub fn close_expanded_empty(&mut self) -> BytesEnd<'static> {
308	self.state = ParseState::InsideText;
309	let name = self
310	.opened_buffer
311	.split_off(self.opened_starts.pop().unwrap());
312	BytesEnd::wrap(name.into())
313	}
314
315	/// Get the decoder, used to decode bytes, read by this reader, to the strings.
316	///
317	/// If [`encoding`] feature is enabled, the used encoding may change after
318	/// parsing the XML declaration, otherwise encoding is fixed to UTF-8.
319	///
320	/// If [`encoding`] feature is enabled and no encoding is specified in declaration,
321	/// defaults to UTF-8.
322	///
323	/// [`encoding`]: ../../index.html#encoding
324	pub const fn decoder(&self) -> Decoder {
325	Decoder {
326	#[cfg(feature = "encoding")]
327	encoding: self.encoding.encoding(),
328	}
329	}
330	}
331
332	impl Default for ReaderState {
333	fn default() -> Self {
334	Self {
335	offset: `0`,
336	last_error_offset: `0`,
337	state: ParseState::Init,
338	config: Config::default(),
339	opened_buffer: Vec::new(),
340	opened_starts: Vec::new(),
341
342	#[cfg(feature = "encoding")]
343	encoding: EncodingRef::Implicit(UTF_8),
344	}
345	}
346	}
347