parser.rs source code [crates/quick-xml-0.30.0/src/reader/parser.rs]

1	#[cfg(feature = "encoding")]
2	use encoding_rs::UTF_8;
3
4	use crate::encoding::Decoder;
5	use crate::errors::{Error, Result};
6	use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event};
7	#[cfg(feature = "encoding")]
8	use crate::reader::EncodingRef;
9	use crate::reader::{is_whitespace, BangType, ParseState};
10
11	use memchr;
12
13	/// A struct that holds a current parse state and a parser configuration.
14	/// It is independent on a way of reading data: the reader feed data into it and
15	/// get back produced [`Event`]s.
16	#[derive(Clone)]
17	pub(super) struct Parser {
18	/// Number of bytes read from the source of data since the parser was created
19	pub offset: usize,
20	/// Defines how to process next byte
21	pub state: ParseState,
22	/// Expand empty element into an opening and closing element
23	pub expand_empty_elements: bool,
24	/// Trims leading whitespace in Text events, skip the element if text is empty
25	pub trim_text_start: bool,
26	/// Trims trailing whitespace in Text events.
27	pub trim_text_end: bool,
28	/// Trims trailing whitespaces from markup names in closing tags `</a >`
29	pub trim_markup_names_in_closing_tags: bool,
30	/// Check if [`Event::End`] nodes match last [`Event::Start`] node
31	pub check_end_names: bool,
32	/// Check if comments contains `--` (false per default)
33	pub check_comments: bool,
34	/// All currently Started elements which didn't have a matching
35	/// End element yet.
36	///
37	/// For an XML
38	///
39	/// ```xml
40	/// <root><one/><inner attr="value">\|<tag></inner></root>
41	/// ```
42	/// when cursor at the `\|` position buffer contains:
43	///
44	/// ```text
45	/// rootinner
46	/// ^ ^
47	/// ```
48	///
49	/// The `^` symbols shows which positions stored in the [`Self::opened_starts`]
50	/// (0 and 4 in that case).
51	opened_buffer: Vec<u8>,
52	/// Opened name start indexes into [`Self::opened_buffer`]. See documentation
53	/// for that field for details
54	opened_starts: Vec<usize>,
55
56	#[cfg(feature = "encoding")]
57	/// Reference to the encoding used to read an XML
58	pub encoding: EncodingRef,
59	}
60
61	impl Parser {
62	/// Trims whitespaces from `bytes`, if required, and returns a [`Text`] event.
63	///
64	/// # Parameters
65	/// - `bytes`: data from the start of stream to the first `<` or from `>` to `<`
66	///
67	/// [`Text`]: Event::Text
68	pub fn emit_text<'b>(&mut self, bytes: &'b [u8]) -> Result<Event<'b>> {
69	let mut content = bytes;
70
71	if self.trim_text_end {
72	// Skip the ending '<'
73	let len = bytes
74	.iter()
75	.rposition(\|&b\| !is_whitespace(b))
76	.map_or_else(\|\| bytes.len(), \|p\| p + `1`);
77	content = &bytes[..len];
78	}
79
80	Ok(Event::Text(BytesText::wrap(content, self.decoder())))
81	}
82
83	/// reads `BytesElement` starting with a `!`,
84	/// return `Comment`, `CData` or `DocType` event
85	pub fn emit_bang<'b>(&mut self, bang_type: BangType, buf: &'b [u8]) -> Result<Event<'b>> {
86	let uncased_starts_with = \|string: &[u8], prefix: &[u8]\| {
87	string.len() >= prefix.len() && string[..prefix.len()].eq_ignore_ascii_case(prefix)
88	};
89
90	let len = buf.len();
91	match bang_type {
92	BangType::Comment if buf.starts_with(b"!--") => {
93	debug_assert!(buf.ends_with(b"--"));
94	if self.check_comments {
95	// search if '--' not in comments
96	if let Some(p) = memchr::memchr_iter(b'-', &buf[`3`..len - `2`])
97	.position(\|p\| buf[`3` + p + `1`] == b'-')
98	{
99	self.offset += len - p;
100	return Err(Error::UnexpectedToken("--".to_string()));
101	}
102	}
103	Ok(Event::Comment(BytesText::wrap(
104	&buf[`3`..len - `2`],
105	self.decoder(),
106	)))
107	}
108	BangType::CData if uncased_starts_with(buf, b"![CDATA[") => {
109	debug_assert!(buf.ends_with(b"]]"));
110	Ok(Event::CData(BytesCData::wrap(
111	&buf[`8`..len - `2`],
112	self.decoder(),
113	)))
114	}
115	BangType::DocType if uncased_starts_with(buf, b"!DOCTYPE") => {
116	let start = buf[`8`..]
117	.iter()
118	.position(\|b\| !is_whitespace(*b))
119	.unwrap_or(len - `8`);
120	if start + `8` >= len {
121	return Err(Error::EmptyDocType);
122	}
123	Ok(Event::DocType(BytesText::wrap(
124	&buf[`8` + start..],
125	self.decoder(),
126	)))
127	}
128	_ => Err(bang_type.to_err()),
129	}
130	}
131
132	/// Wraps content of `buf` into the [`Event::End`] event. Does the check that
133	/// end name matches the last opened start name if `self.check_end_names` is set.
134	pub fn emit_end<'b>(&mut self, buf: &'b [u8]) -> Result<Event<'b>> {
135	// XML standard permits whitespaces after the markup name in closing tags.
136	// Let's strip them from the buffer before comparing tag names.
137	let name = if self.trim_markup_names_in_closing_tags {
138	if let Some(pos_end_name) = buf[`1`..].iter().rposition(\|&b\| !b.is_ascii_whitespace()) {
139	let (name, _) = buf[`1`..].split_at(pos_end_name + `1`);
140	name
141	} else {
142	&buf[`1`..]
143	}
144	} else {
145	&buf[`1`..]
146	};
147
148	let decoder = self.decoder();
149	let mismatch_err = \|expected: String, found: &[u8], offset: &mut usize\| {
150	*offset -= buf.len();
151	Err(Error::EndEventMismatch {
152	expected,
153	found: decoder.decode(found).unwrap_or_default().into_owned(),
154	})
155	};
156
157	// Get the index in self.opened_buffer of the name of the last opened tag
158	match self.opened_starts.pop() {
159	Some(start) => {
160	if self.check_end_names {
161	let expected = &self.opened_buffer[start..];
162	if name != expected {
163	let expected = decoder.decode(expected).unwrap_or_default().into_owned();
164	// #513: In order to allow error recovery we should drop content of the buffer
165	self.opened_buffer.truncate(start);
166
167	return mismatch_err(expected, name, &mut self.offset);
168	}
169	}
170
171	self.opened_buffer.truncate(start);
172	}
173	None => {
174	if self.check_end_names {
175	return mismatch_err("".to_string(), &buf[`1`..], &mut self.offset);
176	}
177	}
178	}
179
180	Ok(Event::End(BytesEnd::wrap(name.into())))
181	}
182
183	/// reads `BytesElement` starting with a `?`,
184	/// return `Decl` or `PI` event
185	pub fn emit_question_mark<'b>(&mut self, buf: &'b [u8]) -> Result<Event<'b>> {
186	let len = buf.len();
187	if len > `2` && buf[len - `1`] == b'?' {
188	if len > `5` && &buf[`1`..`4`] == b"xml" && is_whitespace(buf[`4`]) {
189	let event = BytesDecl::from_start(BytesStart::wrap(&buf[`1`..len - `1`], `3`));
190
191	// Try getting encoding from the declaration event
192	#[cfg(feature = "encoding")]
193	if self.encoding.can_be_refined() {
194	if let Some(encoding) = event.encoder() {
195	self.encoding = EncodingRef::XmlDetected(encoding);
196	}
197	}
198
199	Ok(Event::Decl(event))
200	} else {
201	Ok(Event::PI(BytesText::wrap(&buf[`1`..len - `1`], self.decoder())))
202	}
203	} else {
204	self.offset -= len;
205	Err(Error::UnexpectedEof("XmlDecl".to_string()))
206	}
207	}
208
209	/// Converts content of a tag to a `Start` or an `Empty` event
210	///
211	/// # Parameters
212	/// - `content`: Content of a tag between `<` and `>`
213	pub fn emit_start<'b>(&mut self, content: &'b [u8]) -> Result<Event<'b>> {
214	let len = content.len();
215	let name_end = content
216	.iter()
217	.position(\|&b\| is_whitespace(b))
218	.unwrap_or(len);
219	if let Some(&b'/') = content.last() {
220	// This is self-closed tag `<something/>`
221	let name_len = if name_end < len { name_end } else { len - `1` };
222	let event = BytesStart::wrap(&content[..len - `1`], name_len);
223
224	if self.expand_empty_elements {
225	self.state = ParseState::Empty;
226	self.opened_starts.push(self.opened_buffer.len());
227	self.opened_buffer.extend(&content[..name_len]);
228	Ok(Event::Start(event))
229	} else {
230	Ok(Event::Empty(event))
231	}
232	} else {
233	// #514: Always store names event when .check_end_names == false,
234	// because checks can be temporary disabled and when they would be
235	// enabled, we should have that information
236	self.opened_starts.push(self.opened_buffer.len());
237	self.opened_buffer.extend(&content[..name_end]);
238	Ok(Event::Start(BytesStart::wrap(content, name_end)))
239	}
240	}
241
242	#[inline]
243	pub fn close_expanded_empty(&mut self) -> Result<Event<'static>> {
244	self.state = ParseState::ClosedTag;
245	let name = self
246	.opened_buffer
247	.split_off(self.opened_starts.pop().unwrap());
248	Ok(Event::End(BytesEnd::wrap(name.into())))
249	}
250
251	/// Get the decoder, used to decode bytes, read by this reader, to the strings.
252	///
253	/// If `encoding` feature is enabled, the used encoding may change after
254	/// parsing the XML declaration, otherwise encoding is fixed to UTF-8.
255	///
256	/// If `encoding` feature is enabled and no encoding is specified in declaration,
257	/// defaults to UTF-8.
258	pub fn decoder(&self) -> Decoder {
259	Decoder {
260	#[cfg(feature = "encoding")]
261	encoding: self.encoding.encoding(),
262	}
263	}
264	}
265
266	impl Default for Parser {
267	fn default() -> Self {
268	Self {
269	offset: `0`,
270	state: ParseState::Init,
271	expand_empty_elements: `false`,
272	trim_text_start: `false`,
273	trim_text_end: `false`,
274	trim_markup_names_in_closing_tags: `true`,
275	check_end_names: `true`,
276	check_comments: `false`,
277	opened_buffer: Vec::new(),
278	opened_starts: Vec::new(),
279
280	#[cfg(feature = "encoding")]
281	encoding: EncodingRef::Implicit(UTF_8),
282	}
283	}
284	}
285