1#[cfg(feature = "encoding")]
2use encoding_rs::UTF_8;
3
4use crate::encoding::Decoder;
5use crate::errors::{Error, Result};
6use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event};
7#[cfg(feature = "encoding")]
8use crate::reader::EncodingRef;
9use crate::reader::{is_whitespace, BangType, ParseState};
10
11use memchr;
12
13/// A struct that holds a current parse state and a parser configuration.
14/// It is independent on a way of reading data: the reader feed data into it and
15/// get back produced [`Event`]s.
16#[derive(Clone)]
17pub(super) struct Parser {
18 /// Number of bytes read from the source of data since the parser was created
19 pub offset: usize,
20 /// Defines how to process next byte
21 pub state: ParseState,
22 /// Expand empty element into an opening and closing element
23 pub expand_empty_elements: bool,
24 /// Trims leading whitespace in Text events, skip the element if text is empty
25 pub trim_text_start: bool,
26 /// Trims trailing whitespace in Text events.
27 pub trim_text_end: bool,
28 /// Trims trailing whitespaces from markup names in closing tags `</a >`
29 pub trim_markup_names_in_closing_tags: bool,
30 /// Check if [`Event::End`] nodes match last [`Event::Start`] node
31 pub check_end_names: bool,
32 /// Check if comments contains `--` (false per default)
33 pub check_comments: bool,
34 /// All currently Started elements which didn't have a matching
35 /// End element yet.
36 ///
37 /// For an XML
38 ///
39 /// ```xml
40 /// <root><one/><inner attr="value">|<tag></inner></root>
41 /// ```
42 /// when cursor at the `|` position buffer contains:
43 ///
44 /// ```text
45 /// rootinner
46 /// ^ ^
47 /// ```
48 ///
49 /// The `^` symbols shows which positions stored in the [`Self::opened_starts`]
50 /// (0 and 4 in that case).
51 opened_buffer: Vec<u8>,
52 /// Opened name start indexes into [`Self::opened_buffer`]. See documentation
53 /// for that field for details
54 opened_starts: Vec<usize>,
55
56 #[cfg(feature = "encoding")]
57 /// Reference to the encoding used to read an XML
58 pub encoding: EncodingRef,
59}
60
61impl Parser {
62 /// Trims whitespaces from `bytes`, if required, and returns a [`Text`] event.
63 ///
64 /// # Parameters
65 /// - `bytes`: data from the start of stream to the first `<` or from `>` to `<`
66 ///
67 /// [`Text`]: Event::Text
68 pub fn emit_text<'b>(&mut self, bytes: &'b [u8]) -> Result<Event<'b>> {
69 let mut content = bytes;
70
71 if self.trim_text_end {
72 // Skip the ending '<'
73 let len = bytes
74 .iter()
75 .rposition(|&b| !is_whitespace(b))
76 .map_or_else(|| bytes.len(), |p| p + 1);
77 content = &bytes[..len];
78 }
79
80 Ok(Event::Text(BytesText::wrap(content, self.decoder())))
81 }
82
83 /// reads `BytesElement` starting with a `!`,
84 /// return `Comment`, `CData` or `DocType` event
85 pub fn emit_bang<'b>(&mut self, bang_type: BangType, buf: &'b [u8]) -> Result<Event<'b>> {
86 let uncased_starts_with = |string: &[u8], prefix: &[u8]| {
87 string.len() >= prefix.len() && string[..prefix.len()].eq_ignore_ascii_case(prefix)
88 };
89
90 let len = buf.len();
91 match bang_type {
92 BangType::Comment if buf.starts_with(b"!--") => {
93 debug_assert!(buf.ends_with(b"--"));
94 if self.check_comments {
95 // search if '--' not in comments
96 if let Some(p) = memchr::memchr_iter(b'-', &buf[3..len - 2])
97 .position(|p| buf[3 + p + 1] == b'-')
98 {
99 self.offset += len - p;
100 return Err(Error::UnexpectedToken("--".to_string()));
101 }
102 }
103 Ok(Event::Comment(BytesText::wrap(
104 &buf[3..len - 2],
105 self.decoder(),
106 )))
107 }
108 BangType::CData if uncased_starts_with(buf, b"![CDATA[") => {
109 debug_assert!(buf.ends_with(b"]]"));
110 Ok(Event::CData(BytesCData::wrap(
111 &buf[8..len - 2],
112 self.decoder(),
113 )))
114 }
115 BangType::DocType if uncased_starts_with(buf, b"!DOCTYPE") => {
116 let start = buf[8..]
117 .iter()
118 .position(|b| !is_whitespace(*b))
119 .unwrap_or(len - 8);
120 if start + 8 >= len {
121 return Err(Error::EmptyDocType);
122 }
123 Ok(Event::DocType(BytesText::wrap(
124 &buf[8 + start..],
125 self.decoder(),
126 )))
127 }
128 _ => Err(bang_type.to_err()),
129 }
130 }
131
132 /// Wraps content of `buf` into the [`Event::End`] event. Does the check that
133 /// end name matches the last opened start name if `self.check_end_names` is set.
134 pub fn emit_end<'b>(&mut self, buf: &'b [u8]) -> Result<Event<'b>> {
135 // XML standard permits whitespaces after the markup name in closing tags.
136 // Let's strip them from the buffer before comparing tag names.
137 let name = if self.trim_markup_names_in_closing_tags {
138 if let Some(pos_end_name) = buf[1..].iter().rposition(|&b| !b.is_ascii_whitespace()) {
139 let (name, _) = buf[1..].split_at(pos_end_name + 1);
140 name
141 } else {
142 &buf[1..]
143 }
144 } else {
145 &buf[1..]
146 };
147
148 let decoder = self.decoder();
149 let mismatch_err = |expected: String, found: &[u8], offset: &mut usize| {
150 *offset -= buf.len();
151 Err(Error::EndEventMismatch {
152 expected,
153 found: decoder.decode(found).unwrap_or_default().into_owned(),
154 })
155 };
156
157 // Get the index in self.opened_buffer of the name of the last opened tag
158 match self.opened_starts.pop() {
159 Some(start) => {
160 if self.check_end_names {
161 let expected = &self.opened_buffer[start..];
162 if name != expected {
163 let expected = decoder.decode(expected).unwrap_or_default().into_owned();
164 // #513: In order to allow error recovery we should drop content of the buffer
165 self.opened_buffer.truncate(start);
166
167 return mismatch_err(expected, name, &mut self.offset);
168 }
169 }
170
171 self.opened_buffer.truncate(start);
172 }
173 None => {
174 if self.check_end_names {
175 return mismatch_err("".to_string(), &buf[1..], &mut self.offset);
176 }
177 }
178 }
179
180 Ok(Event::End(BytesEnd::wrap(name.into())))
181 }
182
183 /// reads `BytesElement` starting with a `?`,
184 /// return `Decl` or `PI` event
185 pub fn emit_question_mark<'b>(&mut self, buf: &'b [u8]) -> Result<Event<'b>> {
186 let len = buf.len();
187 if len > 2 && buf[len - 1] == b'?' {
188 if len > 5 && &buf[1..4] == b"xml" && is_whitespace(buf[4]) {
189 let event = BytesDecl::from_start(BytesStart::wrap(&buf[1..len - 1], 3));
190
191 // Try getting encoding from the declaration event
192 #[cfg(feature = "encoding")]
193 if self.encoding.can_be_refined() {
194 if let Some(encoding) = event.encoder() {
195 self.encoding = EncodingRef::XmlDetected(encoding);
196 }
197 }
198
199 Ok(Event::Decl(event))
200 } else {
201 Ok(Event::PI(BytesText::wrap(&buf[1..len - 1], self.decoder())))
202 }
203 } else {
204 self.offset -= len;
205 Err(Error::UnexpectedEof("XmlDecl".to_string()))
206 }
207 }
208
209 /// Converts content of a tag to a `Start` or an `Empty` event
210 ///
211 /// # Parameters
212 /// - `content`: Content of a tag between `<` and `>`
213 pub fn emit_start<'b>(&mut self, content: &'b [u8]) -> Result<Event<'b>> {
214 let len = content.len();
215 let name_end = content
216 .iter()
217 .position(|&b| is_whitespace(b))
218 .unwrap_or(len);
219 if let Some(&b'/') = content.last() {
220 // This is self-closed tag `<something/>`
221 let name_len = if name_end < len { name_end } else { len - 1 };
222 let event = BytesStart::wrap(&content[..len - 1], name_len);
223
224 if self.expand_empty_elements {
225 self.state = ParseState::Empty;
226 self.opened_starts.push(self.opened_buffer.len());
227 self.opened_buffer.extend(&content[..name_len]);
228 Ok(Event::Start(event))
229 } else {
230 Ok(Event::Empty(event))
231 }
232 } else {
233 // #514: Always store names event when .check_end_names == false,
234 // because checks can be temporary disabled and when they would be
235 // enabled, we should have that information
236 self.opened_starts.push(self.opened_buffer.len());
237 self.opened_buffer.extend(&content[..name_end]);
238 Ok(Event::Start(BytesStart::wrap(content, name_end)))
239 }
240 }
241
242 #[inline]
243 pub fn close_expanded_empty(&mut self) -> Result<Event<'static>> {
244 self.state = ParseState::ClosedTag;
245 let name = self
246 .opened_buffer
247 .split_off(self.opened_starts.pop().unwrap());
248 Ok(Event::End(BytesEnd::wrap(name.into())))
249 }
250
251 /// Get the decoder, used to decode bytes, read by this reader, to the strings.
252 ///
253 /// If `encoding` feature is enabled, the used encoding may change after
254 /// parsing the XML declaration, otherwise encoding is fixed to UTF-8.
255 ///
256 /// If `encoding` feature is enabled and no encoding is specified in declaration,
257 /// defaults to UTF-8.
258 pub fn decoder(&self) -> Decoder {
259 Decoder {
260 #[cfg(feature = "encoding")]
261 encoding: self.encoding.encoding(),
262 }
263 }
264}
265
266impl Default for Parser {
267 fn default() -> Self {
268 Self {
269 offset: 0,
270 state: ParseState::Init,
271 expand_empty_elements: false,
272 trim_text_start: false,
273 trim_text_end: false,
274 trim_markup_names_in_closing_tags: true,
275 check_end_names: true,
276 check_comments: false,
277 opened_buffer: Vec::new(),
278 opened_starts: Vec::new(),
279
280 #[cfg(feature = "encoding")]
281 encoding: EncodingRef::Implicit(UTF_8),
282 }
283 }
284}
285