1 | #[cfg (feature = "encoding" )]
|
2 | use encoding_rs::UTF_8;
|
3 |
|
4 | use crate::encoding::Decoder;
|
5 | use crate::errors::{Error, IllFormedError, Result, SyntaxError};
|
6 | use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesPI, BytesStart, BytesText, Event};
|
7 | #[cfg (feature = "encoding" )]
|
8 | use crate::reader::EncodingRef;
|
9 | use crate::reader::{BangType, Config, ParseState};
|
10 | use crate::utils::{is_whitespace, name_len};
|
11 |
|
12 | /// A struct that holds a current reader state and a parser configuration.
|
13 | /// It is independent on a way of reading data: the reader feed data into it and
|
14 | /// get back produced [`Event`]s.
|
15 | #[derive (Clone, Debug)]
|
16 | pub(super) struct ReaderState {
|
17 | /// Number of bytes read from the source of data since the reader was created
|
18 | pub offset: u64,
|
19 | /// A snapshot of an `offset` of the last error returned. It can be less than
|
20 | /// `offset`, because some errors conveniently report at earlier position,
|
21 | /// and changing `offset` is not possible, because `Error::IllFormed` errors
|
22 | /// are recoverable.
|
23 | pub last_error_offset: u64,
|
24 | /// Defines how to process next byte
|
25 | pub state: ParseState,
|
26 | /// User-defined settings that affect parsing
|
27 | pub config: Config,
|
28 | /// All currently Started elements which didn't have a matching
|
29 | /// End element yet.
|
30 | ///
|
31 | /// For an XML
|
32 | ///
|
33 | /// ```xml
|
34 | /// <root><one/><inner attr="value">|<tag></inner></root>
|
35 | /// ```
|
36 | /// when cursor at the `|` position buffer contains:
|
37 | ///
|
38 | /// ```text
|
39 | /// rootinner
|
40 | /// ^ ^
|
41 | /// ```
|
42 | ///
|
43 | /// The `^` symbols shows which positions stored in the [`Self::opened_starts`]
|
44 | /// (0 and 4 in that case).
|
45 | opened_buffer: Vec<u8>,
|
46 | /// Opened name start indexes into [`Self::opened_buffer`]. See documentation
|
47 | /// for that field for details
|
48 | opened_starts: Vec<usize>,
|
49 |
|
50 | #[cfg (feature = "encoding" )]
|
51 | /// Reference to the encoding used to read an XML
|
52 | pub encoding: EncodingRef,
|
53 | }
|
54 |
|
55 | impl ReaderState {
|
56 | /// Trims end whitespaces from `bytes`, if required, and returns a text event.
|
57 | ///
|
58 | /// # Parameters
|
59 | /// - `bytes`: data from the start of stream to the first `<` or from `>` to `<`
|
60 | pub fn emit_text<'b>(&mut self, bytes: &'b [u8]) -> BytesText<'b> {
|
61 | let mut content = bytes;
|
62 |
|
63 | if self.config.trim_text_end {
|
64 | // Skip the ending '<'
|
65 | let len = bytes
|
66 | .iter()
|
67 | .rposition(|&b| !is_whitespace(b))
|
68 | .map_or(0, |p| p + 1);
|
69 | content = &bytes[..len];
|
70 | }
|
71 | BytesText::wrap(content, self.decoder())
|
72 | }
|
73 |
|
74 | /// Returns `Comment`, `CData` or `DocType` event.
|
75 | ///
|
76 | /// `buf` contains data between `<` and `>`:
|
77 | /// - CDATA: `![CDATA[...]]`
|
78 | /// - Comment: `!--...--`
|
79 | /// - Doctype (uppercase): `!D...`
|
80 | /// - Doctype (lowercase): `!d...`
|
81 | pub fn emit_bang<'b>(&mut self, bang_type: BangType, buf: &'b [u8]) -> Result<Event<'b>> {
|
82 | debug_assert_eq!(
|
83 | buf.first(),
|
84 | Some(&b'!' ),
|
85 | "CDATA, comment or DOCTYPE should start from '!'"
|
86 | );
|
87 |
|
88 | let uncased_starts_with = |string: &[u8], prefix: &[u8]| {
|
89 | string.len() >= prefix.len() && string[..prefix.len()].eq_ignore_ascii_case(prefix)
|
90 | };
|
91 |
|
92 | let len = buf.len();
|
93 | match bang_type {
|
94 | BangType::Comment if buf.starts_with(b"!--" ) => {
|
95 | debug_assert!(buf.ends_with(b"--" ));
|
96 | if self.config.check_comments {
|
97 | // search if '--' not in comments
|
98 | let mut haystack = &buf[3..len - 2];
|
99 | let mut off = 0;
|
100 | while let Some(p) = memchr::memchr(b'-' , haystack) {
|
101 | off += p + 1;
|
102 | // if next byte after `-` is also `-`, return an error
|
103 | if buf[3 + off] == b'-' {
|
104 | // Explanation of the magic:
|
105 | //
|
106 | // - `self.offset`` just after `>`,
|
107 | // - `buf` contains `!-- con--tent --`
|
108 | // - `p` is counted from byte after `<!--`
|
109 | //
|
110 | // <!-- con--tent -->:
|
111 | // ~~~~~~~~~~~~~~~~ : - buf
|
112 | // : =========== : - zone of search (possible values of `p`)
|
113 | // : |---p : - p is counted from | (| is 0)
|
114 | // : : : ^ - self.offset
|
115 | // ^ : : - self.offset - len
|
116 | // ^ : - self.offset - len + 2
|
117 | // ^ - self.offset - len + 2 + p
|
118 | self.last_error_offset = self.offset - len as u64 + 2 + p as u64;
|
119 | return Err(Error::IllFormed(IllFormedError::DoubleHyphenInComment));
|
120 | }
|
121 | // Continue search after single `-` (+1 to skip it)
|
122 | haystack = &haystack[p + 1..];
|
123 | }
|
124 | }
|
125 | Ok(Event::Comment(BytesText::wrap(
|
126 | // Cut of `!--` and `--` from start and end
|
127 | &buf[3..len - 2],
|
128 | self.decoder(),
|
129 | )))
|
130 | }
|
131 | // XML requires uppercase only:
|
132 | // https://www.w3.org/TR/xml11/#sec-cdata-sect
|
133 | // Even HTML5 required uppercase only:
|
134 | // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
|
135 | BangType::CData if buf.starts_with(b"![CDATA[" ) => {
|
136 | debug_assert!(buf.ends_with(b"]]" ));
|
137 | Ok(Event::CData(BytesCData::wrap(
|
138 | // Cut of `![CDATA[` and `]]` from start and end
|
139 | &buf[8..len - 2],
|
140 | self.decoder(),
|
141 | )))
|
142 | }
|
143 | // XML requires uppercase only, but we will check that on validation stage:
|
144 | // https://www.w3.org/TR/xml11/#sec-prolog-dtd
|
145 | // HTML5 allows mixed case for doctype declarations:
|
146 | // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
|
147 | BangType::DocType(0) if uncased_starts_with(buf, b"!DOCTYPE" ) => {
|
148 | match buf[8..].iter().position(|&b| !is_whitespace(b)) {
|
149 | Some(start) => Ok(Event::DocType(BytesText::wrap(
|
150 | // Cut of `!DOCTYPE` and any number of spaces from start
|
151 | &buf[8 + start..],
|
152 | self.decoder(),
|
153 | ))),
|
154 | None => {
|
155 | // Because we here, we at least read `<!DOCTYPE>` and offset after `>`.
|
156 | // We want report error at place where name is expected - this is just
|
157 | // before `>`
|
158 | self.last_error_offset = self.offset - 1;
|
159 | return Err(Error::IllFormed(IllFormedError::MissingDoctypeName));
|
160 | }
|
161 | }
|
162 | }
|
163 | _ => {
|
164 | // <!....>
|
165 | // ^^^^^ - `buf` does not contain `<` and `>`, but `self.offset` is after `>`.
|
166 | // ^------- We report error at that position, so we need to subtract 2 and buf len
|
167 | self.last_error_offset = self.offset - len as u64 - 2;
|
168 | Err(bang_type.to_err().into())
|
169 | }
|
170 | }
|
171 | }
|
172 |
|
173 | /// Wraps content of `buf` into the [`Event::End`] event. Does the check that
|
174 | /// end name matches the last opened start name if `self.config.check_end_names` is set.
|
175 | ///
|
176 | /// `buf` contains data between `<` and `>`, for example `/tag`.
|
177 | pub fn emit_end<'b>(&mut self, buf: &'b [u8]) -> Result<Event<'b>> {
|
178 | debug_assert_eq!(
|
179 | buf.first(),
|
180 | Some(&b'/' ),
|
181 | "closing tag should start from '/'"
|
182 | );
|
183 |
|
184 | // Strip the `/` character. `content` contains data between `</` and `>`
|
185 | let content = &buf[1..];
|
186 | // XML standard permits whitespaces after the markup name in closing tags.
|
187 | // Let's strip them from the buffer before comparing tag names.
|
188 | let name = if self.config.trim_markup_names_in_closing_tags {
|
189 | if let Some(pos_end_name) = content.iter().rposition(|&b| !is_whitespace(b)) {
|
190 | &content[..pos_end_name + 1]
|
191 | } else {
|
192 | content
|
193 | }
|
194 | } else {
|
195 | content
|
196 | };
|
197 |
|
198 | let decoder = self.decoder();
|
199 |
|
200 | // Get the index in self.opened_buffer of the name of the last opened tag
|
201 | match self.opened_starts.pop() {
|
202 | Some(start) => {
|
203 | if self.config.check_end_names {
|
204 | let expected = &self.opened_buffer[start..];
|
205 | if name != expected {
|
206 | let expected = decoder.decode(expected).unwrap_or_default().into_owned();
|
207 | // #513: In order to allow error recovery we should drop content of the buffer
|
208 | self.opened_buffer.truncate(start);
|
209 |
|
210 | // Report error at start of the end tag at `<` character
|
211 | // -2 for `<` and `>`
|
212 | self.last_error_offset = self.offset - buf.len() as u64 - 2;
|
213 | return Err(Error::IllFormed(IllFormedError::MismatchedEndTag {
|
214 | expected,
|
215 | found: decoder.decode(name).unwrap_or_default().into_owned(),
|
216 | }));
|
217 | }
|
218 | }
|
219 |
|
220 | self.opened_buffer.truncate(start);
|
221 | }
|
222 | None => {
|
223 | if !self.config.allow_unmatched_ends {
|
224 | // Report error at start of the end tag at `<` character
|
225 | // -2 for `<` and `>`
|
226 | self.last_error_offset = self.offset - buf.len() as u64 - 2;
|
227 | return Err(Error::IllFormed(IllFormedError::UnmatchedEndTag(
|
228 | decoder.decode(name).unwrap_or_default().into_owned(),
|
229 | )));
|
230 | }
|
231 | }
|
232 | }
|
233 |
|
234 | Ok(Event::End(BytesEnd::wrap(name.into())))
|
235 | }
|
236 |
|
237 | /// `buf` contains data between `<` and `>` and the first byte is `?`.
|
238 | /// `self.offset` already after the `>`
|
239 | ///
|
240 | /// Returns `Decl` or `PI` event
|
241 | pub fn emit_question_mark<'b>(&mut self, buf: &'b [u8]) -> Result<Event<'b>> {
|
242 | debug_assert!(buf.len() > 0);
|
243 | debug_assert_eq!(buf[0], b'?' );
|
244 |
|
245 | let len = buf.len();
|
246 | // We accept at least <??>
|
247 | // ~~ - len = 2
|
248 | if len > 1 && buf[len - 1] == b'?' {
|
249 | // Cut of `?` and `?` from start and end
|
250 | let content = &buf[1..len - 1];
|
251 | let len = content.len();
|
252 |
|
253 | if content.starts_with(b"xml" ) && (len == 3 || is_whitespace(content[3])) {
|
254 | let event = BytesDecl::from_start(BytesStart::wrap(content, 3));
|
255 |
|
256 | // Try getting encoding from the declaration event
|
257 | #[cfg (feature = "encoding" )]
|
258 | if self.encoding.can_be_refined() {
|
259 | if let Some(encoding) = event.encoder() {
|
260 | self.encoding = EncodingRef::XmlDetected(encoding);
|
261 | }
|
262 | }
|
263 |
|
264 | Ok(Event::Decl(event))
|
265 | } else {
|
266 | Ok(Event::PI(BytesPI::wrap(content, name_len(content))))
|
267 | }
|
268 | } else {
|
269 | // <?....EOF
|
270 | // ^^^^^ - `buf` does not contains `<`, but we want to report error at `<`,
|
271 | // so we move offset to it (-2 for `<` and `>`)
|
272 | self.last_error_offset = self.offset - len as u64 - 2;
|
273 | Err(Error::Syntax(SyntaxError::UnclosedPIOrXmlDecl))
|
274 | }
|
275 | }
|
276 |
|
277 | /// Converts content of a tag to a `Start` or an `Empty` event
|
278 | ///
|
279 | /// # Parameters
|
280 | /// - `content`: Content of a tag between `<` and `>`
|
281 | pub fn emit_start<'b>(&mut self, content: &'b [u8]) -> Event<'b> {
|
282 | if let Some(content) = content.strip_suffix(b"/" ) {
|
283 | // This is self-closed tag `<something/>`
|
284 | let event = BytesStart::wrap(content, name_len(content));
|
285 |
|
286 | if self.config.expand_empty_elements {
|
287 | self.state = ParseState::InsideEmpty;
|
288 | self.opened_starts.push(self.opened_buffer.len());
|
289 | self.opened_buffer.extend(event.name().as_ref());
|
290 | Event::Start(event)
|
291 | } else {
|
292 | Event::Empty(event)
|
293 | }
|
294 | } else {
|
295 | let event = BytesStart::wrap(content, name_len(content));
|
296 |
|
297 | // #514: Always store names event when .check_end_names == false,
|
298 | // because checks can be temporary disabled and when they would be
|
299 | // enabled, we should have that information
|
300 | self.opened_starts.push(self.opened_buffer.len());
|
301 | self.opened_buffer.extend(event.name().as_ref());
|
302 | Event::Start(event)
|
303 | }
|
304 | }
|
305 |
|
306 | #[inline ]
|
307 | pub fn close_expanded_empty(&mut self) -> BytesEnd<'static> {
|
308 | self.state = ParseState::InsideText;
|
309 | let name = self
|
310 | .opened_buffer
|
311 | .split_off(self.opened_starts.pop().unwrap());
|
312 | BytesEnd::wrap(name.into())
|
313 | }
|
314 |
|
315 | /// Get the decoder, used to decode bytes, read by this reader, to the strings.
|
316 | ///
|
317 | /// If [`encoding`] feature is enabled, the used encoding may change after
|
318 | /// parsing the XML declaration, otherwise encoding is fixed to UTF-8.
|
319 | ///
|
320 | /// If [`encoding`] feature is enabled and no encoding is specified in declaration,
|
321 | /// defaults to UTF-8.
|
322 | ///
|
323 | /// [`encoding`]: ../../index.html#encoding
|
324 | pub const fn decoder(&self) -> Decoder {
|
325 | Decoder {
|
326 | #[cfg (feature = "encoding" )]
|
327 | encoding: self.encoding.encoding(),
|
328 | }
|
329 | }
|
330 | }
|
331 |
|
332 | impl Default for ReaderState {
|
333 | fn default() -> Self {
|
334 | Self {
|
335 | offset: 0,
|
336 | last_error_offset: 0,
|
337 | state: ParseState::Init,
|
338 | config: Config::default(),
|
339 | opened_buffer: Vec::new(),
|
340 | opened_starts: Vec::new(),
|
341 |
|
342 | #[cfg (feature = "encoding" )]
|
343 | encoding: EncodingRef::Implicit(UTF_8),
|
344 | }
|
345 | }
|
346 | }
|
347 | |