1 | #[cfg (feature = "encoding" )]
|
2 | use encoding_rs::UTF_8;
|
3 |
|
4 | use crate::encoding::Decoder;
|
5 | use crate::errors::{Error, Result};
|
6 | use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event};
|
7 | #[cfg (feature = "encoding" )]
|
8 | use crate::reader::EncodingRef;
|
9 | use crate::reader::{is_whitespace, BangType, ParseState};
|
10 |
|
11 | use memchr;
|
12 |
|
13 | /// A struct that holds a current reader state and a parser configuration.
|
14 | /// It is independent on a way of reading data: the reader feed data into it and
|
15 | /// get back produced [`Event`]s.
|
16 | #[derive (Clone)]
|
17 | pub(super) struct ReaderState {
|
18 | /// Number of bytes read from the source of data since the reader was created
|
19 | pub offset: usize,
|
20 | /// Defines how to process next byte
|
21 | pub state: ParseState,
|
22 | /// Expand empty element into an opening and closing element
|
23 | pub expand_empty_elements: bool,
|
24 | /// Trims leading whitespace in Text events, skip the element if text is empty
|
25 | pub trim_text_start: bool,
|
26 | /// Trims trailing whitespace in Text events.
|
27 | pub trim_text_end: bool,
|
28 | /// Trims trailing whitespaces from markup names in closing tags `</a >`
|
29 | pub trim_markup_names_in_closing_tags: bool,
|
30 | /// Check if [`Event::End`] nodes match last [`Event::Start`] node
|
31 | pub check_end_names: bool,
|
32 | /// Check if comments contains `--` (false per default)
|
33 | pub check_comments: bool,
|
34 | /// All currently Started elements which didn't have a matching
|
35 | /// End element yet.
|
36 | ///
|
37 | /// For an XML
|
38 | ///
|
39 | /// ```xml
|
40 | /// <root><one/><inner attr="value">|<tag></inner></root>
|
41 | /// ```
|
42 | /// when cursor at the `|` position buffer contains:
|
43 | ///
|
44 | /// ```text
|
45 | /// rootinner
|
46 | /// ^ ^
|
47 | /// ```
|
48 | ///
|
49 | /// The `^` symbols shows which positions stored in the [`Self::opened_starts`]
|
50 | /// (0 and 4 in that case).
|
51 | opened_buffer: Vec<u8>,
|
52 | /// Opened name start indexes into [`Self::opened_buffer`]. See documentation
|
53 | /// for that field for details
|
54 | opened_starts: Vec<usize>,
|
55 |
|
56 | #[cfg (feature = "encoding" )]
|
57 | /// Reference to the encoding used to read an XML
|
58 | pub encoding: EncodingRef,
|
59 | }
|
60 |
|
61 | impl ReaderState {
|
62 | /// Trims whitespaces from `bytes`, if required, and returns a [`Text`] event.
|
63 | ///
|
64 | /// # Parameters
|
65 | /// - `bytes`: data from the start of stream to the first `<` or from `>` to `<`
|
66 | ///
|
67 | /// [`Text`]: Event::Text
|
68 | pub fn emit_text<'b>(&mut self, bytes: &'b [u8]) -> Result<Event<'b>> {
|
69 | let mut content = bytes;
|
70 |
|
71 | if self.trim_text_end {
|
72 | // Skip the ending '<'
|
73 | let len = bytes
|
74 | .iter()
|
75 | .rposition(|&b| !is_whitespace(b))
|
76 | .map_or_else(|| bytes.len(), |p| p + 1);
|
77 | content = &bytes[..len];
|
78 | }
|
79 |
|
80 | Ok(Event::Text(BytesText::wrap(content, self.decoder())))
|
81 | }
|
82 |
|
83 | /// reads `BytesElement` starting with a `!`,
|
84 | /// return `Comment`, `CData` or `DocType` event
|
85 | pub fn emit_bang<'b>(&mut self, bang_type: BangType, buf: &'b [u8]) -> Result<Event<'b>> {
|
86 | let uncased_starts_with = |string: &[u8], prefix: &[u8]| {
|
87 | string.len() >= prefix.len() && string[..prefix.len()].eq_ignore_ascii_case(prefix)
|
88 | };
|
89 |
|
90 | let len = buf.len();
|
91 | match bang_type {
|
92 | BangType::Comment if buf.starts_with(b"!--" ) => {
|
93 | debug_assert!(buf.ends_with(b"--" ));
|
94 | if self.check_comments {
|
95 | // search if '--' not in comments
|
96 | if let Some(p) = memchr::memchr_iter(b'-' , &buf[3..len - 2])
|
97 | .position(|p| buf[3 + p + 1] == b'-' )
|
98 | {
|
99 | self.offset += len - p;
|
100 | return Err(Error::UnexpectedToken("--" .to_string()));
|
101 | }
|
102 | }
|
103 | Ok(Event::Comment(BytesText::wrap(
|
104 | &buf[3..len - 2],
|
105 | self.decoder(),
|
106 | )))
|
107 | }
|
108 | BangType::CData if uncased_starts_with(buf, b"![CDATA[" ) => {
|
109 | debug_assert!(buf.ends_with(b"]]" ));
|
110 | Ok(Event::CData(BytesCData::wrap(
|
111 | &buf[8..len - 2],
|
112 | self.decoder(),
|
113 | )))
|
114 | }
|
115 | BangType::DocType if uncased_starts_with(buf, b"!DOCTYPE" ) => {
|
116 | let start = buf[8..]
|
117 | .iter()
|
118 | .position(|b| !is_whitespace(*b))
|
119 | .unwrap_or(len - 8);
|
120 | if start + 8 >= len {
|
121 | return Err(Error::EmptyDocType);
|
122 | }
|
123 | Ok(Event::DocType(BytesText::wrap(
|
124 | &buf[8 + start..],
|
125 | self.decoder(),
|
126 | )))
|
127 | }
|
128 | _ => Err(bang_type.to_err()),
|
129 | }
|
130 | }
|
131 |
|
132 | /// Wraps content of `buf` into the [`Event::End`] event. Does the check that
|
133 | /// end name matches the last opened start name if `self.check_end_names` is set.
|
134 | pub fn emit_end<'b>(&mut self, buf: &'b [u8]) -> Result<Event<'b>> {
|
135 | // Strip the `/` character. `content` contains data between `</` and `>`
|
136 | let content = &buf[1..];
|
137 | // XML standard permits whitespaces after the markup name in closing tags.
|
138 | // Let's strip them from the buffer before comparing tag names.
|
139 | let name = if self.trim_markup_names_in_closing_tags {
|
140 | if let Some(pos_end_name) = content.iter().rposition(|&b| !is_whitespace(b)) {
|
141 | &content[..pos_end_name + 1]
|
142 | } else {
|
143 | content
|
144 | }
|
145 | } else {
|
146 | content
|
147 | };
|
148 |
|
149 | let decoder = self.decoder();
|
150 | let mismatch_err = |expected: String, found: &[u8], offset: &mut usize| {
|
151 | *offset -= buf.len();
|
152 | Err(Error::EndEventMismatch {
|
153 | expected,
|
154 | found: decoder.decode(found).unwrap_or_default().into_owned(),
|
155 | })
|
156 | };
|
157 |
|
158 | // Get the index in self.opened_buffer of the name of the last opened tag
|
159 | match self.opened_starts.pop() {
|
160 | Some(start) => {
|
161 | if self.check_end_names {
|
162 | let expected = &self.opened_buffer[start..];
|
163 | if name != expected {
|
164 | let expected = decoder.decode(expected).unwrap_or_default().into_owned();
|
165 | // #513: In order to allow error recovery we should drop content of the buffer
|
166 | self.opened_buffer.truncate(start);
|
167 |
|
168 | return mismatch_err(expected, name, &mut self.offset);
|
169 | }
|
170 | }
|
171 |
|
172 | self.opened_buffer.truncate(start);
|
173 | }
|
174 | None => {
|
175 | if self.check_end_names {
|
176 | return mismatch_err("" .to_string(), &buf[1..], &mut self.offset);
|
177 | }
|
178 | }
|
179 | }
|
180 |
|
181 | Ok(Event::End(BytesEnd::wrap(name.into())))
|
182 | }
|
183 |
|
184 | /// reads `BytesElement` starting with a `?`,
|
185 | /// return `Decl` or `PI` event
|
186 | pub fn emit_question_mark<'b>(&mut self, buf: &'b [u8]) -> Result<Event<'b>> {
|
187 | let len = buf.len();
|
188 | if len > 2 && buf[len - 1] == b'?' {
|
189 | if len > 5 && &buf[1..4] == b"xml" && is_whitespace(buf[4]) {
|
190 | let event = BytesDecl::from_start(BytesStart::wrap(&buf[1..len - 1], 3));
|
191 |
|
192 | // Try getting encoding from the declaration event
|
193 | #[cfg (feature = "encoding" )]
|
194 | if self.encoding.can_be_refined() {
|
195 | if let Some(encoding) = event.encoder() {
|
196 | self.encoding = EncodingRef::XmlDetected(encoding);
|
197 | }
|
198 | }
|
199 |
|
200 | Ok(Event::Decl(event))
|
201 | } else {
|
202 | Ok(Event::PI(BytesText::wrap(&buf[1..len - 1], self.decoder())))
|
203 | }
|
204 | } else {
|
205 | self.offset -= len;
|
206 | Err(Error::UnexpectedEof("XmlDecl" .to_string()))
|
207 | }
|
208 | }
|
209 |
|
210 | /// Converts content of a tag to a `Start` or an `Empty` event
|
211 | ///
|
212 | /// # Parameters
|
213 | /// - `content`: Content of a tag between `<` and `>`
|
214 | pub fn emit_start<'b>(&mut self, content: &'b [u8]) -> Result<Event<'b>> {
|
215 | let len = content.len();
|
216 | let name_end = content
|
217 | .iter()
|
218 | .position(|&b| is_whitespace(b))
|
219 | .unwrap_or(len);
|
220 | if let Some(&b'/' ) = content.last() {
|
221 | // This is self-closed tag `<something/>`
|
222 | let name_len = if name_end < len { name_end } else { len - 1 };
|
223 | let event = BytesStart::wrap(&content[..len - 1], name_len);
|
224 |
|
225 | if self.expand_empty_elements {
|
226 | self.state = ParseState::Empty;
|
227 | self.opened_starts.push(self.opened_buffer.len());
|
228 | self.opened_buffer.extend(&content[..name_len]);
|
229 | Ok(Event::Start(event))
|
230 | } else {
|
231 | Ok(Event::Empty(event))
|
232 | }
|
233 | } else {
|
234 | // #514: Always store names event when .check_end_names == false,
|
235 | // because checks can be temporary disabled and when they would be
|
236 | // enabled, we should have that information
|
237 | self.opened_starts.push(self.opened_buffer.len());
|
238 | self.opened_buffer.extend(&content[..name_end]);
|
239 | Ok(Event::Start(BytesStart::wrap(content, name_end)))
|
240 | }
|
241 | }
|
242 |
|
243 | #[inline ]
|
244 | pub fn close_expanded_empty(&mut self) -> Result<Event<'static>> {
|
245 | self.state = ParseState::ClosedTag;
|
246 | let name = self
|
247 | .opened_buffer
|
248 | .split_off(self.opened_starts.pop().unwrap());
|
249 | Ok(Event::End(BytesEnd::wrap(name.into())))
|
250 | }
|
251 |
|
252 | /// Get the decoder, used to decode bytes, read by this reader, to the strings.
|
253 | ///
|
254 | /// If [`encoding`] feature is enabled, the used encoding may change after
|
255 | /// parsing the XML declaration, otherwise encoding is fixed to UTF-8.
|
256 | ///
|
257 | /// If [`encoding`] feature is enabled and no encoding is specified in declaration,
|
258 | /// defaults to UTF-8.
|
259 | ///
|
260 | /// [`encoding`]: ../../index.html#encoding
|
261 | pub fn decoder(&self) -> Decoder {
|
262 | Decoder {
|
263 | #[cfg (feature = "encoding" )]
|
264 | encoding: self.encoding.encoding(),
|
265 | }
|
266 | }
|
267 | }
|
268 |
|
269 | impl Default for ReaderState {
|
270 | fn default() -> Self {
|
271 | Self {
|
272 | offset: 0,
|
273 | state: ParseState::Init,
|
274 | expand_empty_elements: false,
|
275 | trim_text_start: false,
|
276 | trim_text_end: false,
|
277 | trim_markup_names_in_closing_tags: true,
|
278 | check_end_names: true,
|
279 | check_comments: false,
|
280 | opened_buffer: Vec::new(),
|
281 | opened_starts: Vec::new(),
|
282 |
|
283 | #[cfg (feature = "encoding" )]
|
284 | encoding: EncodingRef::Implicit(UTF_8),
|
285 | }
|
286 | }
|
287 | }
|
288 | |