| 1 | #[cfg (feature = "encoding" )]
|
| 2 | use encoding_rs::UTF_8;
|
| 3 |
|
| 4 | use crate::encoding::Decoder;
|
| 5 | use crate::errors::{Error, Result};
|
| 6 | use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event};
|
| 7 | #[cfg (feature = "encoding" )]
|
| 8 | use crate::reader::EncodingRef;
|
| 9 | use crate::reader::{is_whitespace, BangType, ParseState};
|
| 10 |
|
| 11 | use memchr;
|
| 12 |
|
| 13 | /// A struct that holds a current parse state and a parser configuration.
|
| 14 | /// It is independent on a way of reading data: the reader feed data into it and
|
| 15 | /// get back produced [`Event`]s.
|
| 16 | #[derive (Clone)]
|
| 17 | pub(super) struct Parser {
|
| 18 | /// Number of bytes read from the source of data since the parser was created
|
| 19 | pub offset: usize,
|
| 20 | /// Defines how to process next byte
|
| 21 | pub state: ParseState,
|
| 22 | /// Expand empty element into an opening and closing element
|
| 23 | pub expand_empty_elements: bool,
|
| 24 | /// Trims leading whitespace in Text events, skip the element if text is empty
|
| 25 | pub trim_text_start: bool,
|
| 26 | /// Trims trailing whitespace in Text events.
|
| 27 | pub trim_text_end: bool,
|
| 28 | /// Trims trailing whitespaces from markup names in closing tags `</a >`
|
| 29 | pub trim_markup_names_in_closing_tags: bool,
|
| 30 | /// Check if [`Event::End`] nodes match last [`Event::Start`] node
|
| 31 | pub check_end_names: bool,
|
| 32 | /// Check if comments contains `--` (false per default)
|
| 33 | pub check_comments: bool,
|
| 34 | /// All currently Started elements which didn't have a matching
|
| 35 | /// End element yet.
|
| 36 | ///
|
| 37 | /// For an XML
|
| 38 | ///
|
| 39 | /// ```xml
|
| 40 | /// <root><one/><inner attr="value">|<tag></inner></root>
|
| 41 | /// ```
|
| 42 | /// when cursor at the `|` position buffer contains:
|
| 43 | ///
|
| 44 | /// ```text
|
| 45 | /// rootinner
|
| 46 | /// ^ ^
|
| 47 | /// ```
|
| 48 | ///
|
| 49 | /// The `^` symbols shows which positions stored in the [`Self::opened_starts`]
|
| 50 | /// (0 and 4 in that case).
|
| 51 | opened_buffer: Vec<u8>,
|
| 52 | /// Opened name start indexes into [`Self::opened_buffer`]. See documentation
|
| 53 | /// for that field for details
|
| 54 | opened_starts: Vec<usize>,
|
| 55 |
|
| 56 | #[cfg (feature = "encoding" )]
|
| 57 | /// Reference to the encoding used to read an XML
|
| 58 | pub encoding: EncodingRef,
|
| 59 | }
|
| 60 |
|
| 61 | impl Parser {
|
| 62 | /// Trims whitespaces from `bytes`, if required, and returns a [`Text`] event.
|
| 63 | ///
|
| 64 | /// # Parameters
|
| 65 | /// - `bytes`: data from the start of stream to the first `<` or from `>` to `<`
|
| 66 | ///
|
| 67 | /// [`Text`]: Event::Text
|
| 68 | pub fn emit_text<'b>(&mut self, bytes: &'b [u8]) -> Result<Event<'b>> {
|
| 69 | let mut content = bytes;
|
| 70 |
|
| 71 | if self.trim_text_end {
|
| 72 | // Skip the ending '<'
|
| 73 | let len = bytes
|
| 74 | .iter()
|
| 75 | .rposition(|&b| !is_whitespace(b))
|
| 76 | .map_or_else(|| bytes.len(), |p| p + 1);
|
| 77 | content = &bytes[..len];
|
| 78 | }
|
| 79 |
|
| 80 | Ok(Event::Text(BytesText::wrap(content, self.decoder())))
|
| 81 | }
|
| 82 |
|
| 83 | /// reads `BytesElement` starting with a `!`,
|
| 84 | /// return `Comment`, `CData` or `DocType` event
|
| 85 | pub fn emit_bang<'b>(&mut self, bang_type: BangType, buf: &'b [u8]) -> Result<Event<'b>> {
|
| 86 | let uncased_starts_with = |string: &[u8], prefix: &[u8]| {
|
| 87 | string.len() >= prefix.len() && string[..prefix.len()].eq_ignore_ascii_case(prefix)
|
| 88 | };
|
| 89 |
|
| 90 | let len = buf.len();
|
| 91 | match bang_type {
|
| 92 | BangType::Comment if buf.starts_with(b"!--" ) => {
|
| 93 | debug_assert!(buf.ends_with(b"--" ));
|
| 94 | if self.check_comments {
|
| 95 | // search if '--' not in comments
|
| 96 | if let Some(p) = memchr::memchr_iter(b'-' , &buf[3..len - 2])
|
| 97 | .position(|p| buf[3 + p + 1] == b'-' )
|
| 98 | {
|
| 99 | self.offset += len - p;
|
| 100 | return Err(Error::UnexpectedToken("--" .to_string()));
|
| 101 | }
|
| 102 | }
|
| 103 | Ok(Event::Comment(BytesText::wrap(
|
| 104 | &buf[3..len - 2],
|
| 105 | self.decoder(),
|
| 106 | )))
|
| 107 | }
|
| 108 | BangType::CData if uncased_starts_with(buf, b"![CDATA[" ) => {
|
| 109 | debug_assert!(buf.ends_with(b"]]" ));
|
| 110 | Ok(Event::CData(BytesCData::wrap(
|
| 111 | &buf[8..len - 2],
|
| 112 | self.decoder(),
|
| 113 | )))
|
| 114 | }
|
| 115 | BangType::DocType if uncased_starts_with(buf, b"!DOCTYPE" ) => {
|
| 116 | let start = buf[8..]
|
| 117 | .iter()
|
| 118 | .position(|b| !is_whitespace(*b))
|
| 119 | .unwrap_or(len - 8);
|
| 120 | if start + 8 >= len {
|
| 121 | return Err(Error::EmptyDocType);
|
| 122 | }
|
| 123 | Ok(Event::DocType(BytesText::wrap(
|
| 124 | &buf[8 + start..],
|
| 125 | self.decoder(),
|
| 126 | )))
|
| 127 | }
|
| 128 | _ => Err(bang_type.to_err()),
|
| 129 | }
|
| 130 | }
|
| 131 |
|
| 132 | /// Wraps content of `buf` into the [`Event::End`] event. Does the check that
|
| 133 | /// end name matches the last opened start name if `self.check_end_names` is set.
|
| 134 | pub fn emit_end<'b>(&mut self, buf: &'b [u8]) -> Result<Event<'b>> {
|
| 135 | // XML standard permits whitespaces after the markup name in closing tags.
|
| 136 | // Let's strip them from the buffer before comparing tag names.
|
| 137 | let name = if self.trim_markup_names_in_closing_tags {
|
| 138 | if let Some(pos_end_name) = buf[1..].iter().rposition(|&b| !b.is_ascii_whitespace()) {
|
| 139 | let (name, _) = buf[1..].split_at(pos_end_name + 1);
|
| 140 | name
|
| 141 | } else {
|
| 142 | &buf[1..]
|
| 143 | }
|
| 144 | } else {
|
| 145 | &buf[1..]
|
| 146 | };
|
| 147 |
|
| 148 | let decoder = self.decoder();
|
| 149 | let mismatch_err = |expected: String, found: &[u8], offset: &mut usize| {
|
| 150 | *offset -= buf.len();
|
| 151 | Err(Error::EndEventMismatch {
|
| 152 | expected,
|
| 153 | found: decoder.decode(found).unwrap_or_default().into_owned(),
|
| 154 | })
|
| 155 | };
|
| 156 |
|
| 157 | // Get the index in self.opened_buffer of the name of the last opened tag
|
| 158 | match self.opened_starts.pop() {
|
| 159 | Some(start) => {
|
| 160 | if self.check_end_names {
|
| 161 | let expected = &self.opened_buffer[start..];
|
| 162 | if name != expected {
|
| 163 | let expected = decoder.decode(expected).unwrap_or_default().into_owned();
|
| 164 | // #513: In order to allow error recovery we should drop content of the buffer
|
| 165 | self.opened_buffer.truncate(start);
|
| 166 |
|
| 167 | return mismatch_err(expected, name, &mut self.offset);
|
| 168 | }
|
| 169 | }
|
| 170 |
|
| 171 | self.opened_buffer.truncate(start);
|
| 172 | }
|
| 173 | None => {
|
| 174 | if self.check_end_names {
|
| 175 | return mismatch_err("" .to_string(), &buf[1..], &mut self.offset);
|
| 176 | }
|
| 177 | }
|
| 178 | }
|
| 179 |
|
| 180 | Ok(Event::End(BytesEnd::wrap(name.into())))
|
| 181 | }
|
| 182 |
|
| 183 | /// reads `BytesElement` starting with a `?`,
|
| 184 | /// return `Decl` or `PI` event
|
| 185 | pub fn emit_question_mark<'b>(&mut self, buf: &'b [u8]) -> Result<Event<'b>> {
|
| 186 | let len = buf.len();
|
| 187 | if len > 2 && buf[len - 1] == b'?' {
|
| 188 | if len > 5 && &buf[1..4] == b"xml" && is_whitespace(buf[4]) {
|
| 189 | let event = BytesDecl::from_start(BytesStart::wrap(&buf[1..len - 1], 3));
|
| 190 |
|
| 191 | // Try getting encoding from the declaration event
|
| 192 | #[cfg (feature = "encoding" )]
|
| 193 | if self.encoding.can_be_refined() {
|
| 194 | if let Some(encoding) = event.encoder() {
|
| 195 | self.encoding = EncodingRef::XmlDetected(encoding);
|
| 196 | }
|
| 197 | }
|
| 198 |
|
| 199 | Ok(Event::Decl(event))
|
| 200 | } else {
|
| 201 | Ok(Event::PI(BytesText::wrap(&buf[1..len - 1], self.decoder())))
|
| 202 | }
|
| 203 | } else {
|
| 204 | self.offset -= len;
|
| 205 | Err(Error::UnexpectedEof("XmlDecl" .to_string()))
|
| 206 | }
|
| 207 | }
|
| 208 |
|
| 209 | /// Converts content of a tag to a `Start` or an `Empty` event
|
| 210 | ///
|
| 211 | /// # Parameters
|
| 212 | /// - `content`: Content of a tag between `<` and `>`
|
| 213 | pub fn emit_start<'b>(&mut self, content: &'b [u8]) -> Result<Event<'b>> {
|
| 214 | let len = content.len();
|
| 215 | let name_end = content
|
| 216 | .iter()
|
| 217 | .position(|&b| is_whitespace(b))
|
| 218 | .unwrap_or(len);
|
| 219 | if let Some(&b'/' ) = content.last() {
|
| 220 | // This is self-closed tag `<something/>`
|
| 221 | let name_len = if name_end < len { name_end } else { len - 1 };
|
| 222 | let event = BytesStart::wrap(&content[..len - 1], name_len);
|
| 223 |
|
| 224 | if self.expand_empty_elements {
|
| 225 | self.state = ParseState::Empty;
|
| 226 | self.opened_starts.push(self.opened_buffer.len());
|
| 227 | self.opened_buffer.extend(&content[..name_len]);
|
| 228 | Ok(Event::Start(event))
|
| 229 | } else {
|
| 230 | Ok(Event::Empty(event))
|
| 231 | }
|
| 232 | } else {
|
| 233 | // #514: Always store names event when .check_end_names == false,
|
| 234 | // because checks can be temporary disabled and when they would be
|
| 235 | // enabled, we should have that information
|
| 236 | self.opened_starts.push(self.opened_buffer.len());
|
| 237 | self.opened_buffer.extend(&content[..name_end]);
|
| 238 | Ok(Event::Start(BytesStart::wrap(content, name_end)))
|
| 239 | }
|
| 240 | }
|
| 241 |
|
| 242 | #[inline ]
|
| 243 | pub fn close_expanded_empty(&mut self) -> Result<Event<'static>> {
|
| 244 | self.state = ParseState::ClosedTag;
|
| 245 | let name = self
|
| 246 | .opened_buffer
|
| 247 | .split_off(self.opened_starts.pop().unwrap());
|
| 248 | Ok(Event::End(BytesEnd::wrap(name.into())))
|
| 249 | }
|
| 250 |
|
| 251 | /// Get the decoder, used to decode bytes, read by this reader, to the strings.
|
| 252 | ///
|
| 253 | /// If `encoding` feature is enabled, the used encoding may change after
|
| 254 | /// parsing the XML declaration, otherwise encoding is fixed to UTF-8.
|
| 255 | ///
|
| 256 | /// If `encoding` feature is enabled and no encoding is specified in declaration,
|
| 257 | /// defaults to UTF-8.
|
| 258 | pub fn decoder(&self) -> Decoder {
|
| 259 | Decoder {
|
| 260 | #[cfg (feature = "encoding" )]
|
| 261 | encoding: self.encoding.encoding(),
|
| 262 | }
|
| 263 | }
|
| 264 | }
|
| 265 |
|
| 266 | impl Default for Parser {
|
| 267 | fn default() -> Self {
|
| 268 | Self {
|
| 269 | offset: 0,
|
| 270 | state: ParseState::Init,
|
| 271 | expand_empty_elements: false,
|
| 272 | trim_text_start: false,
|
| 273 | trim_text_end: false,
|
| 274 | trim_markup_names_in_closing_tags: true,
|
| 275 | check_end_names: true,
|
| 276 | check_comments: false,
|
| 277 | opened_buffer: Vec::new(),
|
| 278 | opened_starts: Vec::new(),
|
| 279 |
|
| 280 | #[cfg (feature = "encoding" )]
|
| 281 | encoding: EncodingRef::Implicit(UTF_8),
|
| 282 | }
|
| 283 | }
|
| 284 | }
|
| 285 | |