| 1 | #[cfg (feature = "encoding" )]
|
| 2 | use encoding_rs::UTF_8;
|
| 3 |
|
| 4 | use crate::encoding::Decoder;
|
| 5 | use crate::errors::{Error, IllFormedError, Result, SyntaxError};
|
| 6 | use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesPI, BytesStart, BytesText, Event};
|
| 7 | #[cfg (feature = "encoding" )]
|
| 8 | use crate::reader::EncodingRef;
|
| 9 | use crate::reader::{BangType, Config, ParseState};
|
| 10 | use crate::utils::{is_whitespace, name_len};
|
| 11 |
|
| 12 | /// A struct that holds a current reader state and a parser configuration.
|
| 13 | /// It is independent on a way of reading data: the reader feed data into it and
|
| 14 | /// get back produced [`Event`]s.
|
| 15 | #[derive (Clone, Debug)]
|
| 16 | pub(super) struct ReaderState {
|
| 17 | /// Number of bytes read from the source of data since the reader was created
|
| 18 | pub offset: u64,
|
| 19 | /// A snapshot of an `offset` of the last error returned. It can be less than
|
| 20 | /// `offset`, because some errors conveniently report at earlier position,
|
| 21 | /// and changing `offset` is not possible, because `Error::IllFormed` errors
|
| 22 | /// are recoverable.
|
| 23 | pub last_error_offset: u64,
|
| 24 | /// Defines how to process next byte
|
| 25 | pub state: ParseState,
|
| 26 | /// User-defined settings that affect parsing
|
| 27 | pub config: Config,
|
| 28 | /// All currently Started elements which didn't have a matching
|
| 29 | /// End element yet.
|
| 30 | ///
|
| 31 | /// For an XML
|
| 32 | ///
|
| 33 | /// ```xml
|
| 34 | /// <root><one/><inner attr="value">|<tag></inner></root>
|
| 35 | /// ```
|
| 36 | /// when cursor at the `|` position buffer contains:
|
| 37 | ///
|
| 38 | /// ```text
|
| 39 | /// rootinner
|
| 40 | /// ^ ^
|
| 41 | /// ```
|
| 42 | ///
|
| 43 | /// The `^` symbols shows which positions stored in the [`Self::opened_starts`]
|
| 44 | /// (0 and 4 in that case).
|
| 45 | opened_buffer: Vec<u8>,
|
| 46 | /// Opened name start indexes into [`Self::opened_buffer`]. See documentation
|
| 47 | /// for that field for details
|
| 48 | opened_starts: Vec<usize>,
|
| 49 |
|
| 50 | #[cfg (feature = "encoding" )]
|
| 51 | /// Reference to the encoding used to read an XML
|
| 52 | pub encoding: EncodingRef,
|
| 53 | }
|
| 54 |
|
| 55 | impl ReaderState {
|
| 56 | /// Trims end whitespaces from `bytes`, if required, and returns a text event.
|
| 57 | ///
|
| 58 | /// # Parameters
|
| 59 | /// - `bytes`: data from the start of stream to the first `<` or from `>` to `<`
|
| 60 | pub fn emit_text<'b>(&mut self, bytes: &'b [u8]) -> BytesText<'b> {
|
| 61 | let mut content = bytes;
|
| 62 |
|
| 63 | if self.config.trim_text_end {
|
| 64 | // Skip the ending '<'
|
| 65 | let len = bytes
|
| 66 | .iter()
|
| 67 | .rposition(|&b| !is_whitespace(b))
|
| 68 | .map_or(0, |p| p + 1);
|
| 69 | content = &bytes[..len];
|
| 70 | }
|
| 71 | BytesText::wrap(content, self.decoder())
|
| 72 | }
|
| 73 |
|
| 74 | /// Returns `Comment`, `CData` or `DocType` event.
|
| 75 | ///
|
| 76 | /// `buf` contains data between `<` and `>`:
|
| 77 | /// - CDATA: `![CDATA[...]]`
|
| 78 | /// - Comment: `!--...--`
|
| 79 | /// - Doctype (uppercase): `!D...`
|
| 80 | /// - Doctype (lowercase): `!d...`
|
| 81 | pub fn emit_bang<'b>(&mut self, bang_type: BangType, buf: &'b [u8]) -> Result<Event<'b>> {
|
| 82 | debug_assert_eq!(
|
| 83 | buf.first(),
|
| 84 | Some(&b'!' ),
|
| 85 | "CDATA, comment or DOCTYPE should start from '!'"
|
| 86 | );
|
| 87 |
|
| 88 | let uncased_starts_with = |string: &[u8], prefix: &[u8]| {
|
| 89 | string.len() >= prefix.len() && string[..prefix.len()].eq_ignore_ascii_case(prefix)
|
| 90 | };
|
| 91 |
|
| 92 | let len = buf.len();
|
| 93 | match bang_type {
|
| 94 | BangType::Comment if buf.starts_with(b"!--" ) => {
|
| 95 | debug_assert!(buf.ends_with(b"--" ));
|
| 96 | if self.config.check_comments {
|
| 97 | // search if '--' not in comments
|
| 98 | let mut haystack = &buf[3..len - 2];
|
| 99 | let mut off = 0;
|
| 100 | while let Some(p) = memchr::memchr(b'-' , haystack) {
|
| 101 | off += p + 1;
|
| 102 | // if next byte after `-` is also `-`, return an error
|
| 103 | if buf[3 + off] == b'-' {
|
| 104 | // Explanation of the magic:
|
| 105 | //
|
| 106 | // - `self.offset`` just after `>`,
|
| 107 | // - `buf` contains `!-- con--tent --`
|
| 108 | // - `p` is counted from byte after `<!--`
|
| 109 | //
|
| 110 | // <!-- con--tent -->:
|
| 111 | // ~~~~~~~~~~~~~~~~ : - buf
|
| 112 | // : =========== : - zone of search (possible values of `p`)
|
| 113 | // : |---p : - p is counted from | (| is 0)
|
| 114 | // : : : ^ - self.offset
|
| 115 | // ^ : : - self.offset - len
|
| 116 | // ^ : - self.offset - len + 2
|
| 117 | // ^ - self.offset - len + 2 + p
|
| 118 | self.last_error_offset = self.offset - len as u64 + 2 + p as u64;
|
| 119 | return Err(Error::IllFormed(IllFormedError::DoubleHyphenInComment));
|
| 120 | }
|
| 121 | // Continue search after single `-` (+1 to skip it)
|
| 122 | haystack = &haystack[p + 1..];
|
| 123 | }
|
| 124 | }
|
| 125 | Ok(Event::Comment(BytesText::wrap(
|
| 126 | // Cut of `!--` and `--` from start and end
|
| 127 | &buf[3..len - 2],
|
| 128 | self.decoder(),
|
| 129 | )))
|
| 130 | }
|
| 131 | // XML requires uppercase only:
|
| 132 | // https://www.w3.org/TR/xml11/#sec-cdata-sect
|
| 133 | // Even HTML5 required uppercase only:
|
| 134 | // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
|
| 135 | BangType::CData if buf.starts_with(b"![CDATA[" ) => {
|
| 136 | debug_assert!(buf.ends_with(b"]]" ));
|
| 137 | Ok(Event::CData(BytesCData::wrap(
|
| 138 | // Cut of `![CDATA[` and `]]` from start and end
|
| 139 | &buf[8..len - 2],
|
| 140 | self.decoder(),
|
| 141 | )))
|
| 142 | }
|
| 143 | // XML requires uppercase only, but we will check that on validation stage:
|
| 144 | // https://www.w3.org/TR/xml11/#sec-prolog-dtd
|
| 145 | // HTML5 allows mixed case for doctype declarations:
|
| 146 | // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
|
| 147 | BangType::DocType(0) if uncased_starts_with(buf, b"!DOCTYPE" ) => {
|
| 148 | match buf[8..].iter().position(|&b| !is_whitespace(b)) {
|
| 149 | Some(start) => Ok(Event::DocType(BytesText::wrap(
|
| 150 | // Cut of `!DOCTYPE` and any number of spaces from start
|
| 151 | &buf[8 + start..],
|
| 152 | self.decoder(),
|
| 153 | ))),
|
| 154 | None => {
|
| 155 | // Because we here, we at least read `<!DOCTYPE>` and offset after `>`.
|
| 156 | // We want report error at place where name is expected - this is just
|
| 157 | // before `>`
|
| 158 | self.last_error_offset = self.offset - 1;
|
| 159 | return Err(Error::IllFormed(IllFormedError::MissingDoctypeName));
|
| 160 | }
|
| 161 | }
|
| 162 | }
|
| 163 | _ => {
|
| 164 | // <!....>
|
| 165 | // ^^^^^ - `buf` does not contain `<` and `>`, but `self.offset` is after `>`.
|
| 166 | // ^------- We report error at that position, so we need to subtract 2 and buf len
|
| 167 | self.last_error_offset = self.offset - len as u64 - 2;
|
| 168 | Err(bang_type.to_err().into())
|
| 169 | }
|
| 170 | }
|
| 171 | }
|
| 172 |
|
| 173 | /// Wraps content of `buf` into the [`Event::End`] event. Does the check that
|
| 174 | /// end name matches the last opened start name if `self.config.check_end_names` is set.
|
| 175 | ///
|
| 176 | /// `buf` contains data between `<` and `>`, for example `/tag`.
|
| 177 | pub fn emit_end<'b>(&mut self, buf: &'b [u8]) -> Result<Event<'b>> {
|
| 178 | debug_assert_eq!(
|
| 179 | buf.first(),
|
| 180 | Some(&b'/' ),
|
| 181 | "closing tag should start from '/'"
|
| 182 | );
|
| 183 |
|
| 184 | // Strip the `/` character. `content` contains data between `</` and `>`
|
| 185 | let content = &buf[1..];
|
| 186 | // XML standard permits whitespaces after the markup name in closing tags.
|
| 187 | // Let's strip them from the buffer before comparing tag names.
|
| 188 | let name = if self.config.trim_markup_names_in_closing_tags {
|
| 189 | if let Some(pos_end_name) = content.iter().rposition(|&b| !is_whitespace(b)) {
|
| 190 | &content[..pos_end_name + 1]
|
| 191 | } else {
|
| 192 | content
|
| 193 | }
|
| 194 | } else {
|
| 195 | content
|
| 196 | };
|
| 197 |
|
| 198 | let decoder = self.decoder();
|
| 199 |
|
| 200 | // Get the index in self.opened_buffer of the name of the last opened tag
|
| 201 | match self.opened_starts.pop() {
|
| 202 | Some(start) => {
|
| 203 | if self.config.check_end_names {
|
| 204 | let expected = &self.opened_buffer[start..];
|
| 205 | if name != expected {
|
| 206 | let expected = decoder.decode(expected).unwrap_or_default().into_owned();
|
| 207 | // #513: In order to allow error recovery we should drop content of the buffer
|
| 208 | self.opened_buffer.truncate(start);
|
| 209 |
|
| 210 | // Report error at start of the end tag at `<` character
|
| 211 | // -2 for `<` and `>`
|
| 212 | self.last_error_offset = self.offset - buf.len() as u64 - 2;
|
| 213 | return Err(Error::IllFormed(IllFormedError::MismatchedEndTag {
|
| 214 | expected,
|
| 215 | found: decoder.decode(name).unwrap_or_default().into_owned(),
|
| 216 | }));
|
| 217 | }
|
| 218 | }
|
| 219 |
|
| 220 | self.opened_buffer.truncate(start);
|
| 221 | }
|
| 222 | None => {
|
| 223 | if !self.config.allow_unmatched_ends {
|
| 224 | // Report error at start of the end tag at `<` character
|
| 225 | // -2 for `<` and `>`
|
| 226 | self.last_error_offset = self.offset - buf.len() as u64 - 2;
|
| 227 | return Err(Error::IllFormed(IllFormedError::UnmatchedEndTag(
|
| 228 | decoder.decode(name).unwrap_or_default().into_owned(),
|
| 229 | )));
|
| 230 | }
|
| 231 | }
|
| 232 | }
|
| 233 |
|
| 234 | Ok(Event::End(BytesEnd::wrap(name.into())))
|
| 235 | }
|
| 236 |
|
| 237 | /// `buf` contains data between `<` and `>` and the first byte is `?`.
|
| 238 | /// `self.offset` already after the `>`
|
| 239 | ///
|
| 240 | /// Returns `Decl` or `PI` event
|
| 241 | pub fn emit_question_mark<'b>(&mut self, buf: &'b [u8]) -> Result<Event<'b>> {
|
| 242 | debug_assert!(buf.len() > 0);
|
| 243 | debug_assert_eq!(buf[0], b'?' );
|
| 244 |
|
| 245 | let len = buf.len();
|
| 246 | // We accept at least <??>
|
| 247 | // ~~ - len = 2
|
| 248 | if len > 1 && buf[len - 1] == b'?' {
|
| 249 | // Cut of `?` and `?` from start and end
|
| 250 | let content = &buf[1..len - 1];
|
| 251 | let len = content.len();
|
| 252 |
|
| 253 | if content.starts_with(b"xml" ) && (len == 3 || is_whitespace(content[3])) {
|
| 254 | let event = BytesDecl::from_start(BytesStart::wrap(content, 3));
|
| 255 |
|
| 256 | // Try getting encoding from the declaration event
|
| 257 | #[cfg (feature = "encoding" )]
|
| 258 | if self.encoding.can_be_refined() {
|
| 259 | if let Some(encoding) = event.encoder() {
|
| 260 | self.encoding = EncodingRef::XmlDetected(encoding);
|
| 261 | }
|
| 262 | }
|
| 263 |
|
| 264 | Ok(Event::Decl(event))
|
| 265 | } else {
|
| 266 | Ok(Event::PI(BytesPI::wrap(content, name_len(content))))
|
| 267 | }
|
| 268 | } else {
|
| 269 | // <?....EOF
|
| 270 | // ^^^^^ - `buf` does not contains `<`, but we want to report error at `<`,
|
| 271 | // so we move offset to it (-2 for `<` and `>`)
|
| 272 | self.last_error_offset = self.offset - len as u64 - 2;
|
| 273 | Err(Error::Syntax(SyntaxError::UnclosedPIOrXmlDecl))
|
| 274 | }
|
| 275 | }
|
| 276 |
|
| 277 | /// Converts content of a tag to a `Start` or an `Empty` event
|
| 278 | ///
|
| 279 | /// # Parameters
|
| 280 | /// - `content`: Content of a tag between `<` and `>`
|
| 281 | pub fn emit_start<'b>(&mut self, content: &'b [u8]) -> Event<'b> {
|
| 282 | if let Some(content) = content.strip_suffix(b"/" ) {
|
| 283 | // This is self-closed tag `<something/>`
|
| 284 | let event = BytesStart::wrap(content, name_len(content));
|
| 285 |
|
| 286 | if self.config.expand_empty_elements {
|
| 287 | self.state = ParseState::InsideEmpty;
|
| 288 | self.opened_starts.push(self.opened_buffer.len());
|
| 289 | self.opened_buffer.extend(event.name().as_ref());
|
| 290 | Event::Start(event)
|
| 291 | } else {
|
| 292 | Event::Empty(event)
|
| 293 | }
|
| 294 | } else {
|
| 295 | let event = BytesStart::wrap(content, name_len(content));
|
| 296 |
|
| 297 | // #514: Always store names event when .check_end_names == false,
|
| 298 | // because checks can be temporary disabled and when they would be
|
| 299 | // enabled, we should have that information
|
| 300 | self.opened_starts.push(self.opened_buffer.len());
|
| 301 | self.opened_buffer.extend(event.name().as_ref());
|
| 302 | Event::Start(event)
|
| 303 | }
|
| 304 | }
|
| 305 |
|
| 306 | #[inline ]
|
| 307 | pub fn close_expanded_empty(&mut self) -> BytesEnd<'static> {
|
| 308 | self.state = ParseState::InsideText;
|
| 309 | let name = self
|
| 310 | .opened_buffer
|
| 311 | .split_off(self.opened_starts.pop().unwrap());
|
| 312 | BytesEnd::wrap(name.into())
|
| 313 | }
|
| 314 |
|
| 315 | /// Get the decoder, used to decode bytes, read by this reader, to the strings.
|
| 316 | ///
|
| 317 | /// If [`encoding`] feature is enabled, the used encoding may change after
|
| 318 | /// parsing the XML declaration, otherwise encoding is fixed to UTF-8.
|
| 319 | ///
|
| 320 | /// If [`encoding`] feature is enabled and no encoding is specified in declaration,
|
| 321 | /// defaults to UTF-8.
|
| 322 | ///
|
| 323 | /// [`encoding`]: ../../index.html#encoding
|
| 324 | pub const fn decoder(&self) -> Decoder {
|
| 325 | Decoder {
|
| 326 | #[cfg (feature = "encoding" )]
|
| 327 | encoding: self.encoding.encoding(),
|
| 328 | }
|
| 329 | }
|
| 330 | }
|
| 331 |
|
| 332 | impl Default for ReaderState {
|
| 333 | fn default() -> Self {
|
| 334 | Self {
|
| 335 | offset: 0,
|
| 336 | last_error_offset: 0,
|
| 337 | state: ParseState::Init,
|
| 338 | config: Config::default(),
|
| 339 | opened_buffer: Vec::new(),
|
| 340 | opened_starts: Vec::new(),
|
| 341 |
|
| 342 | #[cfg (feature = "encoding" )]
|
| 343 | encoding: EncodingRef::Implicit(UTF_8),
|
| 344 | }
|
| 345 | }
|
| 346 | }
|
| 347 | |