| 1 | //! This is an implementation of [`Reader`] for reading from a `&[u8]` as
|
| 2 | //! underlying byte stream. This implementation supports not using an
|
| 3 | //! intermediate buffer as the byte slice itself can be used to borrow from.
|
| 4 |
|
| 5 | use std::borrow::Cow;
|
| 6 |
|
| 7 | #[cfg (feature = "encoding" )]
|
| 8 | use crate::reader::EncodingRef;
|
| 9 | #[cfg (feature = "encoding" )]
|
| 10 | use encoding_rs::{Encoding, UTF_8};
|
| 11 |
|
| 12 | use crate::errors::{Error, Result};
|
| 13 | use crate::events::Event;
|
| 14 | use crate::name::QName;
|
| 15 | use crate::reader::{is_whitespace, BangType, ReadElementState, Reader, Span, XmlSource};
|
| 16 |
|
| 17 | use memchr;
|
| 18 |
|
| 19 | /// This is an implementation for reading from a `&[u8]` as underlying byte stream.
|
| 20 | /// This implementation supports not using an intermediate buffer as the byte slice
|
| 21 | /// itself can be used to borrow from.
|
| 22 | impl<'a> Reader<&'a [u8]> {
|
| 23 | /// Creates an XML reader from a string slice.
|
| 24 | #[allow (clippy::should_implement_trait)]
|
| 25 | pub fn from_str(s: &'a str) -> Self {
|
| 26 | // Rust strings are guaranteed to be UTF-8, so lock the encoding
|
| 27 | #[cfg (feature = "encoding" )]
|
| 28 | {
|
| 29 | let mut reader = Self::from_reader(s.as_bytes());
|
| 30 | reader.parser.encoding = EncodingRef::Explicit(UTF_8);
|
| 31 | reader
|
| 32 | }
|
| 33 |
|
| 34 | #[cfg (not(feature = "encoding" ))]
|
| 35 | Self::from_reader(s.as_bytes())
|
| 36 | }
|
| 37 |
|
| 38 | /// Read an event that borrows from the input rather than a buffer.
|
| 39 | ///
|
| 40 | /// There is no asynchronous `read_event_async()` version of this function,
|
| 41 | /// because it is not necessary -- the contents are already in memory and no IO
|
| 42 | /// is needed, therefore there is no potential for blocking.
|
| 43 | ///
|
| 44 | /// # Examples
|
| 45 | ///
|
| 46 | /// ```
|
| 47 | /// # use pretty_assertions::assert_eq;
|
| 48 | /// use quick_xml::events::Event;
|
| 49 | /// use quick_xml::reader::Reader;
|
| 50 | ///
|
| 51 | /// let mut reader = Reader::from_str(r#"
|
| 52 | /// <tag1 att1 = "test">
|
| 53 | /// <tag2><!--Test comment-->Test</tag2>
|
| 54 | /// <tag2>Test 2</tag2>
|
| 55 | /// </tag1>
|
| 56 | /// "# );
|
| 57 | /// reader.trim_text(true);
|
| 58 | ///
|
| 59 | /// let mut count = 0;
|
| 60 | /// let mut txt = Vec::new();
|
| 61 | /// loop {
|
| 62 | /// match reader.read_event().unwrap() {
|
| 63 | /// Event::Start(e) => count += 1,
|
| 64 | /// Event::Text(e) => txt.push(e.unescape().unwrap().into_owned()),
|
| 65 | /// Event::Eof => break,
|
| 66 | /// _ => (),
|
| 67 | /// }
|
| 68 | /// }
|
| 69 | /// assert_eq!(count, 3);
|
| 70 | /// assert_eq!(txt, vec!["Test" .to_string(), "Test 2" .to_string()]);
|
| 71 | /// ```
|
| 72 | #[inline ]
|
| 73 | pub fn read_event(&mut self) -> Result<Event<'a>> {
|
| 74 | self.read_event_impl(())
|
| 75 | }
|
| 76 |
|
| 77 | /// Reads until end element is found. This function is supposed to be called
|
| 78 | /// after you already read a [`Start`] event.
|
| 79 | ///
|
| 80 | /// Returns a span that cover content between `>` of an opening tag and `<` of
|
| 81 | /// a closing tag or an empty slice, if [`expand_empty_elements`] is set and
|
| 82 | /// this method was called after reading expanded [`Start`] event.
|
| 83 | ///
|
| 84 | /// Manages nested cases where parent and child elements have the _literally_
|
| 85 | /// same name.
|
| 86 | ///
|
| 87 | /// If corresponding [`End`] event will not be found, the [`Error::UnexpectedEof`]
|
| 88 | /// will be returned. In particularly, that error will be returned if you call
|
| 89 | /// this method without consuming the corresponding [`Start`] event first.
|
| 90 | ///
|
| 91 | /// The `end` parameter should contain name of the end element _in the reader
|
| 92 | /// encoding_. It is good practice to always get that parameter using
|
| 93 | /// [`BytesStart::to_end()`] method.
|
| 94 | ///
|
| 95 | /// The correctness of the skipped events does not checked, if you disabled
|
| 96 | /// the [`check_end_names`] option.
|
| 97 | ///
|
| 98 | /// There is no asynchronous `read_to_end_async()` version of this function,
|
| 99 | /// because it is not necessary -- the contents are already in memory and no IO
|
| 100 | /// is needed, therefore there is no potential for blocking.
|
| 101 | ///
|
| 102 | /// # Namespaces
|
| 103 | ///
|
| 104 | /// While the `Reader` does not support namespace resolution, namespaces
|
| 105 | /// does not change the algorithm for comparing names. Although the names
|
| 106 | /// `a:name` and `b:name` where both prefixes `a` and `b` resolves to the
|
| 107 | /// same namespace, are semantically equivalent, `</b:name>` cannot close
|
| 108 | /// `<a:name>`, because according to [the specification]
|
| 109 | ///
|
| 110 | /// > The end of every element that begins with a **start-tag** MUST be marked
|
| 111 | /// > by an **end-tag** containing a name that echoes the element's type as
|
| 112 | /// > given in the **start-tag**
|
| 113 | ///
|
| 114 | /// # Examples
|
| 115 | ///
|
| 116 | /// This example shows, how you can skip XML content after you read the
|
| 117 | /// start event.
|
| 118 | ///
|
| 119 | /// ```
|
| 120 | /// # use pretty_assertions::assert_eq;
|
| 121 | /// use quick_xml::events::{BytesStart, Event};
|
| 122 | /// use quick_xml::reader::Reader;
|
| 123 | ///
|
| 124 | /// let mut reader = Reader::from_str(r#"
|
| 125 | /// <outer>
|
| 126 | /// <inner>
|
| 127 | /// <inner></inner>
|
| 128 | /// <inner/>
|
| 129 | /// <outer></outer>
|
| 130 | /// <outer/>
|
| 131 | /// </inner>
|
| 132 | /// </outer>
|
| 133 | /// "# );
|
| 134 | /// reader.trim_text(true);
|
| 135 | ///
|
| 136 | /// let start = BytesStart::new("outer" );
|
| 137 | /// let end = start.to_end().into_owned();
|
| 138 | ///
|
| 139 | /// // First, we read a start event...
|
| 140 | /// assert_eq!(reader.read_event().unwrap(), Event::Start(start));
|
| 141 | ///
|
| 142 | /// // ...then, we could skip all events to the corresponding end event.
|
| 143 | /// // This call will correctly handle nested <outer> elements.
|
| 144 | /// // Note, however, that this method does not handle namespaces.
|
| 145 | /// reader.read_to_end(end.name()).unwrap();
|
| 146 | ///
|
| 147 | /// // At the end we should get an Eof event, because we ate the whole XML
|
| 148 | /// assert_eq!(reader.read_event().unwrap(), Event::Eof);
|
| 149 | /// ```
|
| 150 | ///
|
| 151 | /// [`Start`]: Event::Start
|
| 152 | /// [`End`]: Event::End
|
| 153 | /// [`BytesStart::to_end()`]: crate::events::BytesStart::to_end
|
| 154 | /// [`expand_empty_elements`]: Self::expand_empty_elements
|
| 155 | /// [`check_end_names`]: Self::check_end_names
|
| 156 | /// [the specification]: https://www.w3.org/TR/xml11/#dt-etag
|
| 157 | pub fn read_to_end(&mut self, end: QName) -> Result<Span> {
|
| 158 | Ok(read_to_end!(self, end, (), read_event_impl, {}))
|
| 159 | }
|
| 160 |
|
| 161 | /// Reads content between start and end tags, including any markup. This
|
| 162 | /// function is supposed to be called after you already read a [`Start`] event.
|
| 163 | ///
|
| 164 | /// Manages nested cases where parent and child elements have the _literally_
|
| 165 | /// same name.
|
| 166 | ///
|
| 167 | /// This method does not unescape read data, instead it returns content
|
| 168 | /// "as is" of the XML document. This is because it has no idea what text
|
| 169 | /// it reads, and if, for example, it contains CDATA section, attempt to
|
| 170 | /// unescape it content will spoil data.
|
| 171 | ///
|
| 172 | /// Any text will be decoded using the XML current [`decoder()`].
|
| 173 | ///
|
| 174 | /// Actually, this method perform the following code:
|
| 175 | ///
|
| 176 | /// ```ignore
|
| 177 | /// let span = reader.read_to_end(end)?;
|
| 178 | /// let text = reader.decoder().decode(&reader.inner_slice[span]);
|
| 179 | /// ```
|
| 180 | ///
|
| 181 | /// # Examples
|
| 182 | ///
|
| 183 | /// This example shows, how you can read a HTML content from your XML document.
|
| 184 | ///
|
| 185 | /// ```
|
| 186 | /// # use pretty_assertions::assert_eq;
|
| 187 | /// # use std::borrow::Cow;
|
| 188 | /// use quick_xml::events::{BytesStart, Event};
|
| 189 | /// use quick_xml::reader::Reader;
|
| 190 | ///
|
| 191 | /// let mut reader = Reader::from_str("
|
| 192 | /// <html>
|
| 193 | /// <title>This is a HTML text</title>
|
| 194 | /// <p>Usual XML rules does not apply inside it
|
| 195 | /// <p>For example, elements not needed to be "closed"
|
| 196 | /// </html>
|
| 197 | /// " );
|
| 198 | /// reader.trim_text(true);
|
| 199 | ///
|
| 200 | /// let start = BytesStart::new("html" );
|
| 201 | /// let end = start.to_end().into_owned();
|
| 202 | ///
|
| 203 | /// // First, we read a start event...
|
| 204 | /// assert_eq!(reader.read_event().unwrap(), Event::Start(start));
|
| 205 | /// // ...and disable checking of end names because we expect HTML further...
|
| 206 | /// reader.check_end_names(false);
|
| 207 | ///
|
| 208 | /// // ...then, we could read text content until close tag.
|
| 209 | /// // This call will correctly handle nested <html> elements.
|
| 210 | /// let text = reader.read_text(end.name()).unwrap();
|
| 211 | /// assert_eq!(text, Cow::Borrowed(r#"
|
| 212 | /// <title>This is a HTML text</title>
|
| 213 | /// <p>Usual XML rules does not apply inside it
|
| 214 | /// <p>For example, elements not needed to be "closed"
|
| 215 | /// "# ));
|
| 216 | /// assert!(matches!(text, Cow::Borrowed(_)));
|
| 217 | ///
|
| 218 | /// // Now we can enable checks again
|
| 219 | /// reader.check_end_names(true);
|
| 220 | ///
|
| 221 | /// // At the end we should get an Eof event, because we ate the whole XML
|
| 222 | /// assert_eq!(reader.read_event().unwrap(), Event::Eof);
|
| 223 | /// ```
|
| 224 | ///
|
| 225 | /// [`Start`]: Event::Start
|
| 226 | /// [`decoder()`]: Self::decoder()
|
| 227 | pub fn read_text(&mut self, end: QName) -> Result<Cow<'a, str>> {
|
| 228 | // self.reader will be changed, so store original reference
|
| 229 | let buffer = self.reader;
|
| 230 | let span = self.read_to_end(end)?;
|
| 231 |
|
| 232 | self.decoder().decode(&buffer[0..span.len()])
|
| 233 | }
|
| 234 | }
|
| 235 |
|
| 236 | ////////////////////////////////////////////////////////////////////////////////////////////////////
|
| 237 |
|
| 238 | /// Implementation of `XmlSource` for `&[u8]` reader using a `Self` as buffer
|
| 239 | /// that will be borrowed by events. This implementation provides a zero-copy deserialization
|
| 240 | impl<'a> XmlSource<'a, ()> for &'a [u8] {
|
| 241 | #[cfg (not(feature = "encoding" ))]
|
| 242 | fn remove_utf8_bom(&mut self) -> Result<()> {
|
| 243 | if self.starts_with(crate::encoding::UTF8_BOM) {
|
| 244 | *self = &self[crate::encoding::UTF8_BOM.len()..];
|
| 245 | }
|
| 246 | Ok(())
|
| 247 | }
|
| 248 |
|
| 249 | #[cfg (feature = "encoding" )]
|
| 250 | fn detect_encoding(&mut self) -> Result<Option<&'static Encoding>> {
|
| 251 | if let Some((enc, bom_len)) = crate::encoding::detect_encoding(self) {
|
| 252 | *self = &self[bom_len..];
|
| 253 | return Ok(Some(enc));
|
| 254 | }
|
| 255 | Ok(None)
|
| 256 | }
|
| 257 |
|
| 258 | fn read_bytes_until(
|
| 259 | &mut self,
|
| 260 | byte: u8,
|
| 261 | _buf: (),
|
| 262 | position: &mut usize,
|
| 263 | ) -> Result<Option<&'a [u8]>> {
|
| 264 | // search byte must be within the ascii range
|
| 265 | debug_assert!(byte.is_ascii());
|
| 266 | if self.is_empty() {
|
| 267 | return Ok(None);
|
| 268 | }
|
| 269 |
|
| 270 | Ok(Some(if let Some(i) = memchr::memchr(byte, self) {
|
| 271 | *position += i + 1;
|
| 272 | let bytes = &self[..i];
|
| 273 | *self = &self[i + 1..];
|
| 274 | bytes
|
| 275 | } else {
|
| 276 | *position += self.len();
|
| 277 | let bytes = &self[..];
|
| 278 | *self = &[];
|
| 279 | bytes
|
| 280 | }))
|
| 281 | }
|
| 282 |
|
| 283 | fn read_bang_element(
|
| 284 | &mut self,
|
| 285 | _buf: (),
|
| 286 | position: &mut usize,
|
| 287 | ) -> Result<Option<(BangType, &'a [u8])>> {
|
| 288 | // Peeked one bang ('!') before being called, so it's guaranteed to
|
| 289 | // start with it.
|
| 290 | debug_assert_eq!(self[0], b'!' );
|
| 291 |
|
| 292 | let bang_type = BangType::new(self[1..].first().copied())?;
|
| 293 |
|
| 294 | if let Some((bytes, i)) = bang_type.parse(&[], self) {
|
| 295 | *position += i;
|
| 296 | *self = &self[i..];
|
| 297 | return Ok(Some((bang_type, bytes)));
|
| 298 | }
|
| 299 |
|
| 300 | // Note: Do not update position, so the error points to
|
| 301 | // somewhere sane rather than at the EOF
|
| 302 | Err(bang_type.to_err())
|
| 303 | }
|
| 304 |
|
| 305 | fn read_element(&mut self, _buf: (), position: &mut usize) -> Result<Option<&'a [u8]>> {
|
| 306 | if self.is_empty() {
|
| 307 | return Ok(None);
|
| 308 | }
|
| 309 |
|
| 310 | let mut state = ReadElementState::Elem;
|
| 311 |
|
| 312 | if let Some((bytes, i)) = state.change(self) {
|
| 313 | // Position now just after the `>` symbol
|
| 314 | *position += i;
|
| 315 | *self = &self[i..];
|
| 316 | return Ok(Some(bytes));
|
| 317 | }
|
| 318 |
|
| 319 | // Note: Do not update position, so the error points to a sane place
|
| 320 | // rather than at the EOF.
|
| 321 | Err(Error::UnexpectedEof("Element" .to_string()))
|
| 322 |
|
| 323 | // FIXME: Figure out why the other one works without UnexpectedEof
|
| 324 | }
|
| 325 |
|
| 326 | fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> {
|
| 327 | let whitespaces = self
|
| 328 | .iter()
|
| 329 | .position(|b| !is_whitespace(*b))
|
| 330 | .unwrap_or(self.len());
|
| 331 | *position += whitespaces;
|
| 332 | *self = &self[whitespaces..];
|
| 333 | Ok(())
|
| 334 | }
|
| 335 |
|
| 336 | fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result<bool> {
|
| 337 | // search byte must be within the ascii range
|
| 338 | debug_assert!(byte.is_ascii());
|
| 339 | if self.first() == Some(&byte) {
|
| 340 | *self = &self[1..];
|
| 341 | *position += 1;
|
| 342 | Ok(true)
|
| 343 | } else {
|
| 344 | Ok(false)
|
| 345 | }
|
| 346 | }
|
| 347 |
|
| 348 | fn peek_one(&mut self) -> Result<Option<u8>> {
|
| 349 | Ok(self.first().copied())
|
| 350 | }
|
| 351 | }
|
| 352 |
|
| 353 | #[cfg (test)]
|
| 354 | mod test {
|
| 355 | use crate::reader::test::check;
|
| 356 | use crate::reader::XmlSource;
|
| 357 |
|
| 358 | /// Default buffer constructor just pass the byte array from the test
|
| 359 | fn identity<T>(input: T) -> T {
|
| 360 | input
|
| 361 | }
|
| 362 |
|
| 363 | check!(
|
| 364 | #[test]
|
| 365 | read_event_impl,
|
| 366 | read_until_close,
|
| 367 | identity,
|
| 368 | ()
|
| 369 | );
|
| 370 |
|
| 371 | #[cfg (feature = "encoding" )]
|
| 372 | mod encoding {
|
| 373 | use crate::events::Event;
|
| 374 | use crate::reader::Reader;
|
| 375 | use encoding_rs::UTF_8;
|
| 376 | use pretty_assertions::assert_eq;
|
| 377 |
|
| 378 | /// Checks that XML declaration cannot change the encoding from UTF-8 if
|
| 379 | /// a `Reader` was created using `from_str` method
|
| 380 | #[test ]
|
| 381 | fn str_always_has_utf8() {
|
| 382 | let mut reader = Reader::from_str("<?xml encoding='UTF-16'?>" );
|
| 383 |
|
| 384 | assert_eq!(reader.decoder().encoding(), UTF_8);
|
| 385 | reader.read_event().unwrap();
|
| 386 | assert_eq!(reader.decoder().encoding(), UTF_8);
|
| 387 |
|
| 388 | assert_eq!(reader.read_event().unwrap(), Event::Eof);
|
| 389 | }
|
| 390 | }
|
| 391 | }
|
| 392 | |