| 1 | //! This is an implementation of [`Reader`] for reading from a [`BufRead`] as
|
| 2 | //! underlying byte stream.
|
| 3 |
|
| 4 | use std::fs::File;
|
| 5 | use std::io::{self, BufRead, BufReader};
|
| 6 | use std::path::Path;
|
| 7 |
|
| 8 | use memchr;
|
| 9 |
|
| 10 | use crate::errors::{Error, Result};
|
| 11 | use crate::events::Event;
|
| 12 | use crate::name::QName;
|
| 13 | use crate::reader::{is_whitespace, BangType, ReadElementState, Reader, Span, XmlSource};
|
| 14 |
|
| 15 | macro_rules! impl_buffered_source {
|
| 16 | ($($lf:lifetime, $reader:tt, $async:ident, $await:ident)?) => {
|
| 17 | #[cfg(not(feature = "encoding" ))]
|
| 18 | $($async)? fn remove_utf8_bom(&mut self) -> Result<()> {
|
| 19 | use crate::encoding::UTF8_BOM;
|
| 20 |
|
| 21 | loop {
|
| 22 | break match self $(.$reader)? .fill_buf() $(.$await)? {
|
| 23 | Ok(n) => {
|
| 24 | if n.starts_with(UTF8_BOM) {
|
| 25 | self $(.$reader)? .consume(UTF8_BOM.len());
|
| 26 | }
|
| 27 | Ok(())
|
| 28 | },
|
| 29 | Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
|
| 30 | Err(e) => Err(Error::Io(e.into())),
|
| 31 | };
|
| 32 | }
|
| 33 | }
|
| 34 |
|
| 35 | #[cfg(feature = "encoding" )]
|
| 36 | $($async)? fn detect_encoding(&mut self) -> Result<Option<&'static encoding_rs::Encoding>> {
|
| 37 | loop {
|
| 38 | break match self $(.$reader)? .fill_buf() $(.$await)? {
|
| 39 | Ok(n) => if let Some((enc, bom_len)) = crate::encoding::detect_encoding(n) {
|
| 40 | self $(.$reader)? .consume(bom_len);
|
| 41 | Ok(Some(enc))
|
| 42 | } else {
|
| 43 | Ok(None)
|
| 44 | },
|
| 45 | Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
|
| 46 | Err(e) => Err(Error::Io(e.into())),
|
| 47 | };
|
| 48 | }
|
| 49 | }
|
| 50 |
|
| 51 | #[inline]
|
| 52 | $($async)? fn read_bytes_until $(<$lf>)? (
|
| 53 | &mut self,
|
| 54 | byte: u8,
|
| 55 | buf: &'b mut Vec<u8>,
|
| 56 | position: &mut usize,
|
| 57 | ) -> Result<Option<&'b [u8]>> {
|
| 58 | // search byte must be within the ascii range
|
| 59 | debug_assert!(byte.is_ascii());
|
| 60 |
|
| 61 | let mut read = 0;
|
| 62 | let mut done = false;
|
| 63 | let start = buf.len();
|
| 64 | while !done {
|
| 65 | let used = {
|
| 66 | let available = match self $(.$reader)? .fill_buf() $(.$await)? {
|
| 67 | Ok(n) if n.is_empty() => break,
|
| 68 | Ok(n) => n,
|
| 69 | Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
|
| 70 | Err(e) => {
|
| 71 | *position += read;
|
| 72 | return Err(Error::Io(e.into()));
|
| 73 | }
|
| 74 | };
|
| 75 |
|
| 76 | match memchr::memchr(byte, available) {
|
| 77 | Some(i) => {
|
| 78 | buf.extend_from_slice(&available[..i]);
|
| 79 | done = true;
|
| 80 | i + 1
|
| 81 | }
|
| 82 | None => {
|
| 83 | buf.extend_from_slice(available);
|
| 84 | available.len()
|
| 85 | }
|
| 86 | }
|
| 87 | };
|
| 88 | self $(.$reader)? .consume(used);
|
| 89 | read += used;
|
| 90 | }
|
| 91 | *position += read;
|
| 92 |
|
| 93 | if read == 0 {
|
| 94 | Ok(None)
|
| 95 | } else {
|
| 96 | Ok(Some(&buf[start..]))
|
| 97 | }
|
| 98 | }
|
| 99 |
|
| 100 | $($async)? fn read_bang_element $(<$lf>)? (
|
| 101 | &mut self,
|
| 102 | buf: &'b mut Vec<u8>,
|
| 103 | position: &mut usize,
|
| 104 | ) -> Result<Option<(BangType, &'b [u8])>> {
|
| 105 | // Peeked one bang ('!') before being called, so it's guaranteed to
|
| 106 | // start with it.
|
| 107 | let start = buf.len();
|
| 108 | let mut read = 1;
|
| 109 | buf.push(b'!' );
|
| 110 | self $(.$reader)? .consume(1);
|
| 111 |
|
| 112 | let bang_type = BangType::new(self.peek_one() $(.$await)? ?)?;
|
| 113 |
|
| 114 | loop {
|
| 115 | match self $(.$reader)? .fill_buf() $(.$await)? {
|
| 116 | // Note: Do not update position, so the error points to
|
| 117 | // somewhere sane rather than at the EOF
|
| 118 | Ok(n) if n.is_empty() => return Err(bang_type.to_err()),
|
| 119 | Ok(available) => {
|
| 120 | // We only parse from start because we don't want to consider
|
| 121 | // whatever is in the buffer before the bang element
|
| 122 | if let Some((consumed, used)) = bang_type.parse(&buf[start..], available) {
|
| 123 | buf.extend_from_slice(consumed);
|
| 124 |
|
| 125 | self $(.$reader)? .consume(used);
|
| 126 | read += used;
|
| 127 |
|
| 128 | *position += read;
|
| 129 | break;
|
| 130 | } else {
|
| 131 | buf.extend_from_slice(available);
|
| 132 |
|
| 133 | let used = available.len();
|
| 134 | self $(.$reader)? .consume(used);
|
| 135 | read += used;
|
| 136 | }
|
| 137 | }
|
| 138 | Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
|
| 139 | Err(e) => {
|
| 140 | *position += read;
|
| 141 | return Err(Error::Io(e.into()));
|
| 142 | }
|
| 143 | }
|
| 144 | }
|
| 145 |
|
| 146 | if read == 0 {
|
| 147 | Ok(None)
|
| 148 | } else {
|
| 149 | Ok(Some((bang_type, &buf[start..])))
|
| 150 | }
|
| 151 | }
|
| 152 |
|
| 153 | #[inline]
|
| 154 | $($async)? fn read_element $(<$lf>)? (
|
| 155 | &mut self,
|
| 156 | buf: &'b mut Vec<u8>,
|
| 157 | position: &mut usize,
|
| 158 | ) -> Result<Option<&'b [u8]>> {
|
| 159 | let mut state = ReadElementState::Elem;
|
| 160 | let mut read = 0;
|
| 161 |
|
| 162 | let start = buf.len();
|
| 163 | loop {
|
| 164 | match self $(.$reader)? .fill_buf() $(.$await)? {
|
| 165 | Ok(n) if n.is_empty() => break,
|
| 166 | Ok(available) => {
|
| 167 | if let Some((consumed, used)) = state.change(available) {
|
| 168 | buf.extend_from_slice(consumed);
|
| 169 |
|
| 170 | self $(.$reader)? .consume(used);
|
| 171 | read += used;
|
| 172 |
|
| 173 | // Position now just after the `>` symbol
|
| 174 | *position += read;
|
| 175 | break;
|
| 176 | } else {
|
| 177 | // The `>` symbol not yet found, continue reading
|
| 178 | buf.extend_from_slice(available);
|
| 179 |
|
| 180 | let used = available.len();
|
| 181 | self $(.$reader)? .consume(used);
|
| 182 | read += used;
|
| 183 | }
|
| 184 | }
|
| 185 | Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
|
| 186 | Err(e) => {
|
| 187 | *position += read;
|
| 188 | return Err(Error::Io(e.into()));
|
| 189 | }
|
| 190 | };
|
| 191 | }
|
| 192 |
|
| 193 | if read == 0 {
|
| 194 | Ok(None)
|
| 195 | } else {
|
| 196 | Ok(Some(&buf[start..]))
|
| 197 | }
|
| 198 | }
|
| 199 |
|
| 200 | $($async)? fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> {
|
| 201 | loop {
|
| 202 | break match self $(.$reader)? .fill_buf() $(.$await)? {
|
| 203 | Ok(n) => {
|
| 204 | let count = n.iter().position(|b| !is_whitespace(*b)).unwrap_or(n.len());
|
| 205 | if count > 0 {
|
| 206 | self $(.$reader)? .consume(count);
|
| 207 | *position += count;
|
| 208 | continue;
|
| 209 | } else {
|
| 210 | Ok(())
|
| 211 | }
|
| 212 | }
|
| 213 | Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
|
| 214 | Err(e) => Err(Error::Io(e.into())),
|
| 215 | };
|
| 216 | }
|
| 217 | }
|
| 218 |
|
| 219 | $($async)? fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result<bool> {
|
| 220 | // search byte must be within the ascii range
|
| 221 | debug_assert!(byte.is_ascii());
|
| 222 |
|
| 223 | match self.peek_one() $(.$await)? ? {
|
| 224 | Some(b) if b == byte => {
|
| 225 | *position += 1;
|
| 226 | self $(.$reader)? .consume(1);
|
| 227 | Ok(true)
|
| 228 | }
|
| 229 | _ => Ok(false),
|
| 230 | }
|
| 231 | }
|
| 232 |
|
| 233 | $($async)? fn peek_one(&mut self) -> Result<Option<u8>> {
|
| 234 | loop {
|
| 235 | break match self $(.$reader)? .fill_buf() $(.$await)? {
|
| 236 | Ok(n) if n.is_empty() => Ok(None),
|
| 237 | Ok(n) => Ok(Some(n[0])),
|
| 238 | Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
|
| 239 | Err(e) => Err(Error::Io(e.into())),
|
| 240 | };
|
| 241 | }
|
| 242 | }
|
| 243 | };
|
| 244 | }
|
| 245 |
|
| 246 | // Make it public for use in async implementations
|
| 247 | pub(super) use impl_buffered_source;
|
| 248 |
|
| 249 | /// Implementation of `XmlSource` for any `BufRead` reader using a user-given
|
| 250 | /// `Vec<u8>` as buffer that will be borrowed by events.
|
| 251 | impl<'b, R: BufRead> XmlSource<'b, &'b mut Vec<u8>> for R {
|
| 252 | impl_buffered_source!();
|
| 253 | }
|
| 254 |
|
| 255 | ////////////////////////////////////////////////////////////////////////////////////////////////////
|
| 256 |
|
| 257 | /// This is an implementation for reading from a [`BufRead`] as underlying byte stream.
|
| 258 | impl<R: BufRead> Reader<R> {
|
| 259 | /// Reads the next `Event`.
|
| 260 | ///
|
| 261 | /// This is the main entry point for reading XML `Event`s.
|
| 262 | ///
|
| 263 | /// `Event`s borrow `buf` and can be converted to own their data if needed (uses `Cow`
|
| 264 | /// internally).
|
| 265 | ///
|
| 266 | /// Having the possibility to control the internal buffers gives you some additional benefits
|
| 267 | /// such as:
|
| 268 | ///
|
| 269 | /// - Reduce the number of allocations by reusing the same buffer. For constrained systems,
|
| 270 | /// you can call `buf.clear()` once you are done with processing the event (typically at the
|
| 271 | /// end of your loop).
|
| 272 | /// - Reserve the buffer length if you know the file size (using `Vec::with_capacity`).
|
| 273 | ///
|
| 274 | /// # Examples
|
| 275 | ///
|
| 276 | /// ```
|
| 277 | /// use quick_xml::events::Event;
|
| 278 | /// use quick_xml::reader::Reader;
|
| 279 | ///
|
| 280 | /// let xml = r#"<tag1 att1 = "test">
|
| 281 | /// <tag2><!--Test comment-->Test</tag2>
|
| 282 | /// <tag2>Test 2</tag2>
|
| 283 | /// </tag1>"# ;
|
| 284 | /// let mut reader = Reader::from_str(xml);
|
| 285 | /// reader.trim_text(true);
|
| 286 | /// let mut count = 0;
|
| 287 | /// let mut buf = Vec::new();
|
| 288 | /// let mut txt = Vec::new();
|
| 289 | /// loop {
|
| 290 | /// match reader.read_event_into(&mut buf) {
|
| 291 | /// Ok(Event::Start(_)) => count += 1,
|
| 292 | /// Ok(Event::Text(e)) => txt.push(e.unescape().unwrap().into_owned()),
|
| 293 | /// Err(e) => panic!("Error at position {}: {:?}" , reader.buffer_position(), e),
|
| 294 | /// Ok(Event::Eof) => break,
|
| 295 | /// _ => (),
|
| 296 | /// }
|
| 297 | /// buf.clear();
|
| 298 | /// }
|
| 299 | /// assert_eq!(count, 3);
|
| 300 | /// assert_eq!(txt, vec!["Test" .to_string(), "Test 2" .to_string()]);
|
| 301 | /// ```
|
| 302 | #[inline ]
|
| 303 | pub fn read_event_into<'b>(&mut self, buf: &'b mut Vec<u8>) -> Result<Event<'b>> {
|
| 304 | self.read_event_impl(buf)
|
| 305 | }
|
| 306 |
|
| 307 | /// Reads until end element is found using provided buffer as intermediate
|
| 308 | /// storage for events content. This function is supposed to be called after
|
| 309 | /// you already read a [`Start`] event.
|
| 310 | ///
|
| 311 | /// Returns a span that cover content between `>` of an opening tag and `<` of
|
| 312 | /// a closing tag or an empty slice, if [`expand_empty_elements`] is set and
|
| 313 | /// this method was called after reading expanded [`Start`] event.
|
| 314 | ///
|
| 315 | /// Manages nested cases where parent and child elements have the _literally_
|
| 316 | /// same name.
|
| 317 | ///
|
| 318 | /// If corresponding [`End`] event will not be found, the [`Error::UnexpectedEof`]
|
| 319 | /// will be returned. In particularly, that error will be returned if you call
|
| 320 | /// this method without consuming the corresponding [`Start`] event first.
|
| 321 | ///
|
| 322 | /// If your reader created from a string slice or byte array slice, it is
|
| 323 | /// better to use [`read_to_end()`] method, because it will not copy bytes
|
| 324 | /// into intermediate buffer.
|
| 325 | ///
|
| 326 | /// The provided `buf` buffer will be filled only by one event content at time.
|
| 327 | /// Before reading of each event the buffer will be cleared. If you know an
|
| 328 | /// appropriate size of each event, you can preallocate the buffer to reduce
|
| 329 | /// number of reallocations.
|
| 330 | ///
|
| 331 | /// The `end` parameter should contain name of the end element _in the reader
|
| 332 | /// encoding_. It is good practice to always get that parameter using
|
| 333 | /// [`BytesStart::to_end()`] method.
|
| 334 | ///
|
| 335 | /// The correctness of the skipped events does not checked, if you disabled
|
| 336 | /// the [`check_end_names`] option.
|
| 337 | ///
|
| 338 | /// # Namespaces
|
| 339 | ///
|
| 340 | /// While the `Reader` does not support namespace resolution, namespaces
|
| 341 | /// does not change the algorithm for comparing names. Although the names
|
| 342 | /// `a:name` and `b:name` where both prefixes `a` and `b` resolves to the
|
| 343 | /// same namespace, are semantically equivalent, `</b:name>` cannot close
|
| 344 | /// `<a:name>`, because according to [the specification]
|
| 345 | ///
|
| 346 | /// > The end of every element that begins with a **start-tag** MUST be marked
|
| 347 | /// > by an **end-tag** containing a name that echoes the element's type as
|
| 348 | /// > given in the **start-tag**
|
| 349 | ///
|
| 350 | /// # Examples
|
| 351 | ///
|
| 352 | /// This example shows, how you can skip XML content after you read the
|
| 353 | /// start event.
|
| 354 | ///
|
| 355 | /// ```
|
| 356 | /// # use pretty_assertions::assert_eq;
|
| 357 | /// use quick_xml::events::{BytesStart, Event};
|
| 358 | /// use quick_xml::reader::Reader;
|
| 359 | ///
|
| 360 | /// let mut reader = Reader::from_str(r#"
|
| 361 | /// <outer>
|
| 362 | /// <inner>
|
| 363 | /// <inner></inner>
|
| 364 | /// <inner/>
|
| 365 | /// <outer></outer>
|
| 366 | /// <outer/>
|
| 367 | /// </inner>
|
| 368 | /// </outer>
|
| 369 | /// "# );
|
| 370 | /// reader.trim_text(true);
|
| 371 | /// let mut buf = Vec::new();
|
| 372 | ///
|
| 373 | /// let start = BytesStart::new("outer" );
|
| 374 | /// let end = start.to_end().into_owned();
|
| 375 | ///
|
| 376 | /// // First, we read a start event...
|
| 377 | /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Start(start));
|
| 378 | ///
|
| 379 | /// // ...then, we could skip all events to the corresponding end event.
|
| 380 | /// // This call will correctly handle nested <outer> elements.
|
| 381 | /// // Note, however, that this method does not handle namespaces.
|
| 382 | /// reader.read_to_end_into(end.name(), &mut buf).unwrap();
|
| 383 | ///
|
| 384 | /// // At the end we should get an Eof event, because we ate the whole XML
|
| 385 | /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof);
|
| 386 | /// ```
|
| 387 | ///
|
| 388 | /// [`Start`]: Event::Start
|
| 389 | /// [`End`]: Event::End
|
| 390 | /// [`BytesStart::to_end()`]: crate::events::BytesStart::to_end
|
| 391 | /// [`read_to_end()`]: Self::read_to_end
|
| 392 | /// [`expand_empty_elements`]: Self::expand_empty_elements
|
| 393 | /// [`check_end_names`]: Self::check_end_names
|
| 394 | /// [the specification]: https://www.w3.org/TR/xml11/#dt-etag
|
| 395 | pub fn read_to_end_into(&mut self, end: QName, buf: &mut Vec<u8>) -> Result<Span> {
|
| 396 | Ok(read_to_end!(self, end, buf, read_event_impl, {
|
| 397 | buf.clear();
|
| 398 | }))
|
| 399 | }
|
| 400 | }
|
| 401 |
|
| 402 | impl Reader<BufReader<File>> {
|
| 403 | /// Creates an XML reader from a file path.
|
| 404 | pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
|
| 405 | let file: File = File::open(path)?;
|
| 406 | let reader: BufReader = BufReader::new(inner:file);
|
| 407 | Ok(Self::from_reader(reader))
|
| 408 | }
|
| 409 | }
|
| 410 |
|
| 411 | #[cfg (test)]
|
| 412 | mod test {
|
| 413 | use crate::reader::test::{check, small_buffers};
|
| 414 | use crate::reader::XmlSource;
|
| 415 |
|
| 416 | /// Default buffer constructor just pass the byte array from the test
|
| 417 | fn identity<T>(input: T) -> T {
|
| 418 | input
|
| 419 | }
|
| 420 |
|
| 421 | check!(
|
| 422 | #[test]
|
| 423 | read_event_impl,
|
| 424 | read_until_close,
|
| 425 | identity,
|
| 426 | &mut Vec::new()
|
| 427 | );
|
| 428 |
|
| 429 | small_buffers!(
|
| 430 | #[test]
|
| 431 | read_event_into: std::io::BufReader<_>
|
| 432 | );
|
| 433 |
|
| 434 | #[cfg (feature = "encoding" )]
|
| 435 | mod encoding {
|
| 436 | use crate::events::Event;
|
| 437 | use crate::reader::Reader;
|
| 438 | use encoding_rs::{UTF_16LE, UTF_8, WINDOWS_1251};
|
| 439 | use pretty_assertions::assert_eq;
|
| 440 |
|
| 441 | /// Checks that encoding is detected by BOM and changed after XML declaration
|
| 442 | /// BOM indicates UTF-16LE, but XML - windows-1251
|
| 443 | #[test ]
|
| 444 | fn bom_detected() {
|
| 445 | let mut reader =
|
| 446 | Reader::from_reader(b" \xFF\xFE<?xml encoding='windows-1251'?>" .as_ref());
|
| 447 | let mut buf = Vec::new();
|
| 448 |
|
| 449 | assert_eq!(reader.decoder().encoding(), UTF_8);
|
| 450 | reader.read_event_into(&mut buf).unwrap();
|
| 451 | assert_eq!(reader.decoder().encoding(), WINDOWS_1251);
|
| 452 |
|
| 453 | assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof);
|
| 454 | }
|
| 455 |
|
| 456 | /// Checks that encoding is changed by XML declaration, but only once
|
| 457 | #[test ]
|
| 458 | fn xml_declaration() {
|
| 459 | let mut reader = Reader::from_reader(
|
| 460 | b"<?xml encoding='UTF-16'?><?xml encoding='windows-1251'?>" .as_ref(),
|
| 461 | );
|
| 462 | let mut buf = Vec::new();
|
| 463 |
|
| 464 | assert_eq!(reader.decoder().encoding(), UTF_8);
|
| 465 | reader.read_event_into(&mut buf).unwrap();
|
| 466 | assert_eq!(reader.decoder().encoding(), UTF_16LE);
|
| 467 |
|
| 468 | reader.read_event_into(&mut buf).unwrap();
|
| 469 | assert_eq!(reader.decoder().encoding(), UTF_16LE);
|
| 470 |
|
| 471 | assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof);
|
| 472 | }
|
| 473 | }
|
| 474 | }
|
| 475 | |