1//! This is an implementation of [`Reader`] for reading from a [`BufRead`] as
2//! underlying byte stream.
3
4use std::fs::File;
5use std::io::{self, BufRead, BufReader};
6use std::path::Path;
7
8use memchr;
9
10use crate::errors::{Error, Result};
11use crate::events::Event;
12use crate::name::QName;
13use crate::reader::{is_whitespace, BangType, ReadElementState, Reader, Span, XmlSource};
14
15macro_rules! impl_buffered_source {
16 ($($lf:lifetime, $reader:tt, $async:ident, $await:ident)?) => {
17 #[cfg(not(feature = "encoding"))]
18 $($async)? fn remove_utf8_bom(&mut self) -> Result<()> {
19 use crate::encoding::UTF8_BOM;
20
21 loop {
22 break match self $(.$reader)? .fill_buf() $(.$await)? {
23 Ok(n) => {
24 if n.starts_with(UTF8_BOM) {
25 self $(.$reader)? .consume(UTF8_BOM.len());
26 }
27 Ok(())
28 },
29 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
30 Err(e) => Err(Error::Io(e.into())),
31 };
32 }
33 }
34
35 #[cfg(feature = "encoding")]
36 $($async)? fn detect_encoding(&mut self) -> Result<Option<&'static encoding_rs::Encoding>> {
37 loop {
38 break match self $(.$reader)? .fill_buf() $(.$await)? {
39 Ok(n) => if let Some((enc, bom_len)) = crate::encoding::detect_encoding(n) {
40 self $(.$reader)? .consume(bom_len);
41 Ok(Some(enc))
42 } else {
43 Ok(None)
44 },
45 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
46 Err(e) => Err(Error::Io(e.into())),
47 };
48 }
49 }
50
51 #[inline]
52 $($async)? fn read_bytes_until $(<$lf>)? (
53 &mut self,
54 byte: u8,
55 buf: &'b mut Vec<u8>,
56 position: &mut usize,
57 ) -> Result<Option<&'b [u8]>> {
58 // search byte must be within the ascii range
59 debug_assert!(byte.is_ascii());
60
61 let mut read = 0;
62 let mut done = false;
63 let start = buf.len();
64 while !done {
65 let used = {
66 let available = match self $(.$reader)? .fill_buf() $(.$await)? {
67 Ok(n) if n.is_empty() => break,
68 Ok(n) => n,
69 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
70 Err(e) => {
71 *position += read;
72 return Err(Error::Io(e.into()));
73 }
74 };
75
76 match memchr::memchr(byte, available) {
77 Some(i) => {
78 buf.extend_from_slice(&available[..i]);
79 done = true;
80 i + 1
81 }
82 None => {
83 buf.extend_from_slice(available);
84 available.len()
85 }
86 }
87 };
88 self $(.$reader)? .consume(used);
89 read += used;
90 }
91 *position += read;
92
93 if read == 0 {
94 Ok(None)
95 } else {
96 Ok(Some(&buf[start..]))
97 }
98 }
99
100 $($async)? fn read_bang_element $(<$lf>)? (
101 &mut self,
102 buf: &'b mut Vec<u8>,
103 position: &mut usize,
104 ) -> Result<Option<(BangType, &'b [u8])>> {
105 // Peeked one bang ('!') before being called, so it's guaranteed to
106 // start with it.
107 let start = buf.len();
108 let mut read = 1;
109 buf.push(b'!');
110 self $(.$reader)? .consume(1);
111
112 let bang_type = BangType::new(self.peek_one() $(.$await)? ?)?;
113
114 loop {
115 match self $(.$reader)? .fill_buf() $(.$await)? {
116 // Note: Do not update position, so the error points to
117 // somewhere sane rather than at the EOF
118 Ok(n) if n.is_empty() => return Err(bang_type.to_err()),
119 Ok(available) => {
120 // We only parse from start because we don't want to consider
121 // whatever is in the buffer before the bang element
122 if let Some((consumed, used)) = bang_type.parse(&buf[start..], available) {
123 buf.extend_from_slice(consumed);
124
125 self $(.$reader)? .consume(used);
126 read += used;
127
128 *position += read;
129 break;
130 } else {
131 buf.extend_from_slice(available);
132
133 let used = available.len();
134 self $(.$reader)? .consume(used);
135 read += used;
136 }
137 }
138 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
139 Err(e) => {
140 *position += read;
141 return Err(Error::Io(e.into()));
142 }
143 }
144 }
145
146 if read == 0 {
147 Ok(None)
148 } else {
149 Ok(Some((bang_type, &buf[start..])))
150 }
151 }
152
153 #[inline]
154 $($async)? fn read_element $(<$lf>)? (
155 &mut self,
156 buf: &'b mut Vec<u8>,
157 position: &mut usize,
158 ) -> Result<Option<&'b [u8]>> {
159 let mut state = ReadElementState::Elem;
160 let mut read = 0;
161
162 let start = buf.len();
163 loop {
164 match self $(.$reader)? .fill_buf() $(.$await)? {
165 Ok(n) if n.is_empty() => break,
166 Ok(available) => {
167 if let Some((consumed, used)) = state.change(available) {
168 buf.extend_from_slice(consumed);
169
170 self $(.$reader)? .consume(used);
171 read += used;
172
173 // Position now just after the `>` symbol
174 *position += read;
175 break;
176 } else {
177 // The `>` symbol not yet found, continue reading
178 buf.extend_from_slice(available);
179
180 let used = available.len();
181 self $(.$reader)? .consume(used);
182 read += used;
183 }
184 }
185 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
186 Err(e) => {
187 *position += read;
188 return Err(Error::Io(e.into()));
189 }
190 };
191 }
192
193 if read == 0 {
194 Ok(None)
195 } else {
196 Ok(Some(&buf[start..]))
197 }
198 }
199
200 $($async)? fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> {
201 loop {
202 break match self $(.$reader)? .fill_buf() $(.$await)? {
203 Ok(n) => {
204 let count = n.iter().position(|b| !is_whitespace(*b)).unwrap_or(n.len());
205 if count > 0 {
206 self $(.$reader)? .consume(count);
207 *position += count;
208 continue;
209 } else {
210 Ok(())
211 }
212 }
213 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
214 Err(e) => Err(Error::Io(e.into())),
215 };
216 }
217 }
218
219 $($async)? fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result<bool> {
220 // search byte must be within the ascii range
221 debug_assert!(byte.is_ascii());
222
223 match self.peek_one() $(.$await)? ? {
224 Some(b) if b == byte => {
225 *position += 1;
226 self $(.$reader)? .consume(1);
227 Ok(true)
228 }
229 _ => Ok(false),
230 }
231 }
232
233 $($async)? fn peek_one(&mut self) -> Result<Option<u8>> {
234 loop {
235 break match self $(.$reader)? .fill_buf() $(.$await)? {
236 Ok(n) if n.is_empty() => Ok(None),
237 Ok(n) => Ok(Some(n[0])),
238 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
239 Err(e) => Err(Error::Io(e.into())),
240 };
241 }
242 }
243 };
244}
245
246// Make it public for use in async implementations
247pub(super) use impl_buffered_source;
248
249/// Implementation of `XmlSource` for any `BufRead` reader using a user-given
250/// `Vec<u8>` as buffer that will be borrowed by events.
251impl<'b, R: BufRead> XmlSource<'b, &'b mut Vec<u8>> for R {
252 impl_buffered_source!();
253}
254
255////////////////////////////////////////////////////////////////////////////////////////////////////
256
257/// This is an implementation for reading from a [`BufRead`] as underlying byte stream.
258impl<R: BufRead> Reader<R> {
259 /// Reads the next `Event`.
260 ///
261 /// This is the main entry point for reading XML `Event`s.
262 ///
263 /// `Event`s borrow `buf` and can be converted to own their data if needed (uses `Cow`
264 /// internally).
265 ///
266 /// Having the possibility to control the internal buffers gives you some additional benefits
267 /// such as:
268 ///
269 /// - Reduce the number of allocations by reusing the same buffer. For constrained systems,
270 /// you can call `buf.clear()` once you are done with processing the event (typically at the
271 /// end of your loop).
272 /// - Reserve the buffer length if you know the file size (using `Vec::with_capacity`).
273 ///
274 /// # Examples
275 ///
276 /// ```
277 /// # use pretty_assertions::assert_eq;
278 /// use quick_xml::events::Event;
279 /// use quick_xml::reader::Reader;
280 ///
281 /// let xml = r#"<tag1 att1 = "test">
282 /// <tag2><!--Test comment-->Test</tag2>
283 /// <tag2>Test 2</tag2>
284 /// </tag1>"#;
285 /// let mut reader = Reader::from_str(xml);
286 /// reader.trim_text(true);
287 /// let mut count = 0;
288 /// let mut buf = Vec::new();
289 /// let mut txt = Vec::new();
290 /// loop {
291 /// match reader.read_event_into(&mut buf) {
292 /// Ok(Event::Start(_)) => count += 1,
293 /// Ok(Event::Text(e)) => txt.push(e.unescape().unwrap().into_owned()),
294 /// Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
295 /// Ok(Event::Eof) => break,
296 /// _ => (),
297 /// }
298 /// buf.clear();
299 /// }
300 /// assert_eq!(count, 3);
301 /// assert_eq!(txt, vec!["Test".to_string(), "Test 2".to_string()]);
302 /// ```
303 #[inline]
304 pub fn read_event_into<'b>(&mut self, buf: &'b mut Vec<u8>) -> Result<Event<'b>> {
305 self.read_event_impl(buf)
306 }
307
308 /// Reads until end element is found using provided buffer as intermediate
309 /// storage for events content. This function is supposed to be called after
310 /// you already read a [`Start`] event.
311 ///
312 /// Returns a span that cover content between `>` of an opening tag and `<` of
313 /// a closing tag or an empty slice, if [`expand_empty_elements`] is set and
314 /// this method was called after reading expanded [`Start`] event.
315 ///
316 /// Manages nested cases where parent and child elements have the _literally_
317 /// same name.
318 ///
319 /// If corresponding [`End`] event will not be found, the [`Error::UnexpectedEof`]
320 /// will be returned. In particularly, that error will be returned if you call
321 /// this method without consuming the corresponding [`Start`] event first.
322 ///
323 /// If your reader created from a string slice or byte array slice, it is
324 /// better to use [`read_to_end()`] method, because it will not copy bytes
325 /// into intermediate buffer.
326 ///
327 /// The provided `buf` buffer will be filled only by one event content at time.
328 /// Before reading of each event the buffer will be cleared. If you know an
329 /// appropriate size of each event, you can preallocate the buffer to reduce
330 /// number of reallocations.
331 ///
332 /// The `end` parameter should contain name of the end element _in the reader
333 /// encoding_. It is good practice to always get that parameter using
334 /// [`BytesStart::to_end()`] method.
335 ///
336 /// The correctness of the skipped events does not checked, if you disabled
337 /// the [`check_end_names`] option.
338 ///
339 /// # Namespaces
340 ///
341 /// While the `Reader` does not support namespace resolution, namespaces
342 /// does not change the algorithm for comparing names. Although the names
343 /// `a:name` and `b:name` where both prefixes `a` and `b` resolves to the
344 /// same namespace, are semantically equivalent, `</b:name>` cannot close
345 /// `<a:name>`, because according to [the specification]
346 ///
347 /// > The end of every element that begins with a **start-tag** MUST be marked
348 /// > by an **end-tag** containing a name that echoes the element's type as
349 /// > given in the **start-tag**
350 ///
351 /// # Examples
352 ///
353 /// This example shows, how you can skip XML content after you read the
354 /// start event.
355 ///
356 /// ```
357 /// # use pretty_assertions::assert_eq;
358 /// use quick_xml::events::{BytesStart, Event};
359 /// use quick_xml::reader::Reader;
360 ///
361 /// let mut reader = Reader::from_str(r#"
362 /// <outer>
363 /// <inner>
364 /// <inner></inner>
365 /// <inner/>
366 /// <outer></outer>
367 /// <outer/>
368 /// </inner>
369 /// </outer>
370 /// "#);
371 /// reader.trim_text(true);
372 /// let mut buf = Vec::new();
373 ///
374 /// let start = BytesStart::new("outer");
375 /// let end = start.to_end().into_owned();
376 ///
377 /// // First, we read a start event...
378 /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Start(start));
379 ///
380 /// // ...then, we could skip all events to the corresponding end event.
381 /// // This call will correctly handle nested <outer> elements.
382 /// // Note, however, that this method does not handle namespaces.
383 /// reader.read_to_end_into(end.name(), &mut buf).unwrap();
384 ///
385 /// // At the end we should get an Eof event, because we ate the whole XML
386 /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof);
387 /// ```
388 ///
389 /// [`Start`]: Event::Start
390 /// [`End`]: Event::End
391 /// [`BytesStart::to_end()`]: crate::events::BytesStart::to_end
392 /// [`read_to_end()`]: Self::read_to_end
393 /// [`expand_empty_elements`]: Self::expand_empty_elements
394 /// [`check_end_names`]: Self::check_end_names
395 /// [the specification]: https://www.w3.org/TR/xml11/#dt-etag
396 pub fn read_to_end_into(&mut self, end: QName, buf: &mut Vec<u8>) -> Result<Span> {
397 Ok(read_to_end!(self, end, buf, read_event_impl, {
398 buf.clear();
399 }))
400 }
401}
402
403impl Reader<BufReader<File>> {
404 /// Creates an XML reader from a file path.
405 pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
406 let file: File = File::open(path)?;
407 let reader: BufReader = BufReader::new(inner:file);
408 Ok(Self::from_reader(reader))
409 }
410}
411
412#[cfg(test)]
413mod test {
414 use crate::reader::test::{check, small_buffers};
415 use crate::reader::XmlSource;
416
417 /// Default buffer constructor just pass the byte array from the test
418 fn identity<T>(input: T) -> T {
419 input
420 }
421
422 check!(
423 #[test]
424 read_event_impl,
425 read_until_close,
426 identity,
427 &mut Vec::new()
428 );
429
430 small_buffers!(
431 #[test]
432 read_event_into: std::io::BufReader<_>
433 );
434
435 #[cfg(feature = "encoding")]
436 mod encoding {
437 use crate::events::Event;
438 use crate::reader::Reader;
439 use encoding_rs::{UTF_16LE, UTF_8, WINDOWS_1251};
440 use pretty_assertions::assert_eq;
441
442 /// Checks that encoding is detected by BOM and changed after XML declaration
443 /// BOM indicates UTF-16LE, but XML - windows-1251
444 #[test]
445 fn bom_detected() {
446 let mut reader =
447 Reader::from_reader(b"\xFF\xFE<?xml encoding='windows-1251'?>".as_ref());
448 let mut buf = Vec::new();
449
450 assert_eq!(reader.decoder().encoding(), UTF_8);
451 reader.read_event_into(&mut buf).unwrap();
452 assert_eq!(reader.decoder().encoding(), WINDOWS_1251);
453
454 assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof);
455 }
456
457 /// Checks that encoding is changed by XML declaration, but only once
458 #[test]
459 fn xml_declaration() {
460 let mut reader = Reader::from_reader(
461 b"<?xml encoding='UTF-16'?><?xml encoding='windows-1251'?>".as_ref(),
462 );
463 let mut buf = Vec::new();
464
465 assert_eq!(reader.decoder().encoding(), UTF_8);
466 reader.read_event_into(&mut buf).unwrap();
467 assert_eq!(reader.decoder().encoding(), UTF_16LE);
468
469 reader.read_event_into(&mut buf).unwrap();
470 assert_eq!(reader.decoder().encoding(), UTF_16LE);
471
472 assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof);
473 }
474 }
475}
476