1 | //! This is an implementation of [`Reader`] for reading from a [`BufRead`] as
|
2 | //! underlying byte stream.
|
3 |
|
4 | use std::fs::File;
|
5 | use std::io::{self, BufRead, BufReader};
|
6 | use std::path::Path;
|
7 |
|
8 | use crate::errors::{Error, Result};
|
9 | use crate::events::Event;
|
10 | use crate::name::QName;
|
11 | use crate::parser::Parser;
|
12 | use crate::reader::{BangType, ReadTextResult, Reader, Span, XmlSource};
|
13 | use crate::utils::is_whitespace;
|
14 |
|
15 | macro_rules! impl_buffered_source {
|
16 | ($($lf:lifetime, $reader:tt, $async:ident, $await:ident)?) => {
|
17 | #[cfg(not(feature = "encoding" ))]
|
18 | #[inline]
|
19 | $($async)? fn remove_utf8_bom(&mut self) -> io::Result<()> {
|
20 | use crate::encoding::UTF8_BOM;
|
21 |
|
22 | loop {
|
23 | break match self $(.$reader)? .fill_buf() $(.$await)? {
|
24 | Ok(n) => {
|
25 | if n.starts_with(UTF8_BOM) {
|
26 | self $(.$reader)? .consume(UTF8_BOM.len());
|
27 | }
|
28 | Ok(())
|
29 | },
|
30 | Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
|
31 | Err(e) => Err(e),
|
32 | };
|
33 | }
|
34 | }
|
35 |
|
36 | #[cfg(feature = "encoding" )]
|
37 | #[inline]
|
38 | $($async)? fn detect_encoding(&mut self) -> io::Result<Option<&'static encoding_rs::Encoding>> {
|
39 | loop {
|
40 | break match self $(.$reader)? .fill_buf() $(.$await)? {
|
41 | Ok(n) => if let Some((enc, bom_len)) = crate::encoding::detect_encoding(n) {
|
42 | self $(.$reader)? .consume(bom_len);
|
43 | Ok(Some(enc))
|
44 | } else {
|
45 | Ok(None)
|
46 | },
|
47 | Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
|
48 | Err(e) => Err(e),
|
49 | };
|
50 | }
|
51 | }
|
52 |
|
53 | #[inline]
|
54 | $($async)? fn read_text $(<$lf>)? (
|
55 | &mut self,
|
56 | buf: &'b mut Vec<u8>,
|
57 | position: &mut u64,
|
58 | ) -> ReadTextResult<'b, &'b mut Vec<u8>> {
|
59 | let mut read = 0;
|
60 | let start = buf.len();
|
61 | loop {
|
62 | let available = match self $(.$reader)? .fill_buf() $(.$await)? {
|
63 | Ok(n) if n.is_empty() => break,
|
64 | Ok(n) => n,
|
65 | Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
|
66 | Err(e) => {
|
67 | *position += read;
|
68 | return ReadTextResult::Err(e);
|
69 | }
|
70 | };
|
71 |
|
72 | match memchr::memchr(b'<' , available) {
|
73 | // Special handling is needed only on the first iteration.
|
74 | // On next iterations we already read something and should emit Text event
|
75 | Some(0) if read == 0 => {
|
76 | self $(.$reader)? .consume(1);
|
77 | *position += 1;
|
78 | return ReadTextResult::Markup(buf);
|
79 | }
|
80 | Some(i) => {
|
81 | buf.extend_from_slice(&available[..i]);
|
82 |
|
83 | let used = i + 1;
|
84 | self $(.$reader)? .consume(used);
|
85 | read += used as u64;
|
86 |
|
87 | *position += read;
|
88 | return ReadTextResult::UpToMarkup(&buf[start..]);
|
89 | }
|
90 | None => {
|
91 | buf.extend_from_slice(available);
|
92 |
|
93 | let used = available.len();
|
94 | self $(.$reader)? .consume(used);
|
95 | read += used as u64;
|
96 | }
|
97 | }
|
98 | }
|
99 |
|
100 | *position += read;
|
101 | ReadTextResult::UpToEof(&buf[start..])
|
102 | }
|
103 |
|
104 | #[inline]
|
105 | $($async)? fn read_with<$($lf,)? P: Parser>(
|
106 | &mut self,
|
107 | mut parser: P,
|
108 | buf: &'b mut Vec<u8>,
|
109 | position: &mut u64,
|
110 | ) -> Result<&'b [u8]> {
|
111 | let mut read = 0;
|
112 | let start = buf.len();
|
113 | loop {
|
114 | let available = match self $(.$reader)? .fill_buf() $(.$await)? {
|
115 | Ok(n) if n.is_empty() => break,
|
116 | Ok(n) => n,
|
117 | Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
|
118 | Err(e) => {
|
119 | *position += read;
|
120 | return Err(Error::Io(e.into()));
|
121 | }
|
122 | };
|
123 |
|
124 | if let Some(i) = parser.feed(available) {
|
125 | buf.extend_from_slice(&available[..i]);
|
126 |
|
127 | // +1 for `>` which we do not include
|
128 | self $(.$reader)? .consume(i + 1);
|
129 | read += i as u64 + 1;
|
130 |
|
131 | *position += read;
|
132 | return Ok(&buf[start..]);
|
133 | }
|
134 |
|
135 | // The `>` symbol not yet found, continue reading
|
136 | buf.extend_from_slice(available);
|
137 |
|
138 | let used = available.len();
|
139 | self $(.$reader)? .consume(used);
|
140 | read += used as u64;
|
141 | }
|
142 |
|
143 | *position += read;
|
144 | Err(Error::Syntax(P::eof_error()))
|
145 | }
|
146 |
|
147 | #[inline]
|
148 | $($async)? fn read_bang_element $(<$lf>)? (
|
149 | &mut self,
|
150 | buf: &'b mut Vec<u8>,
|
151 | position: &mut u64,
|
152 | ) -> Result<(BangType, &'b [u8])> {
|
153 | // Peeked one bang ('!') before being called, so it's guaranteed to
|
154 | // start with it.
|
155 | let start = buf.len();
|
156 | let mut read = 1;
|
157 | buf.push(b'!' );
|
158 | self $(.$reader)? .consume(1);
|
159 |
|
160 | let mut bang_type = BangType::new(self.peek_one() $(.$await)? ?)?;
|
161 |
|
162 | loop {
|
163 | match self $(.$reader)? .fill_buf() $(.$await)? {
|
164 | // Note: Do not update position, so the error points to
|
165 | // somewhere sane rather than at the EOF
|
166 | Ok(n) if n.is_empty() => break,
|
167 | Ok(available) => {
|
168 | // We only parse from start because we don't want to consider
|
169 | // whatever is in the buffer before the bang element
|
170 | if let Some((consumed, used)) = bang_type.parse(&buf[start..], available) {
|
171 | buf.extend_from_slice(consumed);
|
172 |
|
173 | self $(.$reader)? .consume(used);
|
174 | read += used as u64;
|
175 |
|
176 | *position += read;
|
177 | return Ok((bang_type, &buf[start..]));
|
178 | } else {
|
179 | buf.extend_from_slice(available);
|
180 |
|
181 | let used = available.len();
|
182 | self $(.$reader)? .consume(used);
|
183 | read += used as u64;
|
184 | }
|
185 | }
|
186 | Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
|
187 | Err(e) => {
|
188 | *position += read;
|
189 | return Err(Error::Io(e.into()));
|
190 | }
|
191 | }
|
192 | }
|
193 |
|
194 | *position += read;
|
195 | Err(bang_type.to_err().into())
|
196 | }
|
197 |
|
198 | #[inline]
|
199 | $($async)? fn skip_whitespace(&mut self, position: &mut u64) -> io::Result<()> {
|
200 | loop {
|
201 | break match self $(.$reader)? .fill_buf() $(.$await)? {
|
202 | Ok(n) => {
|
203 | let count = n.iter().position(|b| !is_whitespace(*b)).unwrap_or(n.len());
|
204 | if count > 0 {
|
205 | self $(.$reader)? .consume(count);
|
206 | *position += count as u64;
|
207 | continue;
|
208 | } else {
|
209 | Ok(())
|
210 | }
|
211 | }
|
212 | Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
|
213 | Err(e) => Err(e),
|
214 | };
|
215 | }
|
216 | }
|
217 |
|
218 | #[inline]
|
219 | $($async)? fn peek_one(&mut self) -> io::Result<Option<u8>> {
|
220 | loop {
|
221 | break match self $(.$reader)? .fill_buf() $(.$await)? {
|
222 | Ok(n) => Ok(n.first().cloned()),
|
223 | Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
|
224 | Err(e) => Err(e),
|
225 | };
|
226 | }
|
227 | }
|
228 | };
|
229 | }
|
230 |
|
231 | // Make it public for use in async implementations.
|
232 | // New rustc reports
|
233 | // > warning: the item `impl_buffered_source` is imported redundantly
|
234 | // so make it public only when async feature is enabled
|
235 | #[cfg (feature = "async-tokio" )]
|
236 | pub(super) use impl_buffered_source;
|
237 |
|
238 | /// Implementation of `XmlSource` for any `BufRead` reader using a user-given
|
239 | /// `Vec<u8>` as buffer that will be borrowed by events.
|
240 | impl<'b, R: BufRead> XmlSource<'b, &'b mut Vec<u8>> for R {
|
241 | impl_buffered_source!();
|
242 | }
|
243 |
|
244 | ////////////////////////////////////////////////////////////////////////////////////////////////////
|
245 |
|
246 | /// This is an implementation for reading from a [`BufRead`] as underlying byte stream.
|
247 | impl<R: BufRead> Reader<R> {
|
248 | /// Reads the next `Event`.
|
249 | ///
|
250 | /// This is the main entry point for reading XML `Event`s.
|
251 | ///
|
252 | /// `Event`s borrow `buf` and can be converted to own their data if needed (uses `Cow`
|
253 | /// internally).
|
254 | ///
|
255 | /// Having the possibility to control the internal buffers gives you some additional benefits
|
256 | /// such as:
|
257 | ///
|
258 | /// - Reduce the number of allocations by reusing the same buffer. For constrained systems,
|
259 | /// you can call `buf.clear()` once you are done with processing the event (typically at the
|
260 | /// end of your loop).
|
261 | /// - Reserve the buffer length if you know the file size (using `Vec::with_capacity`).
|
262 | ///
|
263 | /// # Examples
|
264 | ///
|
265 | /// ```
|
266 | /// # use pretty_assertions::assert_eq;
|
267 | /// use quick_xml::events::Event;
|
268 | /// use quick_xml::reader::Reader;
|
269 | ///
|
270 | /// let xml = r#"<tag1 att1 = "test">
|
271 | /// <tag2><!--Test comment-->Test</tag2>
|
272 | /// <tag2>Test 2</tag2>
|
273 | /// </tag1>"# ;
|
274 | /// let mut reader = Reader::from_str(xml);
|
275 | /// reader.config_mut().trim_text(true);
|
276 | /// let mut count = 0;
|
277 | /// let mut buf = Vec::new();
|
278 | /// let mut txt = Vec::new();
|
279 | /// loop {
|
280 | /// match reader.read_event_into(&mut buf) {
|
281 | /// Ok(Event::Start(_)) => count += 1,
|
282 | /// Ok(Event::Text(e)) => txt.push(e.unescape().unwrap().into_owned()),
|
283 | /// Err(e) => panic!("Error at position {}: {:?}" , reader.error_position(), e),
|
284 | /// Ok(Event::Eof) => break,
|
285 | /// _ => (),
|
286 | /// }
|
287 | /// buf.clear();
|
288 | /// }
|
289 | /// assert_eq!(count, 3);
|
290 | /// assert_eq!(txt, vec!["Test" .to_string(), "Test 2" .to_string()]);
|
291 | /// ```
|
292 | #[inline ]
|
293 | pub fn read_event_into<'b>(&mut self, buf: &'b mut Vec<u8>) -> Result<Event<'b>> {
|
294 | self.read_event_impl(buf)
|
295 | }
|
296 |
|
297 | /// Reads until end element is found using provided buffer as intermediate
|
298 | /// storage for events content. This function is supposed to be called after
|
299 | /// you already read a [`Start`] event.
|
300 | ///
|
301 | /// Returns a span that cover content between `>` of an opening tag and `<` of
|
302 | /// a closing tag or an empty slice, if [`expand_empty_elements`] is set and
|
303 | /// this method was called after reading expanded [`Start`] event.
|
304 | ///
|
305 | /// Manages nested cases where parent and child elements have the _literally_
|
306 | /// same name.
|
307 | ///
|
308 | /// If a corresponding [`End`] event is not found, an error of type [`Error::IllFormed`]
|
309 | /// will be returned. In particularly, that error will be returned if you call
|
310 | /// this method without consuming the corresponding [`Start`] event first.
|
311 | ///
|
312 | /// If your reader created from a string slice or byte array slice, it is
|
313 | /// better to use [`read_to_end()`] method, because it will not copy bytes
|
314 | /// into intermediate buffer.
|
315 | ///
|
316 | /// The provided `buf` buffer will be filled only by one event content at time.
|
317 | /// Before reading of each event the buffer will be cleared. If you know an
|
318 | /// appropriate size of each event, you can preallocate the buffer to reduce
|
319 | /// number of reallocations.
|
320 | ///
|
321 | /// The `end` parameter should contain name of the end element _in the reader
|
322 | /// encoding_. It is good practice to always get that parameter using
|
323 | /// [`BytesStart::to_end()`] method.
|
324 | ///
|
325 | /// The correctness of the skipped events does not checked, if you disabled
|
326 | /// the [`check_end_names`] option.
|
327 | ///
|
328 | /// # Namespaces
|
329 | ///
|
330 | /// While the `Reader` does not support namespace resolution, namespaces
|
331 | /// does not change the algorithm for comparing names. Although the names
|
332 | /// `a:name` and `b:name` where both prefixes `a` and `b` resolves to the
|
333 | /// same namespace, are semantically equivalent, `</b:name>` cannot close
|
334 | /// `<a:name>`, because according to [the specification]
|
335 | ///
|
336 | /// > The end of every element that begins with a **start-tag** MUST be marked
|
337 | /// > by an **end-tag** containing a name that echoes the element's type as
|
338 | /// > given in the **start-tag**
|
339 | ///
|
340 | /// # Examples
|
341 | ///
|
342 | /// This example shows, how you can skip XML content after you read the
|
343 | /// start event.
|
344 | ///
|
345 | /// ```
|
346 | /// # use pretty_assertions::assert_eq;
|
347 | /// use quick_xml::events::{BytesStart, Event};
|
348 | /// use quick_xml::reader::Reader;
|
349 | ///
|
350 | /// let mut reader = Reader::from_str(r#"
|
351 | /// <outer>
|
352 | /// <inner>
|
353 | /// <inner></inner>
|
354 | /// <inner/>
|
355 | /// <outer></outer>
|
356 | /// <outer/>
|
357 | /// </inner>
|
358 | /// </outer>
|
359 | /// "# );
|
360 | /// reader.config_mut().trim_text(true);
|
361 | /// let mut buf = Vec::new();
|
362 | ///
|
363 | /// let start = BytesStart::new("outer" );
|
364 | /// let end = start.to_end().into_owned();
|
365 | ///
|
366 | /// // First, we read a start event...
|
367 | /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Start(start));
|
368 | ///
|
369 | /// // ...then, we could skip all events to the corresponding end event.
|
370 | /// // This call will correctly handle nested <outer> elements.
|
371 | /// // Note, however, that this method does not handle namespaces.
|
372 | /// reader.read_to_end_into(end.name(), &mut buf).unwrap();
|
373 | ///
|
374 | /// // At the end we should get an Eof event, because we ate the whole XML
|
375 | /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof);
|
376 | /// ```
|
377 | ///
|
378 | /// [`Start`]: Event::Start
|
379 | /// [`End`]: Event::End
|
380 | /// [`BytesStart::to_end()`]: crate::events::BytesStart::to_end
|
381 | /// [`read_to_end()`]: Self::read_to_end
|
382 | /// [`expand_empty_elements`]: crate::reader::Config::expand_empty_elements
|
383 | /// [`check_end_names`]: crate::reader::Config::check_end_names
|
384 | /// [the specification]: https://www.w3.org/TR/xml11/#dt-etag
|
385 | pub fn read_to_end_into(&mut self, end: QName, buf: &mut Vec<u8>) -> Result<Span> {
|
386 | Ok(read_to_end!(self, end, buf, read_event_impl, {
|
387 | buf.clear();
|
388 | }))
|
389 | }
|
390 | }
|
391 |
|
392 | impl Reader<BufReader<File>> {
|
393 | /// Creates an XML reader from a file path.
|
394 | pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
|
395 | let file: File = File::open(path)?;
|
396 | let reader: BufReader = BufReader::new(inner:file);
|
397 | Ok(Self::from_reader(reader))
|
398 | }
|
399 | }
|
400 |
|
401 | #[cfg (test)]
|
402 | mod test {
|
403 | use crate::reader::test::check;
|
404 | use crate::reader::XmlSource;
|
405 |
|
406 | /// Default buffer constructor just pass the byte array from the test
|
407 | fn identity<T>(input: T) -> T {
|
408 | input
|
409 | }
|
410 |
|
411 | check!(
|
412 | #[test]
|
413 | read_event_impl,
|
414 | read_until_close,
|
415 | identity,
|
416 | &mut Vec::new()
|
417 | );
|
418 | }
|
419 | |