buffered_reader.rs source code [crates/quick_xml/src/reader/buffered_reader.rs]

1	//! This is an implementation of [`Reader`] for reading from a [`BufRead`] as
2	//! underlying byte stream.
3
4	use std::fs::File;
5	use std::io::{self, BufRead, BufReader};
6	use std::path::Path;
7
8	use crate::errors::{Error, Result};
9	use crate::events::Event;
10	use crate::name::QName;
11	use crate::parser::Parser;
12	use crate::reader::{BangType, ReadTextResult, Reader, Span, XmlSource};
13	use crate::utils::is_whitespace;
14
15	macro_rules! impl_buffered_source {
16	($($lf:lifetime, $reader:tt, $async:ident, $await:ident)?) => {
17	#[cfg(not(feature = "encoding"))]
18	#[inline]
19	$($async)? fn remove_utf8_bom(&mut self) -> io::Result<()> {
20	use crate::encoding::UTF8_BOM;
21
22	loop {
23	break match self $(.$reader)? .fill_buf() $(.$await)? {
24	Ok(n) => {
25	if n.starts_with(UTF8_BOM) {
26	self $(.$reader)? .consume(UTF8_BOM.len());
27	}
28	Ok(())
29	},
30	Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
31	Err(e) => Err(e),
32	};
33	}
34	}
35
36	#[cfg(feature = "encoding")]
37	#[inline]
38	$($async)? fn detect_encoding(&mut self) -> io::Result<Option<&'static encoding_rs::Encoding>> {
39	loop {
40	break match self $(.$reader)? .fill_buf() $(.$await)? {
41	Ok(n) => if let Some((enc, bom_len)) = crate::encoding::detect_encoding(n) {
42	self $(.$reader)? .consume(bom_len);
43	Ok(Some(enc))
44	} else {
45	Ok(None)
46	},
47	Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
48	Err(e) => Err(e),
49	};
50	}
51	}
52
53	#[inline]
54	$($async)? fn read_text $(<$lf>)? (
55	&mut self,
56	buf: &'b mut Vec<u8>,
57	position: &mut u64,
58	) -> ReadTextResult<'b, &'b mut Vec<u8>> {
59	let mut read = `0`;
60	let start = buf.len();
61	loop {
62	let available = match self $(.$reader)? .fill_buf() $(.$await)? {
63	Ok(n) if n.is_empty() => break,
64	Ok(n) => n,
65	Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
66	Err(e) => {
67	*position += read;
68	return ReadTextResult::Err(e);
69	}
70	};
71
72	match memchr::memchr(b'<', available) {
73	// Special handling is needed only on the first iteration.
74	// On next iterations we already read something and should emit Text event
75	Some(`0`) if read == `0` => {
76	self $(.$reader)? .consume(`1`);
77	*position += `1`;
78	return ReadTextResult::Markup(buf);
79	}
80	Some(i) => {
81	buf.extend_from_slice(&available[..i]);
82
83	let used = i + `1`;
84	self $(.$reader)? .consume(used);
85	read += used as u64;
86
87	*position += read;
88	return ReadTextResult::UpToMarkup(&buf[start..]);
89	}
90	None => {
91	buf.extend_from_slice(available);
92
93	let used = available.len();
94	self $(.$reader)? .consume(used);
95	read += used as u64;
96	}
97	}
98	}
99
100	*position += read;
101	ReadTextResult::UpToEof(&buf[start..])
102	}
103
104	#[inline]
105	$($async)? fn read_with<$($lf,)? P: Parser>(
106	&mut self,
107	mut parser: P,
108	buf: &'b mut Vec<u8>,
109	position: &mut u64,
110	) -> Result<&'b [u8]> {
111	let mut read = `0`;
112	let start = buf.len();
113	loop {
114	let available = match self $(.$reader)? .fill_buf() $(.$await)? {
115	Ok(n) if n.is_empty() => break,
116	Ok(n) => n,
117	Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
118	Err(e) => {
119	*position += read;
120	return Err(Error::Io(e.into()));
121	}
122	};
123
124	if let Some(i) = parser.feed(available) {
125	buf.extend_from_slice(&available[..i]);
126
127	// +1 for `>` which we do not include
128	self $(.$reader)? .consume(i + `1`);
129	read += i as u64 + `1`;
130
131	*position += read;
132	return Ok(&buf[start..]);
133	}
134
135	// The `>` symbol not yet found, continue reading
136	buf.extend_from_slice(available);
137
138	let used = available.len();
139	self $(.$reader)? .consume(used);
140	read += used as u64;
141	}
142
143	*position += read;
144	Err(Error::Syntax(P::eof_error()))
145	}
146
147	#[inline]
148	$($async)? fn read_bang_element $(<$lf>)? (
149	&mut self,
150	buf: &'b mut Vec<u8>,
151	position: &mut u64,
152	) -> Result<(BangType, &'b [u8])> {
153	// Peeked one bang ('!') before being called, so it's guaranteed to
154	// start with it.
155	let start = buf.len();
156	let mut read = `1`;
157	buf.push(b'!');
158	self $(.$reader)? .consume(`1`);
159
160	let mut bang_type = BangType::new(self.peek_one() $(.$await)? ?)?;
161
162	loop {
163	match self $(.$reader)? .fill_buf() $(.$await)? {
164	// Note: Do not update position, so the error points to
165	// somewhere sane rather than at the EOF
166	Ok(n) if n.is_empty() => break,
167	Ok(available) => {
168	// We only parse from start because we don't want to consider
169	// whatever is in the buffer before the bang element
170	if let Some((consumed, used)) = bang_type.parse(&buf[start..], available) {
171	buf.extend_from_slice(consumed);
172
173	self $(.$reader)? .consume(used);
174	read += used as u64;
175
176	*position += read;
177	return Ok((bang_type, &buf[start..]));
178	} else {
179	buf.extend_from_slice(available);
180
181	let used = available.len();
182	self $(.$reader)? .consume(used);
183	read += used as u64;
184	}
185	}
186	Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
187	Err(e) => {
188	*position += read;
189	return Err(Error::Io(e.into()));
190	}
191	}
192	}
193
194	*position += read;
195	Err(bang_type.to_err().into())
196	}
197
198	#[inline]
199	$($async)? fn skip_whitespace(&mut self, position: &mut u64) -> io::Result<()> {
200	loop {
201	break match self $(.$reader)? .fill_buf() $(.$await)? {
202	Ok(n) => {
203	let count = n.iter().position(\|b\| !is_whitespace(*b)).unwrap_or(n.len());
204	if count > `0` {
205	self $(.$reader)? .consume(count);
206	*position += count as u64;
207	continue;
208	} else {
209	Ok(())
210	}
211	}
212	Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
213	Err(e) => Err(e),
214	};
215	}
216	}
217
218	#[inline]
219	$($async)? fn peek_one(&mut self) -> io::Result<Option<u8>> {
220	loop {
221	break match self $(.$reader)? .fill_buf() $(.$await)? {
222	Ok(n) => Ok(n.first().cloned()),
223	Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
224	Err(e) => Err(e),
225	};
226	}
227	}
228	};
229	}
230
231	// Make it public for use in async implementations.
232	// New rustc reports
233	// > warning: the item `impl_buffered_source` is imported redundantly
234	// so make it public only when async feature is enabled
235	#[cfg(feature = "async-tokio")]
236	pub(super) use impl_buffered_source;
237
238	/// Implementation of `XmlSource` for any `BufRead` reader using a user-given
239	/// `Vec<u8>` as buffer that will be borrowed by events.
240	impl<'b, R: BufRead> XmlSource<'b, &'b mut Vec<u8>> for R {
241	impl_buffered_source!();
242	}
243
244	////////////////////////////////////////////////////////////////////////////////////////////////////
245
246	/// This is an implementation for reading from a [`BufRead`] as underlying byte stream.
247	impl<R: BufRead> Reader<R> {
248	/// Reads the next `Event`.
249	///
250	/// This is the main entry point for reading XML `Event`s.
251	///
252	/// `Event`s borrow `buf` and can be converted to own their data if needed (uses `Cow`
253	/// internally).
254	///
255	/// Having the possibility to control the internal buffers gives you some additional benefits
256	/// such as:
257	///
258	/// - Reduce the number of allocations by reusing the same buffer. For constrained systems,
259	/// you can call `buf.clear()` once you are done with processing the event (typically at the
260	/// end of your loop).
261	/// - Reserve the buffer length if you know the file size (using `Vec::with_capacity`).
262	///
263	/// # Examples
264	///
265	/// ```
266	/// # use pretty_assertions::assert_eq;
267	/// use quick_xml::events::Event;
268	/// use quick_xml::reader::Reader;
269	///
270	/// let xml = r#"<tag1 att1 = "test">
271	/// <tag2><!--Test comment-->Test</tag2>
272	/// <tag2>Test 2</tag2>
273	/// </tag1>"#;
274	/// let mut reader = Reader::from_str(xml);
275	/// reader.config_mut().trim_text(`true`);
276	/// let mut count = `0`;
277	/// let mut buf = Vec::new();
278	/// let mut txt = Vec::new();
279	/// loop {
280	/// match reader.read_event_into(&mut buf) {
281	/// Ok(Event::Start(_)) => count += `1`,
282	/// Ok(Event::Text(e)) => txt.push(e.unescape().unwrap().into_owned()),
283	/// Err(e) => panic!("Error at position {}: {:?}", reader.error_position(), e),
284	/// Ok(Event::Eof) => break,
285	/// _ => (),
286	/// }
287	/// buf.clear();
288	/// }
289	/// assert_eq!(count, `3`);
290	/// assert_eq!(txt, vec!["Test".to_string(), "Test 2".to_string()]);
291	/// ```
292	#[inline]
293	pub fn read_event_into<'b>(&mut self, buf: &'b mut Vec<u8>) -> Result<Event<'b>> {
294	self.read_event_impl(buf)
295	}
296
297	/// Reads until end element is found using provided buffer as intermediate
298	/// storage for events content. This function is supposed to be called after
299	/// you already read a [`Start`] event.
300	///
301	/// Returns a span that cover content between `>` of an opening tag and `<` of
302	/// a closing tag or an empty slice, if [`expand_empty_elements`] is set and
303	/// this method was called after reading expanded [`Start`] event.
304	///
305	/// Manages nested cases where parent and child elements have the _literally_
306	/// same name.
307	///
308	/// If a corresponding [`End`] event is not found, an error of type [`Error::IllFormed`]
309	/// will be returned. In particularly, that error will be returned if you call
310	/// this method without consuming the corresponding [`Start`] event first.
311	///
312	/// If your reader created from a string slice or byte array slice, it is
313	/// better to use [`read_to_end()`] method, because it will not copy bytes
314	/// into intermediate buffer.
315	///
316	/// The provided `buf` buffer will be filled only by one event content at time.
317	/// Before reading of each event the buffer will be cleared. If you know an
318	/// appropriate size of each event, you can preallocate the buffer to reduce
319	/// number of reallocations.
320	///
321	/// The `end` parameter should contain name of the end element _in the reader
322	/// encoding_. It is good practice to always get that parameter using
323	/// [`BytesStart::to_end()`] method.
324	///
325	/// The correctness of the skipped events does not checked, if you disabled
326	/// the [`check_end_names`] option.
327	///
328	/// # Namespaces
329	///
330	/// While the `Reader` does not support namespace resolution, namespaces
331	/// does not change the algorithm for comparing names. Although the names
332	/// `a:name` and `b:name` where both prefixes `a` and `b` resolves to the
333	/// same namespace, are semantically equivalent, `</b:name>` cannot close
334	/// `<a:name>`, because according to [the specification]
335	///
336	/// > The end of every element that begins with a start-tag* MUST be marked*
337	/// > by an end-tag* containing a name that echoes the element's type as*
338	/// > given in the start-tag
339	///
340	/// # Examples
341	///
342	/// This example shows, how you can skip XML content after you read the
343	/// start event.
344	///
345	/// ```
346	/// # use pretty_assertions::assert_eq;
347	/// use quick_xml::events::{BytesStart, Event};
348	/// use quick_xml::reader::Reader;
349	///
350	/// let mut reader = Reader::from_str(r#"
351	/// <outer>
352	/// <inner>
353	/// <inner></inner>
354	/// <inner/>
355	/// <outer></outer>
356	/// <outer/>
357	/// </inner>
358	/// </outer>
359	/// "#);
360	/// reader.config_mut().trim_text(`true`);
361	/// let mut buf = Vec::new();
362	///
363	/// let start = BytesStart::new("outer");
364	/// let end = start.to_end().into_owned();
365	///
366	/// // First, we read a start event...
367	/// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Start(start));
368	///
369	/// // ...then, we could skip all events to the corresponding end event.
370	/// // This call will correctly handle nested <outer> elements.
371	/// // Note, however, that this method does not handle namespaces.
372	/// reader.read_to_end_into(end.name(), &mut buf).unwrap();
373	///
374	/// // At the end we should get an Eof event, because we ate the whole XML
375	/// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof);
376	/// ```
377	///
378	/// [`Start`]: Event::Start
379	/// [`End`]: Event::End
380	/// [`BytesStart::to_end()`]: crate::events::BytesStart::to_end
381	/// [`read_to_end()`]: Self::read_to_end
382	/// [`expand_empty_elements`]: crate::reader::Config::expand_empty_elements
383	/// [`check_end_names`]: crate::reader::Config::check_end_names
384	/// [the specification]: https://www.w3.org/TR/xml11/#dt-etag
385	pub fn read_to_end_into(&mut self, end: QName, buf: &mut Vec<u8>) -> Result<Span> {
386	Ok(read_to_end!(self, end, buf, read_event_impl, {
387	buf.clear();
388	}))
389	}
390	}
391
392	impl Reader<BufReader<File>> {
393	/// Creates an XML reader from a file path.
394	pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
395	let file: File = File::open(path)?;
396	let reader: BufReader = BufReader::new(inner:file);
397	Ok(Self::from_reader(reader))
398	}
399	}
400
401	#[cfg(test)]
402	mod test {
403	use crate::reader::test::check;
404	use crate::reader::XmlSource;
405
406	/// Default buffer constructor just pass the byte array from the test
407	fn identity<T>(input: T) -> T {
408	input
409	}
410
411	check!(
412	#[test]
413	read_event_impl,
414	read_until_close,
415	identity,
416	&mut Vec::new()
417	);
418	}
419