slice_reader.rs source code [crates/quick_xml/src/reader/slice_reader.rs]

1	//! This is an implementation of [`Reader`] for reading from a `&[u8]` as
2	//! underlying byte stream. This implementation supports not using an
3	//! intermediate buffer as the byte slice itself can be used to borrow from.
4
5	use std::borrow::Cow;
6	use std::io;
7
8	#[cfg(feature = "encoding")]
9	use crate::reader::EncodingRef;
10	#[cfg(feature = "encoding")]
11	use encoding_rs::{Encoding, UTF_8};
12
13	use crate::errors::{Error, Result};
14	use crate::events::Event;
15	use crate::name::QName;
16	use crate::parser::Parser;
17	use crate::reader::{BangType, ReadTextResult, Reader, Span, XmlSource};
18	use crate::utils::is_whitespace;
19
20	/// This is an implementation for reading from a `&[u8]` as underlying byte stream.
21	/// This implementation supports not using an intermediate buffer as the byte slice
22	/// itself can be used to borrow from.
23	impl<'a> Reader<&'a [u8]> {
24	/// Creates an XML reader from a string slice.
25	#[allow(clippy::should_implement_trait)]
26	pub fn from_str(s: &'a str) -> Self {
27	// Rust strings are guaranteed to be UTF-8, so lock the encoding
28	#[cfg(feature = "encoding")]
29	{
30	let mut reader = Self::from_reader(s.as_bytes());
31	reader.state.encoding = EncodingRef::Explicit(UTF_8);
32	reader
33	}
34
35	#[cfg(not(feature = "encoding"))]
36	Self::from_reader(s.as_bytes())
37	}
38
39	/// Read an event that borrows from the input rather than a buffer.
40	///
41	/// There is no asynchronous `read_event_async()` version of this function,
42	/// because it is not necessary -- the contents are already in memory and no IO
43	/// is needed, therefore there is no potential for blocking.
44	///
45	/// # Examples
46	///
47	/// ```
48	/// # use pretty_assertions::assert_eq;
49	/// use quick_xml::events::Event;
50	/// use quick_xml::reader::Reader;
51	///
52	/// let mut reader = Reader::from_str(r#"
53	/// <tag1 att1 = "test">
54	/// <tag2><!--Test comment-->Test</tag2>
55	/// <tag2>Test 2</tag2>
56	/// </tag1>
57	/// "#);
58	/// reader.config_mut().trim_text(`true`);
59	///
60	/// let mut count = `0`;
61	/// let mut txt = Vec::new();
62	/// loop {
63	/// match reader.read_event().unwrap() {
64	/// Event::Start(e) => count += `1`,
65	/// Event::Text(e) => txt.push(e.unescape().unwrap().into_owned()),
66	/// Event::Eof => break,
67	/// _ => (),
68	/// }
69	/// }
70	/// assert_eq!(count, `3`);
71	/// assert_eq!(txt, vec!["Test".to_string(), "Test 2".to_string()]);
72	/// ```
73	#[inline]
74	pub fn read_event(&mut self) -> Result<Event<'a>> {
75	self.read_event_impl(())
76	}
77
78	/// Reads until end element is found. This function is supposed to be called
79	/// after you already read a [`Start`] event.
80	///
81	/// Returns a span that cover content between `>` of an opening tag and `<` of
82	/// a closing tag or an empty slice, if [`expand_empty_elements`] is set and
83	/// this method was called after reading expanded [`Start`] event.
84	///
85	/// Manages nested cases where parent and child elements have the _literally_
86	/// same name.
87	///
88	/// If a corresponding [`End`] event is not found, an error of type [`Error::IllFormed`]
89	/// will be returned. In particularly, that error will be returned if you call
90	/// this method without consuming the corresponding [`Start`] event first.
91	///
92	/// The `end` parameter should contain name of the end element _in the reader
93	/// encoding_. It is good practice to always get that parameter using
94	/// [`BytesStart::to_end()`] method.
95	///
96	/// The correctness of the skipped events does not checked, if you disabled
97	/// the [`check_end_names`] option.
98	///
99	/// There is no asynchronous `read_to_end_async()` version of this function,
100	/// because it is not necessary -- the contents are already in memory and no IO
101	/// is needed, therefore there is no potential for blocking.
102	///
103	/// # Namespaces
104	///
105	/// While the `Reader` does not support namespace resolution, namespaces
106	/// does not change the algorithm for comparing names. Although the names
107	/// `a:name` and `b:name` where both prefixes `a` and `b` resolves to the
108	/// same namespace, are semantically equivalent, `</b:name>` cannot close
109	/// `<a:name>`, because according to [the specification]
110	///
111	/// > The end of every element that begins with a start-tag* MUST be marked*
112	/// > by an end-tag* containing a name that echoes the element's type as*
113	/// > given in the start-tag
114	///
115	/// # Examples
116	///
117	/// This example shows, how you can skip XML content after you read the
118	/// start event.
119	///
120	/// ```
121	/// # use pretty_assertions::assert_eq;
122	/// use quick_xml::events::{BytesStart, Event};
123	/// use quick_xml::reader::Reader;
124	///
125	/// let mut reader = Reader::from_str(r#"
126	/// <outer>
127	/// <inner>
128	/// <inner></inner>
129	/// <inner/>
130	/// <outer></outer>
131	/// <outer/>
132	/// </inner>
133	/// </outer>
134	/// "#);
135	/// reader.config_mut().trim_text(`true`);
136	///
137	/// let start = BytesStart::new("outer");
138	/// let end = start.to_end().into_owned();
139	///
140	/// // First, we read a start event...
141	/// assert_eq!(reader.read_event().unwrap(), Event::Start(start));
142	///
143	/// // ...then, we could skip all events to the corresponding end event.
144	/// // This call will correctly handle nested <outer> elements.
145	/// // Note, however, that this method does not handle namespaces.
146	/// reader.read_to_end(end.name()).unwrap();
147	///
148	/// // At the end we should get an Eof event, because we ate the whole XML
149	/// assert_eq!(reader.read_event().unwrap(), Event::Eof);
150	/// ```
151	///
152	/// [`Start`]: Event::Start
153	/// [`End`]: Event::End
154	/// [`BytesStart::to_end()`]: crate::events::BytesStart::to_end
155	/// [`expand_empty_elements`]: crate::reader::Config::expand_empty_elements
156	/// [`check_end_names`]: crate::reader::Config::check_end_names
157	/// [the specification]: https://www.w3.org/TR/xml11/#dt-etag
158	pub fn read_to_end(&mut self, end: QName) -> Result<Span> {
159	Ok(read_to_end!(self, end, (), read_event_impl, {}))
160	}
161
162	/// Reads content between start and end tags, including any markup. This
163	/// function is supposed to be called after you already read a [`Start`] event.
164	///
165	/// Manages nested cases where parent and child elements have the _literally_
166	/// same name.
167	///
168	/// This method does not unescape read data, instead it returns content
169	/// "as is" of the XML document. This is because it has no idea what text
170	/// it reads, and if, for example, it contains CDATA section, attempt to
171	/// unescape it content will spoil data.
172	///
173	/// Any text will be decoded using the XML current [`decoder()`].
174	///
175	/// Actually, this method perform the following code:
176	///
177	/// ```ignore
178	/// let span = reader.read_to_end(end)?;
179	/// let text = reader.decoder().decode(&reader.inner_slice[span]);
180	/// ```
181	///
182	/// # Examples
183	///
184	/// This example shows, how you can read a HTML content from your XML document.
185	///
186	/// ```
187	/// # use pretty_assertions::assert_eq;
188	/// # use std::borrow::Cow;
189	/// use quick_xml::events::{BytesStart, Event};
190	/// use quick_xml::reader::Reader;
191	///
192	/// let mut reader = Reader::from_str("
193	/// <html>
194	/// <title>This is a HTML text</title>
195	/// <p>Usual XML rules does not apply inside it
196	/// <p>For example, elements not needed to be "closed"
197	/// </html>
198	/// ");
199	/// reader.config_mut().trim_text(`true`);
200	///
201	/// let start = BytesStart::new("html");
202	/// let end = start.to_end().into_owned();
203	///
204	/// // First, we read a start event...
205	/// assert_eq!(reader.read_event().unwrap(), Event::Start(start));
206	/// // ...and disable checking of end names because we expect HTML further...
207	/// reader.config_mut().check_end_names = `false`;
208	///
209	/// // ...then, we could read text content until close tag.
210	/// // This call will correctly handle nested <html> elements.
211	/// let text = reader.read_text(end.name()).unwrap();
212	/// assert_eq!(text, Cow::Borrowed(r#"
213	/// <title>This is a HTML text</title>
214	/// <p>Usual XML rules does not apply inside it
215	/// <p>For example, elements not needed to be "closed"
216	/// "#));
217	/// assert!(matches!(text, Cow::Borrowed(_)));
218	///
219	/// // Now we can enable checks again
220	/// reader.config_mut().check_end_names = `true`;
221	///
222	/// // At the end we should get an Eof event, because we ate the whole XML
223	/// assert_eq!(reader.read_event().unwrap(), Event::Eof);
224	/// ```
225	///
226	/// [`Start`]: Event::Start
227	/// [`decoder()`]: Self::decoder()
228	pub fn read_text(&mut self, end: QName) -> Result<Cow<'a, str>> {
229	// self.reader will be changed, so store original reference
230	let buffer = self.reader;
231	let span = self.read_to_end(end)?;
232
233	let len = span.end - span.start;
234	// SAFETY: `span` can only contain indexes up to usize::MAX because it
235	// was created from offsets from a single &[u8] slice
236	Ok(self.decoder().decode(&buffer[`0`..len as usize])?)
237	}
238	}
239
240	////////////////////////////////////////////////////////////////////////////////////////////////////
241
242	/// Implementation of `XmlSource` for `&[u8]` reader using a `Self` as buffer
243	/// that will be borrowed by events. This implementation provides a zero-copy deserialization
244	impl<'a> XmlSource<'a, ()> for &'a [u8] {
245	#[cfg(not(feature = "encoding"))]
246	#[inline]
247	fn remove_utf8_bom(&mut self) -> io::Result<()> {
248	if self.starts_with(crate::encoding::UTF8_BOM) {
249	self = &self[crate*::encoding::UTF8_BOM.len()..];
250	}
251	Ok(())
252	}
253
254	#[cfg(feature = "encoding")]
255	#[inline]
256	fn detect_encoding(&mut self) -> io::Result<Option<&'static Encoding>> {
257	if let Some((enc, bom_len)) = crate::encoding::detect_encoding(self) {
258	*self = &self[bom_len..];
259	return Ok(Some(enc));
260	}
261	Ok(None)
262	}
263
264	#[inline]
265	fn read_text(&mut self, _buf: (), position: &mut u64) -> ReadTextResult<'a, ()> {
266	match memchr::memchr(b'<', self) {
267	Some(`0`) => {
268	*position += `1`;
269	*self = &self[`1`..];
270	ReadTextResult::Markup(())
271	}
272	Some(i) => {
273	position += i as u64* + `1`;
274	let bytes = &self[..i];
275	*self = &self[i + `1`..];
276	ReadTextResult::UpToMarkup(bytes)
277	}
278	None => {
279	position += self.len() as u64*;
280	let bytes = &self[..];
281	*self = &[];
282	ReadTextResult::UpToEof(bytes)
283	}
284	}
285	}
286
287	#[inline]
288	fn read_with<P>(&mut self, mut parser: P, _buf: (), position: &mut u64) -> Result<&'a [u8]>
289	where
290	P: Parser,
291	{
292	if let Some(i) = parser.feed(self) {
293	// +1 for `>` which we do not include
294	position += i as u64* + `1`;
295	let bytes = &self[..i];
296	*self = &self[i + `1`..];
297	return Ok(bytes);
298	}
299
300	position += self.len() as u64*;
301	Err(Error::Syntax(P::eof_error()))
302	}
303
304	#[inline]
305	fn read_bang_element(&mut self, _buf: (), position: &mut u64) -> Result<(BangType, &'a [u8])> {
306	// Peeked one bang ('!') before being called, so it's guaranteed to
307	// start with it.
308	debug_assert_eq!(self[`0`], b'!');
309
310	let mut bang_type = BangType::new(self[`1`..].first().copied())?;
311
312	if let Some((bytes, i)) = bang_type.parse(&[], self) {
313	position += i as u64*;
314	*self = &self[i..];
315	return Ok((bang_type, bytes));
316	}
317
318	position += self.len() as u64*;
319	Err(bang_type.to_err().into())
320	}
321
322	#[inline]
323	fn skip_whitespace(&mut self, position: &mut u64) -> io::Result<()> {
324	let whitespaces = self
325	.iter()
326	.position(\|b\| !is_whitespace(*b))
327	.unwrap_or(self.len());
328	position += whitespaces as u64*;
329	*self = &self[whitespaces..];
330	Ok(())
331	}
332
333	#[inline]
334	fn peek_one(&mut self) -> io::Result<Option<u8>> {
335	Ok(self.first().copied())
336	}
337	}
338
339	#[cfg(test)]
340	mod test {
341	use crate::reader::test::check;
342	use crate::reader::XmlSource;
343
344	/// Default buffer constructor just pass the byte array from the test
345	fn identity<T>(input: T) -> T {
346	input
347	}
348
349	check!(
350	#[test]
351	read_event_impl,
352	read_until_close,
353	identity,
354	()
355	);
356	}
357