slice_reader.rs source code [crates/quick-xml-0.31.0/src/reader/slice_reader.rs]

1	//! This is an implementation of [`Reader`] for reading from a `&[u8]` as
2	//! underlying byte stream. This implementation supports not using an
3	//! intermediate buffer as the byte slice itself can be used to borrow from.
4
5	use std::borrow::Cow;
6
7	#[cfg(feature = "encoding")]
8	use crate::reader::EncodingRef;
9	#[cfg(feature = "encoding")]
10	use encoding_rs::{Encoding, UTF_8};
11
12	use crate::errors::{Error, Result};
13	use crate::events::Event;
14	use crate::name::QName;
15	use crate::reader::{is_whitespace, BangType, ReadElementState, Reader, Span, XmlSource};
16
17	use memchr;
18
19	/// This is an implementation for reading from a `&[u8]` as underlying byte stream.
20	/// This implementation supports not using an intermediate buffer as the byte slice
21	/// itself can be used to borrow from.
22	impl<'a> Reader<&'a [u8]> {
23	/// Creates an XML reader from a string slice.
24	#[allow(clippy::should_implement_trait)]
25	pub fn from_str(s: &'a str) -> Self {
26	// Rust strings are guaranteed to be UTF-8, so lock the encoding
27	#[cfg(feature = "encoding")]
28	{
29	let mut reader = Self::from_reader(s.as_bytes());
30	reader.state.encoding = EncodingRef::Explicit(UTF_8);
31	reader
32	}
33
34	#[cfg(not(feature = "encoding"))]
35	Self::from_reader(s.as_bytes())
36	}
37
38	/// Read an event that borrows from the input rather than a buffer.
39	///
40	/// There is no asynchronous `read_event_async()` version of this function,
41	/// because it is not necessary -- the contents are already in memory and no IO
42	/// is needed, therefore there is no potential for blocking.
43	///
44	/// # Examples
45	///
46	/// ```
47	/// # use pretty_assertions::assert_eq;
48	/// use quick_xml::events::Event;
49	/// use quick_xml::reader::Reader;
50	///
51	/// let mut reader = Reader::from_str(r#"
52	/// <tag1 att1 = "test">
53	/// <tag2><!--Test comment-->Test</tag2>
54	/// <tag2>Test 2</tag2>
55	/// </tag1>
56	/// "#);
57	/// reader.trim_text(`true`);
58	///
59	/// let mut count = `0`;
60	/// let mut txt = Vec::new();
61	/// loop {
62	/// match reader.read_event().unwrap() {
63	/// Event::Start(e) => count += `1`,
64	/// Event::Text(e) => txt.push(e.unescape().unwrap().into_owned()),
65	/// Event::Eof => break,
66	/// _ => (),
67	/// }
68	/// }
69	/// assert_eq!(count, `3`);
70	/// assert_eq!(txt, vec!["Test".to_string(), "Test 2".to_string()]);
71	/// ```
72	#[inline]
73	pub fn read_event(&mut self) -> Result<Event<'a>> {
74	self.read_event_impl(())
75	}
76
77	/// Reads until end element is found. This function is supposed to be called
78	/// after you already read a [`Start`] event.
79	///
80	/// Returns a span that cover content between `>` of an opening tag and `<` of
81	/// a closing tag or an empty slice, if [`expand_empty_elements`] is set and
82	/// this method was called after reading expanded [`Start`] event.
83	///
84	/// Manages nested cases where parent and child elements have the _literally_
85	/// same name.
86	///
87	/// If corresponding [`End`] event will not be found, the [`Error::UnexpectedEof`]
88	/// will be returned. In particularly, that error will be returned if you call
89	/// this method without consuming the corresponding [`Start`] event first.
90	///
91	/// The `end` parameter should contain name of the end element _in the reader
92	/// encoding_. It is good practice to always get that parameter using
93	/// [`BytesStart::to_end()`] method.
94	///
95	/// The correctness of the skipped events does not checked, if you disabled
96	/// the [`check_end_names`] option.
97	///
98	/// There is no asynchronous `read_to_end_async()` version of this function,
99	/// because it is not necessary -- the contents are already in memory and no IO
100	/// is needed, therefore there is no potential for blocking.
101	///
102	/// # Namespaces
103	///
104	/// While the `Reader` does not support namespace resolution, namespaces
105	/// does not change the algorithm for comparing names. Although the names
106	/// `a:name` and `b:name` where both prefixes `a` and `b` resolves to the
107	/// same namespace, are semantically equivalent, `</b:name>` cannot close
108	/// `<a:name>`, because according to [the specification]
109	///
110	/// > The end of every element that begins with a start-tag* MUST be marked*
111	/// > by an end-tag* containing a name that echoes the element's type as*
112	/// > given in the start-tag
113	///
114	/// # Examples
115	///
116	/// This example shows, how you can skip XML content after you read the
117	/// start event.
118	///
119	/// ```
120	/// # use pretty_assertions::assert_eq;
121	/// use quick_xml::events::{BytesStart, Event};
122	/// use quick_xml::reader::Reader;
123	///
124	/// let mut reader = Reader::from_str(r#"
125	/// <outer>
126	/// <inner>
127	/// <inner></inner>
128	/// <inner/>
129	/// <outer></outer>
130	/// <outer/>
131	/// </inner>
132	/// </outer>
133	/// "#);
134	/// reader.trim_text(`true`);
135	///
136	/// let start = BytesStart::new("outer");
137	/// let end = start.to_end().into_owned();
138	///
139	/// // First, we read a start event...
140	/// assert_eq!(reader.read_event().unwrap(), Event::Start(start));
141	///
142	/// // ...then, we could skip all events to the corresponding end event.
143	/// // This call will correctly handle nested <outer> elements.
144	/// // Note, however, that this method does not handle namespaces.
145	/// reader.read_to_end(end.name()).unwrap();
146	///
147	/// // At the end we should get an Eof event, because we ate the whole XML
148	/// assert_eq!(reader.read_event().unwrap(), Event::Eof);
149	/// ```
150	///
151	/// [`Start`]: Event::Start
152	/// [`End`]: Event::End
153	/// [`BytesStart::to_end()`]: crate::events::BytesStart::to_end
154	/// [`expand_empty_elements`]: Self::expand_empty_elements
155	/// [`check_end_names`]: Self::check_end_names
156	/// [the specification]: https://www.w3.org/TR/xml11/#dt-etag
157	pub fn read_to_end(&mut self, end: QName) -> Result<Span> {
158	Ok(read_to_end!(self, end, (), read_event_impl, {}))
159	}
160
161	/// Reads content between start and end tags, including any markup. This
162	/// function is supposed to be called after you already read a [`Start`] event.
163	///
164	/// Manages nested cases where parent and child elements have the _literally_
165	/// same name.
166	///
167	/// This method does not unescape read data, instead it returns content
168	/// "as is" of the XML document. This is because it has no idea what text
169	/// it reads, and if, for example, it contains CDATA section, attempt to
170	/// unescape it content will spoil data.
171	///
172	/// Any text will be decoded using the XML current [`decoder()`].
173	///
174	/// Actually, this method perform the following code:
175	///
176	/// ```ignore
177	/// let span = reader.read_to_end(end)?;
178	/// let text = reader.decoder().decode(&reader.inner_slice[span]);
179	/// ```
180	///
181	/// # Examples
182	///
183	/// This example shows, how you can read a HTML content from your XML document.
184	///
185	/// ```
186	/// # use pretty_assertions::assert_eq;
187	/// # use std::borrow::Cow;
188	/// use quick_xml::events::{BytesStart, Event};
189	/// use quick_xml::reader::Reader;
190	///
191	/// let mut reader = Reader::from_str("
192	/// <html>
193	/// <title>This is a HTML text</title>
194	/// <p>Usual XML rules does not apply inside it
195	/// <p>For example, elements not needed to be "closed"
196	/// </html>
197	/// ");
198	/// reader.trim_text(`true`);
199	///
200	/// let start = BytesStart::new("html");
201	/// let end = start.to_end().into_owned();
202	///
203	/// // First, we read a start event...
204	/// assert_eq!(reader.read_event().unwrap(), Event::Start(start));
205	/// // ...and disable checking of end names because we expect HTML further...
206	/// reader.check_end_names(`false`);
207	///
208	/// // ...then, we could read text content until close tag.
209	/// // This call will correctly handle nested <html> elements.
210	/// let text = reader.read_text(end.name()).unwrap();
211	/// assert_eq!(text, Cow::Borrowed(r#"
212	/// <title>This is a HTML text</title>
213	/// <p>Usual XML rules does not apply inside it
214	/// <p>For example, elements not needed to be "closed"
215	/// "#));
216	/// assert!(matches!(text, Cow::Borrowed(_)));
217	///
218	/// // Now we can enable checks again
219	/// reader.check_end_names(`true`);
220	///
221	/// // At the end we should get an Eof event, because we ate the whole XML
222	/// assert_eq!(reader.read_event().unwrap(), Event::Eof);
223	/// ```
224	///
225	/// [`Start`]: Event::Start
226	/// [`decoder()`]: Self::decoder()
227	pub fn read_text(&mut self, end: QName) -> Result<Cow<'a, str>> {
228	// self.reader will be changed, so store original reference
229	let buffer = self.reader;
230	let span = self.read_to_end(end)?;
231
232	self.decoder().decode(&buffer[`0`..span.len()])
233	}
234	}
235
236	////////////////////////////////////////////////////////////////////////////////////////////////////
237
238	/// Implementation of `XmlSource` for `&[u8]` reader using a `Self` as buffer
239	/// that will be borrowed by events. This implementation provides a zero-copy deserialization
240	impl<'a> XmlSource<'a, ()> for &'a [u8] {
241	#[cfg(not(feature = "encoding"))]
242	fn remove_utf8_bom(&mut self) -> Result<()> {
243	if self.starts_with(crate::encoding::UTF8_BOM) {
244	self = &self[crate*::encoding::UTF8_BOM.len()..];
245	}
246	Ok(())
247	}
248
249	#[cfg(feature = "encoding")]
250	fn detect_encoding(&mut self) -> Result<Option<&'static Encoding>> {
251	if let Some((enc, bom_len)) = crate::encoding::detect_encoding(self) {
252	*self = &self[bom_len..];
253	return Ok(Some(enc));
254	}
255	Ok(None)
256	}
257
258	fn read_bytes_until(
259	&mut self,
260	byte: u8,
261	_buf: (),
262	position: &mut usize,
263	) -> Result<Option<&'a [u8]>> {
264	// search byte must be within the ascii range
265	debug_assert!(byte.is_ascii());
266	if self.is_empty() {
267	return Ok(None);
268	}
269
270	Ok(Some(if let Some(i) = memchr::memchr(byte, self) {
271	*position += i + `1`;
272	let bytes = &self[..i];
273	*self = &self[i + `1`..];
274	bytes
275	} else {
276	*position += self.len();
277	let bytes = &self[..];
278	*self = &[];
279	bytes
280	}))
281	}
282
283	fn read_bang_element(
284	&mut self,
285	_buf: (),
286	position: &mut usize,
287	) -> Result<Option<(BangType, &'a [u8])>> {
288	// Peeked one bang ('!') before being called, so it's guaranteed to
289	// start with it.
290	debug_assert_eq!(self[`0`], b'!');
291
292	let bang_type = BangType::new(self[`1`..].first().copied())?;
293
294	if let Some((bytes, i)) = bang_type.parse(&[], self) {
295	*position += i;
296	*self = &self[i..];
297	return Ok(Some((bang_type, bytes)));
298	}
299
300	// Note: Do not update position, so the error points to
301	// somewhere sane rather than at the EOF
302	Err(bang_type.to_err())
303	}
304
305	fn read_element(&mut self, _buf: (), position: &mut usize) -> Result<Option<&'a [u8]>> {
306	if self.is_empty() {
307	return Ok(None);
308	}
309
310	let mut state = ReadElementState::Elem;
311
312	if let Some((bytes, i)) = state.change(self) {
313	// Position now just after the `>` symbol
314	*position += i;
315	*self = &self[i..];
316	return Ok(Some(bytes));
317	}
318
319	// Note: Do not update position, so the error points to a sane place
320	// rather than at the EOF.
321	Err(Error::UnexpectedEof("Element".to_string()))
322
323	// FIXME: Figure out why the other one works without UnexpectedEof
324	}
325
326	fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> {
327	let whitespaces = self
328	.iter()
329	.position(\|b\| !is_whitespace(*b))
330	.unwrap_or(self.len());
331	*position += whitespaces;
332	*self = &self[whitespaces..];
333	Ok(())
334	}
335
336	fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result<bool> {
337	// search byte must be within the ascii range
338	debug_assert!(byte.is_ascii());
339	if self.first() == Some(&byte) {
340	*self = &self[`1`..];
341	*position += `1`;
342	Ok(`true`)
343	} else {
344	Ok(`false`)
345	}
346	}
347
348	fn peek_one(&mut self) -> Result<Option<u8>> {
349	Ok(self.first().copied())
350	}
351	}
352
353	#[cfg(test)]
354	mod test {
355	use crate::reader::test::check;
356	use crate::reader::XmlSource;
357
358	/// Default buffer constructor just pass the byte array from the test
359	fn identity<T>(input: T) -> T {
360	input
361	}
362
363	check!(
364	#[test]
365	read_event_impl,
366	read_until_close,
367	identity,
368	()
369	);
370
371	#[cfg(feature = "encoding")]
372	mod encoding {
373	use crate::events::Event;
374	use crate::reader::Reader;
375	use encoding_rs::UTF_8;
376	use pretty_assertions::assert_eq;
377
378	/// Checks that XML declaration cannot change the encoding from UTF-8 if
379	/// a `Reader` was created using `from_str` method
380	#[test]
381	fn str_always_has_utf8() {
382	let mut reader = Reader::from_str("<?xml encoding='UTF-16'?>");
383
384	assert_eq!(reader.decoder().encoding(), UTF_8);
385	reader.read_event().unwrap();
386	assert_eq!(reader.decoder().encoding(), UTF_8);
387
388	assert_eq!(reader.read_event().unwrap(), Event::Eof);
389	}
390	}
391	}
392