1//! This is an implementation of [`Reader`] for reading from a `&[u8]` as
2//! underlying byte stream. This implementation supports not using an
3//! intermediate buffer as the byte slice itself can be used to borrow from.
4
5use std::borrow::Cow;
6
7#[cfg(feature = "encoding")]
8use crate::reader::EncodingRef;
9#[cfg(feature = "encoding")]
10use encoding_rs::{Encoding, UTF_8};
11
12use crate::errors::{Error, Result};
13use crate::events::Event;
14use crate::name::QName;
15use crate::reader::{is_whitespace, BangType, ReadElementState, Reader, Span, XmlSource};
16
17use memchr;
18
19/// This is an implementation for reading from a `&[u8]` as underlying byte stream.
20/// This implementation supports not using an intermediate buffer as the byte slice
21/// itself can be used to borrow from.
22impl<'a> Reader<&'a [u8]> {
23 /// Creates an XML reader from a string slice.
24 #[allow(clippy::should_implement_trait)]
25 pub fn from_str(s: &'a str) -> Self {
26 // Rust strings are guaranteed to be UTF-8, so lock the encoding
27 #[cfg(feature = "encoding")]
28 {
29 let mut reader = Self::from_reader(s.as_bytes());
30 reader.state.encoding = EncodingRef::Explicit(UTF_8);
31 reader
32 }
33
34 #[cfg(not(feature = "encoding"))]
35 Self::from_reader(s.as_bytes())
36 }
37
38 /// Read an event that borrows from the input rather than a buffer.
39 ///
40 /// There is no asynchronous `read_event_async()` version of this function,
41 /// because it is not necessary -- the contents are already in memory and no IO
42 /// is needed, therefore there is no potential for blocking.
43 ///
44 /// # Examples
45 ///
46 /// ```
47 /// # use pretty_assertions::assert_eq;
48 /// use quick_xml::events::Event;
49 /// use quick_xml::reader::Reader;
50 ///
51 /// let mut reader = Reader::from_str(r#"
52 /// <tag1 att1 = "test">
53 /// <tag2><!--Test comment-->Test</tag2>
54 /// <tag2>Test 2</tag2>
55 /// </tag1>
56 /// "#);
57 /// reader.trim_text(true);
58 ///
59 /// let mut count = 0;
60 /// let mut txt = Vec::new();
61 /// loop {
62 /// match reader.read_event().unwrap() {
63 /// Event::Start(e) => count += 1,
64 /// Event::Text(e) => txt.push(e.unescape().unwrap().into_owned()),
65 /// Event::Eof => break,
66 /// _ => (),
67 /// }
68 /// }
69 /// assert_eq!(count, 3);
70 /// assert_eq!(txt, vec!["Test".to_string(), "Test 2".to_string()]);
71 /// ```
72 #[inline]
73 pub fn read_event(&mut self) -> Result<Event<'a>> {
74 self.read_event_impl(())
75 }
76
77 /// Reads until end element is found. This function is supposed to be called
78 /// after you already read a [`Start`] event.
79 ///
80 /// Returns a span that cover content between `>` of an opening tag and `<` of
81 /// a closing tag or an empty slice, if [`expand_empty_elements`] is set and
82 /// this method was called after reading expanded [`Start`] event.
83 ///
84 /// Manages nested cases where parent and child elements have the _literally_
85 /// same name.
86 ///
87 /// If corresponding [`End`] event will not be found, the [`Error::UnexpectedEof`]
88 /// will be returned. In particularly, that error will be returned if you call
89 /// this method without consuming the corresponding [`Start`] event first.
90 ///
91 /// The `end` parameter should contain name of the end element _in the reader
92 /// encoding_. It is good practice to always get that parameter using
93 /// [`BytesStart::to_end()`] method.
94 ///
95 /// The correctness of the skipped events does not checked, if you disabled
96 /// the [`check_end_names`] option.
97 ///
98 /// There is no asynchronous `read_to_end_async()` version of this function,
99 /// because it is not necessary -- the contents are already in memory and no IO
100 /// is needed, therefore there is no potential for blocking.
101 ///
102 /// # Namespaces
103 ///
104 /// While the `Reader` does not support namespace resolution, namespaces
105 /// does not change the algorithm for comparing names. Although the names
106 /// `a:name` and `b:name` where both prefixes `a` and `b` resolves to the
107 /// same namespace, are semantically equivalent, `</b:name>` cannot close
108 /// `<a:name>`, because according to [the specification]
109 ///
110 /// > The end of every element that begins with a **start-tag** MUST be marked
111 /// > by an **end-tag** containing a name that echoes the element's type as
112 /// > given in the **start-tag**
113 ///
114 /// # Examples
115 ///
116 /// This example shows, how you can skip XML content after you read the
117 /// start event.
118 ///
119 /// ```
120 /// # use pretty_assertions::assert_eq;
121 /// use quick_xml::events::{BytesStart, Event};
122 /// use quick_xml::reader::Reader;
123 ///
124 /// let mut reader = Reader::from_str(r#"
125 /// <outer>
126 /// <inner>
127 /// <inner></inner>
128 /// <inner/>
129 /// <outer></outer>
130 /// <outer/>
131 /// </inner>
132 /// </outer>
133 /// "#);
134 /// reader.trim_text(true);
135 ///
136 /// let start = BytesStart::new("outer");
137 /// let end = start.to_end().into_owned();
138 ///
139 /// // First, we read a start event...
140 /// assert_eq!(reader.read_event().unwrap(), Event::Start(start));
141 ///
142 /// // ...then, we could skip all events to the corresponding end event.
143 /// // This call will correctly handle nested <outer> elements.
144 /// // Note, however, that this method does not handle namespaces.
145 /// reader.read_to_end(end.name()).unwrap();
146 ///
147 /// // At the end we should get an Eof event, because we ate the whole XML
148 /// assert_eq!(reader.read_event().unwrap(), Event::Eof);
149 /// ```
150 ///
151 /// [`Start`]: Event::Start
152 /// [`End`]: Event::End
153 /// [`BytesStart::to_end()`]: crate::events::BytesStart::to_end
154 /// [`expand_empty_elements`]: Self::expand_empty_elements
155 /// [`check_end_names`]: Self::check_end_names
156 /// [the specification]: https://www.w3.org/TR/xml11/#dt-etag
157 pub fn read_to_end(&mut self, end: QName) -> Result<Span> {
158 Ok(read_to_end!(self, end, (), read_event_impl, {}))
159 }
160
161 /// Reads content between start and end tags, including any markup. This
162 /// function is supposed to be called after you already read a [`Start`] event.
163 ///
164 /// Manages nested cases where parent and child elements have the _literally_
165 /// same name.
166 ///
167 /// This method does not unescape read data, instead it returns content
168 /// "as is" of the XML document. This is because it has no idea what text
169 /// it reads, and if, for example, it contains CDATA section, attempt to
170 /// unescape it content will spoil data.
171 ///
172 /// Any text will be decoded using the XML current [`decoder()`].
173 ///
174 /// Actually, this method perform the following code:
175 ///
176 /// ```ignore
177 /// let span = reader.read_to_end(end)?;
178 /// let text = reader.decoder().decode(&reader.inner_slice[span]);
179 /// ```
180 ///
181 /// # Examples
182 ///
183 /// This example shows, how you can read a HTML content from your XML document.
184 ///
185 /// ```
186 /// # use pretty_assertions::assert_eq;
187 /// # use std::borrow::Cow;
188 /// use quick_xml::events::{BytesStart, Event};
189 /// use quick_xml::reader::Reader;
190 ///
191 /// let mut reader = Reader::from_str("
192 /// <html>
193 /// <title>This is a HTML text</title>
194 /// <p>Usual XML rules does not apply inside it
195 /// <p>For example, elements not needed to be &quot;closed&quot;
196 /// </html>
197 /// ");
198 /// reader.trim_text(true);
199 ///
200 /// let start = BytesStart::new("html");
201 /// let end = start.to_end().into_owned();
202 ///
203 /// // First, we read a start event...
204 /// assert_eq!(reader.read_event().unwrap(), Event::Start(start));
205 /// // ...and disable checking of end names because we expect HTML further...
206 /// reader.check_end_names(false);
207 ///
208 /// // ...then, we could read text content until close tag.
209 /// // This call will correctly handle nested <html> elements.
210 /// let text = reader.read_text(end.name()).unwrap();
211 /// assert_eq!(text, Cow::Borrowed(r#"
212 /// <title>This is a HTML text</title>
213 /// <p>Usual XML rules does not apply inside it
214 /// <p>For example, elements not needed to be &quot;closed&quot;
215 /// "#));
216 /// assert!(matches!(text, Cow::Borrowed(_)));
217 ///
218 /// // Now we can enable checks again
219 /// reader.check_end_names(true);
220 ///
221 /// // At the end we should get an Eof event, because we ate the whole XML
222 /// assert_eq!(reader.read_event().unwrap(), Event::Eof);
223 /// ```
224 ///
225 /// [`Start`]: Event::Start
226 /// [`decoder()`]: Self::decoder()
227 pub fn read_text(&mut self, end: QName) -> Result<Cow<'a, str>> {
228 // self.reader will be changed, so store original reference
229 let buffer = self.reader;
230 let span = self.read_to_end(end)?;
231
232 self.decoder().decode(&buffer[0..span.len()])
233 }
234}
235
236////////////////////////////////////////////////////////////////////////////////////////////////////
237
238/// Implementation of `XmlSource` for `&[u8]` reader using a `Self` as buffer
239/// that will be borrowed by events. This implementation provides a zero-copy deserialization
240impl<'a> XmlSource<'a, ()> for &'a [u8] {
241 #[cfg(not(feature = "encoding"))]
242 fn remove_utf8_bom(&mut self) -> Result<()> {
243 if self.starts_with(crate::encoding::UTF8_BOM) {
244 *self = &self[crate::encoding::UTF8_BOM.len()..];
245 }
246 Ok(())
247 }
248
249 #[cfg(feature = "encoding")]
250 fn detect_encoding(&mut self) -> Result<Option<&'static Encoding>> {
251 if let Some((enc, bom_len)) = crate::encoding::detect_encoding(self) {
252 *self = &self[bom_len..];
253 return Ok(Some(enc));
254 }
255 Ok(None)
256 }
257
258 fn read_bytes_until(
259 &mut self,
260 byte: u8,
261 _buf: (),
262 position: &mut usize,
263 ) -> Result<Option<&'a [u8]>> {
264 // search byte must be within the ascii range
265 debug_assert!(byte.is_ascii());
266 if self.is_empty() {
267 return Ok(None);
268 }
269
270 Ok(Some(if let Some(i) = memchr::memchr(byte, self) {
271 *position += i + 1;
272 let bytes = &self[..i];
273 *self = &self[i + 1..];
274 bytes
275 } else {
276 *position += self.len();
277 let bytes = &self[..];
278 *self = &[];
279 bytes
280 }))
281 }
282
283 fn read_bang_element(
284 &mut self,
285 _buf: (),
286 position: &mut usize,
287 ) -> Result<Option<(BangType, &'a [u8])>> {
288 // Peeked one bang ('!') before being called, so it's guaranteed to
289 // start with it.
290 debug_assert_eq!(self[0], b'!');
291
292 let bang_type = BangType::new(self[1..].first().copied())?;
293
294 if let Some((bytes, i)) = bang_type.parse(&[], self) {
295 *position += i;
296 *self = &self[i..];
297 return Ok(Some((bang_type, bytes)));
298 }
299
300 // Note: Do not update position, so the error points to
301 // somewhere sane rather than at the EOF
302 Err(bang_type.to_err())
303 }
304
305 fn read_element(&mut self, _buf: (), position: &mut usize) -> Result<Option<&'a [u8]>> {
306 if self.is_empty() {
307 return Ok(None);
308 }
309
310 let mut state = ReadElementState::Elem;
311
312 if let Some((bytes, i)) = state.change(self) {
313 // Position now just after the `>` symbol
314 *position += i;
315 *self = &self[i..];
316 return Ok(Some(bytes));
317 }
318
319 // Note: Do not update position, so the error points to a sane place
320 // rather than at the EOF.
321 Err(Error::UnexpectedEof("Element".to_string()))
322
323 // FIXME: Figure out why the other one works without UnexpectedEof
324 }
325
326 fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> {
327 let whitespaces = self
328 .iter()
329 .position(|b| !is_whitespace(*b))
330 .unwrap_or(self.len());
331 *position += whitespaces;
332 *self = &self[whitespaces..];
333 Ok(())
334 }
335
336 fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result<bool> {
337 // search byte must be within the ascii range
338 debug_assert!(byte.is_ascii());
339 if self.first() == Some(&byte) {
340 *self = &self[1..];
341 *position += 1;
342 Ok(true)
343 } else {
344 Ok(false)
345 }
346 }
347
348 fn peek_one(&mut self) -> Result<Option<u8>> {
349 Ok(self.first().copied())
350 }
351}
352
353#[cfg(test)]
354mod test {
355 use crate::reader::test::check;
356 use crate::reader::XmlSource;
357
358 /// Default buffer constructor just pass the byte array from the test
359 fn identity<T>(input: T) -> T {
360 input
361 }
362
363 check!(
364 #[test]
365 read_event_impl,
366 read_until_close,
367 identity,
368 ()
369 );
370
371 #[cfg(feature = "encoding")]
372 mod encoding {
373 use crate::events::Event;
374 use crate::reader::Reader;
375 use encoding_rs::UTF_8;
376 use pretty_assertions::assert_eq;
377
378 /// Checks that XML declaration cannot change the encoding from UTF-8 if
379 /// a `Reader` was created using `from_str` method
380 #[test]
381 fn str_always_has_utf8() {
382 let mut reader = Reader::from_str("<?xml encoding='UTF-16'?>");
383
384 assert_eq!(reader.decoder().encoding(), UTF_8);
385 reader.read_event().unwrap();
386 assert_eq!(reader.decoder().encoding(), UTF_8);
387
388 assert_eq!(reader.read_event().unwrap(), Event::Eof);
389 }
390 }
391}
392