1 | //! This is an implementation of [`Reader`] for reading from a `&[u8]` as
|
2 | //! underlying byte stream. This implementation supports not using an
|
3 | //! intermediate buffer as the byte slice itself can be used to borrow from.
|
4 |
|
5 | use std::borrow::Cow;
|
6 | use std::io;
|
7 |
|
8 | #[cfg (feature = "encoding" )]
|
9 | use crate::reader::EncodingRef;
|
10 | #[cfg (feature = "encoding" )]
|
11 | use encoding_rs::{Encoding, UTF_8};
|
12 |
|
13 | use crate::errors::{Error, Result};
|
14 | use crate::events::Event;
|
15 | use crate::name::QName;
|
16 | use crate::parser::Parser;
|
17 | use crate::reader::{BangType, ReadTextResult, Reader, Span, XmlSource};
|
18 | use crate::utils::is_whitespace;
|
19 |
|
20 | /// This is an implementation for reading from a `&[u8]` as underlying byte stream.
|
21 | /// This implementation supports not using an intermediate buffer as the byte slice
|
22 | /// itself can be used to borrow from.
|
23 | impl<'a> Reader<&'a [u8]> {
|
24 | /// Creates an XML reader from a string slice.
|
25 | #[allow (clippy::should_implement_trait)]
|
26 | pub fn from_str(s: &'a str) -> Self {
|
27 | // Rust strings are guaranteed to be UTF-8, so lock the encoding
|
28 | #[cfg (feature = "encoding" )]
|
29 | {
|
30 | let mut reader = Self::from_reader(s.as_bytes());
|
31 | reader.state.encoding = EncodingRef::Explicit(UTF_8);
|
32 | reader
|
33 | }
|
34 |
|
35 | #[cfg (not(feature = "encoding" ))]
|
36 | Self::from_reader(s.as_bytes())
|
37 | }
|
38 |
|
39 | /// Read an event that borrows from the input rather than a buffer.
|
40 | ///
|
41 | /// There is no asynchronous `read_event_async()` version of this function,
|
42 | /// because it is not necessary -- the contents are already in memory and no IO
|
43 | /// is needed, therefore there is no potential for blocking.
|
44 | ///
|
45 | /// # Examples
|
46 | ///
|
47 | /// ```
|
48 | /// # use pretty_assertions::assert_eq;
|
49 | /// use quick_xml::events::Event;
|
50 | /// use quick_xml::reader::Reader;
|
51 | ///
|
52 | /// let mut reader = Reader::from_str(r#"
|
53 | /// <tag1 att1 = "test">
|
54 | /// <tag2><!--Test comment-->Test</tag2>
|
55 | /// <tag2>Test 2</tag2>
|
56 | /// </tag1>
|
57 | /// "# );
|
58 | /// reader.config_mut().trim_text(true);
|
59 | ///
|
60 | /// let mut count = 0;
|
61 | /// let mut txt = Vec::new();
|
62 | /// loop {
|
63 | /// match reader.read_event().unwrap() {
|
64 | /// Event::Start(e) => count += 1,
|
65 | /// Event::Text(e) => txt.push(e.unescape().unwrap().into_owned()),
|
66 | /// Event::Eof => break,
|
67 | /// _ => (),
|
68 | /// }
|
69 | /// }
|
70 | /// assert_eq!(count, 3);
|
71 | /// assert_eq!(txt, vec!["Test" .to_string(), "Test 2" .to_string()]);
|
72 | /// ```
|
73 | #[inline ]
|
74 | pub fn read_event(&mut self) -> Result<Event<'a>> {
|
75 | self.read_event_impl(())
|
76 | }
|
77 |
|
78 | /// Reads until end element is found. This function is supposed to be called
|
79 | /// after you already read a [`Start`] event.
|
80 | ///
|
81 | /// Returns a span that cover content between `>` of an opening tag and `<` of
|
82 | /// a closing tag or an empty slice, if [`expand_empty_elements`] is set and
|
83 | /// this method was called after reading expanded [`Start`] event.
|
84 | ///
|
85 | /// Manages nested cases where parent and child elements have the _literally_
|
86 | /// same name.
|
87 | ///
|
88 | /// If a corresponding [`End`] event is not found, an error of type [`Error::IllFormed`]
|
89 | /// will be returned. In particularly, that error will be returned if you call
|
90 | /// this method without consuming the corresponding [`Start`] event first.
|
91 | ///
|
92 | /// The `end` parameter should contain name of the end element _in the reader
|
93 | /// encoding_. It is good practice to always get that parameter using
|
94 | /// [`BytesStart::to_end()`] method.
|
95 | ///
|
96 | /// The correctness of the skipped events does not checked, if you disabled
|
97 | /// the [`check_end_names`] option.
|
98 | ///
|
99 | /// There is no asynchronous `read_to_end_async()` version of this function,
|
100 | /// because it is not necessary -- the contents are already in memory and no IO
|
101 | /// is needed, therefore there is no potential for blocking.
|
102 | ///
|
103 | /// # Namespaces
|
104 | ///
|
105 | /// While the `Reader` does not support namespace resolution, namespaces
|
106 | /// does not change the algorithm for comparing names. Although the names
|
107 | /// `a:name` and `b:name` where both prefixes `a` and `b` resolves to the
|
108 | /// same namespace, are semantically equivalent, `</b:name>` cannot close
|
109 | /// `<a:name>`, because according to [the specification]
|
110 | ///
|
111 | /// > The end of every element that begins with a **start-tag** MUST be marked
|
112 | /// > by an **end-tag** containing a name that echoes the element's type as
|
113 | /// > given in the **start-tag**
|
114 | ///
|
115 | /// # Examples
|
116 | ///
|
117 | /// This example shows, how you can skip XML content after you read the
|
118 | /// start event.
|
119 | ///
|
120 | /// ```
|
121 | /// # use pretty_assertions::assert_eq;
|
122 | /// use quick_xml::events::{BytesStart, Event};
|
123 | /// use quick_xml::reader::Reader;
|
124 | ///
|
125 | /// let mut reader = Reader::from_str(r#"
|
126 | /// <outer>
|
127 | /// <inner>
|
128 | /// <inner></inner>
|
129 | /// <inner/>
|
130 | /// <outer></outer>
|
131 | /// <outer/>
|
132 | /// </inner>
|
133 | /// </outer>
|
134 | /// "# );
|
135 | /// reader.config_mut().trim_text(true);
|
136 | ///
|
137 | /// let start = BytesStart::new("outer" );
|
138 | /// let end = start.to_end().into_owned();
|
139 | ///
|
140 | /// // First, we read a start event...
|
141 | /// assert_eq!(reader.read_event().unwrap(), Event::Start(start));
|
142 | ///
|
143 | /// // ...then, we could skip all events to the corresponding end event.
|
144 | /// // This call will correctly handle nested <outer> elements.
|
145 | /// // Note, however, that this method does not handle namespaces.
|
146 | /// reader.read_to_end(end.name()).unwrap();
|
147 | ///
|
148 | /// // At the end we should get an Eof event, because we ate the whole XML
|
149 | /// assert_eq!(reader.read_event().unwrap(), Event::Eof);
|
150 | /// ```
|
151 | ///
|
152 | /// [`Start`]: Event::Start
|
153 | /// [`End`]: Event::End
|
154 | /// [`BytesStart::to_end()`]: crate::events::BytesStart::to_end
|
155 | /// [`expand_empty_elements`]: crate::reader::Config::expand_empty_elements
|
156 | /// [`check_end_names`]: crate::reader::Config::check_end_names
|
157 | /// [the specification]: https://www.w3.org/TR/xml11/#dt-etag
|
158 | pub fn read_to_end(&mut self, end: QName) -> Result<Span> {
|
159 | Ok(read_to_end!(self, end, (), read_event_impl, {}))
|
160 | }
|
161 |
|
162 | /// Reads content between start and end tags, including any markup. This
|
163 | /// function is supposed to be called after you already read a [`Start`] event.
|
164 | ///
|
165 | /// Manages nested cases where parent and child elements have the _literally_
|
166 | /// same name.
|
167 | ///
|
168 | /// This method does not unescape read data, instead it returns content
|
169 | /// "as is" of the XML document. This is because it has no idea what text
|
170 | /// it reads, and if, for example, it contains CDATA section, attempt to
|
171 | /// unescape it content will spoil data.
|
172 | ///
|
173 | /// Any text will be decoded using the XML current [`decoder()`].
|
174 | ///
|
175 | /// Actually, this method perform the following code:
|
176 | ///
|
177 | /// ```ignore
|
178 | /// let span = reader.read_to_end(end)?;
|
179 | /// let text = reader.decoder().decode(&reader.inner_slice[span]);
|
180 | /// ```
|
181 | ///
|
182 | /// # Examples
|
183 | ///
|
184 | /// This example shows, how you can read a HTML content from your XML document.
|
185 | ///
|
186 | /// ```
|
187 | /// # use pretty_assertions::assert_eq;
|
188 | /// # use std::borrow::Cow;
|
189 | /// use quick_xml::events::{BytesStart, Event};
|
190 | /// use quick_xml::reader::Reader;
|
191 | ///
|
192 | /// let mut reader = Reader::from_str("
|
193 | /// <html>
|
194 | /// <title>This is a HTML text</title>
|
195 | /// <p>Usual XML rules does not apply inside it
|
196 | /// <p>For example, elements not needed to be "closed"
|
197 | /// </html>
|
198 | /// " );
|
199 | /// reader.config_mut().trim_text(true);
|
200 | ///
|
201 | /// let start = BytesStart::new("html" );
|
202 | /// let end = start.to_end().into_owned();
|
203 | ///
|
204 | /// // First, we read a start event...
|
205 | /// assert_eq!(reader.read_event().unwrap(), Event::Start(start));
|
206 | /// // ...and disable checking of end names because we expect HTML further...
|
207 | /// reader.config_mut().check_end_names = false;
|
208 | ///
|
209 | /// // ...then, we could read text content until close tag.
|
210 | /// // This call will correctly handle nested <html> elements.
|
211 | /// let text = reader.read_text(end.name()).unwrap();
|
212 | /// assert_eq!(text, Cow::Borrowed(r#"
|
213 | /// <title>This is a HTML text</title>
|
214 | /// <p>Usual XML rules does not apply inside it
|
215 | /// <p>For example, elements not needed to be "closed"
|
216 | /// "# ));
|
217 | /// assert!(matches!(text, Cow::Borrowed(_)));
|
218 | ///
|
219 | /// // Now we can enable checks again
|
220 | /// reader.config_mut().check_end_names = true;
|
221 | ///
|
222 | /// // At the end we should get an Eof event, because we ate the whole XML
|
223 | /// assert_eq!(reader.read_event().unwrap(), Event::Eof);
|
224 | /// ```
|
225 | ///
|
226 | /// [`Start`]: Event::Start
|
227 | /// [`decoder()`]: Self::decoder()
|
228 | pub fn read_text(&mut self, end: QName) -> Result<Cow<'a, str>> {
|
229 | // self.reader will be changed, so store original reference
|
230 | let buffer = self.reader;
|
231 | let span = self.read_to_end(end)?;
|
232 |
|
233 | let len = span.end - span.start;
|
234 | // SAFETY: `span` can only contain indexes up to usize::MAX because it
|
235 | // was created from offsets from a single &[u8] slice
|
236 | Ok(self.decoder().decode(&buffer[0..len as usize])?)
|
237 | }
|
238 | }
|
239 |
|
240 | ////////////////////////////////////////////////////////////////////////////////////////////////////
|
241 |
|
242 | /// Implementation of `XmlSource` for `&[u8]` reader using a `Self` as buffer
|
243 | /// that will be borrowed by events. This implementation provides a zero-copy deserialization
|
244 | impl<'a> XmlSource<'a, ()> for &'a [u8] {
|
245 | #[cfg (not(feature = "encoding" ))]
|
246 | #[inline ]
|
247 | fn remove_utf8_bom(&mut self) -> io::Result<()> {
|
248 | if self.starts_with(crate::encoding::UTF8_BOM) {
|
249 | *self = &self[crate::encoding::UTF8_BOM.len()..];
|
250 | }
|
251 | Ok(())
|
252 | }
|
253 |
|
254 | #[cfg (feature = "encoding" )]
|
255 | #[inline ]
|
256 | fn detect_encoding(&mut self) -> io::Result<Option<&'static Encoding>> {
|
257 | if let Some((enc, bom_len)) = crate::encoding::detect_encoding(self) {
|
258 | *self = &self[bom_len..];
|
259 | return Ok(Some(enc));
|
260 | }
|
261 | Ok(None)
|
262 | }
|
263 |
|
264 | #[inline ]
|
265 | fn read_text(&mut self, _buf: (), position: &mut u64) -> ReadTextResult<'a, ()> {
|
266 | match memchr::memchr(b'<' , self) {
|
267 | Some(0) => {
|
268 | *position += 1;
|
269 | *self = &self[1..];
|
270 | ReadTextResult::Markup(())
|
271 | }
|
272 | Some(i) => {
|
273 | *position += i as u64 + 1;
|
274 | let bytes = &self[..i];
|
275 | *self = &self[i + 1..];
|
276 | ReadTextResult::UpToMarkup(bytes)
|
277 | }
|
278 | None => {
|
279 | *position += self.len() as u64;
|
280 | let bytes = &self[..];
|
281 | *self = &[];
|
282 | ReadTextResult::UpToEof(bytes)
|
283 | }
|
284 | }
|
285 | }
|
286 |
|
287 | #[inline ]
|
288 | fn read_with<P>(&mut self, mut parser: P, _buf: (), position: &mut u64) -> Result<&'a [u8]>
|
289 | where
|
290 | P: Parser,
|
291 | {
|
292 | if let Some(i) = parser.feed(self) {
|
293 | // +1 for `>` which we do not include
|
294 | *position += i as u64 + 1;
|
295 | let bytes = &self[..i];
|
296 | *self = &self[i + 1..];
|
297 | return Ok(bytes);
|
298 | }
|
299 |
|
300 | *position += self.len() as u64;
|
301 | Err(Error::Syntax(P::eof_error()))
|
302 | }
|
303 |
|
304 | #[inline ]
|
305 | fn read_bang_element(&mut self, _buf: (), position: &mut u64) -> Result<(BangType, &'a [u8])> {
|
306 | // Peeked one bang ('!') before being called, so it's guaranteed to
|
307 | // start with it.
|
308 | debug_assert_eq!(self[0], b'!' );
|
309 |
|
310 | let mut bang_type = BangType::new(self[1..].first().copied())?;
|
311 |
|
312 | if let Some((bytes, i)) = bang_type.parse(&[], self) {
|
313 | *position += i as u64;
|
314 | *self = &self[i..];
|
315 | return Ok((bang_type, bytes));
|
316 | }
|
317 |
|
318 | *position += self.len() as u64;
|
319 | Err(bang_type.to_err().into())
|
320 | }
|
321 |
|
322 | #[inline ]
|
323 | fn skip_whitespace(&mut self, position: &mut u64) -> io::Result<()> {
|
324 | let whitespaces = self
|
325 | .iter()
|
326 | .position(|b| !is_whitespace(*b))
|
327 | .unwrap_or(self.len());
|
328 | *position += whitespaces as u64;
|
329 | *self = &self[whitespaces..];
|
330 | Ok(())
|
331 | }
|
332 |
|
333 | #[inline ]
|
334 | fn peek_one(&mut self) -> io::Result<Option<u8>> {
|
335 | Ok(self.first().copied())
|
336 | }
|
337 | }
|
338 |
|
339 | #[cfg (test)]
|
340 | mod test {
|
341 | use crate::reader::test::check;
|
342 | use crate::reader::XmlSource;
|
343 |
|
344 | /// Default buffer constructor just pass the byte array from the test
|
345 | fn identity<T>(input: T) -> T {
|
346 | input
|
347 | }
|
348 |
|
349 | check!(
|
350 | #[test]
|
351 | read_event_impl,
|
352 | read_until_close,
|
353 | identity,
|
354 | ()
|
355 | );
|
356 | }
|
357 | |