1 | // Copyright 2014-2017 The html5ever Project Developers. See the |
2 | // COPYRIGHT file at the top-level directory of this distribution. |
3 | // |
4 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
5 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
6 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
7 | // option. This file may not be copied, modified, or distributed |
8 | // except according to those terms. |
9 | |
10 | //! The `BufferQueue` struct and helper types. |
11 | //! |
12 | //! This type is designed for the efficient parsing of string data, especially where many |
13 | //! significant characters are from the ascii range 0-63. This includes, for example, important |
14 | //! characters in xml/html parsing. |
15 | //! |
16 | //! Good and predictable performance is achieved by avoiding allocation where possible (a.k.a. zero |
17 | //! copy). |
18 | //! |
19 | //! [`BufferQueue`]: struct.BufferQueue.html |
20 | |
21 | use std::collections::VecDeque; |
22 | |
23 | use tendril::StrTendril; |
24 | |
25 | pub use self::SetResult::{FromSet, NotFromSet}; |
26 | use crate::util::smallcharset::SmallCharSet; |
27 | |
28 | /// Result from [`pop_except_from`] containing either a character from a [`SmallCharSet`], or a |
29 | /// string buffer of characters not from the set. |
30 | /// |
31 | /// [`pop_except_from`]: struct.BufferQueue.html#method.pop_except_from |
32 | /// [`SmallCharSet`]: ../struct.SmallCharSet.html |
33 | #[derive (PartialEq, Eq, Debug)] |
34 | pub enum SetResult { |
35 | /// A character from the `SmallCharSet`. |
36 | FromSet(char), |
37 | /// A string buffer containing no characters from the `SmallCharSet`. |
38 | NotFromSet(StrTendril), |
39 | } |
40 | |
41 | /// A queue of owned string buffers, which supports incrementally consuming characters. |
42 | /// |
43 | /// Internally it uses [`VecDeque`] and has the same complexity properties. |
44 | /// |
45 | /// [`VecDeque`]: https://doc.rust-lang.org/std/collections/struct.VecDeque.html |
46 | #[derive (Debug)] |
47 | pub struct BufferQueue { |
48 | /// Buffers to process. |
49 | buffers: VecDeque<StrTendril>, |
50 | } |
51 | |
52 | impl BufferQueue { |
53 | /// Create an empty BufferQueue. |
54 | #[inline ] |
55 | pub fn new() -> BufferQueue { |
56 | BufferQueue { |
57 | buffers: VecDeque::with_capacity(16), |
58 | } |
59 | } |
60 | |
61 | /// Returns whether the queue is empty. |
62 | #[inline ] |
63 | pub fn is_empty(&self) -> bool { |
64 | self.buffers.is_empty() |
65 | } |
66 | |
67 | /// Get the buffer at the beginning of the queue. |
68 | #[inline ] |
69 | pub fn pop_front(&mut self) -> Option<StrTendril> { |
70 | self.buffers.pop_front() |
71 | } |
72 | |
73 | /// Add a buffer to the beginning of the queue. |
74 | /// |
75 | /// If the buffer is empty, it will be skipped. |
76 | pub fn push_front(&mut self, buf: StrTendril) { |
77 | if buf.len32() == 0 { |
78 | return; |
79 | } |
80 | self.buffers.push_front(buf); |
81 | } |
82 | |
83 | /// Add a buffer to the end of the queue. |
84 | /// |
85 | /// If the buffer is empty, it will be skipped. |
86 | pub fn push_back(&mut self, buf: StrTendril) { |
87 | if buf.len32() == 0 { |
88 | return; |
89 | } |
90 | self.buffers.push_back(buf); |
91 | } |
92 | |
93 | /// Look at the next available character without removing it, if the queue is not empty. |
94 | pub fn peek(&self) -> Option<char> { |
95 | debug_assert!( |
96 | self.buffers |
97 | .iter() |
98 | .find(|el| el.len32() == 0) |
99 | .is_none(), |
100 | "invariant \"all buffers in the queue are non-empty \" failed" |
101 | ); |
102 | self.buffers.front().map(|b| b.chars().next().unwrap()) |
103 | } |
104 | |
105 | /// Get the next character if one is available, removing it from the queue. |
106 | /// |
107 | /// This function manages the buffers, removing them as they become empty. |
108 | pub fn next(&mut self) -> Option<char> { |
109 | let (result, now_empty) = match self.buffers.front_mut() { |
110 | None => (None, false), |
111 | Some(buf) => { |
112 | let c = buf.pop_front_char().expect("empty buffer in queue" ); |
113 | (Some(c), buf.is_empty()) |
114 | }, |
115 | }; |
116 | |
117 | if now_empty { |
118 | self.buffers.pop_front(); |
119 | } |
120 | |
121 | result |
122 | } |
123 | |
124 | /// Pops and returns either a single character from the given set, or |
125 | /// a buffer of characters none of which are in the set. |
126 | /// |
127 | /// # Examples |
128 | /// |
129 | /// ``` |
130 | /// # #[macro_use ] extern crate markup5ever; |
131 | /// # #[macro_use ] extern crate tendril; |
132 | /// # fn main() { |
133 | /// use markup5ever::buffer_queue::{BufferQueue, SetResult}; |
134 | /// |
135 | /// let mut queue = BufferQueue::new(); |
136 | /// queue.push_back(format_tendril!(r#"<some_tag attr="text">SomeText</some_tag>"# )); |
137 | /// let set = small_char_set!(b'<' b'>' b' ' b'=' b'"' b'/' ); |
138 | /// let tag = format_tendril!("some_tag" ); |
139 | /// let attr = format_tendril!("attr" ); |
140 | /// let attr_val = format_tendril!("text" ); |
141 | /// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet('<' ))); |
142 | /// assert_eq!(queue.pop_except_from(set), Some(SetResult::NotFromSet(tag))); |
143 | /// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet(' ' ))); |
144 | /// assert_eq!(queue.pop_except_from(set), Some(SetResult::NotFromSet(attr))); |
145 | /// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet('=' ))); |
146 | /// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet('"' ))); |
147 | /// assert_eq!(queue.pop_except_from(set), Some(SetResult::NotFromSet(attr_val))); |
148 | /// // ... |
149 | /// # } |
150 | /// ``` |
151 | pub fn pop_except_from(&mut self, set: SmallCharSet) -> Option<SetResult> { |
152 | let (result, now_empty) = match self.buffers.front_mut() { |
153 | None => (None, false), |
154 | Some(buf) => { |
155 | let n = set.nonmember_prefix_len(&buf); |
156 | if n > 0 { |
157 | let out; |
158 | unsafe { |
159 | out = buf.unsafe_subtendril(0, n); |
160 | buf.unsafe_pop_front(n); |
161 | } |
162 | (Some(NotFromSet(out)), buf.is_empty()) |
163 | } else { |
164 | let c = buf.pop_front_char().expect("empty buffer in queue" ); |
165 | (Some(FromSet(c)), buf.is_empty()) |
166 | } |
167 | }, |
168 | }; |
169 | |
170 | // Unborrow self for this part. |
171 | if now_empty { |
172 | self.buffers.pop_front(); |
173 | } |
174 | |
175 | result |
176 | } |
177 | |
178 | /// Consume bytes matching the pattern, using a custom comparison function `eq`. |
179 | /// |
180 | /// Returns `Some(true)` if there is a match, `Some(false)` if there is no match, or `None` if |
181 | /// it wasn't possible to know (more data is needed). |
182 | /// |
183 | /// The custom comparison function is used elsewhere to compare ascii-case-insensitively. |
184 | /// |
185 | /// # Examples |
186 | /// |
187 | /// ``` |
188 | /// # extern crate markup5ever; |
189 | /// # #[macro_use ] extern crate tendril; |
190 | /// # fn main() { |
191 | /// use markup5ever::buffer_queue::{BufferQueue}; |
192 | /// |
193 | /// let mut queue = BufferQueue::new(); |
194 | /// queue.push_back(format_tendril!("testtext" )); |
195 | /// let test_str = "test" ; |
196 | /// assert_eq!(queue.eat("test" , |&a, &b| a == b), Some(true)); |
197 | /// assert_eq!(queue.eat("text" , |&a, &b| a == b), Some(true)); |
198 | /// assert!(queue.is_empty()); |
199 | /// # } |
200 | /// ``` |
201 | pub fn eat<F: Fn(&u8, &u8) -> bool>(&mut self, pat: &str, eq: F) -> Option<bool> { |
202 | let mut buffers_exhausted = 0; |
203 | let mut consumed_from_last = 0; |
204 | |
205 | self.buffers.front()?; |
206 | |
207 | for pattern_byte in pat.bytes() { |
208 | if buffers_exhausted >= self.buffers.len() { |
209 | return None; |
210 | } |
211 | let buf = &self.buffers[buffers_exhausted]; |
212 | |
213 | if !eq(&buf.as_bytes()[consumed_from_last], &pattern_byte) { |
214 | return Some(false); |
215 | } |
216 | |
217 | consumed_from_last += 1; |
218 | if consumed_from_last >= buf.len() { |
219 | buffers_exhausted += 1; |
220 | consumed_from_last = 0; |
221 | } |
222 | } |
223 | |
224 | // We have a match. Commit changes to the BufferQueue. |
225 | for _ in 0..buffers_exhausted { |
226 | self.buffers.pop_front(); |
227 | } |
228 | |
229 | match self.buffers.front_mut() { |
230 | None => assert_eq!(consumed_from_last, 0), |
231 | Some(ref mut buf) => buf.pop_front(consumed_from_last as u32), |
232 | } |
233 | |
234 | Some(true) |
235 | } |
236 | } |
237 | |
238 | #[cfg (test)] |
239 | #[allow (non_snake_case)] |
240 | mod test { |
241 | use tendril::SliceExt; |
242 | |
243 | use super::BufferQueue; |
244 | use super::SetResult::{FromSet, NotFromSet}; |
245 | |
246 | #[test ] |
247 | fn smoke_test() { |
248 | let mut bq = BufferQueue::new(); |
249 | assert_eq!(bq.peek(), None); |
250 | assert_eq!(bq.next(), None); |
251 | |
252 | bq.push_back("abc" .to_tendril()); |
253 | assert_eq!(bq.peek(), Some('a' )); |
254 | assert_eq!(bq.next(), Some('a' )); |
255 | assert_eq!(bq.peek(), Some('b' )); |
256 | assert_eq!(bq.peek(), Some('b' )); |
257 | assert_eq!(bq.next(), Some('b' )); |
258 | assert_eq!(bq.peek(), Some('c' )); |
259 | assert_eq!(bq.next(), Some('c' )); |
260 | assert_eq!(bq.peek(), None); |
261 | assert_eq!(bq.next(), None); |
262 | } |
263 | |
264 | #[test ] |
265 | fn can_unconsume() { |
266 | let mut bq = BufferQueue::new(); |
267 | bq.push_back("abc" .to_tendril()); |
268 | assert_eq!(bq.next(), Some('a' )); |
269 | |
270 | bq.push_front("xy" .to_tendril()); |
271 | assert_eq!(bq.next(), Some('x' )); |
272 | assert_eq!(bq.next(), Some('y' )); |
273 | assert_eq!(bq.next(), Some('b' )); |
274 | assert_eq!(bq.next(), Some('c' )); |
275 | assert_eq!(bq.next(), None); |
276 | } |
277 | |
278 | #[test ] |
279 | fn can_pop_except_set() { |
280 | let mut bq = BufferQueue::new(); |
281 | bq.push_back("abc&def" .to_tendril()); |
282 | let mut pop = || bq.pop_except_from(small_char_set!('&' )); |
283 | assert_eq!(pop(), Some(NotFromSet("abc" .to_tendril()))); |
284 | assert_eq!(pop(), Some(FromSet('&' ))); |
285 | assert_eq!(pop(), Some(NotFromSet("def" .to_tendril()))); |
286 | assert_eq!(pop(), None); |
287 | } |
288 | |
289 | #[test ] |
290 | fn can_eat() { |
291 | // This is not very comprehensive. We rely on the tokenizer |
292 | // integration tests for more thorough testing with many |
293 | // different input buffer splits. |
294 | let mut bq = BufferQueue::new(); |
295 | bq.push_back("a" .to_tendril()); |
296 | bq.push_back("bc" .to_tendril()); |
297 | assert_eq!(bq.eat("abcd" , u8::eq_ignore_ascii_case), None); |
298 | assert_eq!(bq.eat("ax" , u8::eq_ignore_ascii_case), Some(false)); |
299 | assert_eq!(bq.eat("ab" , u8::eq_ignore_ascii_case), Some(true)); |
300 | assert_eq!(bq.next(), Some('c' )); |
301 | assert_eq!(bq.next(), None); |
302 | } |
303 | } |
304 | |