1// Copyright 2014-2017 The html5ever Project Developers. See the
2// COPYRIGHT file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10//! The `BufferQueue` struct and helper types.
11//!
12//! This type is designed for the efficient parsing of string data, especially where many
13//! significant characters are from the ascii range 0-63. This includes, for example, important
14//! characters in xml/html parsing.
15//!
16//! Good and predictable performance is achieved by avoiding allocation where possible (a.k.a. zero
17//! copy).
18//!
19//! [`BufferQueue`]: struct.BufferQueue.html
20
21use std::collections::VecDeque;
22
23use tendril::StrTendril;
24
25pub use self::SetResult::{FromSet, NotFromSet};
26use crate::util::smallcharset::SmallCharSet;
27
28/// Result from [`pop_except_from`] containing either a character from a [`SmallCharSet`], or a
29/// string buffer of characters not from the set.
30///
31/// [`pop_except_from`]: struct.BufferQueue.html#method.pop_except_from
32/// [`SmallCharSet`]: ../struct.SmallCharSet.html
33#[derive(PartialEq, Eq, Debug)]
34pub enum SetResult {
35 /// A character from the `SmallCharSet`.
36 FromSet(char),
37 /// A string buffer containing no characters from the `SmallCharSet`.
38 NotFromSet(StrTendril),
39}
40
41/// A queue of owned string buffers, which supports incrementally consuming characters.
42///
43/// Internally it uses [`VecDeque`] and has the same complexity properties.
44///
45/// [`VecDeque`]: https://doc.rust-lang.org/std/collections/struct.VecDeque.html
46#[derive(Debug)]
47pub struct BufferQueue {
48 /// Buffers to process.
49 buffers: VecDeque<StrTendril>,
50}
51
52impl BufferQueue {
53 /// Create an empty BufferQueue.
54 #[inline]
55 pub fn new() -> BufferQueue {
56 BufferQueue {
57 buffers: VecDeque::with_capacity(16),
58 }
59 }
60
61 /// Returns whether the queue is empty.
62 #[inline]
63 pub fn is_empty(&self) -> bool {
64 self.buffers.is_empty()
65 }
66
67 /// Get the buffer at the beginning of the queue.
68 #[inline]
69 pub fn pop_front(&mut self) -> Option<StrTendril> {
70 self.buffers.pop_front()
71 }
72
73 /// Add a buffer to the beginning of the queue.
74 ///
75 /// If the buffer is empty, it will be skipped.
76 pub fn push_front(&mut self, buf: StrTendril) {
77 if buf.len32() == 0 {
78 return;
79 }
80 self.buffers.push_front(buf);
81 }
82
83 /// Add a buffer to the end of the queue.
84 ///
85 /// If the buffer is empty, it will be skipped.
86 pub fn push_back(&mut self, buf: StrTendril) {
87 if buf.len32() == 0 {
88 return;
89 }
90 self.buffers.push_back(buf);
91 }
92
93 /// Look at the next available character without removing it, if the queue is not empty.
94 pub fn peek(&self) -> Option<char> {
95 debug_assert!(
96 self.buffers
97 .iter()
98 .find(|el| el.len32() == 0)
99 .is_none(),
100 "invariant \"all buffers in the queue are non-empty\" failed"
101 );
102 self.buffers.front().map(|b| b.chars().next().unwrap())
103 }
104
105 /// Get the next character if one is available, removing it from the queue.
106 ///
107 /// This function manages the buffers, removing them as they become empty.
108 pub fn next(&mut self) -> Option<char> {
109 let (result, now_empty) = match self.buffers.front_mut() {
110 None => (None, false),
111 Some(buf) => {
112 let c = buf.pop_front_char().expect("empty buffer in queue");
113 (Some(c), buf.is_empty())
114 },
115 };
116
117 if now_empty {
118 self.buffers.pop_front();
119 }
120
121 result
122 }
123
124 /// Pops and returns either a single character from the given set, or
125 /// a buffer of characters none of which are in the set.
126 ///
127 /// # Examples
128 ///
129 /// ```
130 /// # #[macro_use] extern crate markup5ever;
131 /// # #[macro_use] extern crate tendril;
132 /// # fn main() {
133 /// use markup5ever::buffer_queue::{BufferQueue, SetResult};
134 ///
135 /// let mut queue = BufferQueue::new();
136 /// queue.push_back(format_tendril!(r#"<some_tag attr="text">SomeText</some_tag>"#));
137 /// let set = small_char_set!(b'<' b'>' b' ' b'=' b'"' b'/');
138 /// let tag = format_tendril!("some_tag");
139 /// let attr = format_tendril!("attr");
140 /// let attr_val = format_tendril!("text");
141 /// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet('<')));
142 /// assert_eq!(queue.pop_except_from(set), Some(SetResult::NotFromSet(tag)));
143 /// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet(' ')));
144 /// assert_eq!(queue.pop_except_from(set), Some(SetResult::NotFromSet(attr)));
145 /// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet('=')));
146 /// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet('"')));
147 /// assert_eq!(queue.pop_except_from(set), Some(SetResult::NotFromSet(attr_val)));
148 /// // ...
149 /// # }
150 /// ```
151 pub fn pop_except_from(&mut self, set: SmallCharSet) -> Option<SetResult> {
152 let (result, now_empty) = match self.buffers.front_mut() {
153 None => (None, false),
154 Some(buf) => {
155 let n = set.nonmember_prefix_len(&buf);
156 if n > 0 {
157 let out;
158 unsafe {
159 out = buf.unsafe_subtendril(0, n);
160 buf.unsafe_pop_front(n);
161 }
162 (Some(NotFromSet(out)), buf.is_empty())
163 } else {
164 let c = buf.pop_front_char().expect("empty buffer in queue");
165 (Some(FromSet(c)), buf.is_empty())
166 }
167 },
168 };
169
170 // Unborrow self for this part.
171 if now_empty {
172 self.buffers.pop_front();
173 }
174
175 result
176 }
177
178 /// Consume bytes matching the pattern, using a custom comparison function `eq`.
179 ///
180 /// Returns `Some(true)` if there is a match, `Some(false)` if there is no match, or `None` if
181 /// it wasn't possible to know (more data is needed).
182 ///
183 /// The custom comparison function is used elsewhere to compare ascii-case-insensitively.
184 ///
185 /// # Examples
186 ///
187 /// ```
188 /// # extern crate markup5ever;
189 /// # #[macro_use] extern crate tendril;
190 /// # fn main() {
191 /// use markup5ever::buffer_queue::{BufferQueue};
192 ///
193 /// let mut queue = BufferQueue::new();
194 /// queue.push_back(format_tendril!("testtext"));
195 /// let test_str = "test";
196 /// assert_eq!(queue.eat("test", |&a, &b| a == b), Some(true));
197 /// assert_eq!(queue.eat("text", |&a, &b| a == b), Some(true));
198 /// assert!(queue.is_empty());
199 /// # }
200 /// ```
201 pub fn eat<F: Fn(&u8, &u8) -> bool>(&mut self, pat: &str, eq: F) -> Option<bool> {
202 let mut buffers_exhausted = 0;
203 let mut consumed_from_last = 0;
204
205 self.buffers.front()?;
206
207 for pattern_byte in pat.bytes() {
208 if buffers_exhausted >= self.buffers.len() {
209 return None;
210 }
211 let buf = &self.buffers[buffers_exhausted];
212
213 if !eq(&buf.as_bytes()[consumed_from_last], &pattern_byte) {
214 return Some(false);
215 }
216
217 consumed_from_last += 1;
218 if consumed_from_last >= buf.len() {
219 buffers_exhausted += 1;
220 consumed_from_last = 0;
221 }
222 }
223
224 // We have a match. Commit changes to the BufferQueue.
225 for _ in 0..buffers_exhausted {
226 self.buffers.pop_front();
227 }
228
229 match self.buffers.front_mut() {
230 None => assert_eq!(consumed_from_last, 0),
231 Some(ref mut buf) => buf.pop_front(consumed_from_last as u32),
232 }
233
234 Some(true)
235 }
236}
237
238#[cfg(test)]
239#[allow(non_snake_case)]
240mod test {
241 use tendril::SliceExt;
242
243 use super::BufferQueue;
244 use super::SetResult::{FromSet, NotFromSet};
245
246 #[test]
247 fn smoke_test() {
248 let mut bq = BufferQueue::new();
249 assert_eq!(bq.peek(), None);
250 assert_eq!(bq.next(), None);
251
252 bq.push_back("abc".to_tendril());
253 assert_eq!(bq.peek(), Some('a'));
254 assert_eq!(bq.next(), Some('a'));
255 assert_eq!(bq.peek(), Some('b'));
256 assert_eq!(bq.peek(), Some('b'));
257 assert_eq!(bq.next(), Some('b'));
258 assert_eq!(bq.peek(), Some('c'));
259 assert_eq!(bq.next(), Some('c'));
260 assert_eq!(bq.peek(), None);
261 assert_eq!(bq.next(), None);
262 }
263
264 #[test]
265 fn can_unconsume() {
266 let mut bq = BufferQueue::new();
267 bq.push_back("abc".to_tendril());
268 assert_eq!(bq.next(), Some('a'));
269
270 bq.push_front("xy".to_tendril());
271 assert_eq!(bq.next(), Some('x'));
272 assert_eq!(bq.next(), Some('y'));
273 assert_eq!(bq.next(), Some('b'));
274 assert_eq!(bq.next(), Some('c'));
275 assert_eq!(bq.next(), None);
276 }
277
278 #[test]
279 fn can_pop_except_set() {
280 let mut bq = BufferQueue::new();
281 bq.push_back("abc&def".to_tendril());
282 let mut pop = || bq.pop_except_from(small_char_set!('&'));
283 assert_eq!(pop(), Some(NotFromSet("abc".to_tendril())));
284 assert_eq!(pop(), Some(FromSet('&')));
285 assert_eq!(pop(), Some(NotFromSet("def".to_tendril())));
286 assert_eq!(pop(), None);
287 }
288
289 #[test]
290 fn can_eat() {
291 // This is not very comprehensive. We rely on the tokenizer
292 // integration tests for more thorough testing with many
293 // different input buffer splits.
294 let mut bq = BufferQueue::new();
295 bq.push_back("a".to_tendril());
296 bq.push_back("bc".to_tendril());
297 assert_eq!(bq.eat("abcd", u8::eq_ignore_ascii_case), None);
298 assert_eq!(bq.eat("ax", u8::eq_ignore_ascii_case), Some(false));
299 assert_eq!(bq.eat("ab", u8::eq_ignore_ascii_case), Some(true));
300 assert_eq!(bq.next(), Some('c'));
301 assert_eq!(bq.next(), None);
302 }
303}
304