1// Copyright 2014-2017 The html5ever Project Developers. See the
2// COPYRIGHT file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10//! The `BufferQueue` struct and helper types.
11//!
12//! This type is designed for the efficient parsing of string data, especially where many
13//! significant characters are from the ascii range 0-63. This includes, for example, important
14//! characters in xml/html parsing.
15//!
16//! Good and predictable performance is achieved by avoiding allocation where possible (a.k.a. zero
17//! copy).
18//!
19//! [`BufferQueue`]: struct.BufferQueue.html
20
21use std::{cell::RefCell, collections::VecDeque, mem};
22
23use tendril::StrTendril;
24
25pub use self::SetResult::{FromSet, NotFromSet};
26use crate::util::smallcharset::SmallCharSet;
27
28/// Result from [`pop_except_from`] containing either a character from a [`SmallCharSet`], or a
29/// string buffer of characters not from the set.
30///
31/// [`pop_except_from`]: struct.BufferQueue.html#method.pop_except_from
32/// [`SmallCharSet`]: ../struct.SmallCharSet.html
33#[derive(PartialEq, Eq, Debug)]
34pub enum SetResult {
35 /// A character from the `SmallCharSet`.
36 FromSet(char),
37 /// A string buffer containing no characters from the `SmallCharSet`.
38 NotFromSet(StrTendril),
39}
40
41/// A queue of owned string buffers, which supports incrementally consuming characters.
42///
43/// Internally it uses [`VecDeque`] and has the same complexity properties.
44///
45/// [`VecDeque`]: https://doc.rust-lang.org/std/collections/struct.VecDeque.html
46#[derive(Debug)]
47pub struct BufferQueue {
48 /// Buffers to process.
49 buffers: RefCell<VecDeque<StrTendril>>,
50}
51
52impl Default for BufferQueue {
53 /// Create an empty BufferQueue.
54 #[inline]
55 fn default() -> Self {
56 Self {
57 buffers: RefCell::new(VecDeque::with_capacity(16)),
58 }
59 }
60}
61
62impl BufferQueue {
63 /// Returns whether the queue is empty.
64 #[inline]
65 pub fn is_empty(&self) -> bool {
66 self.buffers.borrow().is_empty()
67 }
68
69 /// Get the buffer at the beginning of the queue.
70 #[inline]
71 pub fn pop_front(&self) -> Option<StrTendril> {
72 self.buffers.borrow_mut().pop_front()
73 }
74
75 /// Add a buffer to the beginning of the queue.
76 ///
77 /// If the buffer is empty, it will be skipped.
78 pub fn push_front(&self, buf: StrTendril) {
79 if buf.len32() == 0 {
80 return;
81 }
82 self.buffers.borrow_mut().push_front(buf);
83 }
84
85 /// Add a buffer to the end of the queue.
86 ///
87 /// If the buffer is empty, it will be skipped.
88 pub fn push_back(&self, buf: StrTendril) {
89 if buf.len32() == 0 {
90 return;
91 }
92 self.buffers.borrow_mut().push_back(buf);
93 }
94
95 /// Look at the next available character without removing it, if the queue is not empty.
96 pub fn peek(&self) -> Option<char> {
97 debug_assert!(
98 !self.buffers.borrow().iter().any(|el| el.len32() == 0),
99 "invariant \"all buffers in the queue are non-empty\" failed"
100 );
101 self.buffers
102 .borrow()
103 .front()
104 .map(|b| b.chars().next().unwrap())
105 }
106
107 /// Pops and returns either a single character from the given set, or
108 /// a buffer of characters none of which are in the set.
109 ///
110 /// # Examples
111 ///
112 /// ```
113 /// # #[macro_use] extern crate markup5ever;
114 /// # #[macro_use] extern crate tendril;
115 /// # fn main() {
116 /// use markup5ever::buffer_queue::{BufferQueue, SetResult};
117 ///
118 /// let mut queue = BufferQueue::default();
119 /// queue.push_back(format_tendril!(r#"<some_tag attr="text">SomeText</some_tag>"#));
120 /// let set = small_char_set!(b'<' b'>' b' ' b'=' b'"' b'/');
121 /// let tag = format_tendril!("some_tag");
122 /// let attr = format_tendril!("attr");
123 /// let attr_val = format_tendril!("text");
124 /// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet('<')));
125 /// assert_eq!(queue.pop_except_from(set), Some(SetResult::NotFromSet(tag)));
126 /// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet(' ')));
127 /// assert_eq!(queue.pop_except_from(set), Some(SetResult::NotFromSet(attr)));
128 /// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet('=')));
129 /// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet('"')));
130 /// assert_eq!(queue.pop_except_from(set), Some(SetResult::NotFromSet(attr_val)));
131 /// // ...
132 /// # }
133 /// ```
134 pub fn pop_except_from(&self, set: SmallCharSet) -> Option<SetResult> {
135 let (result, now_empty) = match self.buffers.borrow_mut().front_mut() {
136 None => (None, false),
137 Some(buf) => {
138 let n = set.nonmember_prefix_len(buf);
139 if n > 0 {
140 let out;
141 unsafe {
142 out = buf.unsafe_subtendril(0, n);
143 buf.unsafe_pop_front(n);
144 }
145 (Some(NotFromSet(out)), buf.is_empty())
146 } else {
147 let c = buf.pop_front_char().expect("empty buffer in queue");
148 (Some(FromSet(c)), buf.is_empty())
149 }
150 },
151 };
152
153 // Unborrow self for this part.
154 if now_empty {
155 self.buffers.borrow_mut().pop_front();
156 }
157
158 result
159 }
160
161 /// Consume bytes matching the pattern, using a custom comparison function `eq`.
162 ///
163 /// Returns `Some(true)` if there is a match, `Some(false)` if there is no match, or `None` if
164 /// it wasn't possible to know (more data is needed).
165 ///
166 /// The custom comparison function is used elsewhere to compare ascii-case-insensitively.
167 ///
168 /// # Examples
169 ///
170 /// ```
171 /// # extern crate markup5ever;
172 /// # #[macro_use] extern crate tendril;
173 /// # fn main() {
174 /// use markup5ever::buffer_queue::BufferQueue;
175 ///
176 /// let mut queue = BufferQueue::default();
177 /// queue.push_back(format_tendril!("testtext"));
178 /// let test_str = "test";
179 /// assert_eq!(queue.eat("test", |&a, &b| a == b), Some(true));
180 /// assert_eq!(queue.eat("text", |&a, &b| a == b), Some(true));
181 /// assert!(queue.is_empty());
182 /// # }
183 /// ```
184 pub fn eat<F: Fn(&u8, &u8) -> bool>(&self, pat: &str, eq: F) -> Option<bool> {
185 let mut buffers_exhausted = 0;
186 let mut consumed_from_last = 0;
187
188 self.buffers.borrow().front()?;
189
190 for pattern_byte in pat.bytes() {
191 if buffers_exhausted >= self.buffers.borrow().len() {
192 return None;
193 }
194 let buf = &self.buffers.borrow()[buffers_exhausted];
195
196 if !eq(&buf.as_bytes()[consumed_from_last], &pattern_byte) {
197 return Some(false);
198 }
199
200 consumed_from_last += 1;
201 if consumed_from_last >= buf.len() {
202 buffers_exhausted += 1;
203 consumed_from_last = 0;
204 }
205 }
206
207 // We have a match. Commit changes to the BufferQueue.
208 for _ in 0..buffers_exhausted {
209 self.buffers.borrow_mut().pop_front();
210 }
211
212 match self.buffers.borrow_mut().front_mut() {
213 None => assert_eq!(consumed_from_last, 0),
214 Some(ref mut buf) => buf.pop_front(consumed_from_last as u32),
215 }
216
217 Some(true)
218 }
219
220 /// Get the next character if one is available, removing it from the queue.
221 ///
222 /// This function manages the buffers, removing them as they become empty.
223 pub fn next(&self) -> Option<char> {
224 let (result, now_empty) = match self.buffers.borrow_mut().front_mut() {
225 None => (None, false),
226 Some(buf) => {
227 let c = buf.pop_front_char().expect("empty buffer in queue");
228 (Some(c), buf.is_empty())
229 },
230 };
231
232 if now_empty {
233 self.buffers.borrow_mut().pop_front();
234 }
235
236 result
237 }
238
239 pub fn replace_with(&self, other: BufferQueue) {
240 let _ = mem::replace(&mut *self.buffers.borrow_mut(), other.buffers.take());
241 }
242
243 pub fn swap_with(&self, other: &BufferQueue) {
244 mem::swap(
245 &mut *self.buffers.borrow_mut(),
246 &mut *other.buffers.borrow_mut(),
247 );
248 }
249}
250
251#[cfg(test)]
252#[allow(non_snake_case)]
253mod test {
254 use tendril::SliceExt;
255
256 use super::BufferQueue;
257 use super::SetResult::{FromSet, NotFromSet};
258
259 #[test]
260 fn smoke_test() {
261 let bq = BufferQueue::default();
262 assert_eq!(bq.peek(), None);
263 assert_eq!(bq.next(), None);
264
265 bq.push_back("abc".to_tendril());
266 assert_eq!(bq.peek(), Some('a'));
267 assert_eq!(bq.next(), Some('a'));
268 assert_eq!(bq.peek(), Some('b'));
269 assert_eq!(bq.peek(), Some('b'));
270 assert_eq!(bq.next(), Some('b'));
271 assert_eq!(bq.peek(), Some('c'));
272 assert_eq!(bq.next(), Some('c'));
273 assert_eq!(bq.peek(), None);
274 assert_eq!(bq.next(), None);
275 }
276
277 #[test]
278 fn can_unconsume() {
279 let bq = BufferQueue::default();
280 bq.push_back("abc".to_tendril());
281 assert_eq!(bq.next(), Some('a'));
282
283 bq.push_front("xy".to_tendril());
284 assert_eq!(bq.next(), Some('x'));
285 assert_eq!(bq.next(), Some('y'));
286 assert_eq!(bq.next(), Some('b'));
287 assert_eq!(bq.next(), Some('c'));
288 assert_eq!(bq.next(), None);
289 }
290
291 #[test]
292 fn can_pop_except_set() {
293 let bq = BufferQueue::default();
294 bq.push_back("abc&def".to_tendril());
295 let pop = || bq.pop_except_from(small_char_set!('&'));
296 assert_eq!(pop(), Some(NotFromSet("abc".to_tendril())));
297 assert_eq!(pop(), Some(FromSet('&')));
298 assert_eq!(pop(), Some(NotFromSet("def".to_tendril())));
299 assert_eq!(pop(), None);
300 }
301
302 #[test]
303 fn can_eat() {
304 // This is not very comprehensive. We rely on the tokenizer
305 // integration tests for more thorough testing with many
306 // different input buffer splits.
307 let bq = BufferQueue::default();
308 bq.push_back("a".to_tendril());
309 bq.push_back("bc".to_tendril());
310 assert_eq!(bq.eat("abcd", u8::eq_ignore_ascii_case), None);
311 assert_eq!(bq.eat("ax", u8::eq_ignore_ascii_case), Some(false));
312 assert_eq!(bq.eat("ab", u8::eq_ignore_ascii_case), Some(true));
313 assert_eq!(bq.next(), Some('c'));
314 assert_eq!(bq.next(), None);
315 }
316}
317

Provided by KDAB

Privacy Policy
Learn Rust with the experts
Find out more