1 | // This module defines some core types for dealing with accelerated DFA states. |
2 | // Briefly, a DFA state can be "accelerated" if all of its transitions except |
3 | // for a few loop back to itself. This directly implies that the only way out |
4 | // of such a state is if a byte corresponding to one of those non-loopback |
5 | // transitions is found. Such states are often found in simple repetitions in |
6 | // non-Unicode regexes. For example, consider '(?-u)[^a]+a'. We can look at its |
7 | // DFA with regex-cli: |
8 | // |
9 | // $ regex-cli debug dfa dense '(?-u)[^a]+a' -BbC |
10 | // dense::DFA( |
11 | // D 000000: |
12 | // Q 000001: |
13 | // *000002: |
14 | // A 000003: \x00-` => 3, a => 5, b-\xFF => 3 |
15 | // >000004: \x00-` => 3, a => 4, b-\xFF => 3 |
16 | // 000005: \x00-\xFF => 2, EOI => 2 |
17 | // ) |
18 | // |
19 | // In particular, state 3 is accelerated (shown via the 'A' indicator) since |
20 | // the only way to leave that state once entered is to see an 'a' byte. If |
21 | // there is a long run of non-'a' bytes, then using something like 'memchr' |
22 | // to find the next 'a' byte can be significantly faster than just using the |
23 | // standard byte-at-a-time state machine. |
24 | // |
25 | // Unfortunately, this optimization rarely applies when Unicode is enabled. |
26 | // For example, patterns like '[^a]' don't actually match any byte that isn't |
27 | // 'a', but rather, any UTF-8 encoding of a Unicode scalar value that isn't |
28 | // 'a'. This makes the state machine much more complex---far beyond a single |
29 | // state---and removes the ability to easily accelerate it. (Because if the |
30 | // machine sees a non-UTF-8 sequence, then the machine won't match through it.) |
31 | // |
32 | // In practice, we only consider accelerating states that have 3 or fewer |
33 | // non-loop transitions. At a certain point, you get diminishing returns, but |
34 | // also because that's what the memchr crate supports. The structures below |
35 | // hard-code this assumption and provide (de)serialization APIs for use inside |
36 | // a DFA. |
37 | // |
38 | // And finally, note that there is some trickery involved in making it very |
39 | // fast to not only check whether a state is accelerated at search time, but |
40 | // also to access the bytes to search for to implement the acceleration itself. |
41 | // dfa/special.rs provides more detail, but the short story is that all |
42 | // accelerated states appear contiguously in a DFA. This means we can represent |
43 | // the ID space of all accelerated DFA states with a single range. So given |
44 | // a state ID, we can determine whether it's accelerated via |
45 | // |
46 | // min_accel_id <= id <= max_accel_id |
47 | // |
48 | // And find its corresponding accelerator with: |
49 | // |
50 | // accels.get((id - min_accel_id) / dfa_stride) |
51 | |
52 | use core::convert::{TryFrom, TryInto}; |
53 | |
54 | #[cfg (feature = "alloc" )] |
55 | use alloc::{vec, vec::Vec}; |
56 | |
57 | use crate::util::bytes::{self, DeserializeError, Endian, SerializeError}; |
58 | |
59 | /// The base type used to represent a collection of accelerators. |
60 | /// |
61 | /// While an `Accel` is represented as a fixed size array of bytes, a |
62 | /// *collection* of `Accel`s (called `Accels`) is represented internally as a |
63 | /// slice of u32. While it's a bit unnatural to do this and costs us a bit of |
64 | /// fairly low-risk not-safe code, it lets us remove the need for a second type |
65 | /// parameter in the definition of dense::DFA. (Which really wants everything |
66 | /// to be a slice of u32.) |
67 | type AccelTy = u32; |
68 | |
69 | /// The size of the unit of representation for accelerators. |
70 | /// |
71 | /// ACCEL_CAP *must* be a multiple of this size. |
72 | const ACCEL_TY_SIZE: usize = core::mem::size_of::<AccelTy>(); |
73 | |
74 | /// The maximum length in bytes that a single Accel can be. This is distinct |
75 | /// from the capacity of an accelerator in that the length represents only the |
76 | /// bytes that should be read. |
77 | const ACCEL_LEN: usize = 4; |
78 | |
79 | /// The capacity of each accelerator, in bytes. We set this to 8 since it's a |
80 | /// multiple of 4 (our ID size) and because it gives us a little wiggle room |
81 | /// if we want to support more accel bytes in the future without a breaking |
82 | /// change. |
83 | /// |
84 | /// This MUST be a multiple of ACCEL_TY_SIZE. |
85 | const ACCEL_CAP: usize = 8; |
86 | |
87 | /// Search for between 1 and 3 needle bytes in the given haystack, starting the |
88 | /// search at the given position. If `needles` has a length other than 1-3, |
89 | /// then this panics. |
90 | #[inline (always)] |
91 | pub(crate) fn find_fwd( |
92 | needles: &[u8], |
93 | haystack: &[u8], |
94 | at: usize, |
95 | ) -> Option<usize> { |
96 | let bs: &[u8] = needles; |
97 | let i: usize = match needles.len() { |
98 | 1 => memchr::memchr(needle:bs[0], &haystack[at..])?, |
99 | 2 => memchr::memchr2(needle1:bs[0], needle2:bs[1], &haystack[at..])?, |
100 | 3 => memchr::memchr3(needle1:bs[0], needle2:bs[1], needle3:bs[2], &haystack[at..])?, |
101 | 0 => panic!("cannot find with empty needles" ), |
102 | n: usize => panic!("invalid needles length: {}" , n), |
103 | }; |
104 | Some(at + i) |
105 | } |
106 | |
107 | /// Search for between 1 and 3 needle bytes in the given haystack in reverse, |
108 | /// starting the search at the given position. If `needles` has a length other |
109 | /// than 1-3, then this panics. |
110 | #[inline (always)] |
111 | pub(crate) fn find_rev( |
112 | needles: &[u8], |
113 | haystack: &[u8], |
114 | at: usize, |
115 | ) -> Option<usize> { |
116 | let bs: &[u8] = needles; |
117 | match needles.len() { |
118 | 1 => memchr::memrchr(needle:bs[0], &haystack[..at]), |
119 | 2 => memchr::memrchr2(needle1:bs[0], needle2:bs[1], &haystack[..at]), |
120 | 3 => memchr::memrchr3(needle1:bs[0], needle2:bs[1], needle3:bs[2], &haystack[..at]), |
121 | 0 => panic!("cannot find with empty needles" ), |
122 | n: usize => panic!("invalid needles length: {}" , n), |
123 | } |
124 | } |
125 | |
126 | /// Represents the accelerators for all accelerated states in a dense DFA. |
127 | /// |
128 | /// The `A` type parameter represents the type of the underlying bytes. |
129 | /// Generally, this is either `&[AccelTy]` or `Vec<AccelTy>`. |
130 | #[derive (Clone)] |
131 | pub(crate) struct Accels<A> { |
132 | /// A length prefixed slice of contiguous accelerators. See the top comment |
133 | /// in this module for more details on how we can jump from a DFA's state |
134 | /// ID to an accelerator in this list. |
135 | /// |
136 | /// The first 4 bytes always correspond to the number of accelerators |
137 | /// that follow. |
138 | accels: A, |
139 | } |
140 | |
141 | #[cfg (feature = "alloc" )] |
142 | impl Accels<Vec<AccelTy>> { |
143 | /// Create an empty sequence of accelerators for a DFA. |
144 | pub fn empty() -> Accels<Vec<AccelTy>> { |
145 | Accels { accels: vec![0] } |
146 | } |
147 | |
148 | /// Add an accelerator to this sequence. |
149 | /// |
150 | /// This adds to the accelerator to the end of the sequence and therefore |
151 | /// should be done in correspondence with its state in the DFA. |
152 | /// |
153 | /// This panics if this results in more accelerators than AccelTy::MAX. |
154 | pub fn add(&mut self, accel: Accel) { |
155 | self.accels.extend_from_slice(&accel.as_accel_tys()); |
156 | let len = self.len(); |
157 | self.set_len(len + 1); |
158 | } |
159 | |
160 | /// Set the number of accelerators in this sequence, which is encoded in |
161 | /// the first 4 bytes of the underlying bytes. |
162 | fn set_len(&mut self, new_len: usize) { |
163 | // The only way an accelerator gets added is if a state exists for |
164 | // it, and if a state exists, then its index is guaranteed to be |
165 | // representable by a AccelTy by virtue of the guarantees provided by |
166 | // StateID. |
167 | let new_len = AccelTy::try_from(new_len).unwrap(); |
168 | self.accels[0] = new_len; |
169 | } |
170 | } |
171 | |
172 | impl<'a> Accels<&'a [AccelTy]> { |
173 | /// Deserialize a sequence of accelerators from the given bytes. If there |
174 | /// was a problem deserializing, then an error is returned. |
175 | /// |
176 | /// This is guaranteed to run in constant time. This does not guarantee |
177 | /// that every accelerator in the returned collection is valid. Thus, |
178 | /// accessing one may panic, or not-safe code that relies on accelerators |
179 | /// being correct my result in UB. |
180 | /// |
181 | /// Callers may check the validity of every accelerator with the `validate` |
182 | /// method. |
183 | pub unsafe fn from_bytes_unchecked( |
184 | mut slice: &'a [u8], |
185 | ) -> Result<(Accels<&'a [AccelTy]>, usize), DeserializeError> { |
186 | let slice_start = slice.as_ptr() as usize; |
187 | |
188 | let (count, _) = |
189 | bytes::try_read_u32_as_usize(slice, "accelerators count" )?; |
190 | // The accelerator count is part of the accel_tys slice that |
191 | // we deserialize. This is perhaps a bit idiosyncratic. It would |
192 | // probably be better to split out the count into a real field. |
193 | |
194 | let accel_tys_count = bytes::add( |
195 | bytes::mul(count, 2, "total number of accelerator accel_tys" )?, |
196 | 1, |
197 | "total number of accel_tys" , |
198 | )?; |
199 | let accel_tys_len = bytes::mul( |
200 | ACCEL_TY_SIZE, |
201 | accel_tys_count, |
202 | "total number of bytes in accelerators" , |
203 | )?; |
204 | bytes::check_slice_len(slice, accel_tys_len, "accelerators" )?; |
205 | bytes::check_alignment::<AccelTy>(slice)?; |
206 | let accel_tys = &slice[..accel_tys_len]; |
207 | slice = &slice[accel_tys_len..]; |
208 | // SAFETY: We've checked the length and alignment above, and since |
209 | // slice is just bytes, we can safely cast to a slice of &[AccelTy]. |
210 | #[allow (unused_unsafe)] |
211 | let accels = unsafe { |
212 | core::slice::from_raw_parts( |
213 | accel_tys.as_ptr() as *const AccelTy, |
214 | accel_tys_count, |
215 | ) |
216 | }; |
217 | Ok((Accels { accels }, slice.as_ptr() as usize - slice_start)) |
218 | } |
219 | } |
220 | |
221 | impl<A: AsRef<[AccelTy]>> Accels<A> { |
222 | /// Return an owned version of the accelerators. |
223 | #[cfg (feature = "alloc" )] |
224 | pub fn to_owned(&self) -> Accels<Vec<AccelTy>> { |
225 | Accels { accels: self.accels.as_ref().to_vec() } |
226 | } |
227 | |
228 | /// Return a borrowed version of the accelerators. |
229 | pub fn as_ref(&self) -> Accels<&[AccelTy]> { |
230 | Accels { accels: self.accels.as_ref() } |
231 | } |
232 | |
233 | /// Return the bytes representing the serialization of the accelerators. |
234 | pub fn as_bytes(&self) -> &[u8] { |
235 | let accels = self.accels.as_ref(); |
236 | // SAFETY: This is safe because accels is a just a slice of AccelTy, |
237 | // and u8 always has a smaller alignment. |
238 | unsafe { |
239 | core::slice::from_raw_parts( |
240 | accels.as_ptr() as *const u8, |
241 | accels.len() * ACCEL_TY_SIZE, |
242 | ) |
243 | } |
244 | } |
245 | |
246 | /// Returns the memory usage, in bytes, of these accelerators. |
247 | /// |
248 | /// The memory usage is computed based on the number of bytes used to |
249 | /// represent all of the accelerators. |
250 | /// |
251 | /// This does **not** include the stack size used by this value. |
252 | pub fn memory_usage(&self) -> usize { |
253 | self.as_bytes().len() |
254 | } |
255 | |
256 | /// Return the bytes to search for corresponding to the accelerator in this |
257 | /// sequence at index `i`. If no such accelerator exists, then this panics. |
258 | /// |
259 | /// The significance of the index is that it should be in correspondence |
260 | /// with the index of the corresponding DFA. That is, accelerated DFA |
261 | /// states are stored contiguously in the DFA and have an ordering implied |
262 | /// by their respective state IDs. The state's index in that sequence |
263 | /// corresponds to the index of its corresponding accelerator. |
264 | #[inline (always)] |
265 | pub fn needles(&self, i: usize) -> &[u8] { |
266 | if i >= self.len() { |
267 | panic!("invalid accelerator index {}" , i); |
268 | } |
269 | let bytes = self.as_bytes(); |
270 | let offset = ACCEL_TY_SIZE + i * ACCEL_CAP; |
271 | let len = bytes[offset] as usize; |
272 | &bytes[offset + 1..offset + 1 + len] |
273 | } |
274 | |
275 | /// Return the total number of accelerators in this sequence. |
276 | pub fn len(&self) -> usize { |
277 | // This should never panic since deserialization checks that the |
278 | // length can fit into a usize. |
279 | usize::try_from(self.accels.as_ref()[0]).unwrap() |
280 | } |
281 | |
282 | /// Return the accelerator in this sequence at index `i`. If no such |
283 | /// accelerator exists, then this returns None. |
284 | /// |
285 | /// See the docs for `needles` on the significance of the index. |
286 | fn get(&self, i: usize) -> Option<Accel> { |
287 | if i >= self.len() { |
288 | return None; |
289 | } |
290 | let offset = ACCEL_TY_SIZE + i * ACCEL_CAP; |
291 | let accel = Accel::from_slice(&self.as_bytes()[offset..]) |
292 | .expect("Accels must contain valid accelerators" ); |
293 | Some(accel) |
294 | } |
295 | |
296 | /// Returns an iterator of accelerators in this sequence. |
297 | fn iter(&self) -> IterAccels<'_, A> { |
298 | IterAccels { accels: self, i: 0 } |
299 | } |
300 | |
301 | /// Writes these accelerators to the given byte buffer using the indicated |
302 | /// endianness. If the given buffer is too small, then an error is |
303 | /// returned. Upon success, the total number of bytes written is returned. |
304 | /// The number of bytes written is guaranteed to be a multiple of 8. |
305 | pub fn write_to<E: Endian>( |
306 | &self, |
307 | dst: &mut [u8], |
308 | ) -> Result<usize, SerializeError> { |
309 | let nwrite = self.write_to_len(); |
310 | assert_eq!( |
311 | nwrite % ACCEL_TY_SIZE, |
312 | 0, |
313 | "expected accelerator bytes written to be a multiple of {}" , |
314 | ACCEL_TY_SIZE, |
315 | ); |
316 | if dst.len() < nwrite { |
317 | return Err(SerializeError::buffer_too_small("accelerators" )); |
318 | } |
319 | |
320 | // The number of accelerators can never exceed AccelTy::MAX. |
321 | E::write_u32(AccelTy::try_from(self.len()).unwrap(), dst); |
322 | // The actual accelerators are just raw bytes and thus their endianness |
323 | // is irrelevant. So we can copy them as bytes. |
324 | dst[ACCEL_TY_SIZE..nwrite] |
325 | .copy_from_slice(&self.as_bytes()[ACCEL_TY_SIZE..nwrite]); |
326 | Ok(nwrite) |
327 | } |
328 | |
329 | /// Validates that every accelerator in this collection can be successfully |
330 | /// deserialized as a valid accelerator. |
331 | pub fn validate(&self) -> Result<(), DeserializeError> { |
332 | for chunk in self.as_bytes()[ACCEL_TY_SIZE..].chunks(ACCEL_CAP) { |
333 | let _ = Accel::from_slice(chunk)?; |
334 | } |
335 | Ok(()) |
336 | } |
337 | |
338 | /// Returns the total number of bytes written by `write_to`. |
339 | pub fn write_to_len(&self) -> usize { |
340 | self.as_bytes().len() |
341 | } |
342 | } |
343 | |
344 | impl<A: AsRef<[AccelTy]>> core::fmt::Debug for Accels<A> { |
345 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
346 | write!(f, "Accels(" )?; |
347 | let mut list: DebugList<'_, '_> = f.debug_list(); |
348 | for a: Accel in self.iter() { |
349 | list.entry(&a); |
350 | } |
351 | list.finish()?; |
352 | write!(f, ")" ) |
353 | } |
354 | } |
355 | |
356 | #[derive (Debug)] |
357 | struct IterAccels<'a, A: AsRef<[AccelTy]>> { |
358 | accels: &'a Accels<A>, |
359 | i: usize, |
360 | } |
361 | |
362 | impl<'a, A: AsRef<[AccelTy]>> Iterator for IterAccels<'a, A> { |
363 | type Item = Accel; |
364 | |
365 | fn next(&mut self) -> Option<Accel> { |
366 | let accel: Accel = self.accels.get(self.i)?; |
367 | self.i += 1; |
368 | Some(accel) |
369 | } |
370 | } |
371 | |
372 | /// Accel represents a structure for determining how to "accelerate" a DFA |
373 | /// state. |
374 | /// |
375 | /// Namely, it contains zero or more bytes that must be seen in order for the |
376 | /// DFA to leave the state it is associated with. In practice, the actual range |
377 | /// is 1 to 3 bytes. |
378 | /// |
379 | /// The purpose of acceleration is to identify states whose vast majority |
380 | /// of transitions are just loops back to the same state. For example, |
381 | /// in the regex `(?-u)^[^a]+b`, the corresponding DFA will have a state |
382 | /// (corresponding to `[^a]+`) where all transitions *except* for `a` and |
383 | /// `b` loop back to itself. Thus, this state can be "accelerated" by simply |
384 | /// looking for the next occurrence of either `a` or `b` instead of explicitly |
385 | /// following transitions. (In this case, `b` transitions to the next state |
386 | /// where as `a` would transition to the dead state.) |
387 | #[derive (Clone)] |
388 | pub(crate) struct Accel { |
389 | /// The first byte is the length. Subsequent bytes are the accelerated |
390 | /// bytes. |
391 | /// |
392 | /// Note that we make every accelerator 8 bytes as a slightly wasteful |
393 | /// way of making sure alignment is always correct for state ID sizes of |
394 | /// 1, 2, 4 and 8. This should be okay since accelerated states aren't |
395 | /// particularly common, especially when Unicode is enabled. |
396 | bytes: [u8; ACCEL_CAP], |
397 | } |
398 | |
399 | impl Accel { |
400 | /// Returns an empty accel, where no bytes are accelerated. |
401 | #[cfg (feature = "alloc" )] |
402 | pub fn new() -> Accel { |
403 | Accel { bytes: [0; ACCEL_CAP] } |
404 | } |
405 | |
406 | /// Returns a verified accelerator derived from the beginning of the given |
407 | /// slice. |
408 | /// |
409 | /// If the slice is not long enough or contains invalid bytes for an |
410 | /// accelerator, then this returns an error. |
411 | pub fn from_slice(mut slice: &[u8]) -> Result<Accel, DeserializeError> { |
412 | slice = &slice[..core::cmp::min(ACCEL_LEN, slice.len())]; |
413 | let bytes = slice |
414 | .try_into() |
415 | .map_err(|_| DeserializeError::buffer_too_small("accelerator" ))?; |
416 | Accel::from_bytes(bytes) |
417 | } |
418 | |
419 | /// Returns a verified accelerator derived from raw bytes. |
420 | /// |
421 | /// If the given bytes are invalid, then this returns an error. |
422 | fn from_bytes(bytes: [u8; 4]) -> Result<Accel, DeserializeError> { |
423 | if bytes[0] as usize >= ACCEL_LEN { |
424 | return Err(DeserializeError::generic( |
425 | "accelerator bytes cannot have length more than 3" , |
426 | )); |
427 | } |
428 | Ok(Accel::from_bytes_unchecked(bytes)) |
429 | } |
430 | |
431 | /// Returns an accelerator derived from raw bytes. |
432 | /// |
433 | /// This does not check whether the given bytes are valid. Invalid bytes |
434 | /// cannot sacrifice memory safety, but may result in panics or silent |
435 | /// logic bugs. |
436 | fn from_bytes_unchecked(bytes: [u8; 4]) -> Accel { |
437 | Accel { bytes: [bytes[0], bytes[1], bytes[2], bytes[3], 0, 0, 0, 0] } |
438 | } |
439 | |
440 | /// Attempts to add the given byte to this accelerator. If the accelerator |
441 | /// is already full then this returns false. Otherwise, returns true. |
442 | /// |
443 | /// If the given byte is already in this accelerator, then it panics. |
444 | #[cfg (feature = "alloc" )] |
445 | pub fn add(&mut self, byte: u8) -> bool { |
446 | if self.len() >= 3 { |
447 | return false; |
448 | } |
449 | assert!( |
450 | !self.contains(byte), |
451 | "accelerator already contains {:?}" , |
452 | crate::util::DebugByte(byte) |
453 | ); |
454 | self.bytes[self.len() + 1] = byte; |
455 | self.bytes[0] += 1; |
456 | true |
457 | } |
458 | |
459 | /// Return the number of bytes in this accelerator. |
460 | pub fn len(&self) -> usize { |
461 | self.bytes[0] as usize |
462 | } |
463 | |
464 | /// Returns true if and only if there are no bytes in this accelerator. |
465 | #[cfg (feature = "alloc" )] |
466 | pub fn is_empty(&self) -> bool { |
467 | self.len() == 0 |
468 | } |
469 | |
470 | /// Returns the slice of bytes to accelerate. |
471 | /// |
472 | /// If this accelerator is empty, then this returns an empty slice. |
473 | fn needles(&self) -> &[u8] { |
474 | &self.bytes[1..1 + self.len()] |
475 | } |
476 | |
477 | /// Returns true if and only if this accelerator will accelerate the given |
478 | /// byte. |
479 | #[cfg (feature = "alloc" )] |
480 | fn contains(&self, byte: u8) -> bool { |
481 | self.needles().iter().position(|&b| b == byte).is_some() |
482 | } |
483 | |
484 | /// Returns the accelerator bytes as an array of AccelTys. |
485 | #[cfg (feature = "alloc" )] |
486 | fn as_accel_tys(&self) -> [AccelTy; 2] { |
487 | assert_eq!(ACCEL_CAP, 8); |
488 | // These unwraps are OK since ACCEL_CAP is set to 8. |
489 | let first = |
490 | AccelTy::from_ne_bytes(self.bytes[0..4].try_into().unwrap()); |
491 | let second = |
492 | AccelTy::from_ne_bytes(self.bytes[4..8].try_into().unwrap()); |
493 | [first, second] |
494 | } |
495 | } |
496 | |
497 | impl core::fmt::Debug for Accel { |
498 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
499 | write!(f, "Accel(" )?; |
500 | let mut set: DebugSet<'_, '_> = f.debug_set(); |
501 | for &b: u8 in self.needles() { |
502 | set.entry(&crate::util::DebugByte(b)); |
503 | } |
504 | set.finish()?; |
505 | write!(f, ")" ) |
506 | } |
507 | } |
508 | |