1 | // This module defines some core types for dealing with accelerated DFA states. |
2 | // Briefly, a DFA state can be "accelerated" if all of its transitions except |
3 | // for a few loop back to itself. This directly implies that the only way out |
4 | // of such a state is if a byte corresponding to one of those non-loopback |
5 | // transitions is found. Such states are often found in simple repetitions in |
6 | // non-Unicode regexes. For example, consider '(?-u)[^a]+a'. We can look at its |
7 | // DFA with regex-cli: |
8 | // |
9 | // $ regex-cli debug dense dfa -p '(?-u)[^a]+a' -BbC --no-table |
10 | // D 000000: |
11 | // Q 000001: |
12 | // *000002: |
13 | // A 000003: \x00-` => 3, a => 8, b-\xFF => 3 |
14 | // A 000004: \x00-` => 4, a => 7, b-\xFF => 4 |
15 | // 000005: \x00-` => 4, b-\xFF => 4 |
16 | // 000006: \x00-` => 3, a => 6, b-\xFF => 3 |
17 | // 000007: \x00-\xFF => 2, EOI => 2 |
18 | // 000008: \x00-\xFF => 2, EOI => 2 |
19 | // |
20 | // In particular, state 3 is accelerated (shown via the 'A' indicator) since |
21 | // the only way to leave that state once entered is to see an 'a' byte. If |
22 | // there is a long run of non-'a' bytes, then using something like 'memchr' |
23 | // to find the next 'a' byte can be significantly faster than just using the |
24 | // standard byte-at-a-time state machine. |
25 | // |
26 | // Unfortunately, this optimization rarely applies when Unicode is enabled. |
27 | // For example, patterns like '[^a]' don't actually match any byte that isn't |
28 | // 'a', but rather, any UTF-8 encoding of a Unicode scalar value that isn't |
29 | // 'a'. This makes the state machine much more complex---far beyond a single |
30 | // state---and removes the ability to easily accelerate it. (Because if the |
31 | // machine sees a non-UTF-8 sequence, then the machine won't match through it.) |
32 | // |
33 | // In practice, we only consider accelerating states that have 3 or fewer |
34 | // non-loop transitions. At a certain point, you get diminishing returns, but |
35 | // also because that's what the memchr crate supports. The structures below |
36 | // hard-code this assumption and provide (de)serialization APIs for use inside |
37 | // a DFA. |
38 | // |
39 | // And finally, note that there is some trickery involved in making it very |
40 | // fast to not only check whether a state is accelerated at search time, but |
41 | // also to access the bytes to search for to implement the acceleration itself. |
42 | // dfa/special.rs provides more detail, but the short story is that all |
43 | // accelerated states appear contiguously in a DFA. This means we can represent |
44 | // the ID space of all accelerated DFA states with a single range. So given |
45 | // a state ID, we can determine whether it's accelerated via |
46 | // |
47 | // min_accel_id <= id <= max_accel_id |
48 | // |
49 | // And find its corresponding accelerator with: |
50 | // |
51 | // accels.get((id - min_accel_id) / dfa_stride) |
52 | |
53 | #[cfg (feature = "dfa-build" )] |
54 | use alloc::{vec, vec::Vec}; |
55 | |
56 | use crate::util::{ |
57 | int::Pointer, |
58 | memchr, |
59 | wire::{self, DeserializeError, Endian, SerializeError}, |
60 | }; |
61 | |
62 | /// The base type used to represent a collection of accelerators. |
63 | /// |
64 | /// While an `Accel` is represented as a fixed size array of bytes, a |
65 | /// *collection* of `Accel`s (called `Accels`) is represented internally as a |
66 | /// slice of u32. While it's a bit unnatural to do this and costs us a bit of |
67 | /// fairly low-risk not-safe code, it lets us remove the need for a second type |
68 | /// parameter in the definition of dense::DFA. (Which really wants everything |
69 | /// to be a slice of u32.) |
70 | type AccelTy = u32; |
71 | |
72 | /// The size of the unit of representation for accelerators. |
73 | /// |
74 | /// ACCEL_CAP *must* be a multiple of this size. |
75 | const ACCEL_TY_SIZE: usize = core::mem::size_of::<AccelTy>(); |
76 | |
77 | /// The maximum length in bytes that a single Accel can be. This is distinct |
78 | /// from the capacity of an accelerator in that the length represents only the |
79 | /// bytes that should be read. |
80 | const ACCEL_LEN: usize = 4; |
81 | |
82 | /// The capacity of each accelerator, in bytes. We set this to 8 since it's a |
83 | /// multiple of 4 (our ID size) and because it gives us a little wiggle room |
84 | /// if we want to support more accel bytes in the future without a breaking |
85 | /// change. |
86 | /// |
87 | /// This MUST be a multiple of ACCEL_TY_SIZE. |
88 | const ACCEL_CAP: usize = 8; |
89 | |
90 | /// Search for between 1 and 3 needle bytes in the given haystack, starting the |
91 | /// search at the given position. If `needles` has a length other than 1-3, |
92 | /// then this panics. |
93 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
94 | pub(crate) fn find_fwd( |
95 | needles: &[u8], |
96 | haystack: &[u8], |
97 | at: usize, |
98 | ) -> Option<usize> { |
99 | let bs = needles; |
100 | let i = match needles.len() { |
101 | 1 => memchr::memchr(bs[0], &haystack[at..])?, |
102 | 2 => memchr::memchr2(bs[0], bs[1], &haystack[at..])?, |
103 | 3 => memchr::memchr3(bs[0], bs[1], bs[2], &haystack[at..])?, |
104 | 0 => panic!("cannot find with empty needles" ), |
105 | n => panic!("invalid needles length: {}" , n), |
106 | }; |
107 | Some(at + i) |
108 | } |
109 | |
110 | /// Search for between 1 and 3 needle bytes in the given haystack in reverse, |
111 | /// starting the search at the given position. If `needles` has a length other |
112 | /// than 1-3, then this panics. |
113 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
114 | pub(crate) fn find_rev( |
115 | needles: &[u8], |
116 | haystack: &[u8], |
117 | at: usize, |
118 | ) -> Option<usize> { |
119 | let bs = needles; |
120 | match needles.len() { |
121 | 1 => memchr::memrchr(bs[0], &haystack[..at]), |
122 | 2 => memchr::memrchr2(bs[0], bs[1], &haystack[..at]), |
123 | 3 => memchr::memrchr3(bs[0], bs[1], bs[2], &haystack[..at]), |
124 | 0 => panic!("cannot find with empty needles" ), |
125 | n => panic!("invalid needles length: {}" , n), |
126 | } |
127 | } |
128 | |
129 | /// Represents the accelerators for all accelerated states in a dense DFA. |
130 | /// |
131 | /// The `A` type parameter represents the type of the underlying bytes. |
132 | /// Generally, this is either `&[AccelTy]` or `Vec<AccelTy>`. |
133 | #[derive(Clone)] |
134 | pub(crate) struct Accels<A> { |
135 | /// A length prefixed slice of contiguous accelerators. See the top comment |
136 | /// in this module for more details on how we can jump from a DFA's state |
137 | /// ID to an accelerator in this list. |
138 | /// |
139 | /// The first 4 bytes always correspond to the number of accelerators |
140 | /// that follow. |
141 | accels: A, |
142 | } |
143 | |
144 | #[cfg (feature = "dfa-build" )] |
145 | impl Accels<Vec<AccelTy>> { |
146 | /// Create an empty sequence of accelerators for a DFA. |
147 | pub fn empty() -> Accels<Vec<AccelTy>> { |
148 | Accels { accels: vec![0] } |
149 | } |
150 | |
151 | /// Add an accelerator to this sequence. |
152 | /// |
153 | /// This adds to the accelerator to the end of the sequence and therefore |
154 | /// should be done in correspondence with its state in the DFA. |
155 | /// |
156 | /// This panics if this results in more accelerators than AccelTy::MAX. |
157 | pub fn add(&mut self, accel: Accel) { |
158 | self.accels.extend_from_slice(&accel.as_accel_tys()); |
159 | let len = self.len(); |
160 | self.set_len(len + 1); |
161 | } |
162 | |
163 | /// Set the number of accelerators in this sequence, which is encoded in |
164 | /// the first 4 bytes of the underlying bytes. |
165 | fn set_len(&mut self, new_len: usize) { |
166 | // The only way an accelerator gets added is if a state exists for |
167 | // it, and if a state exists, then its index is guaranteed to be |
168 | // representable by a AccelTy by virtue of the guarantees provided by |
169 | // StateID. |
170 | let new_len = AccelTy::try_from(new_len).unwrap(); |
171 | self.accels[0] = new_len; |
172 | } |
173 | } |
174 | |
175 | impl<'a> Accels<&'a [AccelTy]> { |
176 | /// Deserialize a sequence of accelerators from the given bytes. If there |
177 | /// was a problem deserializing, then an error is returned. |
178 | /// |
179 | /// This is guaranteed to run in constant time. This does not guarantee |
180 | /// that every accelerator in the returned collection is valid. Thus, |
181 | /// accessing one may panic, or not-safe code that relies on accelerators |
182 | /// being correct my result in UB. |
183 | /// |
184 | /// Callers may check the validity of every accelerator with the `validate` |
185 | /// method. |
186 | pub fn from_bytes_unchecked( |
187 | mut slice: &'a [u8], |
188 | ) -> Result<(Accels<&'a [AccelTy]>, usize), DeserializeError> { |
189 | let slice_start = slice.as_ptr().as_usize(); |
190 | |
191 | let (accel_len, _) = |
192 | wire::try_read_u32_as_usize(slice, "accelerators length" )?; |
193 | // The accelerator length is part of the accel_tys slice that |
194 | // we deserialize. This is perhaps a bit idiosyncratic. It would |
195 | // probably be better to split out the length into a real field. |
196 | |
197 | let accel_tys_len = wire::add( |
198 | wire::mul(accel_len, 2, "total number of accelerator accel_tys" )?, |
199 | 1, |
200 | "total number of accel_tys" , |
201 | )?; |
202 | let accel_tys_bytes_len = wire::mul( |
203 | ACCEL_TY_SIZE, |
204 | accel_tys_len, |
205 | "total number of bytes in accelerators" , |
206 | )?; |
207 | wire::check_slice_len(slice, accel_tys_bytes_len, "accelerators" )?; |
208 | wire::check_alignment::<AccelTy>(slice)?; |
209 | let accel_tys = &slice[..accel_tys_bytes_len]; |
210 | slice = &slice[accel_tys_bytes_len..]; |
211 | // SAFETY: We've checked the length and alignment above, and since |
212 | // slice is just bytes and AccelTy is just a u32, we can safely cast to |
213 | // a slice of &[AccelTy]. |
214 | let accels = unsafe { |
215 | core::slice::from_raw_parts( |
216 | accel_tys.as_ptr().cast::<AccelTy>(), |
217 | accel_tys_len, |
218 | ) |
219 | }; |
220 | Ok((Accels { accels }, slice.as_ptr().as_usize() - slice_start)) |
221 | } |
222 | } |
223 | |
224 | impl<A: AsRef<[AccelTy]>> Accels<A> { |
225 | /// Return an owned version of the accelerators. |
226 | #[cfg (feature = "alloc" )] |
227 | pub fn to_owned(&self) -> Accels<alloc::vec::Vec<AccelTy>> { |
228 | Accels { accels: self.accels.as_ref().to_vec() } |
229 | } |
230 | |
231 | /// Return a borrowed version of the accelerators. |
232 | pub fn as_ref(&self) -> Accels<&[AccelTy]> { |
233 | Accels { accels: self.accels.as_ref() } |
234 | } |
235 | |
236 | /// Return the bytes representing the serialization of the accelerators. |
237 | pub fn as_bytes(&self) -> &[u8] { |
238 | let accels = self.accels.as_ref(); |
239 | // SAFETY: This is safe because accels is a just a slice of AccelTy, |
240 | // and u8 always has a smaller alignment. |
241 | unsafe { |
242 | core::slice::from_raw_parts( |
243 | accels.as_ptr().cast::<u8>(), |
244 | accels.len() * ACCEL_TY_SIZE, |
245 | ) |
246 | } |
247 | } |
248 | |
249 | /// Returns the memory usage, in bytes, of these accelerators. |
250 | /// |
251 | /// The memory usage is computed based on the number of bytes used to |
252 | /// represent all of the accelerators. |
253 | /// |
254 | /// This does **not** include the stack size used by this value. |
255 | pub fn memory_usage(&self) -> usize { |
256 | self.as_bytes().len() |
257 | } |
258 | |
259 | /// Return the bytes to search for corresponding to the accelerator in this |
260 | /// sequence at index `i`. If no such accelerator exists, then this panics. |
261 | /// |
262 | /// The significance of the index is that it should be in correspondence |
263 | /// with the index of the corresponding DFA. That is, accelerated DFA |
264 | /// states are stored contiguously in the DFA and have an ordering implied |
265 | /// by their respective state IDs. The state's index in that sequence |
266 | /// corresponds to the index of its corresponding accelerator. |
267 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
268 | pub fn needles(&self, i: usize) -> &[u8] { |
269 | if i >= self.len() { |
270 | panic!("invalid accelerator index {}" , i); |
271 | } |
272 | let bytes = self.as_bytes(); |
273 | let offset = ACCEL_TY_SIZE + i * ACCEL_CAP; |
274 | let len = usize::from(bytes[offset]); |
275 | &bytes[offset + 1..offset + 1 + len] |
276 | } |
277 | |
278 | /// Return the total number of accelerators in this sequence. |
279 | pub fn len(&self) -> usize { |
280 | // This should never panic since deserialization checks that the |
281 | // length can fit into a usize. |
282 | usize::try_from(self.accels.as_ref()[0]).unwrap() |
283 | } |
284 | |
285 | /// Return the accelerator in this sequence at index `i`. If no such |
286 | /// accelerator exists, then this returns None. |
287 | /// |
288 | /// See the docs for `needles` on the significance of the index. |
289 | fn get(&self, i: usize) -> Option<Accel> { |
290 | if i >= self.len() { |
291 | return None; |
292 | } |
293 | let offset = ACCEL_TY_SIZE + i * ACCEL_CAP; |
294 | let accel = Accel::from_slice(&self.as_bytes()[offset..]) |
295 | .expect("Accels must contain valid accelerators" ); |
296 | Some(accel) |
297 | } |
298 | |
299 | /// Returns an iterator of accelerators in this sequence. |
300 | fn iter(&self) -> IterAccels<'_, A> { |
301 | IterAccels { accels: self, i: 0 } |
302 | } |
303 | |
304 | /// Writes these accelerators to the given byte buffer using the indicated |
305 | /// endianness. If the given buffer is too small, then an error is |
306 | /// returned. Upon success, the total number of bytes written is returned. |
307 | /// The number of bytes written is guaranteed to be a multiple of 8. |
308 | pub fn write_to<E: Endian>( |
309 | &self, |
310 | dst: &mut [u8], |
311 | ) -> Result<usize, SerializeError> { |
312 | let nwrite = self.write_to_len(); |
313 | assert_eq!( |
314 | nwrite % ACCEL_TY_SIZE, |
315 | 0, |
316 | "expected accelerator bytes written to be a multiple of {}" , |
317 | ACCEL_TY_SIZE, |
318 | ); |
319 | if dst.len() < nwrite { |
320 | return Err(SerializeError::buffer_too_small("accelerators" )); |
321 | } |
322 | |
323 | // The number of accelerators can never exceed AccelTy::MAX. |
324 | E::write_u32(AccelTy::try_from(self.len()).unwrap(), dst); |
325 | // The actual accelerators are just raw bytes and thus their endianness |
326 | // is irrelevant. So we can copy them as bytes. |
327 | dst[ACCEL_TY_SIZE..nwrite] |
328 | .copy_from_slice(&self.as_bytes()[ACCEL_TY_SIZE..nwrite]); |
329 | Ok(nwrite) |
330 | } |
331 | |
332 | /// Validates that every accelerator in this collection can be successfully |
333 | /// deserialized as a valid accelerator. |
334 | pub fn validate(&self) -> Result<(), DeserializeError> { |
335 | for chunk in self.as_bytes()[ACCEL_TY_SIZE..].chunks(ACCEL_CAP) { |
336 | let _ = Accel::from_slice(chunk)?; |
337 | } |
338 | Ok(()) |
339 | } |
340 | |
341 | /// Returns the total number of bytes written by `write_to`. |
342 | pub fn write_to_len(&self) -> usize { |
343 | self.as_bytes().len() |
344 | } |
345 | } |
346 | |
347 | impl<A: AsRef<[AccelTy]>> core::fmt::Debug for Accels<A> { |
348 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
349 | write!(f, "Accels(" )?; |
350 | let mut list = f.debug_list(); |
351 | for a in self.iter() { |
352 | list.entry(&a); |
353 | } |
354 | list.finish()?; |
355 | write!(f, ")" ) |
356 | } |
357 | } |
358 | |
359 | #[derive(Debug)] |
360 | struct IterAccels<'a, A: AsRef<[AccelTy]>> { |
361 | accels: &'a Accels<A>, |
362 | i: usize, |
363 | } |
364 | |
365 | impl<'a, A: AsRef<[AccelTy]>> Iterator for IterAccels<'a, A> { |
366 | type Item = Accel; |
367 | |
368 | fn next(&mut self) -> Option<Accel> { |
369 | let accel = self.accels.get(self.i)?; |
370 | self.i += 1; |
371 | Some(accel) |
372 | } |
373 | } |
374 | |
375 | /// Accel represents a structure for determining how to "accelerate" a DFA |
376 | /// state. |
377 | /// |
378 | /// Namely, it contains zero or more bytes that must be seen in order for the |
379 | /// DFA to leave the state it is associated with. In practice, the actual range |
380 | /// is 1 to 3 bytes. |
381 | /// |
382 | /// The purpose of acceleration is to identify states whose vast majority |
383 | /// of transitions are just loops back to the same state. For example, |
384 | /// in the regex `(?-u)^[^a]+b`, the corresponding DFA will have a state |
385 | /// (corresponding to `[^a]+`) where all transitions *except* for `a` and |
386 | /// `b` loop back to itself. Thus, this state can be "accelerated" by simply |
387 | /// looking for the next occurrence of either `a` or `b` instead of explicitly |
388 | /// following transitions. (In this case, `b` transitions to the next state |
389 | /// where as `a` would transition to the dead state.) |
390 | #[derive(Clone)] |
391 | pub(crate) struct Accel { |
392 | /// The first byte is the length. Subsequent bytes are the accelerated |
393 | /// bytes. |
394 | /// |
395 | /// Note that we make every accelerator 8 bytes as a slightly wasteful |
396 | /// way of making sure alignment is always correct for state ID sizes of |
397 | /// 1, 2, 4 and 8. This should be okay since accelerated states aren't |
398 | /// particularly common, especially when Unicode is enabled. |
399 | bytes: [u8; ACCEL_CAP], |
400 | } |
401 | |
402 | impl Accel { |
403 | /// Returns an empty accel, where no bytes are accelerated. |
404 | #[cfg (feature = "dfa-build" )] |
405 | pub fn new() -> Accel { |
406 | Accel { bytes: [0; ACCEL_CAP] } |
407 | } |
408 | |
409 | /// Returns a verified accelerator derived from the beginning of the given |
410 | /// slice. |
411 | /// |
412 | /// If the slice is not long enough or contains invalid bytes for an |
413 | /// accelerator, then this returns an error. |
414 | pub fn from_slice(mut slice: &[u8]) -> Result<Accel, DeserializeError> { |
415 | slice = &slice[..core::cmp::min(ACCEL_LEN, slice.len())]; |
416 | let bytes = slice |
417 | .try_into() |
418 | .map_err(|_| DeserializeError::buffer_too_small("accelerator" ))?; |
419 | Accel::from_bytes(bytes) |
420 | } |
421 | |
422 | /// Returns a verified accelerator derived from raw bytes. |
423 | /// |
424 | /// If the given bytes are invalid, then this returns an error. |
425 | fn from_bytes(bytes: [u8; 4]) -> Result<Accel, DeserializeError> { |
426 | if usize::from(bytes[0]) >= ACCEL_LEN { |
427 | return Err(DeserializeError::generic( |
428 | "accelerator bytes cannot have length more than 3" , |
429 | )); |
430 | } |
431 | Ok(Accel::from_bytes_unchecked(bytes)) |
432 | } |
433 | |
434 | /// Returns an accelerator derived from raw bytes. |
435 | /// |
436 | /// This does not check whether the given bytes are valid. Invalid bytes |
437 | /// cannot sacrifice memory safety, but may result in panics or silent |
438 | /// logic bugs. |
439 | fn from_bytes_unchecked(bytes: [u8; 4]) -> Accel { |
440 | Accel { bytes: [bytes[0], bytes[1], bytes[2], bytes[3], 0, 0, 0, 0] } |
441 | } |
442 | |
443 | /// Attempts to add the given byte to this accelerator. If the accelerator |
444 | /// is already full or thinks the byte is a poor accelerator, then this |
445 | /// returns false. Otherwise, returns true. |
446 | /// |
447 | /// If the given byte is already in this accelerator, then it panics. |
448 | #[cfg (feature = "dfa-build" )] |
449 | pub fn add(&mut self, byte: u8) -> bool { |
450 | if self.len() >= 3 { |
451 | return false; |
452 | } |
453 | // As a special case, we totally reject trying to accelerate a state |
454 | // with an ASCII space. In most cases, it occurs very frequently, and |
455 | // tends to result in worse overall performance. |
456 | if byte == b' ' { |
457 | return false; |
458 | } |
459 | assert!( |
460 | !self.contains(byte), |
461 | "accelerator already contains {:?}" , |
462 | crate::util::escape::DebugByte(byte) |
463 | ); |
464 | self.bytes[self.len() + 1] = byte; |
465 | self.bytes[0] += 1; |
466 | true |
467 | } |
468 | |
469 | /// Return the number of bytes in this accelerator. |
470 | pub fn len(&self) -> usize { |
471 | usize::from(self.bytes[0]) |
472 | } |
473 | |
474 | /// Returns true if and only if there are no bytes in this accelerator. |
475 | #[cfg (feature = "dfa-build" )] |
476 | pub fn is_empty(&self) -> bool { |
477 | self.len() == 0 |
478 | } |
479 | |
480 | /// Returns the slice of bytes to accelerate. |
481 | /// |
482 | /// If this accelerator is empty, then this returns an empty slice. |
483 | fn needles(&self) -> &[u8] { |
484 | &self.bytes[1..1 + self.len()] |
485 | } |
486 | |
487 | /// Returns true if and only if this accelerator will accelerate the given |
488 | /// byte. |
489 | #[cfg (feature = "dfa-build" )] |
490 | fn contains(&self, byte: u8) -> bool { |
491 | self.needles().iter().position(|&b| b == byte).is_some() |
492 | } |
493 | |
494 | /// Returns the accelerator bytes as an array of AccelTys. |
495 | #[cfg (feature = "dfa-build" )] |
496 | fn as_accel_tys(&self) -> [AccelTy; 2] { |
497 | assert_eq!(ACCEL_CAP, 8); |
498 | // These unwraps are OK since ACCEL_CAP is set to 8. |
499 | let first = |
500 | AccelTy::from_ne_bytes(self.bytes[0..4].try_into().unwrap()); |
501 | let second = |
502 | AccelTy::from_ne_bytes(self.bytes[4..8].try_into().unwrap()); |
503 | [first, second] |
504 | } |
505 | } |
506 | |
507 | impl core::fmt::Debug for Accel { |
508 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
509 | write!(f, "Accel(" )?; |
510 | let mut set = f.debug_set(); |
511 | for &b in self.needles() { |
512 | set.entry(&crate::util::escape::DebugByte(b)); |
513 | } |
514 | set.finish()?; |
515 | write!(f, ")" ) |
516 | } |
517 | } |
518 | |