accel.rs - Codebrowser

1	// This module defines some core types for dealing with accelerated DFA states.
2	// Briefly, a DFA state can be "accelerated" if all of its transitions except
3	// for a few loop back to itself. This directly implies that the only way out
4	// of such a state is if a byte corresponding to one of those non-loopback
5	// transitions is found. Such states are often found in simple repetitions in
6	// non-Unicode regexes. For example, consider '(?-u)[^a]+a'. We can look at its
7	// DFA with regex-cli:
8	//
9	// $ regex-cli debug dense dfa -p '(?-u)[^a]+a' -BbC --no-table
10	// D 000000:
11	// Q 000001:
12	// 000002:*
13	// A 000003: \x00-` => 3, a => 8, b-\xFF => 3
14	// A 000004: \x00-` => 4, a => 7, b-\xFF => 4
15	// 000005: \x00-` => 4, b-\xFF => 4
16	// 000006: \x00-` => 3, a => 6, b-\xFF => 3
17	// 000007: \x00-\xFF => 2, EOI => 2
18	// 000008: \x00-\xFF => 2, EOI => 2
19	//
20	// In particular, state 3 is accelerated (shown via the 'A' indicator) since
21	// the only way to leave that state once entered is to see an 'a' byte. If
22	// there is a long run of non-'a' bytes, then using something like 'memchr'
23	// to find the next 'a' byte can be significantly faster than just using the
24	// standard byte-at-a-time state machine.
25	//
26	// Unfortunately, this optimization rarely applies when Unicode is enabled.
27	// For example, patterns like '[^a]' don't actually match any byte that isn't
28	// 'a', but rather, any UTF-8 encoding of a Unicode scalar value that isn't
29	// 'a'. This makes the state machine much more complex---far beyond a single
30	// state---and removes the ability to easily accelerate it. (Because if the
31	// machine sees a non-UTF-8 sequence, then the machine won't match through it.)
32	//
33	// In practice, we only consider accelerating states that have 3 or fewer
34	// non-loop transitions. At a certain point, you get diminishing returns, but
35	// also because that's what the memchr crate supports. The structures below
36	// hard-code this assumption and provide (de)serialization APIs for use inside
37	// a DFA.
38	//
39	// And finally, note that there is some trickery involved in making it very
40	// fast to not only check whether a state is accelerated at search time, but
41	// also to access the bytes to search for to implement the acceleration itself.
42	// dfa/special.rs provides more detail, but the short story is that all
43	// accelerated states appear contiguously in a DFA. This means we can represent
44	// the ID space of all accelerated DFA states with a single range. So given
45	// a state ID, we can determine whether it's accelerated via
46	//
47	// min_accel_id <= id <= max_accel_id
48	//
49	// And find its corresponding accelerator with:
50	//
51	// accels.get((id - min_accel_id) / dfa_stride)
52
53	#[cfg(feature = "dfa-build")]
54	use alloc::{vec, vec::Vec};
55
56	use crate::util::{
57	int::Pointer,
58	memchr,
59	wire::{self, DeserializeError, Endian, SerializeError},
60	};
61
62	/// The base type used to represent a collection of accelerators.
63	///
64	/// While an `Accel` is represented as a fixed size array of bytes, a
65	/// collection* of `Accel`s (called `Accels`) is represented internally as a*
66	/// slice of u32. While it's a bit unnatural to do this and costs us a bit of
67	/// fairly low-risk not-safe code, it lets us remove the need for a second type
68	/// parameter in the definition of dense::DFA. (Which really wants everything
69	/// to be a slice of u32.)
70	type AccelTy = u32;
71
72	/// The size of the unit of representation for accelerators.
73	///
74	/// ACCEL_CAP must* be a multiple of this size.*
75	const ACCEL_TY_SIZE: usize = core::mem::size_of::<AccelTy>();
76
77	/// The maximum length in bytes that a single Accel can be. This is distinct
78	/// from the capacity of an accelerator in that the length represents only the
79	/// bytes that should be read.
80	const ACCEL_LEN: usize = `4`;
81
82	/// The capacity of each accelerator, in bytes. We set this to 8 since it's a
83	/// multiple of 4 (our ID size) and because it gives us a little wiggle room
84	/// if we want to support more accel bytes in the future without a breaking
85	/// change.
86	///
87	/// This MUST be a multiple of ACCEL_TY_SIZE.
88	const ACCEL_CAP: usize = `8`;
89
90	/// Search for between 1 and 3 needle bytes in the given haystack, starting the
91	/// search at the given position. If `needles` has a length other than 1-3,
92	/// then this panics.
93	#[cfg_attr(feature = "perf-inline", inline(always))]
94	pub(crate) fn find_fwd(
95	needles: &[u8],
96	haystack: &[u8],
97	at: usize,
98	) -> Option<usize> {
99	let bs = needles;
100	let i = match needles.len() {
101	`1` => memchr::memchr(bs[`0`], &haystack[at..])?,
102	`2` => memchr::memchr2(bs[`0`], bs[`1`], &haystack[at..])?,
103	`3` => memchr::memchr3(bs[`0`], bs[`1`], bs[`2`], &haystack[at..])?,
104	`0` => panic!("cannot find with empty needles"),
105	n => panic!("invalid needles length: {}", n),
106	};
107	Some(at + i)
108	}
109
110	/// Search for between 1 and 3 needle bytes in the given haystack in reverse,
111	/// starting the search at the given position. If `needles` has a length other
112	/// than 1-3, then this panics.
113	#[cfg_attr(feature = "perf-inline", inline(always))]
114	pub(crate) fn find_rev(
115	needles: &[u8],
116	haystack: &[u8],
117	at: usize,
118	) -> Option<usize> {
119	let bs = needles;
120	match needles.len() {
121	`1` => memchr::memrchr(bs[`0`], &haystack[..at]),
122	`2` => memchr::memrchr2(bs[`0`], bs[`1`], &haystack[..at]),
123	`3` => memchr::memrchr3(bs[`0`], bs[`1`], bs[`2`], &haystack[..at]),
124	`0` => panic!("cannot find with empty needles"),
125	n => panic!("invalid needles length: {}", n),
126	}
127	}
128
129	/// Represents the accelerators for all accelerated states in a dense DFA.
130	///
131	/// The `A` type parameter represents the type of the underlying bytes.
132	/// Generally, this is either `&[AccelTy]` or `Vec<AccelTy>`.
133	#[derive(Clone)]
134	pub(crate) struct Accels<A> {
135	/// A length prefixed slice of contiguous accelerators. See the top comment
136	/// in this module for more details on how we can jump from a DFA's state
137	/// ID to an accelerator in this list.
138	///
139	/// The first 4 bytes always correspond to the number of accelerators
140	/// that follow.
141	accels: A,
142	}
143
144	#[cfg(feature = "dfa-build")]
145	impl Accels<Vec<AccelTy>> {
146	/// Create an empty sequence of accelerators for a DFA.
147	pub fn empty() -> Accels<Vec<AccelTy>> {
148	Accels { accels: vec![`0`] }
149	}
150
151	/// Add an accelerator to this sequence.
152	///
153	/// This adds to the accelerator to the end of the sequence and therefore
154	/// should be done in correspondence with its state in the DFA.
155	///
156	/// This panics if this results in more accelerators than AccelTy::MAX.
157	pub fn add(&mut self, accel: Accel) {
158	self.accels.extend_from_slice(&accel.as_accel_tys());
159	let len = self.len();
160	self.set_len(len + `1`);
161	}
162
163	/// Set the number of accelerators in this sequence, which is encoded in
164	/// the first 4 bytes of the underlying bytes.
165	fn set_len(&mut self, new_len: usize) {
166	// The only way an accelerator gets added is if a state exists for
167	// it, and if a state exists, then its index is guaranteed to be
168	// representable by a AccelTy by virtue of the guarantees provided by
169	// StateID.
170	let new_len = AccelTy::try_from(new_len).unwrap();
171	self.accels[`0`] = new_len;
172	}
173	}
174
175	impl<'a> Accels<&'a [AccelTy]> {
176	/// Deserialize a sequence of accelerators from the given bytes. If there
177	/// was a problem deserializing, then an error is returned.
178	///
179	/// This is guaranteed to run in constant time. This does not guarantee
180	/// that every accelerator in the returned collection is valid. Thus,
181	/// accessing one may panic, or not-safe code that relies on accelerators
182	/// being correct my result in UB.
183	///
184	/// Callers may check the validity of every accelerator with the `validate`
185	/// method.
186	pub fn from_bytes_unchecked(
187	mut slice: &'a [u8],
188	) -> Result<(Accels<&'a [AccelTy]>, usize), DeserializeError> {
189	let slice_start = slice.as_ptr().as_usize();
190
191	let (accel_len, _) =
192	wire::try_read_u32_as_usize(slice, "accelerators length")?;
193	// The accelerator length is part of the accel_tys slice that
194	// we deserialize. This is perhaps a bit idiosyncratic. It would
195	// probably be better to split out the length into a real field.
196
197	let accel_tys_len = wire::add(
198	wire::mul(accel_len, `2`, "total number of accelerator accel_tys")?,
199	`1`,
200	"total number of accel_tys",
201	)?;
202	let accel_tys_bytes_len = wire::mul(
203	ACCEL_TY_SIZE,
204	accel_tys_len,
205	"total number of bytes in accelerators",
206	)?;
207	wire::check_slice_len(slice, accel_tys_bytes_len, "accelerators")?;
208	wire::check_alignment::<AccelTy>(slice)?;
209	let accel_tys = &slice[..accel_tys_bytes_len];
210	slice = &slice[accel_tys_bytes_len..];
211	// SAFETY: We've checked the length and alignment above, and since
212	// slice is just bytes and AccelTy is just a u32, we can safely cast to
213	// a slice of &[AccelTy].
214	let accels = unsafe {
215	core::slice::from_raw_parts(
216	accel_tys.as_ptr().cast::<AccelTy>(),
217	accel_tys_len,
218	)
219	};
220	Ok((Accels { accels }, slice.as_ptr().as_usize() - slice_start))
221	}
222	}
223
224	impl<A: AsRef<[AccelTy]>> Accels<A> {
225	/// Return an owned version of the accelerators.
226	#[cfg(feature = "alloc")]
227	pub fn to_owned(&self) -> Accels<alloc::vec::Vec<AccelTy>> {
228	Accels { accels: self.accels.as_ref().to_vec() }
229	}
230
231	/// Return a borrowed version of the accelerators.
232	pub fn as_ref(&self) -> Accels<&[AccelTy]> {
233	Accels { accels: self.accels.as_ref() }
234	}
235
236	/// Return the bytes representing the serialization of the accelerators.
237	pub fn as_bytes(&self) -> &[u8] {
238	let accels = self.accels.as_ref();
239	// SAFETY: This is safe because accels is a just a slice of AccelTy,
240	// and u8 always has a smaller alignment.
241	unsafe {
242	core::slice::from_raw_parts(
243	accels.as_ptr().cast::<u8>(),
244	accels.len() * ACCEL_TY_SIZE,
245	)
246	}
247	}
248
249	/// Returns the memory usage, in bytes, of these accelerators.
250	///
251	/// The memory usage is computed based on the number of bytes used to
252	/// represent all of the accelerators.
253	///
254	/// This does not* include the stack size used by this value.*
255	pub fn memory_usage(&self) -> usize {
256	self.as_bytes().len()
257	}
258
259	/// Return the bytes to search for corresponding to the accelerator in this
260	/// sequence at index `i`. If no such accelerator exists, then this panics.
261	///
262	/// The significance of the index is that it should be in correspondence
263	/// with the index of the corresponding DFA. That is, accelerated DFA
264	/// states are stored contiguously in the DFA and have an ordering implied
265	/// by their respective state IDs. The state's index in that sequence
266	/// corresponds to the index of its corresponding accelerator.
267	#[cfg_attr(feature = "perf-inline", inline(always))]
268	pub fn needles(&self, i: usize) -> &[u8] {
269	if i >= self.len() {
270	panic!("invalid accelerator index {}", i);
271	}
272	let bytes = self.as_bytes();
273	let offset = ACCEL_TY_SIZE + i * ACCEL_CAP;
274	let len = usize::from(bytes[offset]);
275	&bytes[offset + `1`..offset + `1` + len]
276	}
277
278	/// Return the total number of accelerators in this sequence.
279	pub fn len(&self) -> usize {
280	// This should never panic since deserialization checks that the
281	// length can fit into a usize.
282	usize::try_from(self.accels.as_ref()[`0`]).unwrap()
283	}
284
285	/// Return the accelerator in this sequence at index `i`. If no such
286	/// accelerator exists, then this returns None.
287	///
288	/// See the docs for `needles` on the significance of the index.
289	fn get(&self, i: usize) -> Option<Accel> {
290	if i >= self.len() {
291	return None;
292	}
293	let offset = ACCEL_TY_SIZE + i * ACCEL_CAP;
294	let accel = Accel::from_slice(&self.as_bytes()[offset..])
295	.expect("Accels must contain valid accelerators");
296	Some(accel)
297	}
298
299	/// Returns an iterator of accelerators in this sequence.
300	fn iter(&self) -> IterAccels<'_, A> {
301	IterAccels { accels: self, i: `0` }
302	}
303
304	/// Writes these accelerators to the given byte buffer using the indicated
305	/// endianness. If the given buffer is too small, then an error is
306	/// returned. Upon success, the total number of bytes written is returned.
307	/// The number of bytes written is guaranteed to be a multiple of 8.
308	pub fn write_to<E: Endian>(
309	&self,
310	dst: &mut [u8],
311	) -> Result<usize, SerializeError> {
312	let nwrite = self.write_to_len();
313	assert_eq!(
314	nwrite % ACCEL_TY_SIZE,
315	`0`,
316	"expected accelerator bytes written to be a multiple of {}",
317	ACCEL_TY_SIZE,
318	);
319	if dst.len() < nwrite {
320	return Err(SerializeError::buffer_too_small("accelerators"));
321	}
322
323	// The number of accelerators can never exceed AccelTy::MAX.
324	E::write_u32(AccelTy::try_from(self.len()).unwrap(), dst);
325	// The actual accelerators are just raw bytes and thus their endianness
326	// is irrelevant. So we can copy them as bytes.
327	dst[ACCEL_TY_SIZE..nwrite]
328	.copy_from_slice(&self.as_bytes()[ACCEL_TY_SIZE..nwrite]);
329	Ok(nwrite)
330	}
331
332	/// Validates that every accelerator in this collection can be successfully
333	/// deserialized as a valid accelerator.
334	pub fn validate(&self) -> Result<(), DeserializeError> {
335	for chunk in self.as_bytes()[ACCEL_TY_SIZE..].chunks(ACCEL_CAP) {
336	let _ = Accel::from_slice(chunk)?;
337	}
338	Ok(())
339	}
340
341	/// Returns the total number of bytes written by `write_to`.
342	pub fn write_to_len(&self) -> usize {
343	self.as_bytes().len()
344	}
345	}
346
347	impl<A: AsRef<[AccelTy]>> core::fmt::Debug for Accels<A> {
348	fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
349	write!(f, "Accels(")?;
350	let mut list = f.debug_list();
351	for a in self.iter() {
352	list.entry(&a);
353	}
354	list.finish()?;
355	write!(f, ")")
356	}
357	}
358
359	#[derive(Debug)]
360	struct IterAccels<'a, A: AsRef<[AccelTy]>> {
361	accels: &'a Accels<A>,
362	i: usize,
363	}
364
365	impl<'a, A: AsRef<[AccelTy]>> Iterator for IterAccels<'a, A> {
366	type Item = Accel;
367
368	fn next(&mut self) -> Option<Accel> {
369	let accel = self.accels.get(self.i)?;
370	self.i += `1`;
371	Some(accel)
372	}
373	}
374
375	/// Accel represents a structure for determining how to "accelerate" a DFA
376	/// state.
377	///
378	/// Namely, it contains zero or more bytes that must be seen in order for the
379	/// DFA to leave the state it is associated with. In practice, the actual range
380	/// is 1 to 3 bytes.
381	///
382	/// The purpose of acceleration is to identify states whose vast majority
383	/// of transitions are just loops back to the same state. For example,
384	/// in the regex `(?-u)^[^a]+b`, the corresponding DFA will have a state
385	/// (corresponding to `[^a]+`) where all transitions except* for `a` and*
386	/// `b` loop back to itself. Thus, this state can be "accelerated" by simply
387	/// looking for the next occurrence of either `a` or `b` instead of explicitly
388	/// following transitions. (In this case, `b` transitions to the next state
389	/// where as `a` would transition to the dead state.)
390	#[derive(Clone)]
391	pub(crate) struct Accel {
392	/// The first byte is the length. Subsequent bytes are the accelerated
393	/// bytes.
394	///
395	/// Note that we make every accelerator 8 bytes as a slightly wasteful
396	/// way of making sure alignment is always correct for state ID sizes of
397	/// 1, 2, 4 and 8. This should be okay since accelerated states aren't
398	/// particularly common, especially when Unicode is enabled.
399	bytes: [u8; ACCEL_CAP],
400	}
401
402	impl Accel {
403	/// Returns an empty accel, where no bytes are accelerated.
404	#[cfg(feature = "dfa-build")]
405	pub fn new() -> Accel {
406	Accel { bytes: [`0`; ACCEL_CAP] }
407	}
408
409	/// Returns a verified accelerator derived from the beginning of the given
410	/// slice.
411	///
412	/// If the slice is not long enough or contains invalid bytes for an
413	/// accelerator, then this returns an error.
414	pub fn from_slice(mut slice: &[u8]) -> Result<Accel, DeserializeError> {
415	slice = &slice[..core::cmp::min(ACCEL_LEN, slice.len())];
416	let bytes = slice
417	.try_into()
418	.map_err(\|_\| DeserializeError::buffer_too_small("accelerator"))?;
419	Accel::from_bytes(bytes)
420	}
421
422	/// Returns a verified accelerator derived from raw bytes.
423	///
424	/// If the given bytes are invalid, then this returns an error.
425	fn from_bytes(bytes: [u8; `4`]) -> Result<Accel, DeserializeError> {
426	if usize::from(bytes[`0`]) >= ACCEL_LEN {
427	return Err(DeserializeError::generic(
428	"accelerator bytes cannot have length more than 3",
429	));
430	}
431	Ok(Accel::from_bytes_unchecked(bytes))
432	}
433
434	/// Returns an accelerator derived from raw bytes.
435	///
436	/// This does not check whether the given bytes are valid. Invalid bytes
437	/// cannot sacrifice memory safety, but may result in panics or silent
438	/// logic bugs.
439	fn from_bytes_unchecked(bytes: [u8; `4`]) -> Accel {
440	Accel { bytes: [bytes[`0`], bytes[`1`], bytes[`2`], bytes[`3`], `0`, `0`, `0`, `0`] }
441	}
442
443	/// Attempts to add the given byte to this accelerator. If the accelerator
444	/// is already full or thinks the byte is a poor accelerator, then this
445	/// returns false. Otherwise, returns true.
446	///
447	/// If the given byte is already in this accelerator, then it panics.
448	#[cfg(feature = "dfa-build")]
449	pub fn add(&mut self, byte: u8) -> bool {
450	if self.len() >= `3` {
451	return `false`;
452	}
453	// As a special case, we totally reject trying to accelerate a state
454	// with an ASCII space. In most cases, it occurs very frequently, and
455	// tends to result in worse overall performance.
456	if byte == b' ' {
457	return `false`;
458	}
459	assert!(
460	!self.contains(byte),
461	"accelerator already contains {:?}",
462	crate::util::escape::DebugByte(byte)
463	);
464	self.bytes[self.len() + `1`] = byte;
465	self.bytes[`0`] += `1`;
466	`true`
467	}
468
469	/// Return the number of bytes in this accelerator.
470	pub fn len(&self) -> usize {
471	usize::from(self.bytes[`0`])
472	}
473
474	/// Returns true if and only if there are no bytes in this accelerator.
475	#[cfg(feature = "dfa-build")]
476	pub fn is_empty(&self) -> bool {
477	self.len() == `0`
478	}
479
480	/// Returns the slice of bytes to accelerate.
481	///
482	/// If this accelerator is empty, then this returns an empty slice.
483	fn needles(&self) -> &[u8] {
484	&self.bytes[`1`..`1` + self.len()]
485	}
486
487	/// Returns true if and only if this accelerator will accelerate the given
488	/// byte.
489	#[cfg(feature = "dfa-build")]
490	fn contains(&self, byte: u8) -> bool {
491	self.needles().iter().position(\|&b\| b == byte).is_some()
492	}
493
494	/// Returns the accelerator bytes as an array of AccelTys.
495	#[cfg(feature = "dfa-build")]
496	fn as_accel_tys(&self) -> [AccelTy; `2`] {
497	assert_eq!(ACCEL_CAP, `8`);
498	// These unwraps are OK since ACCEL_CAP is set to 8.
499	let first =
500	AccelTy::from_ne_bytes(self.bytes[`0`..`4`].try_into().unwrap());
501	let second =
502	AccelTy::from_ne_bytes(self.bytes[`4`..`8`].try_into().unwrap());
503	[first, second]
504	}
505	}
506
507	impl core::fmt::Debug for Accel {
508	fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
509	write!(f, "Accel(")?;
510	let mut set = f.debug_set();
511	for &b in self.needles() {
512	set.entry(&crate::util::escape::DebugByte(b));
513	}
514	set.finish()?;
515	write!(f, ")")
516	}
517	}
518