special.rs source code [crates/regex-automata-0.2.0/src/dfa/special.rs]

1	use crate::{
2	dfa::DEAD,
3	util::{
4	bytes::{self, DeserializeError, Endian, SerializeError},
5	id::StateID,
6	},
7	};
8
9	macro_rules! err {
10	($msg:expr) => {
11	return Err(DeserializeError::generic($msg));
12	};
13	}
14
15	// Special represents the identifiers in a DFA that correspond to "special"
16	// states. If a state is one or more of the following, then it is considered
17	// special:
18	//
19	// dead - A non-matching state where all outgoing transitions lead back to*
20	// itself. There is only one of these, regardless of whether minimization
21	// has run. The dead state always has an ID of 0. i.e., It is always the
22	// first state in a DFA.
23	// quit - A state that is entered whenever a byte is seen that should cause*
24	// a DFA to give up and stop searching. This results in a MatchError::Quit
25	// error being returned at search time. The default configuration for a DFA
26	// has no quit bytes, which means this state is unreachable by default,
27	// although it is always present for reasons of implementation simplicity.
28	// This state is only reachable when the caller configures the DFA to quit
29	// on certain bytes. There is always exactly one of these states and it
30	// is always the second state. (Its actual ID depends on the size of the
31	// alphabet in dense DFAs, since state IDs are premultiplied in order to
32	// allow them to be used directly as indices into the transition table.)
33	// match - An accepting state, i.e., indicative of a match. There may be*
34	// zero or more of these states.
35	// accelerated - A state where all of its outgoing transitions, except a*
36	// few, loop back to itself. These states are candidates for acceleration
37	// via memchr during search. There may be zero or more of these states.
38	// start - A non-matching state that indicates where the automaton should*
39	// start during a search. There is always at least one starting state and
40	// all are guaranteed to be non-match states. (A start state cannot be a
41	// match state because the DFAs in this crate delay all matches by one byte.
42	// So every search that finds a match must move through one transition to
43	// some other match state, even when searching an empty string.)
44	//
45	// These are not mutually exclusive categories. Namely, the following
46	// overlappings can occur:
47	//
48	// {dead, start} - If a DFA can never lead to a match and it is minimized,*
49	// then it will typically compile to something where all starting IDs point
50	// to the DFA's dead state.
51	// {match, accelerated} - It is possible for a match state to have the*
52	// majority of its transitions loop back to itself, which means it's
53	// possible for a match state to be accelerated.
54	// {start, accelerated} - Similarly, it is possible for a start state to be*
55	// accelerated. Note that it is possible for an accelerated state to be
56	// neither a match or a start state. Also note that just because both match
57	// and start states overlap with accelerated states does not mean that
58	// match and start states overlap with each other. In fact, they are
59	// guaranteed not to overlap.
60	//
61	// As a special mention, every DFA always has a dead and a quit state, even
62	// though from the perspective of the DFA, they are equivalent. (Indeed,
63	// minimization special cases them to ensure they don't get merged.) The
64	// purpose of keeping them distinct is to use the quit state as a sentinel to
65	// distguish between whether a search finished successfully without finding
66	// anything or whether it gave up before finishing.
67	//
68	// So the main problem we want to solve here is the fast* detection of whether*
69	// a state is special or not. And we also want to do this while storing as
70	// little extra data as possible. AND we want to be able to quickly determine
71	// which categories a state falls into above if it is special.
72	//
73	// We achieve this by essentially shuffling all special states to the beginning
74	// of a DFA. That is, all special states appear before every other non-special
75	// state. By representing special states this way, we can determine whether a
76	// state is special or not by a single comparison, where special.max is the
77	// identifier of the last special state in the DFA:
78	//
79	// if current_state <= special.max:
80	// ... do something with special state
81	//
82	// The only thing left to do is to determine what kind of special state
83	// it is. Because what we do next depends on that. Since special states
84	// are typically rare, we can afford to do a bit more extra work, but we'd
85	// still like this to be as fast as possible. The trick we employ here is to
86	// continue shuffling states even within the special state range. Such that
87	// one contiguous region corresponds to match states, another for start states
88	// and then an overlapping range for accelerated states. At a high level, our
89	// special state detection might look like this (for leftmost searching, where
90	// we continue searching even after seeing a match):
91	//
92	// byte = input[offset]
93	// current_state = next_state(current_state, byte)
94	// offset += 1
95	// if current_state <= special.max:
96	// if current_state == 0:
97	// # We can never leave a dead state, so this always marks the
98	// # end of our search.
99	// return last_match
100	// if current_state == special.quit_id:
101	// # A quit state means we give up. If he DFA has no quit state,
102	// # then special.quit_id == 0 == dead, which is handled by the
103	// # conditional above.
104	// return Err(MatchError::Quit { byte, offset: offset - 1 })
105	// if special.min_match <= current_state <= special.max_match:
106	// last_match = Some(offset)
107	// if special.min_accel <= current_state <= special.max_accel:
108	// offset = accelerate(input, offset)
109	// last_match = Some(offset)
110	// elif special.min_start <= current_state <= special.max_start:
111	// offset = prefilter.find(input, offset)
112	// if special.min_accel <= current_state <= special.max_accel:
113	// offset = accelerate(input, offset)
114	// elif special.min_accel <= current_state <= special.max_accel:
115	// offset = accelerate(input, offset)
116	//
117	// There are some small details left out of the logic above. For example,
118	// in order to accelerate a state, we need to know which bytes to search for.
119	// This in turn implies some extra data we need to store in the DFA. To keep
120	// things compact, we would ideally only store
121	//
122	// N = special.max_accel - special.min_accel + 1
123	//
124	// items. But state IDs are premultiplied, which means they are not contiguous.
125	// So in order to take a state ID and index an array of accelerated structures,
126	// we need to do:
127	//
128	// i = (state_id - special.min_accel) / stride
129	//
130	// (N.B. 'stride' is always a power of 2, so the above can be implemented via
131	// '(state_id - special.min_accel) >> stride2', where 'stride2' is x in
132	// 2^x=stride.)
133	//
134	// Moreover, some of these specialty categories may be empty. For example,
135	// DFAs are not required to have any match states or any accelerated states.
136	// In that case, the lower and upper bounds are both set to 0 (the dead state
137	// ID) and the first `current_state == 0` check subsumes cases where the
138	// ranges are empty.
139	//
140	// Loop unrolling, if applicable, has also been left out of the logic above.
141	//
142	// Graphically, the ranges look like this, where asterisks indicate ranges
143	// that can be empty. Each 'x' is a state.
144	//
145	// quit
146	// dead\|
147	// \|\|
148	// xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
149	// \| \| \| \| start \| \|
150	// \| \|-------------\| \|-------\| \|
151	// \| match \| \| \| \|*
152	// \| \| \| \| \|
153	// \| \|----------\| \| \|
154	// \| accel \| \|*
155	// \| \| \|
156	// \| \| \|
157	// \|----------------------------\|------------------------
158	// special non-special*
159	#[derive(Clone, Copy, Debug)]
160	pub struct Special {
161	/// The identifier of the last special state in a DFA. A state is special
162	/// if and only if its identifier is less than or equal to `max`.
163	pub max: StateID,
164	/// The identifier of the quit state in a DFA. (There is no analogous field
165	/// for the dead state since the dead state's ID is always zero, regardless
166	/// of state ID size.)
167	pub quit_id: StateID,
168	/// The identifier of the first match state.
169	pub min_match: StateID,
170	/// The identifier of the last match state.
171	pub max_match: StateID,
172	/// The identifier of the first accelerated state.
173	pub min_accel: StateID,
174	/// The identifier of the last accelerated state.
175	pub max_accel: StateID,
176	/// The identifier of the first start state.
177	pub min_start: StateID,
178	/// The identifier of the last start state.
179	pub max_start: StateID,
180	}
181
182	impl Special {
183	/// Creates a new set of special ranges for a DFA. All ranges are initially
184	/// set to only contain the dead state. This is interpreted as an empty
185	/// range.
186	#[cfg(feature = "alloc")]
187	pub fn new() -> Special {
188	Special {
189	max: DEAD,
190	quit_id: DEAD,
191	min_match: DEAD,
192	max_match: DEAD,
193	min_accel: DEAD,
194	max_accel: DEAD,
195	min_start: DEAD,
196	max_start: DEAD,
197	}
198	}
199
200	/// Remaps all of the special state identifiers using the function given.
201	#[cfg(feature = "alloc")]
202	pub fn remap(&self, map: impl Fn(StateID) -> StateID) -> Special {
203	Special {
204	max: map(self.max),
205	quit_id: map(self.quit_id),
206	min_match: map(self.min_match),
207	max_match: map(self.max_match),
208	min_accel: map(self.min_accel),
209	max_accel: map(self.max_accel),
210	min_start: map(self.min_start),
211	max_start: map(self.max_start),
212	}
213	}
214
215	/// Deserialize the given bytes into special state ranges. If the slice
216	/// given is not big enough, then this returns an error. Similarly, if
217	/// any of the expected invariants around special state ranges aren't
218	/// upheld, an error is returned. Note that this does not guarantee that
219	/// the information returned is correct.
220	///
221	/// Upon success, this returns the number of bytes read in addition to the
222	/// special state IDs themselves.
223	pub fn from_bytes(
224	mut slice: &[u8],
225	) -> Result<(Special, usize), DeserializeError> {
226	bytes::check_slice_len(slice, `8` * StateID::SIZE, "special states")?;
227
228	let mut nread = `0`;
229	let mut read_id = \|what\| -> Result<StateID, DeserializeError> {
230	let (id, nr) = bytes::try_read_state_id(slice, what)?;
231	nread += nr;
232	slice = &slice[StateID::SIZE..];
233	Ok(id)
234	};
235
236	let max = read_id("special max id")?;
237	let quit_id = read_id("special quit id")?;
238	let min_match = read_id("special min match id")?;
239	let max_match = read_id("special max match id")?;
240	let min_accel = read_id("special min accel id")?;
241	let max_accel = read_id("special max accel id")?;
242	let min_start = read_id("special min start id")?;
243	let max_start = read_id("special max start id")?;
244
245	let special = Special {
246	max,
247	quit_id,
248	min_match,
249	max_match,
250	min_accel,
251	max_accel,
252	min_start,
253	max_start,
254	};
255	special.validate()?;
256	assert_eq!(nread, special.write_to_len());
257	Ok((special, nread))
258	}
259
260	/// Validate that the information describing special states satisfies
261	/// all known invariants.
262	pub fn validate(&self) -> Result<(), DeserializeError> {
263	// Check that both ends of the range are DEAD or neither are.
264	if self.min_match == DEAD && self.max_match != DEAD {
265	err!("min_match is DEAD, but max_match is not");
266	}
267	if self.min_match != DEAD && self.max_match == DEAD {
268	err!("max_match is DEAD, but min_match is not");
269	}
270	if self.min_accel == DEAD && self.max_accel != DEAD {
271	err!("min_accel is DEAD, but max_accel is not");
272	}
273	if self.min_accel != DEAD && self.max_accel == DEAD {
274	err!("max_accel is DEAD, but min_accel is not");
275	}
276	if self.min_start == DEAD && self.max_start != DEAD {
277	err!("min_start is DEAD, but max_start is not");
278	}
279	if self.min_start != DEAD && self.max_start == DEAD {
280	err!("max_start is DEAD, but min_start is not");
281	}
282
283	// Check that ranges are well formed.
284	if self.min_match > self.max_match {
285	err!("min_match should not be greater than max_match");
286	}
287	if self.min_accel > self.max_accel {
288	err!("min_accel should not be greater than max_accel");
289	}
290	if self.min_start > self.max_start {
291	err!("min_start should not be greater than max_start");
292	}
293
294	// Check that ranges are ordered with respect to one another.
295	if self.matches() && self.quit_id >= self.min_match {
296	err!("quit_id should not be greater than min_match");
297	}
298	if self.accels() && self.quit_id >= self.min_accel {
299	err!("quit_id should not be greater than min_accel");
300	}
301	if self.starts() && self.quit_id >= self.min_start {
302	err!("quit_id should not be greater than min_start");
303	}
304	if self.matches() && self.accels() && self.min_accel < self.min_match {
305	err!("min_match should not be greater than min_accel");
306	}
307	if self.matches() && self.starts() && self.min_start < self.min_match {
308	err!("min_match should not be greater than min_start");
309	}
310	if self.accels() && self.starts() && self.min_start < self.min_accel {
311	err!("min_accel should not be greater than min_start");
312	}
313
314	// Check that max is at least as big as everything else.
315	if self.max < self.quit_id {
316	err!("quit_id should not be greater than max");
317	}
318	if self.max < self.max_match {
319	err!("max_match should not be greater than max");
320	}
321	if self.max < self.max_accel {
322	err!("max_accel should not be greater than max");
323	}
324	if self.max < self.max_start {
325	err!("max_start should not be greater than max");
326	}
327
328	Ok(())
329	}
330
331	/// Validate that the special state information is compatible with the
332	/// given state count.
333	pub fn validate_state_count(
334	&self,
335	count: usize,
336	stride2: usize,
337	) -> Result<(), DeserializeError> {
338	// We assume that 'validate' has already passed, so we know that 'max'
339	// is truly the max. So all we need to check is that the max state
340	// ID is less than the state ID count. The max legal value here is
341	// count-1, which occurs when there are no non-special states.
342	if (self.max.as_usize() >> stride2) >= count {
343	err!("max should not be greater than or equal to state count");
344	}
345	Ok(())
346	}
347
348	/// Write the IDs and ranges for special states to the given byte buffer.
349	/// The buffer given must have enough room to store all data, otherwise
350	/// this will return an error. The number of bytes written is returned
351	/// on success. The number of bytes written is guaranteed to be a multiple
352	/// of 8.
353	pub fn write_to<E: Endian>(
354	&self,
355	dst: &mut [u8],
356	) -> Result<usize, SerializeError> {
357	use crate::util::bytes::write_state_id as write;
358
359	if dst.len() < self.write_to_len() {
360	return Err(SerializeError::buffer_too_small("special state ids"));
361	}
362
363	let mut nwrite = `0`;
364	nwrite += write::<E>(self.max, &mut dst[nwrite..]);
365	nwrite += write::<E>(self.quit_id, &mut dst[nwrite..]);
366	nwrite += write::<E>(self.min_match, &mut dst[nwrite..]);
367	nwrite += write::<E>(self.max_match, &mut dst[nwrite..]);
368	nwrite += write::<E>(self.min_accel, &mut dst[nwrite..]);
369	nwrite += write::<E>(self.max_accel, &mut dst[nwrite..]);
370	nwrite += write::<E>(self.min_start, &mut dst[nwrite..]);
371	nwrite += write::<E>(self.max_start, &mut dst[nwrite..]);
372
373	assert_eq!(
374	self.write_to_len(),
375	nwrite,
376	"expected to write certain number of bytes",
377	);
378	assert_eq!(
379	nwrite % `8`,
380	`0`,
381	"expected to write multiple of 8 bytes for special states",
382	);
383	Ok(nwrite)
384	}
385
386	/// Returns the total number of bytes written by `write_to`.
387	pub fn write_to_len(&self) -> usize {
388	`8` * StateID::SIZE
389	}
390
391	/// Sets the maximum special state ID based on the current values. This
392	/// should be used once all possible state IDs are set.
393	#[cfg(feature = "alloc")]
394	pub fn set_max(&mut self) {
395	use core::cmp::max;
396	self.max = max(
397	self.quit_id,
398	max(self.max_match, max(self.max_accel, self.max_start)),
399	);
400	}
401
402	/// Returns true if and only if the given state ID is a special state.
403	#[inline]
404	pub fn is_special_state(&self, id: StateID) -> bool {
405	id <= self.max
406	}
407
408	/// Returns true if and only if the given state ID is a dead state.
409	#[inline]
410	pub fn is_dead_state(&self, id: StateID) -> bool {
411	id == DEAD
412	}
413
414	/// Returns true if and only if the given state ID is a quit state.
415	#[inline]
416	pub fn is_quit_state(&self, id: StateID) -> bool {
417	!self.is_dead_state(id) && self.quit_id == id
418	}
419
420	/// Returns true if and only if the given state ID is a match state.
421	#[inline]
422	pub fn is_match_state(&self, id: StateID) -> bool {
423	!self.is_dead_state(id) && self.min_match <= id && id <= self.max_match
424	}
425
426	/// Returns true if and only if the given state ID is an accel state.
427	#[inline]
428	pub fn is_accel_state(&self, id: StateID) -> bool {
429	!self.is_dead_state(id) && self.min_accel <= id && id <= self.max_accel
430	}
431
432	/// Returns true if and only if the given state ID is a start state.
433	#[inline]
434	pub fn is_start_state(&self, id: StateID) -> bool {
435	!self.is_dead_state(id) && self.min_start <= id && id <= self.max_start
436	}
437
438	/// Returns the total number of match states for a dense table based DFA.
439	#[inline]
440	pub fn match_len(&self, stride: usize) -> usize {
441	if self.matches() {
442	(self.max_match.as_usize() - self.min_match.as_usize() + stride)
443	/ stride
444	} else {
445	`0`
446	}
447	}
448
449	/// Returns true if and only if there is at least one match state.
450	#[inline]
451	pub fn matches(&self) -> bool {
452	self.min_match != DEAD
453	}
454
455	/// Returns the total number of accel states.
456	#[cfg(feature = "alloc")]
457	pub fn accel_len(&self, stride: usize) -> usize {
458	if self.accels() {
459	(self.max_accel.as_usize() - self.min_accel.as_usize() + stride)
460	/ stride
461	} else {
462	`0`
463	}
464	}
465
466	/// Returns true if and only if there is at least one accel state.
467	#[inline]
468	pub fn accels(&self) -> bool {
469	self.min_accel != DEAD
470	}
471
472	/// Returns true if and only if there is at least one start state.
473	#[inline]
474	pub fn starts(&self) -> bool {
475	self.min_start != DEAD
476	}
477	}
478