dfa.rs source code [crates/aho-corasick-1.0.2/src/dfa.rs]

1	/!*
2	Provides direct access to a DFA implementation of Aho-Corasick.
3
4	This is a low-level API that generally only needs to be used in niche
5	circumstances. When possible, prefer using [`AhoCorasick`](crate::AhoCorasick)
6	instead of a DFA directly. Using an `DFA` directly is typically only necessary
7	when one needs access to the [`Automaton`] trait implementation.
8	*/
9
10	use alloc::{vec, vec::Vec};
11
12	use crate::{
13	automaton::Automaton,
14	nfa::noncontiguous,
15	util::{
16	alphabet::ByteClasses,
17	error::{BuildError, MatchError},
18	int::{Usize, U32},
19	prefilter::Prefilter,
20	primitives::{IteratorIndexExt, PatternID, SmallIndex, StateID},
21	search::{Anchored, MatchKind, StartKind},
22	special::Special,
23	},
24	};
25
26	/// A DFA implementation of Aho-Corasick.
27	///
28	/// When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) instead of
29	/// this type directly. Using a `DFA` directly is typically only necessary when
30	/// one needs access to the [`Automaton`] trait implementation.
31	///
32	/// This DFA can only be built by first constructing a [`noncontiguous::NFA`].
33	/// Both [`DFA::new`] and [`Builder::build`] do this for you automatically, but
34	/// [`Builder::build_from_noncontiguous`] permits doing it explicitly.
35	///
36	/// A DFA provides the best possible search performance (in this crate) via two
37	/// mechanisms:
38	///
39	/// All states use a dense representation for their transitions.*
40	/// All failure transitions are pre-computed such that they are never*
41	/// explicitly handled at search time.
42	///
43	/// These two facts combined mean that every state transition is performed
44	/// using a constant number of instructions. However, this comes at
45	/// great cost. The memory usage of a DFA can be quite exorbitant.
46	/// It is potentially multiple orders of magnitude greater than a
47	/// [`contiguous::NFA`](crate::nfa::contiguous::NFA) for example. In exchange,
48	/// a DFA will typically have better search speed than a `contiguous::NFA`, but
49	/// not by orders of magnitude.
50	///
51	/// Unless you have a small number of patterns or memory usage is not a concern
52	/// and search performance is critical, a DFA is usually not the best choice.
53	///
54	/// Moreover, unlike the NFAs in this crate, it is costly for a DFA to
55	/// support for anchored and unanchored search configurations. Namely,
56	/// since failure transitions are pre-computed, supporting both anchored
57	/// and unanchored searches requires a duplication of the transition table,
58	/// making the memory usage of such a DFA ever bigger. (The NFAs in this crate
59	/// unconditionally support both anchored and unanchored searches because there
60	/// is essentially no added cost for doing so.) It is for this reason that
61	/// a DFA's support for anchored and unanchored searches can be configured
62	/// via [`Builder::start_kind`]. By default, a DFA only supports unanchored
63	/// searches.
64	///
65	/// # Example
66	///
67	/// This example shows how to build an `DFA` directly and use it to execute
68	/// [`Automaton::try_find`]:
69	///
70	/// ```
71	/// use aho_corasick::{
72	/// automaton::Automaton,
73	/// dfa::DFA,
74	/// Input, Match,
75	/// };
76	///
77	/// let patterns = &["b", "abc", "abcd"];
78	/// let haystack = "abcd";
79	///
80	/// let nfa = DFA::new(patterns).unwrap();
81	/// assert_eq!(
82	/// Some(Match::must(`0`, `1`..`2`)),
83	/// nfa.try_find(&Input::new(haystack))?,
84	/// );
85	/// # Ok::<(), Box<dyn std::error::Error>>(())
86	/// ```
87	///
88	/// It is also possible to implement your own version of `try_find`. See the
89	/// [`Automaton`] documentation for an example.
90	#[derive(Clone)]
91	pub struct DFA {
92	/// The DFA transition table. IDs in this table are pre-multiplied. So
93	/// instead of the IDs being 0, 1, 2, 3, ..., they are 0stride, 1stride,
94	/// 2stride, 3stride, ...
95	trans: Vec<StateID>,
96	/// The matches for every match state in this DFA. This is indexed by order
97	/// of match states in the DFA. Namely, as constructed, match states are
98	/// always laid out sequentially and contiguously in memory. Thus, after
99	/// converting a match state ID to a match state index, the indices are
100	/// all adjacent.
101	///
102	/// More concretely, when a search enters a match state with id 'sid', then
103	/// the matching patterns are at 'matches[(sid >> stride2) - 2]'. The '- 2'
104	/// is to offset the first two states of a DFA: the dead and fail states.
105	matches: Vec<Vec<PatternID>>,
106	/// The amount of heap memory used, in bytes, by the inner Vecs of
107	/// 'matches'.
108	matches_memory_usage: usize,
109	/// The length of each pattern. This is used to compute the start offset
110	/// of a match.
111	pattern_lens: Vec<SmallIndex>,
112	/// A prefilter for accelerating searches, if one exists.
113	prefilter: Option<Prefilter>,
114	/// The match semantics built into this DFA.
115	match_kind: MatchKind,
116	/// The total number of states in this DFA.
117	state_len: usize,
118	/// The alphabet size, or total number of equivalence classes, for this
119	/// DFA. Note that the actual number of transitions in each state is
120	/// stride=2^stride2, where stride is the smallest power of 2 greater than
121	/// or equal to alphabet_len. We do things this way so that we can use
122	/// bitshifting to go from a state ID to an index into 'matches'.
123	alphabet_len: usize,
124	/// The exponent with a base 2, such that stride=2^stride2. Given a state
125	/// index 'i', its state identifier is 'i << stride2'. Given a state
126	/// identifier 'sid', its state index is 'sid >> stride2'.
127	stride2: usize,
128	/// The equivalence classes for this DFA. All transitions are defined on
129	/// equivalence classes and not on the 256 distinct byte values.
130	byte_classes: ByteClasses,
131	/// The length of the shortest pattern in this automaton.
132	min_pattern_len: usize,
133	/// The length of the longest pattern in this automaton.
134	max_pattern_len: usize,
135	/// The information required to deduce which states are "special" in this
136	/// DFA.
137	special: Special,
138	}
139
140	impl DFA {
141	/// Create a new Aho-Corasick DFA using the default configuration.
142	///
143	/// Use a [`Builder`] if you want to change the configuration.
144	pub fn new<I, P>(patterns: I) -> Result<DFA, BuildError>
145	where
146	I: IntoIterator<Item = P>,
147	P: AsRef<[u8]>,
148	{
149	DFA::builder().build(patterns)
150	}
151
152	/// A convenience method for returning a new Aho-Corasick DFA builder.
153	///
154	/// This usually permits one to just import the `DFA` type.
155	pub fn builder() -> Builder {
156	Builder::new()
157	}
158	}
159
160	impl DFA {
161	/// A sentinel state ID indicating that a search should stop once it has
162	/// entered this state. When a search stops, it returns a match if one has
163	/// been found, otherwise no match. A DFA always has an actual dead state
164	/// at this ID.
165	///
166	/// N.B. DFAs, unlike NFAs, do not have any notion of a FAIL state.
167	/// Namely, the whole point of a DFA is that the FAIL state is completely
168	/// compiled away. That is, DFA construction involves pre-computing the
169	/// failure transitions everywhere, such that failure transitions are no
170	/// longer used at search time. This, combined with its uniformly dense
171	/// representation, are the two most important factors in why it's faster
172	/// than the NFAs in this crate.
173	const DEAD: StateID = StateID::new_unchecked(`0`);
174
175	/// Adds the given pattern IDs as matches to the given state and also
176	/// records the added memory usage.
177	fn set_matches(&mut self, sid: StateID, pids: &[PatternID]) {
178	use core::mem::size_of;
179
180	assert!(!pids.is_empty(), "match state must have non-empty pids");
181	let index = (sid.as_usize() >> self.stride2).checked_sub(`2`).unwrap();
182	self.matches[index].extend_from_slice(pids);
183	self.matches_memory_usage += size_of::<PatternID>() * pids.len();
184	}
185	}
186
187	// SAFETY: 'start_state' always returns a valid state ID, 'next_state' always
188	// returns a valid state ID given a valid state ID. We otherwise claim that
189	// all other methods are correct as well.
190	unsafe impl Automaton for DFA {
191	#[inline(always)]
192	fn start_state(&self, anchored: Anchored) -> Result<StateID, MatchError> {
193	// Either of the start state IDs can be DEAD, in which case, support
194	// for that type of search is not provided by this DFA. Which start
195	// state IDs are inactive depends on the 'StartKind' configuration at
196	// DFA construction time.
197	match anchored {
198	Anchored::No => {
199	let start = self.special.start_unanchored_id;
200	if start == DFA::DEAD {
201	Err(MatchError::invalid_input_unanchored())
202	} else {
203	Ok(start)
204	}
205	}
206	Anchored::Yes => {
207	let start = self.special.start_anchored_id;
208	if start == DFA::DEAD {
209	Err(MatchError::invalid_input_anchored())
210	} else {
211	Ok(start)
212	}
213	}
214	}
215	}
216
217	#[inline(always)]
218	fn next_state(
219	&self,
220	_anchored: Anchored,
221	sid: StateID,
222	byte: u8,
223	) -> StateID {
224	let class = self.byte_classes.get(byte);
225	self.trans[(sid.as_u32() + u32::from(class)).as_usize()]
226	}
227
228	#[inline(always)]
229	fn is_special(&self, sid: StateID) -> bool {
230	sid <= self.special.max_special_id
231	}
232
233	#[inline(always)]
234	fn is_dead(&self, sid: StateID) -> bool {
235	sid == DFA::DEAD
236	}
237
238	#[inline(always)]
239	fn is_match(&self, sid: StateID) -> bool {
240	!self.is_dead(sid) && sid <= self.special.max_match_id
241	}
242
243	#[inline(always)]
244	fn is_start(&self, sid: StateID) -> bool {
245	sid == self.special.start_unanchored_id
246	\|\| sid == self.special.start_anchored_id
247	}
248
249	#[inline(always)]
250	fn match_kind(&self) -> MatchKind {
251	self.match_kind
252	}
253
254	#[inline(always)]
255	fn patterns_len(&self) -> usize {
256	self.pattern_lens.len()
257	}
258
259	#[inline(always)]
260	fn pattern_len(&self, pid: PatternID) -> usize {
261	self.pattern_lens[pid].as_usize()
262	}
263
264	#[inline(always)]
265	fn min_pattern_len(&self) -> usize {
266	self.min_pattern_len
267	}
268
269	#[inline(always)]
270	fn max_pattern_len(&self) -> usize {
271	self.max_pattern_len
272	}
273
274	#[inline(always)]
275	fn match_len(&self, sid: StateID) -> usize {
276	debug_assert!(self.is_match(sid));
277	let offset = (sid.as_usize() >> self.stride2) - `2`;
278	self.matches[offset].len()
279	}
280
281	#[inline(always)]
282	fn match_pattern(&self, sid: StateID, index: usize) -> PatternID {
283	debug_assert!(self.is_match(sid));
284	let offset = (sid.as_usize() >> self.stride2) - `2`;
285	self.matches[offset][index]
286	}
287
288	#[inline(always)]
289	fn memory_usage(&self) -> usize {
290	use core::mem::size_of;
291
292	(self.trans.len() * size_of::<u32>())
293	+ (self.matches.len() * size_of::<Vec<PatternID>>())
294	+ self.matches_memory_usage
295	+ (self.pattern_lens.len() * size_of::<SmallIndex>())
296	+ self.prefilter.as_ref().map_or(`0`, \|p\| p.memory_usage())
297	}
298
299	#[inline(always)]
300	fn prefilter(&self) -> Option<&Prefilter> {
301	self.prefilter.as_ref()
302	}
303	}
304
305	impl core::fmt::Debug for DFA {
306	fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
307	use crate::{
308	automaton::{fmt_state_indicator, sparse_transitions},
309	util::debug::DebugByte,
310	};
311
312	writeln!(f, "dfa::DFA(")?;
313	for index in `0`..self.state_len {
314	let sid = StateID::new_unchecked(index << self.stride2);
315	// While we do currently include the FAIL state in the transition
316	// table (to simplify construction), it is never actually used. It
317	// poses problems with the code below because it gets treated as
318	// a match state incidentally when it is, of course, not. So we
319	// special case it. The fail state is always the first state after
320	// the dead state.
321	//
322	// If the construction is changed to remove the fail state (it
323	// probably should be), then this special case should be updated.
324	if index == `1` {
325	writeln!(f, "F {:`06`}:", sid.as_usize())?;
326	continue;
327	}
328	fmt_state_indicator(f, self, sid)?;
329	write!(f, "{:`06`}: ", sid.as_usize())?;
330
331	let it = (`0`..self.byte_classes.alphabet_len()).map(\|class\| {
332	(class.as_u8(), self.trans[sid.as_usize() + class])
333	});
334	for (i, (start, end, next)) in sparse_transitions(it).enumerate() {
335	if i > `0` {
336	write!(f, ", ")?;
337	}
338	if start == end {
339	write!(
340	f,
341	"{:?} => {:?}",
342	DebugByte(start),
343	next.as_usize()
344	)?;
345	} else {
346	write!(
347	f,
348	"{:?}-{:?} => {:?}",
349	DebugByte(start),
350	DebugByte(end),
351	next.as_usize()
352	)?;
353	}
354	}
355	write!(f, "`\n`")?;
356	if self.is_match(sid) {
357	write!(f, " matches: ")?;
358	for i in `0`..self.match_len(sid) {
359	if i > `0` {
360	write!(f, ", ")?;
361	}
362	let pid = self.match_pattern(sid, i);
363	write!(f, "{}", pid.as_usize())?;
364	}
365	write!(f, "`\n`")?;
366	}
367	}
368	writeln!(f, "match kind: {:?}", self.match_kind)?;
369	writeln!(f, "prefilter: {:?}", self.prefilter.is_some())?;
370	writeln!(f, "state length: {:?}", self.state_len)?;
371	writeln!(f, "pattern length: {:?}", self.patterns_len())?;
372	writeln!(f, "shortest pattern length: {:?}", self.min_pattern_len)?;
373	writeln!(f, "longest pattern length: {:?}", self.max_pattern_len)?;
374	writeln!(f, "alphabet length: {:?}", self.alphabet_len)?;
375	writeln!(f, "stride: {:?}", `1` << self.stride2)?;
376	writeln!(f, "byte classes: {:?}", self.byte_classes)?;
377	writeln!(f, "memory usage: {:?}", self.memory_usage())?;
378	writeln!(f, ")")?;
379	Ok(())
380	}
381	}
382
383	/// A builder for configuring an Aho-Corasick DFA.
384	///
385	/// This builder has a subset of the options available to a
386	/// [`AhoCorasickBuilder`](crate::AhoCorasickBuilder). Of the shared options,
387	/// their behavior is identical.
388	#[derive(Clone, Debug)]
389	pub struct Builder {
390	noncontiguous: noncontiguous::Builder,
391	start_kind: StartKind,
392	byte_classes: bool,
393	}
394
395	impl Default for Builder {
396	fn default() -> Builder {
397	Builder {
398	noncontiguous: noncontiguous::Builder::new(),
399	start_kind: StartKind::Unanchored,
400	byte_classes: `true`,
401	}
402	}
403	}
404
405	impl Builder {
406	/// Create a new builder for configuring an Aho-Corasick DFA.
407	pub fn new() -> Builder {
408	Builder::default()
409	}
410
411	/// Build an Aho-Corasick DFA from the given iterator of patterns.
412	///
413	/// A builder may be reused to create more DFAs.
414	pub fn build<I, P>(&self, patterns: I) -> Result<DFA, BuildError>
415	where
416	I: IntoIterator<Item = P>,
417	P: AsRef<[u8]>,
418	{
419	let nnfa = self.noncontiguous.build(patterns)?;
420	self.build_from_noncontiguous(&nnfa)
421	}
422
423	/// Build an Aho-Corasick DFA from the given noncontiguous NFA.
424	///
425	/// Note that when this method is used, only the `start_kind` and
426	/// `byte_classes` settings on this builder are respected. The other
427	/// settings only apply to the initial construction of the Aho-Corasick
428	/// automaton. Since using this method requires that initial construction
429	/// has already completed, all settings impacting only initial construction
430	/// are no longer relevant.
431	pub fn build_from_noncontiguous(
432	&self,
433	nnfa: &noncontiguous::NFA,
434	) -> Result<DFA, BuildError> {
435	debug!("building DFA");
436	let byte_classes = if self.byte_classes {
437	nnfa.byte_classes().clone()
438	} else {
439	ByteClasses::singletons()
440	};
441	let state_len = match self.start_kind {
442	StartKind::Unanchored \| StartKind::Anchored => nnfa.states().len(),
443	StartKind::Both => {
444	// These unwraps are OK because we know that the number of
445	// NFA states is < StateID::LIMIT which is in turn less than
446	// i32::MAX. Thus, there is always room to multiply by 2.
447	// Finally, the number of states is always at least 4 in the
448	// NFA (DEAD, FAIL, START-UNANCHORED, START-ANCHORED), so the
449	// subtraction of 4 is okay.
450	//
451	// Note that we subtract 4 because the "anchored" part of
452	// the DFA duplicates the unanchored part (without failure
453	// transitions), but reuses the DEAD, FAIL and START states.
454	nnfa.states()
455	.len()
456	.checked_mul(`2`)
457	.unwrap()
458	.checked_sub(`4`)
459	.unwrap()
460	}
461	};
462	let trans_len =
463	match state_len.checked_shl(byte_classes.stride2().as_u32()) {
464	Some(trans_len) => trans_len,
465	None => {
466	return Err(BuildError::state_id_overflow(
467	StateID::MAX.as_u64(),
468	usize::MAX.as_u64(),
469	))
470	}
471	};
472	StateID::new(trans_len.checked_sub(byte_classes.stride()).unwrap())
473	.map_err(\|e\| {
474	BuildError::state_id_overflow(
475	StateID::MAX.as_u64(),
476	e.attempted(),
477	)
478	})?;
479	let num_match_states = match self.start_kind {
480	StartKind::Unanchored \| StartKind::Anchored => {
481	nnfa.special().max_match_id.as_usize().checked_sub(`1`).unwrap()
482	}
483	StartKind::Both => nnfa
484	.special()
485	.max_match_id
486	.as_usize()
487	.checked_sub(`1`)
488	.unwrap()
489	.checked_mul(`2`)
490	.unwrap(),
491	};
492	let mut dfa = DFA {
493	trans: vec![DFA::DEAD; trans_len],
494	matches: vec![vec![]; num_match_states],
495	matches_memory_usage: `0`,
496	pattern_lens: nnfa.pattern_lens_raw().to_vec(),
497	prefilter: nnfa.prefilter().map(\|p\| p.clone()),
498	match_kind: nnfa.match_kind(),
499	state_len,
500	alphabet_len: byte_classes.alphabet_len(),
501	stride2: byte_classes.stride2(),
502	byte_classes,
503	min_pattern_len: nnfa.min_pattern_len(),
504	max_pattern_len: nnfa.max_pattern_len(),
505	// The special state IDs are set later.
506	special: Special::zero(),
507	};
508	match self.start_kind {
509	StartKind::Both => {
510	self.finish_build_both_starts(nnfa, &mut dfa);
511	}
512	StartKind::Unanchored => {
513	self.finish_build_one_start(Anchored::No, nnfa, &mut dfa);
514	}
515	StartKind::Anchored => {
516	self.finish_build_one_start(Anchored::Yes, nnfa, &mut dfa)
517	}
518	}
519	debug!(
520	"DFA built, <states: {:?}, size: {:?}, \
521	alphabet len: {:?}, stride: {:?}>",
522	dfa.state_len,
523	dfa.memory_usage(),
524	dfa.byte_classes.alphabet_len(),
525	dfa.byte_classes.stride(),
526	);
527	Ok(dfa)
528	}
529
530	/// Finishes building a DFA for either unanchored or anchored searches,
531	/// but NOT both.
532	fn finish_build_one_start(
533	&self,
534	anchored: Anchored,
535	nnfa: &noncontiguous::NFA,
536	dfa: &mut DFA,
537	) {
538	// This function always succeeds because we check above that all of the
539	// states in the NFA can be mapped to DFA state IDs.
540	let stride2 = dfa.stride2;
541	let old2new = \|oldsid: StateID\| {
542	StateID::new_unchecked(oldsid.as_usize() << stride2)
543	};
544	for (oldsid, state) in nnfa.states().iter().with_state_ids() {
545	let newsid = old2new(oldsid);
546	if !state.matches.is_empty() {
547	dfa.set_matches(newsid, &state.matches);
548	}
549	sparse_iter(
550	state,
551	&dfa.byte_classes,
552	\|byte, class, mut oldnextsid\| {
553	if oldnextsid == noncontiguous::NFA::FAIL {
554	if anchored.is_anchored() {
555	oldnextsid = noncontiguous::NFA::DEAD;
556	} else {
557	oldnextsid = nnfa.next_state(
558	Anchored::No,
559	state.fail,
560	byte,
561	);
562	}
563	}
564	dfa.trans[newsid.as_usize() + usize::from(class)] =
565	old2new(oldnextsid);
566	},
567	);
568	}
569	// Now that we've remapped all the IDs in our states, all that's left
570	// is remapping the special state IDs.
571	let old = nnfa.special();
572	let new = &mut dfa.special;
573	new.max_special_id = old2new(old.max_special_id);
574	new.max_match_id = old2new(old.max_match_id);
575	if anchored.is_anchored() {
576	new.start_unanchored_id = DFA::DEAD;
577	new.start_anchored_id = old2new(old.start_anchored_id);
578	} else {
579	new.start_unanchored_id = old2new(old.start_unanchored_id);
580	new.start_anchored_id = DFA::DEAD;
581	}
582	}
583
584	/// Finishes building a DFA that supports BOTH unanchored and anchored
585	/// searches. It works by inter-leaving unanchored states with anchored
586	/// states in the same transition table. This way, we avoid needing to
587	/// re-shuffle states afterward to ensure that our states still look like
588	/// DEAD, MATCH, ..., START-UNANCHORED, START-ANCHORED, NON-MATCH, ...
589	///
590	/// Honestly this is pretty inscrutable... Simplifications are most
591	/// welcome.
592	fn finish_build_both_starts(
593	&self,
594	nnfa: &noncontiguous::NFA,
595	dfa: &mut DFA,
596	) {
597	let stride2 = dfa.stride2;
598	let stride = `1` << stride2;
599	let mut remap_unanchored = vec![DFA::DEAD; nnfa.states().len()];
600	let mut remap_anchored = vec![DFA::DEAD; nnfa.states().len()];
601	let mut is_anchored = vec![`false`; dfa.state_len];
602	let mut newsid = DFA::DEAD;
603	let next_dfa_id =
604	\|sid: StateID\| StateID::new_unchecked(sid.as_usize() + stride);
605	for (oldsid, state) in nnfa.states().iter().with_state_ids() {
606	if oldsid == noncontiguous::NFA::DEAD
607	\|\| oldsid == noncontiguous::NFA::FAIL
608	{
609	remap_unanchored[oldsid] = newsid;
610	remap_anchored[oldsid] = newsid;
611	newsid = next_dfa_id(newsid);
612	} else if oldsid == nnfa.special().start_unanchored_id
613	\|\| oldsid == nnfa.special().start_anchored_id
614	{
615	if oldsid == nnfa.special().start_unanchored_id {
616	remap_unanchored[oldsid] = newsid;
617	remap_anchored[oldsid] = DFA::DEAD;
618	} else {
619	remap_unanchored[oldsid] = DFA::DEAD;
620	remap_anchored[oldsid] = newsid;
621	is_anchored[newsid.as_usize() >> stride2] = `true`;
622	}
623	if !state.matches.is_empty() {
624	dfa.set_matches(newsid, &state.matches);
625	}
626	sparse_iter(
627	state,
628	&dfa.byte_classes,
629	\|_, class, oldnextsid\| {
630	let class = usize::from(class);
631	if oldnextsid == noncontiguous::NFA::FAIL {
632	dfa.trans[newsid.as_usize() + class] = DFA::DEAD;
633	} else {
634	dfa.trans[newsid.as_usize() + class] = oldnextsid;
635	}
636	},
637	);
638	newsid = next_dfa_id(newsid);
639	} else {
640	let unewsid = newsid;
641	newsid = next_dfa_id(newsid);
642	let anewsid = newsid;
643	newsid = next_dfa_id(newsid);
644
645	remap_unanchored[oldsid] = unewsid;
646	remap_anchored[oldsid] = anewsid;
647	is_anchored[anewsid.as_usize() >> stride2] = `true`;
648	if !state.matches.is_empty() {
649	dfa.set_matches(unewsid, &state.matches);
650	dfa.set_matches(anewsid, &state.matches);
651	}
652	sparse_iter(
653	state,
654	&dfa.byte_classes,
655	\|byte, class, oldnextsid\| {
656	let class = usize::from(class);
657	if oldnextsid == noncontiguous::NFA::FAIL {
658	dfa.trans[unewsid.as_usize() + class] = nnfa
659	.next_state(Anchored::No, state.fail, byte);
660	} else {
661	dfa.trans[unewsid.as_usize() + class] = oldnextsid;
662	dfa.trans[anewsid.as_usize() + class] = oldnextsid;
663	}
664	},
665	);
666	}
667	}
668	for i in `0`..dfa.state_len {
669	let sid = i << stride2;
670	if is_anchored[i] {
671	for next in dfa.trans[sid..][..stride].iter_mut() {
672	next = remap_anchored[next];
673	}
674	} else {
675	for next in dfa.trans[sid..][..stride].iter_mut() {
676	next = remap_unanchored[next];
677	}
678	}
679	}
680	// Now that we've remapped all the IDs in our states, all that's left
681	// is remapping the special state IDs.
682	let old = nnfa.special();
683	let new = &mut dfa.special;
684	new.max_special_id = remap_anchored[old.max_special_id];
685	new.max_match_id = remap_anchored[old.max_match_id];
686	new.start_unanchored_id = remap_unanchored[old.start_unanchored_id];
687	new.start_anchored_id = remap_anchored[old.start_anchored_id];
688	}
689
690	/// Set the desired match semantics.
691	///
692	/// This only applies when using [`Builder::build`] and not
693	/// [`Builder::build_from_noncontiguous`].
694	///
695	/// See
696	/// [`AhoCorasickBuilder::match_kind`](crate::AhoCorasickBuilder::match_kind)
697	/// for more documentation and examples.
698	pub fn match_kind(&mut self, kind: MatchKind) -> &mut Builder {
699	self.noncontiguous.match_kind(kind);
700	self
701	}
702
703	/// Enable ASCII-aware case insensitive matching.
704	///
705	/// This only applies when using [`Builder::build`] and not
706	/// [`Builder::build_from_noncontiguous`].
707	///
708	/// See
709	/// [`AhoCorasickBuilder::ascii_case_insensitive`](crate::AhoCorasickBuilder::ascii_case_insensitive)
710	/// for more documentation and examples.
711	pub fn ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder {
712	self.noncontiguous.ascii_case_insensitive(yes);
713	self
714	}
715
716	/// Enable heuristic prefilter optimizations.
717	///
718	/// This only applies when using [`Builder::build`] and not
719	/// [`Builder::build_from_noncontiguous`].
720	///
721	/// See
722	/// [`AhoCorasickBuilder::prefilter`](crate::AhoCorasickBuilder::prefilter)
723	/// for more documentation and examples.
724	pub fn prefilter(&mut self, yes: bool) -> &mut Builder {
725	self.noncontiguous.prefilter(yes);
726	self
727	}
728
729	/// Sets the starting state configuration for the automaton.
730	///
731	/// See
732	/// [`AhoCorasickBuilder::start_kind`](crate::AhoCorasickBuilder::start_kind)
733	/// for more documentation and examples.
734	pub fn start_kind(&mut self, kind: StartKind) -> &mut Builder {
735	self.start_kind = kind;
736	self
737	}
738
739	/// A debug setting for whether to attempt to shrink the size of the
740	/// automaton's alphabet or not.
741	///
742	/// This should never be enabled unless you're debugging an automaton.
743	/// Namely, disabling byte classes makes transitions easier to reason
744	/// about, since they use the actual bytes instead of equivalence classes.
745	/// Disabling this confers no performance benefit at search time.
746	///
747	/// See
748	/// [`AhoCorasickBuilder::byte_classes`](crate::AhoCorasickBuilder::byte_classes)
749	/// for more documentation and examples.
750	pub fn byte_classes(&mut self, yes: bool) -> &mut Builder {
751	self.byte_classes = yes;
752	self
753	}
754	}
755
756	/// Iterate over all possible equivalence class transitions in this state.
757	/// The closure is called for all transitions with a distinct equivalence
758	/// class, even those not explicitly represented in this sparse state. For
759	/// any implicitly defined transitions, the given closure is called with
760	/// the fail state ID.
761	///
762	/// The closure is guaranteed to be called precisely
763	/// `byte_classes.alphabet_len()` times, once for every possible class in
764	/// ascending order.
765	fn sparse_iter<F: FnMut(u8, u8, StateID)>(
766	state: &noncontiguous::State,
767	classes: &ByteClasses,
768	mut f: F,
769	) {
770	let mut prev_class = None;
771	let mut byte = `0usize`;
772	for &(b, id) in state.trans.iter() {
773	while byte < usize::from(b) {
774	let rep = byte.as_u8();
775	let class = classes.get(rep);
776	byte += `1`;
777	if prev_class != Some(class) {
778	f(rep, class, noncontiguous::NFA::FAIL);
779	prev_class = Some(class);
780	}
781	}
782	let rep = b;
783	let class = classes.get(rep);
784	byte += `1`;
785	if prev_class != Some(class) {
786	f(rep, class, id);
787	prev_class = Some(class);
788	}
789	}
790	for b in byte..=`255` {
791	let rep = b.as_u8();
792	let class = classes.get(rep);
793	if prev_class != Some(class) {
794	f(rep, class, noncontiguous::NFA::FAIL);
795	prev_class = Some(class);
796	}
797	}
798	}
799