dfa.rs - Codebrowser

1	/!*
2	Provides direct access to a DFA implementation of Aho-Corasick.
3
4	This is a low-level API that generally only needs to be used in niche
5	circumstances. When possible, prefer using [`AhoCorasick`](crate::AhoCorasick)
6	instead of a DFA directly. Using an `DFA` directly is typically only necessary
7	when one needs access to the [`Automaton`] trait implementation.
8	*/
9
10	use alloc::{vec, vec::Vec};
11
12	use crate::{
13	automaton::Automaton,
14	nfa::noncontiguous,
15	util::{
16	alphabet::ByteClasses,
17	error::{BuildError, MatchError},
18	int::{Usize, U32},
19	prefilter::Prefilter,
20	primitives::{IteratorIndexExt, PatternID, SmallIndex, StateID},
21	search::{Anchored, MatchKind, StartKind},
22	special::Special,
23	},
24	};
25
26	/// A DFA implementation of Aho-Corasick.
27	///
28	/// When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) instead of
29	/// this type directly. Using a `DFA` directly is typically only necessary when
30	/// one needs access to the [`Automaton`] trait implementation.
31	///
32	/// This DFA can only be built by first constructing a [`noncontiguous::NFA`].
33	/// Both [`DFA::new`] and [`Builder::build`] do this for you automatically, but
34	/// [`Builder::build_from_noncontiguous`] permits doing it explicitly.
35	///
36	/// A DFA provides the best possible search performance (in this crate) via two
37	/// mechanisms:
38	///
39	/// All states use a dense representation for their transitions.*
40	/// All failure transitions are pre-computed such that they are never*
41	/// explicitly handled at search time.
42	///
43	/// These two facts combined mean that every state transition is performed
44	/// using a constant number of instructions. However, this comes at
45	/// great cost. The memory usage of a DFA can be quite exorbitant.
46	/// It is potentially multiple orders of magnitude greater than a
47	/// [`contiguous::NFA`](crate::nfa::contiguous::NFA) for example. In exchange,
48	/// a DFA will typically have better search speed than a `contiguous::NFA`, but
49	/// not by orders of magnitude.
50	///
51	/// Unless you have a small number of patterns or memory usage is not a concern
52	/// and search performance is critical, a DFA is usually not the best choice.
53	///
54	/// Moreover, unlike the NFAs in this crate, it is costly for a DFA to
55	/// support for anchored and unanchored search configurations. Namely,
56	/// since failure transitions are pre-computed, supporting both anchored
57	/// and unanchored searches requires a duplication of the transition table,
58	/// making the memory usage of such a DFA ever bigger. (The NFAs in this crate
59	/// unconditionally support both anchored and unanchored searches because there
60	/// is essentially no added cost for doing so.) It is for this reason that
61	/// a DFA's support for anchored and unanchored searches can be configured
62	/// via [`Builder::start_kind`]. By default, a DFA only supports unanchored
63	/// searches.
64	///
65	/// # Example
66	///
67	/// This example shows how to build an `DFA` directly and use it to execute
68	/// [`Automaton::try_find`]:
69	///
70	/// ```
71	/// use aho_corasick::{
72	/// automaton::Automaton,
73	/// dfa::DFA,
74	/// Input, Match,
75	/// };
76	///
77	/// let patterns = &["b", "abc", "abcd"];
78	/// let haystack = "abcd";
79	///
80	/// let nfa = DFA::new(patterns).unwrap();
81	/// assert_eq!(
82	/// Some(Match::must(`0`, `1`..`2`)),
83	/// nfa.try_find(&Input::new(haystack))?,
84	/// );
85	/// # Ok::<(), Box<dyn std::error::Error>>(())
86	/// ```
87	///
88	/// It is also possible to implement your own version of `try_find`. See the
89	/// [`Automaton`] documentation for an example.
90	#[derive(Clone)]
91	pub struct DFA {
92	/// The DFA transition table. IDs in this table are pre-multiplied. So
93	/// instead of the IDs being 0, 1, 2, 3, ..., they are 0stride, 1stride,
94	/// 2stride, 3stride, ...
95	trans: Vec<StateID>,
96	/// The matches for every match state in this DFA. This is first indexed by
97	/// state index (so that's `sid >> stride2`) and then by order in which the
98	/// matches are meant to occur.
99	matches: Vec<Vec<PatternID>>,
100	/// The amount of heap memory used, in bytes, by the inner Vecs of
101	/// 'matches'.
102	matches_memory_usage: usize,
103	/// The length of each pattern. This is used to compute the start offset
104	/// of a match.
105	pattern_lens: Vec<SmallIndex>,
106	/// A prefilter for accelerating searches, if one exists.
107	prefilter: Option<Prefilter>,
108	/// The match semantics built into this DFA.
109	match_kind: MatchKind,
110	/// The total number of states in this DFA.
111	state_len: usize,
112	/// The alphabet size, or total number of equivalence classes, for this
113	/// DFA. Note that the actual number of transitions in each state is
114	/// stride=2^stride2, where stride is the smallest power of 2 greater than
115	/// or equal to alphabet_len. We do things this way so that we can use
116	/// bitshifting to go from a state ID to an index into 'matches'.
117	alphabet_len: usize,
118	/// The exponent with a base 2, such that stride=2^stride2. Given a state
119	/// index 'i', its state identifier is 'i << stride2'. Given a state
120	/// identifier 'sid', its state index is 'sid >> stride2'.
121	stride2: usize,
122	/// The equivalence classes for this DFA. All transitions are defined on
123	/// equivalence classes and not on the 256 distinct byte values.
124	byte_classes: ByteClasses,
125	/// The length of the shortest pattern in this automaton.
126	min_pattern_len: usize,
127	/// The length of the longest pattern in this automaton.
128	max_pattern_len: usize,
129	/// The information required to deduce which states are "special" in this
130	/// DFA.
131	special: Special,
132	}
133
134	impl DFA {
135	/// Create a new Aho-Corasick DFA using the default configuration.
136	///
137	/// Use a [`Builder`] if you want to change the configuration.
138	pub fn new<I, P>(patterns: I) -> Result<DFA, BuildError>
139	where
140	I: IntoIterator<Item = P>,
141	P: AsRef<[u8]>,
142	{
143	DFA::builder().build(patterns)
144	}
145
146	/// A convenience method for returning a new Aho-Corasick DFA builder.
147	///
148	/// This usually permits one to just import the `DFA` type.
149	pub fn builder() -> Builder {
150	Builder::new()
151	}
152	}
153
154	impl DFA {
155	/// A sentinel state ID indicating that a search should stop once it has
156	/// entered this state. When a search stops, it returns a match if one has
157	/// been found, otherwise no match. A DFA always has an actual dead state
158	/// at this ID.
159	///
160	/// N.B. DFAs, unlike NFAs, do not have any notion of a FAIL state.
161	/// Namely, the whole point of a DFA is that the FAIL state is completely
162	/// compiled away. That is, DFA construction involves pre-computing the
163	/// failure transitions everywhere, such that failure transitions are no
164	/// longer used at search time. This, combined with its uniformly dense
165	/// representation, are the two most important factors in why it's faster
166	/// than the NFAs in this crate.
167	const DEAD: StateID = StateID::new_unchecked(`0`);
168
169	/// Adds the given pattern IDs as matches to the given state and also
170	/// records the added memory usage.
171	fn set_matches(
172	&mut self,
173	sid: StateID,
174	pids: impl Iterator<Item = PatternID>,
175	) {
176	let index = (sid.as_usize() >> self.stride2).checked_sub(`2`).unwrap();
177	let mut at_least_one = `false`;
178	for pid in pids {
179	self.matches[index].push(pid);
180	self.matches_memory_usage += PatternID::SIZE;
181	at_least_one = `true`;
182	}
183	assert!(at_least_one, "match state must have non-empty pids");
184	}
185	}
186
187	// SAFETY: 'start_state' always returns a valid state ID, 'next_state' always
188	// returns a valid state ID given a valid state ID. We otherwise claim that
189	// all other methods are correct as well.
190	unsafe impl Automaton for DFA {
191	#[inline(always)]
192	fn start_state(&self, anchored: Anchored) -> Result<StateID, MatchError> {
193	// Either of the start state IDs can be DEAD, in which case, support
194	// for that type of search is not provided by this DFA. Which start
195	// state IDs are inactive depends on the 'StartKind' configuration at
196	// DFA construction time.
197	match anchored {
198	Anchored::No => {
199	let start = self.special.start_unanchored_id;
200	if start == DFA::DEAD {
201	Err(MatchError::invalid_input_unanchored())
202	} else {
203	Ok(start)
204	}
205	}
206	Anchored::Yes => {
207	let start = self.special.start_anchored_id;
208	if start == DFA::DEAD {
209	Err(MatchError::invalid_input_anchored())
210	} else {
211	Ok(start)
212	}
213	}
214	}
215	}
216
217	#[inline(always)]
218	fn next_state(
219	&self,
220	_anchored: Anchored,
221	sid: StateID,
222	byte: u8,
223	) -> StateID {
224	let class = self.byte_classes.get(byte);
225	self.trans[(sid.as_u32() + u32::from(class)).as_usize()]
226	}
227
228	#[inline(always)]
229	fn is_special(&self, sid: StateID) -> bool {
230	sid <= self.special.max_special_id
231	}
232
233	#[inline(always)]
234	fn is_dead(&self, sid: StateID) -> bool {
235	sid == DFA::DEAD
236	}
237
238	#[inline(always)]
239	fn is_match(&self, sid: StateID) -> bool {
240	!self.is_dead(sid) && sid <= self.special.max_match_id
241	}
242
243	#[inline(always)]
244	fn is_start(&self, sid: StateID) -> bool {
245	sid == self.special.start_unanchored_id
246	\|\| sid == self.special.start_anchored_id
247	}
248
249	#[inline(always)]
250	fn match_kind(&self) -> MatchKind {
251	self.match_kind
252	}
253
254	#[inline(always)]
255	fn patterns_len(&self) -> usize {
256	self.pattern_lens.len()
257	}
258
259	#[inline(always)]
260	fn pattern_len(&self, pid: PatternID) -> usize {
261	self.pattern_lens[pid].as_usize()
262	}
263
264	#[inline(always)]
265	fn min_pattern_len(&self) -> usize {
266	self.min_pattern_len
267	}
268
269	#[inline(always)]
270	fn max_pattern_len(&self) -> usize {
271	self.max_pattern_len
272	}
273
274	#[inline(always)]
275	fn match_len(&self, sid: StateID) -> usize {
276	debug_assert!(self.is_match(sid));
277	let offset = (sid.as_usize() >> self.stride2) - `2`;
278	self.matches[offset].len()
279	}
280
281	#[inline(always)]
282	fn match_pattern(&self, sid: StateID, index: usize) -> PatternID {
283	debug_assert!(self.is_match(sid));
284	let offset = (sid.as_usize() >> self.stride2) - `2`;
285	self.matches[offset][index]
286	}
287
288	#[inline(always)]
289	fn memory_usage(&self) -> usize {
290	use core::mem::size_of;
291
292	(self.trans.len() * size_of::<u32>())
293	+ (self.matches.len() * size_of::<Vec<PatternID>>())
294	+ self.matches_memory_usage
295	+ (self.pattern_lens.len() * size_of::<SmallIndex>())
296	+ self.prefilter.as_ref().map_or(`0`, \|p\| p.memory_usage())
297	}
298
299	#[inline(always)]
300	fn prefilter(&self) -> Option<&Prefilter> {
301	self.prefilter.as_ref()
302	}
303	}
304
305	impl core::fmt::Debug for DFA {
306	fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
307	use crate::{
308	automaton::{fmt_state_indicator, sparse_transitions},
309	util::debug::DebugByte,
310	};
311
312	writeln!(f, "dfa::DFA(")?;
313	for index in `0`..self.state_len {
314	let sid = StateID::new_unchecked(index << self.stride2);
315	// While we do currently include the FAIL state in the transition
316	// table (to simplify construction), it is never actually used. It
317	// poses problems with the code below because it gets treated as
318	// a match state incidentally when it is, of course, not. So we
319	// special case it. The fail state is always the first state after
320	// the dead state.
321	//
322	// If the construction is changed to remove the fail state (it
323	// probably should be), then this special case should be updated.
324	if index == `1` {
325	writeln!(f, "F {:06}:", sid.as_usize())?;
326	continue;
327	}
328	fmt_state_indicator(f, self, sid)?;
329	write!(f, "{:06}: ", sid.as_usize())?;
330
331	let it = (`0`..self.byte_classes.alphabet_len()).map(\|class\| {
332	(class.as_u8(), self.trans[sid.as_usize() + class])
333	});
334	for (i, (start, end, next)) in sparse_transitions(it).enumerate() {
335	if i > `0` {
336	write!(f, ", ")?;
337	}
338	if start == end {
339	write!(
340	f,
341	"{:?} => {:?}",
342	DebugByte(start),
343	next.as_usize()
344	)?;
345	} else {
346	write!(
347	f,
348	"{:?}-{:?} => {:?}",
349	DebugByte(start),
350	DebugByte(end),
351	next.as_usize()
352	)?;
353	}
354	}
355	write!(f, "`\n`")?;
356	if self.is_match(sid) {
357	write!(f, " matches: ")?;
358	for i in `0`..self.match_len(sid) {
359	if i > `0` {
360	write!(f, ", ")?;
361	}
362	let pid = self.match_pattern(sid, i);
363	write!(f, "{}", pid.as_usize())?;
364	}
365	write!(f, "`\n`")?;
366	}
367	}
368	writeln!(f, "match kind: {:?}", self.match_kind)?;
369	writeln!(f, "prefilter: {:?}", self.prefilter.is_some())?;
370	writeln!(f, "state length: {:?}", self.state_len)?;
371	writeln!(f, "pattern length: {:?}", self.patterns_len())?;
372	writeln!(f, "shortest pattern length: {:?}", self.min_pattern_len)?;
373	writeln!(f, "longest pattern length: {:?}", self.max_pattern_len)?;
374	writeln!(f, "alphabet length: {:?}", self.alphabet_len)?;
375	writeln!(f, "stride: {:?}", `1` << self.stride2)?;
376	writeln!(f, "byte classes: {:?}", self.byte_classes)?;
377	writeln!(f, "memory usage: {:?}", self.memory_usage())?;
378	writeln!(f, ")")?;
379	Ok(())
380	}
381	}
382
383	/// A builder for configuring an Aho-Corasick DFA.
384	///
385	/// This builder has a subset of the options available to a
386	/// [`AhoCorasickBuilder`](crate::AhoCorasickBuilder). Of the shared options,
387	/// their behavior is identical.
388	#[derive(Clone, Debug)]
389	pub struct Builder {
390	noncontiguous: noncontiguous::Builder,
391	start_kind: StartKind,
392	byte_classes: bool,
393	}
394
395	impl Default for Builder {
396	fn default() -> Builder {
397	Builder {
398	noncontiguous: noncontiguous::Builder::new(),
399	start_kind: StartKind::Unanchored,
400	byte_classes: `true`,
401	}
402	}
403	}
404
405	impl Builder {
406	/// Create a new builder for configuring an Aho-Corasick DFA.
407	pub fn new() -> Builder {
408	Builder::default()
409	}
410
411	/// Build an Aho-Corasick DFA from the given iterator of patterns.
412	///
413	/// A builder may be reused to create more DFAs.
414	pub fn build<I, P>(&self, patterns: I) -> Result<DFA, BuildError>
415	where
416	I: IntoIterator<Item = P>,
417	P: AsRef<[u8]>,
418	{
419	let nnfa = self.noncontiguous.build(patterns)?;
420	self.build_from_noncontiguous(&nnfa)
421	}
422
423	/// Build an Aho-Corasick DFA from the given noncontiguous NFA.
424	///
425	/// Note that when this method is used, only the `start_kind` and
426	/// `byte_classes` settings on this builder are respected. The other
427	/// settings only apply to the initial construction of the Aho-Corasick
428	/// automaton. Since using this method requires that initial construction
429	/// has already completed, all settings impacting only initial construction
430	/// are no longer relevant.
431	pub fn build_from_noncontiguous(
432	&self,
433	nnfa: &noncontiguous::NFA,
434	) -> Result<DFA, BuildError> {
435	debug!("building DFA");
436	let byte_classes = if self.byte_classes {
437	nnfa.byte_classes().clone()
438	} else {
439	ByteClasses::singletons()
440	};
441	let state_len = match self.start_kind {
442	StartKind::Unanchored \| StartKind::Anchored => nnfa.states().len(),
443	StartKind::Both => {
444	// These unwraps are OK because we know that the number of
445	// NFA states is < StateID::LIMIT which is in turn less than
446	// i32::MAX. Thus, there is always room to multiply by 2.
447	// Finally, the number of states is always at least 4 in the
448	// NFA (DEAD, FAIL, START-UNANCHORED, START-ANCHORED), so the
449	// subtraction of 4 is okay.
450	//
451	// Note that we subtract 4 because the "anchored" part of
452	// the DFA duplicates the unanchored part (without failure
453	// transitions), but reuses the DEAD, FAIL and START states.
454	nnfa.states()
455	.len()
456	.checked_mul(`2`)
457	.unwrap()
458	.checked_sub(`4`)
459	.unwrap()
460	}
461	};
462	let trans_len =
463	match state_len.checked_shl(byte_classes.stride2().as_u32()) {
464	Some(trans_len) => trans_len,
465	None => {
466	return Err(BuildError::state_id_overflow(
467	StateID::MAX.as_u64(),
468	usize::MAX.as_u64(),
469	))
470	}
471	};
472	StateID::new(trans_len.checked_sub(byte_classes.stride()).unwrap())
473	.map_err(\|e\| {
474	BuildError::state_id_overflow(
475	StateID::MAX.as_u64(),
476	e.attempted(),
477	)
478	})?;
479	let num_match_states = match self.start_kind {
480	StartKind::Unanchored \| StartKind::Anchored => {
481	nnfa.special().max_match_id.as_usize().checked_sub(`1`).unwrap()
482	}
483	StartKind::Both => nnfa
484	.special()
485	.max_match_id
486	.as_usize()
487	.checked_sub(`1`)
488	.unwrap()
489	.checked_mul(`2`)
490	.unwrap(),
491	};
492	let mut dfa = DFA {
493	trans: vec![DFA::DEAD; trans_len],
494	matches: vec![vec![]; num_match_states],
495	matches_memory_usage: `0`,
496	pattern_lens: nnfa.pattern_lens_raw().to_vec(),
497	prefilter: nnfa.prefilter().map(\|p\| p.clone()),
498	match_kind: nnfa.match_kind(),
499	state_len,
500	alphabet_len: byte_classes.alphabet_len(),
501	stride2: byte_classes.stride2(),
502	byte_classes,
503	min_pattern_len: nnfa.min_pattern_len(),
504	max_pattern_len: nnfa.max_pattern_len(),
505	// The special state IDs are set later.
506	special: Special::zero(),
507	};
508	match self.start_kind {
509	StartKind::Both => {
510	self.finish_build_both_starts(nnfa, &mut dfa);
511	}
512	StartKind::Unanchored => {
513	self.finish_build_one_start(Anchored::No, nnfa, &mut dfa);
514	}
515	StartKind::Anchored => {
516	self.finish_build_one_start(Anchored::Yes, nnfa, &mut dfa)
517	}
518	}
519	debug!(
520	"DFA built, <states: {:?}, size: {:?}, \
521	alphabet len: {:?}, stride: {:?}>",
522	dfa.state_len,
523	dfa.memory_usage(),
524	dfa.byte_classes.alphabet_len(),
525	dfa.byte_classes.stride(),
526	);
527	// The vectors can grow ~twice as big during construction because a
528	// Vec amortizes growth. But here, let's shrink things back down to
529	// what we actually need since we're never going to add more to it.
530	dfa.trans.shrink_to_fit();
531	dfa.pattern_lens.shrink_to_fit();
532	dfa.matches.shrink_to_fit();
533	// TODO: We might also want to shrink each Vec inside of `dfa.matches`,
534	// or even better, convert it to one contiguous allocation. But I think
535	// I went with nested allocs for good reason (can't remember), so this
536	// may be tricky to do. I decided not to shrink them here because it
537	// might require a fair bit of work to do. It's unclear whether it's
538	// worth it.
539	Ok(dfa)
540	}
541
542	/// Finishes building a DFA for either unanchored or anchored searches,
543	/// but NOT both.
544	fn finish_build_one_start(
545	&self,
546	anchored: Anchored,
547	nnfa: &noncontiguous::NFA,
548	dfa: &mut DFA,
549	) {
550	// This function always succeeds because we check above that all of the
551	// states in the NFA can be mapped to DFA state IDs.
552	let stride2 = dfa.stride2;
553	let old2new = \|oldsid: StateID\| {
554	StateID::new_unchecked(oldsid.as_usize() << stride2)
555	};
556	for (oldsid, state) in nnfa.states().iter().with_state_ids() {
557	let newsid = old2new(oldsid);
558	if state.is_match() {
559	dfa.set_matches(newsid, nnfa.iter_matches(oldsid));
560	}
561	sparse_iter(
562	nnfa,
563	oldsid,
564	&dfa.byte_classes,
565	\|byte, class, mut oldnextsid\| {
566	if oldnextsid == noncontiguous::NFA::FAIL {
567	if anchored.is_anchored() {
568	oldnextsid = noncontiguous::NFA::DEAD;
569	} else if state.fail() == noncontiguous::NFA::DEAD {
570	// This is a special case that avoids following
571	// DEAD transitions in a non-contiguous NFA.
572	// Following these transitions is pretty slow
573	// because the non-contiguous NFA will always use
574	// a sparse representation for it (because the
575	// DEAD state is usually treated as a sentinel).
576	// The vast* majority of failure states are DEAD*
577	// states, so this winds up being pretty slow if
578	// we go through the non-contiguous NFA state
579	// transition logic. Instead, just do it ourselves.
580	oldnextsid = noncontiguous::NFA::DEAD;
581	} else {
582	oldnextsid = nnfa.next_state(
583	Anchored::No,
584	state.fail(),
585	byte,
586	);
587	}
588	}
589	dfa.trans[newsid.as_usize() + usize::from(class)] =
590	old2new(oldnextsid);
591	},
592	);
593	}
594	// Now that we've remapped all the IDs in our states, all that's left
595	// is remapping the special state IDs.
596	let old = nnfa.special();
597	let new = &mut dfa.special;
598	new.max_special_id = old2new(old.max_special_id);
599	new.max_match_id = old2new(old.max_match_id);
600	if anchored.is_anchored() {
601	new.start_unanchored_id = DFA::DEAD;
602	new.start_anchored_id = old2new(old.start_anchored_id);
603	} else {
604	new.start_unanchored_id = old2new(old.start_unanchored_id);
605	new.start_anchored_id = DFA::DEAD;
606	}
607	}
608
609	/// Finishes building a DFA that supports BOTH unanchored and anchored
610	/// searches. It works by inter-leaving unanchored states with anchored
611	/// states in the same transition table. This way, we avoid needing to
612	/// re-shuffle states afterward to ensure that our states still look like
613	/// DEAD, MATCH, ..., START-UNANCHORED, START-ANCHORED, NON-MATCH, ...
614	///
615	/// Honestly this is pretty inscrutable... Simplifications are most
616	/// welcome.
617	fn finish_build_both_starts(
618	&self,
619	nnfa: &noncontiguous::NFA,
620	dfa: &mut DFA,
621	) {
622	let stride2 = dfa.stride2;
623	let stride = `1` << stride2;
624	let mut remap_unanchored = vec![DFA::DEAD; nnfa.states().len()];
625	let mut remap_anchored = vec![DFA::DEAD; nnfa.states().len()];
626	let mut is_anchored = vec![`false`; dfa.state_len];
627	let mut newsid = DFA::DEAD;
628	let next_dfa_id =
629	\|sid: StateID\| StateID::new_unchecked(sid.as_usize() + stride);
630	for (oldsid, state) in nnfa.states().iter().with_state_ids() {
631	if oldsid == noncontiguous::NFA::DEAD
632	\|\| oldsid == noncontiguous::NFA::FAIL
633	{
634	remap_unanchored[oldsid] = newsid;
635	remap_anchored[oldsid] = newsid;
636	newsid = next_dfa_id(newsid);
637	} else if oldsid == nnfa.special().start_unanchored_id
638	\|\| oldsid == nnfa.special().start_anchored_id
639	{
640	if oldsid == nnfa.special().start_unanchored_id {
641	remap_unanchored[oldsid] = newsid;
642	remap_anchored[oldsid] = DFA::DEAD;
643	} else {
644	remap_unanchored[oldsid] = DFA::DEAD;
645	remap_anchored[oldsid] = newsid;
646	is_anchored[newsid.as_usize() >> stride2] = `true`;
647	}
648	if state.is_match() {
649	dfa.set_matches(newsid, nnfa.iter_matches(oldsid));
650	}
651	sparse_iter(
652	nnfa,
653	oldsid,
654	&dfa.byte_classes,
655	\|_, class, oldnextsid\| {
656	let class = usize::from(class);
657	if oldnextsid == noncontiguous::NFA::FAIL {
658	dfa.trans[newsid.as_usize() + class] = DFA::DEAD;
659	} else {
660	dfa.trans[newsid.as_usize() + class] = oldnextsid;
661	}
662	},
663	);
664	newsid = next_dfa_id(newsid);
665	} else {
666	let unewsid = newsid;
667	newsid = next_dfa_id(newsid);
668	let anewsid = newsid;
669	newsid = next_dfa_id(newsid);
670
671	remap_unanchored[oldsid] = unewsid;
672	remap_anchored[oldsid] = anewsid;
673	is_anchored[anewsid.as_usize() >> stride2] = `true`;
674	if state.is_match() {
675	dfa.set_matches(unewsid, nnfa.iter_matches(oldsid));
676	dfa.set_matches(anewsid, nnfa.iter_matches(oldsid));
677	}
678	sparse_iter(
679	nnfa,
680	oldsid,
681	&dfa.byte_classes,
682	\|byte, class, oldnextsid\| {
683	let class = usize::from(class);
684	if oldnextsid == noncontiguous::NFA::FAIL {
685	let oldnextsid =
686	if state.fail() == noncontiguous::NFA::DEAD {
687	noncontiguous::NFA::DEAD
688	} else {
689	nnfa.next_state(
690	Anchored::No,
691	state.fail(),
692	byte,
693	)
694	};
695	dfa.trans[unewsid.as_usize() + class] = oldnextsid;
696	} else {
697	dfa.trans[unewsid.as_usize() + class] = oldnextsid;
698	dfa.trans[anewsid.as_usize() + class] = oldnextsid;
699	}
700	},
701	);
702	}
703	}
704	for i in `0`..dfa.state_len {
705	let sid = i << stride2;
706	if is_anchored[i] {
707	for next in dfa.trans[sid..][..stride].iter_mut() {
708	next = remap_anchored[next];
709	}
710	} else {
711	for next in dfa.trans[sid..][..stride].iter_mut() {
712	next = remap_unanchored[next];
713	}
714	}
715	}
716	// Now that we've remapped all the IDs in our states, all that's left
717	// is remapping the special state IDs.
718	let old = nnfa.special();
719	let new = &mut dfa.special;
720	new.max_special_id = remap_anchored[old.max_special_id];
721	new.max_match_id = remap_anchored[old.max_match_id];
722	new.start_unanchored_id = remap_unanchored[old.start_unanchored_id];
723	new.start_anchored_id = remap_anchored[old.start_anchored_id];
724	}
725
726	/// Set the desired match semantics.
727	///
728	/// This only applies when using [`Builder::build`] and not
729	/// [`Builder::build_from_noncontiguous`].
730	///
731	/// See
732	/// [`AhoCorasickBuilder::match_kind`](crate::AhoCorasickBuilder::match_kind)
733	/// for more documentation and examples.
734	pub fn match_kind(&mut self, kind: MatchKind) -> &mut Builder {
735	self.noncontiguous.match_kind(kind);
736	self
737	}
738
739	/// Enable ASCII-aware case insensitive matching.
740	///
741	/// This only applies when using [`Builder::build`] and not
742	/// [`Builder::build_from_noncontiguous`].
743	///
744	/// See
745	/// [`AhoCorasickBuilder::ascii_case_insensitive`](crate::AhoCorasickBuilder::ascii_case_insensitive)
746	/// for more documentation and examples.
747	pub fn ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder {
748	self.noncontiguous.ascii_case_insensitive(yes);
749	self
750	}
751
752	/// Enable heuristic prefilter optimizations.
753	///
754	/// This only applies when using [`Builder::build`] and not
755	/// [`Builder::build_from_noncontiguous`].
756	///
757	/// See
758	/// [`AhoCorasickBuilder::prefilter`](crate::AhoCorasickBuilder::prefilter)
759	/// for more documentation and examples.
760	pub fn prefilter(&mut self, yes: bool) -> &mut Builder {
761	self.noncontiguous.prefilter(yes);
762	self
763	}
764
765	/// Sets the starting state configuration for the automaton.
766	///
767	/// See
768	/// [`AhoCorasickBuilder::start_kind`](crate::AhoCorasickBuilder::start_kind)
769	/// for more documentation and examples.
770	pub fn start_kind(&mut self, kind: StartKind) -> &mut Builder {
771	self.start_kind = kind;
772	self
773	}
774
775	/// A debug setting for whether to attempt to shrink the size of the
776	/// automaton's alphabet or not.
777	///
778	/// This should never be enabled unless you're debugging an automaton.
779	/// Namely, disabling byte classes makes transitions easier to reason
780	/// about, since they use the actual bytes instead of equivalence classes.
781	/// Disabling this confers no performance benefit at search time.
782	///
783	/// See
784	/// [`AhoCorasickBuilder::byte_classes`](crate::AhoCorasickBuilder::byte_classes)
785	/// for more documentation and examples.
786	pub fn byte_classes(&mut self, yes: bool) -> &mut Builder {
787	self.byte_classes = yes;
788	self
789	}
790	}
791
792	/// Iterate over all possible equivalence class transitions in this state.
793	/// The closure is called for all transitions with a distinct equivalence
794	/// class, even those not explicitly represented in this sparse state. For
795	/// any implicitly defined transitions, the given closure is called with
796	/// the fail state ID.
797	///
798	/// The closure is guaranteed to be called precisely
799	/// `byte_classes.alphabet_len()` times, once for every possible class in
800	/// ascending order.
801	fn sparse_iter<F: FnMut(u8, u8, StateID)>(
802	nnfa: &noncontiguous::NFA,
803	oldsid: StateID,
804	classes: &ByteClasses,
805	mut f: F,
806	) {
807	let mut prev_class = None;
808	let mut byte = `0usize`;
809	for t in nnfa.iter_trans(oldsid) {
810	while byte < usize::from(t.byte()) {
811	let rep = byte.as_u8();
812	let class = classes.get(rep);
813	byte += `1`;
814	if prev_class != Some(class) {
815	f(rep, class, noncontiguous::NFA::FAIL);
816	prev_class = Some(class);
817	}
818	}
819	let rep = t.byte();
820	let class = classes.get(rep);
821	byte += `1`;
822	if prev_class != Some(class) {
823	f(rep, class, t.next());
824	prev_class = Some(class);
825	}
826	}
827	for b in byte..=`255` {
828	let rep = b.as_u8();
829	let class = classes.get(rep);
830	if prev_class != Some(class) {
831	f(rep, class, noncontiguous::NFA::FAIL);
832	prev_class = Some(class);
833	}
834	}
835	}
836