contiguous.rs - Codebrowser

1	/!*
2	Provides a contiguous NFA implementation of Aho-Corasick.
3
4	This is a low-level API that generally only needs to be used in niche
5	circumstances. When possible, prefer using [`AhoCorasick`](crate::AhoCorasick)
6	instead of a contiguous NFA directly. Using an `NFA` directly is typically only
7	necessary when one needs access to the [`Automaton`] trait implementation.
8	*/
9
10	use alloc::{vec, vec::Vec};
11
12	use crate::{
13	automaton::Automaton,
14	nfa::noncontiguous,
15	util::{
16	alphabet::ByteClasses,
17	error::{BuildError, MatchError},
18	int::{Usize, U16, U32},
19	prefilter::Prefilter,
20	primitives::{IteratorIndexExt, PatternID, SmallIndex, StateID},
21	search::{Anchored, MatchKind},
22	special::Special,
23	},
24	};
25
26	/// A contiguous NFA implementation of Aho-Corasick.
27	///
28	/// When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) instead of
29	/// this type directly. Using an `NFA` directly is typically only necessary
30	/// when one needs access to the [`Automaton`] trait implementation.
31	///
32	/// This NFA can only be built by first constructing a [`noncontiguous::NFA`].
33	/// Both [`NFA::new`] and [`Builder::build`] do this for you automatically, but
34	/// [`Builder::build_from_noncontiguous`] permits doing it explicitly.
35	///
36	/// The main difference between a noncontiguous NFA and a contiguous NFA is
37	/// that the latter represents all of its states and transitions in a single
38	/// allocation, where as the former uses a separate allocation for each state.
39	/// Doing this at construction time while keeping a low memory footprint isn't
40	/// feasible, which is primarily why there are two different NFA types: one
41	/// that does the least amount of work possible to build itself, and another
42	/// that does a little extra work to compact itself and make state transitions
43	/// faster by making some states use a dense representation.
44	///
45	/// Because a contiguous NFA uses a single allocation, there is a lot more
46	/// opportunity for compression tricks to reduce the heap memory used. Indeed,
47	/// it is not uncommon for a contiguous NFA to use an order of magnitude less
48	/// heap memory than a noncontiguous NFA. Since building a contiguous NFA
49	/// usually only takes a fraction of the time it takes to build a noncontiguous
50	/// NFA, the overall build time is not much slower. Thus, in most cases, a
51	/// contiguous NFA is the best choice.
52	///
53	/// Since a contiguous NFA uses various tricks for compression and to achieve
54	/// faster state transitions, currently, its limit on the number of states
55	/// is somewhat smaller than what a noncontiguous NFA can achieve. Generally
56	/// speaking, you shouldn't expect to run into this limit if the number of
57	/// patterns is under 1 million. It is plausible that this limit will be
58	/// increased in the future. If the limit is reached, building a contiguous NFA
59	/// will return an error. Often, since building a contiguous NFA is relatively
60	/// cheap, it can make sense to always try it even if you aren't sure if it
61	/// will fail or not. If it does, you can always fall back to a noncontiguous
62	/// NFA. (Indeed, the main [`AhoCorasick`](crate::AhoCorasick) type employs a
63	/// strategy similar to this at construction time.)
64	///
65	/// # Example
66	///
67	/// This example shows how to build an `NFA` directly and use it to execute
68	/// [`Automaton::try_find`]:
69	///
70	/// ```
71	/// use aho_corasick::{
72	/// automaton::Automaton,
73	/// nfa::contiguous::NFA,
74	/// Input, Match,
75	/// };
76	///
77	/// let patterns = &["b", "abc", "abcd"];
78	/// let haystack = "abcd";
79	///
80	/// let nfa = NFA::new(patterns).unwrap();
81	/// assert_eq!(
82	/// Some(Match::must(`0`, `1`..`2`)),
83	/// nfa.try_find(&Input::new(haystack))?,
84	/// );
85	/// # Ok::<(), Box<dyn std::error::Error>>(())
86	/// ```
87	///
88	/// It is also possible to implement your own version of `try_find`. See the
89	/// [`Automaton`] documentation for an example.
90	#[derive(Clone)]
91	pub struct NFA {
92	/// The raw NFA representation. Each state is packed with a header
93	/// (containing the format of the state, the failure transition and, for
94	/// a sparse state, the number of transitions), its transitions and any
95	/// matching pattern IDs for match states.
96	repr: Vec<u32>,
97	/// The length of each pattern. This is used to compute the start offset
98	/// of a match.
99	pattern_lens: Vec<SmallIndex>,
100	/// The total number of states in this NFA.
101	state_len: usize,
102	/// A prefilter for accelerating searches, if one exists.
103	prefilter: Option<Prefilter>,
104	/// The match semantics built into this NFA.
105	match_kind: MatchKind,
106	/// The alphabet size, or total number of equivalence classes, for this
107	/// NFA. Dense states always have this many transitions.
108	alphabet_len: usize,
109	/// The equivalence classes for this NFA. All transitions, dense and
110	/// sparse, are defined on equivalence classes and not on the 256 distinct
111	/// byte values.
112	byte_classes: ByteClasses,
113	/// The length of the shortest pattern in this automaton.
114	min_pattern_len: usize,
115	/// The length of the longest pattern in this automaton.
116	max_pattern_len: usize,
117	/// The information required to deduce which states are "special" in this
118	/// NFA.
119	special: Special,
120	}
121
122	impl NFA {
123	/// Create a new Aho-Corasick contiguous NFA using the default
124	/// configuration.
125	///
126	/// Use a [`Builder`] if you want to change the configuration.
127	pub fn new<I, P>(patterns: I) -> Result<NFA, BuildError>
128	where
129	I: IntoIterator<Item = P>,
130	P: AsRef<[u8]>,
131	{
132	NFA::builder().build(patterns)
133	}
134
135	/// A convenience method for returning a new Aho-Corasick contiguous NFA
136	/// builder.
137	///
138	/// This usually permits one to just import the `NFA` type.
139	pub fn builder() -> Builder {
140	Builder::new()
141	}
142	}
143
144	impl NFA {
145	/// A sentinel state ID indicating that a search should stop once it has
146	/// entered this state. When a search stops, it returns a match if one
147	/// has been found, otherwise no match. A contiguous NFA always has an
148	/// actual dead state at this ID.
149	const DEAD: StateID = StateID::new_unchecked(`0`);
150	/// Another sentinel state ID indicating that a search should move through
151	/// current state's failure transition.
152	///
153	/// Note that unlike DEAD, this does not actually point to a valid state
154	/// in a contiguous NFA. (noncontiguous::NFA::FAIL does point to a valid
155	/// state.) Instead, this points to the position that is guaranteed to
156	/// never be a valid state ID (by making sure it points to a place in the
157	/// middle of the encoding of the DEAD state). Since we never need to
158	/// actually look at the FAIL state itself, this works out.
159	///
160	/// By why do it this way? So that FAIL is a constant. I don't have any
161	/// concrete evidence that this materially helps matters, but it's easy to
162	/// do. The alternative would be making the FAIL ID point to the second
163	/// state, which could be made a constant but is a little trickier to do.
164	/// The easiest path is to just make the FAIL state a runtime value, but
165	/// since comparisons with FAIL occur in perf critical parts of the search,
166	/// we want it to be as tight as possible and not waste any registers.
167	///
168	/// Very hand wavy... But the code complexity that results from this is
169	/// very mild.
170	const FAIL: StateID = StateID::new_unchecked(`1`);
171	}
172
173	// SAFETY: 'start_state' always returns a valid state ID, 'next_state' always
174	// returns a valid state ID given a valid state ID. We otherwise claim that
175	// all other methods are correct as well.
176	unsafe impl Automaton for NFA {
177	#[inline(always)]
178	fn start_state(&self, anchored: Anchored) -> Result<StateID, MatchError> {
179	match anchored {
180	Anchored::No => Ok(self.special.start_unanchored_id),
181	Anchored::Yes => Ok(self.special.start_anchored_id),
182	}
183	}
184
185	#[inline(always)]
186	fn next_state(
187	&self,
188	anchored: Anchored,
189	mut sid: StateID,
190	byte: u8,
191	) -> StateID {
192	let repr = &self.repr;
193	let class = self.byte_classes.get(byte);
194	let u32tosid = StateID::from_u32_unchecked;
195	loop {
196	let o = sid.as_usize();
197	let kind = repr[o] & `0xFF`;
198	// I tried to encapsulate the "next transition" logic into its own
199	// function, but it seemed to always result in sub-optimal codegen
200	// that led to real and significant slowdowns. So we just inline
201	// the logic here.
202	//
203	// I've also tried a lot of different ways to speed up this
204	// routine, and most of them have failed.
205	if kind == State::KIND_DENSE {
206	let next = u32tosid(repr[o + `2` + usize::from(class)]);
207	if next != NFA::FAIL {
208	return next;
209	}
210	} else if kind == State::KIND_ONE {
211	if class == repr[o].low_u16().high_u8() {
212	return u32tosid(repr[o + `2`]);
213	}
214	} else {
215	// NOTE: I tried a SWAR technique in the loop below, but found
216	// it slower. See the 'swar' test in the tests for this module.
217	let trans_len = kind.as_usize();
218	let classes_len = u32_len(trans_len);
219	let trans_offset = o + `2` + classes_len;
220	for (i, &chunk) in
221	repr[o + `2`..][..classes_len].iter().enumerate()
222	{
223	let classes = chunk.to_ne_bytes();
224	if classes[`0`] == class {
225	return u32tosid(repr[trans_offset + i * `4`]);
226	}
227	if classes[`1`] == class {
228	return u32tosid(repr[trans_offset + i * `4` + `1`]);
229	}
230	if classes[`2`] == class {
231	return u32tosid(repr[trans_offset + i * `4` + `2`]);
232	}
233	if classes[`3`] == class {
234	return u32tosid(repr[trans_offset + i * `4` + `3`]);
235	}
236	}
237	}
238	// For an anchored search, we never follow failure transitions
239	// because failure transitions lead us down a path to matching
240	// a proper* suffix of the path we were on. Thus, it can only*
241	// produce matches that appear after the beginning of the search.
242	if anchored.is_anchored() {
243	return NFA::DEAD;
244	}
245	sid = u32tosid(repr[o + `1`]);
246	}
247	}
248
249	#[inline(always)]
250	fn is_special(&self, sid: StateID) -> bool {
251	sid <= self.special.max_special_id
252	}
253
254	#[inline(always)]
255	fn is_dead(&self, sid: StateID) -> bool {
256	sid == NFA::DEAD
257	}
258
259	#[inline(always)]
260	fn is_match(&self, sid: StateID) -> bool {
261	!self.is_dead(sid) && sid <= self.special.max_match_id
262	}
263
264	#[inline(always)]
265	fn is_start(&self, sid: StateID) -> bool {
266	sid == self.special.start_unanchored_id
267	\|\| sid == self.special.start_anchored_id
268	}
269
270	#[inline(always)]
271	fn match_kind(&self) -> MatchKind {
272	self.match_kind
273	}
274
275	#[inline(always)]
276	fn patterns_len(&self) -> usize {
277	self.pattern_lens.len()
278	}
279
280	#[inline(always)]
281	fn pattern_len(&self, pid: PatternID) -> usize {
282	self.pattern_lens[pid].as_usize()
283	}
284
285	#[inline(always)]
286	fn min_pattern_len(&self) -> usize {
287	self.min_pattern_len
288	}
289
290	#[inline(always)]
291	fn max_pattern_len(&self) -> usize {
292	self.max_pattern_len
293	}
294
295	#[inline(always)]
296	fn match_len(&self, sid: StateID) -> usize {
297	State::match_len(self.alphabet_len, &self.repr[sid.as_usize()..])
298	}
299
300	#[inline(always)]
301	fn match_pattern(&self, sid: StateID, index: usize) -> PatternID {
302	State::match_pattern(
303	self.alphabet_len,
304	&self.repr[sid.as_usize()..],
305	index,
306	)
307	}
308
309	#[inline(always)]
310	fn memory_usage(&self) -> usize {
311	use core::mem::size_of;
312
313	(self.repr.len() * size_of::<u32>())
314	+ (self.pattern_lens.len() * size_of::<SmallIndex>())
315	+ self.prefilter.as_ref().map_or(`0`, \|p\| p.memory_usage())
316	}
317
318	#[inline(always)]
319	fn prefilter(&self) -> Option<&Prefilter> {
320	self.prefilter.as_ref()
321	}
322	}
323
324	impl core::fmt::Debug for NFA {
325	fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
326	use crate::automaton::fmt_state_indicator;
327
328	writeln!(f, "contiguous::NFA(")?;
329	let mut sid = NFA::DEAD; // always the first state and always present
330	loop {
331	let raw = &self.repr[sid.as_usize()..];
332	if raw.is_empty() {
333	break;
334	}
335	let is_match = self.is_match(sid);
336	let state = State::read(self.alphabet_len, is_match, raw);
337	fmt_state_indicator(f, self, sid)?;
338	write!(
339	f,
340	"{:06}({:06}): ",
341	sid.as_usize(),
342	state.fail.as_usize()
343	)?;
344	state.fmt(f)?;
345	write!(f, "`\n`")?;
346	if self.is_match(sid) {
347	write!(f, " matches: ")?;
348	for i in `0`..state.match_len {
349	let pid = State::match_pattern(self.alphabet_len, raw, i);
350	if i > `0` {
351	write!(f, ", ")?;
352	}
353	write!(f, "{}", pid.as_usize())?;
354	}
355	write!(f, "`\n`")?;
356	}
357	// The FAIL state doesn't actually have space for a state allocated
358	// for it, so we have to treat it as a special case. write below
359	// the DEAD state.
360	if sid == NFA::DEAD {
361	writeln!(f, "F {:06}:", NFA::FAIL.as_usize())?;
362	}
363	let len = State::len(self.alphabet_len, is_match, raw);
364	sid = StateID::new(sid.as_usize().checked_add(len).unwrap())
365	.unwrap();
366	}
367	writeln!(f, "match kind: {:?}", self.match_kind)?;
368	writeln!(f, "prefilter: {:?}", self.prefilter.is_some())?;
369	writeln!(f, "state length: {:?}", self.state_len)?;
370	writeln!(f, "pattern length: {:?}", self.patterns_len())?;
371	writeln!(f, "shortest pattern length: {:?}", self.min_pattern_len)?;
372	writeln!(f, "longest pattern length: {:?}", self.max_pattern_len)?;
373	writeln!(f, "alphabet length: {:?}", self.alphabet_len)?;
374	writeln!(f, "byte classes: {:?}", self.byte_classes)?;
375	writeln!(f, "memory usage: {:?}", self.memory_usage())?;
376	writeln!(f, ")")?;
377
378	Ok(())
379	}
380	}
381
382	/// The "in memory" representation a single dense or sparse state.
383	///
384	/// A `State`'s in memory representation is not ever actually materialized
385	/// during a search with a contiguous NFA. Doing so would be too slow. (Indeed,
386	/// the only time a `State` is actually constructed is in `Debug` impls.)
387	/// Instead, a `State` exposes a number of static methods for reading certain
388	/// things from the raw binary encoding of the state.
389	#[derive(Clone)]
390	struct State<'a> {
391	/// The state to transition to when 'class_to_next' yields a transition
392	/// to the FAIL state.
393	fail: StateID,
394	/// The number of pattern IDs in this state. For a non-match state, this is
395	/// always zero. Otherwise it is always bigger than zero.
396	match_len: usize,
397	/// The sparse or dense representation of the transitions for this state.
398	trans: StateTrans<'a>,
399	}
400
401	/// The underlying representation of sparse or dense transitions for a state.
402	///
403	/// Note that like `State`, we don't typically construct values of this type
404	/// during a search since we don't always need all values and thus would
405	/// represent a lot of wasteful work.
406	#[derive(Clone)]
407	enum StateTrans<'a> {
408	/// A sparse representation of transitions for a state, where only non-FAIL
409	/// transitions are explicitly represented.
410	Sparse {
411	classes: &'a [u32],
412	/// The transitions for this state, where each transition is packed
413	/// into a u32. The low 8 bits correspond to the byte class for the
414	/// transition, and the high 24 bits correspond to the next state ID.
415	///
416	/// This packing is why the max state ID allowed for a contiguous
417	/// NFA is 2^24-1.
418	nexts: &'a [u32],
419	},
420	/// A "one transition" state that is never a match state.
421	///
422	/// These are by far the most common state, so we use a specialized and
423	/// very compact representation for them.
424	One {
425	/// The element of this NFA's alphabet that this transition is
426	/// defined for.
427	class: u8,
428	/// The state this should transition to if the current symbol is
429	/// equal to 'class'.
430	next: u32,
431	},
432	/// A dense representation of transitions for a state, where all
433	/// transitions are explicitly represented, including transitions to the
434	/// FAIL state.
435	Dense {
436	/// A dense set of transitions to other states. The transitions may
437	/// point to a FAIL state, in which case, the search should try the
438	/// same transition lookup at 'fail'.
439	///
440	/// Note that this is indexed by byte equivalence classes and not
441	/// byte values. That means 'class_to_next[byte]' is wrong and
442	/// 'class_to_next[classes.get(byte)]' is correct. The number of
443	/// transitions is always equivalent to 'classes.alphabet_len()'.
444	class_to_next: &'a [u32],
445	},
446	}
447
448	impl<'a> State<'a> {
449	/// The offset of where the "kind" of a state is stored. If it isn't one
450	/// of the sentinel values below, then it's a sparse state and the kind
451	/// corresponds to the number of transitions in the state.
452	const KIND: usize = `0`;
453
454	/// A sentinel value indicating that the state uses a dense representation.
455	const KIND_DENSE: u32 = `0xFF`;
456	/// A sentinel value indicating that the state uses a special "one
457	/// transition" encoding. In practice, non-match states with one transition
458	/// make up the overwhelming majority of all states in any given
459	/// Aho-Corasick automaton, so we can specialize them using a very compact
460	/// representation.
461	const KIND_ONE: u32 = `0xFE`;
462
463	/// The maximum number of transitions to encode as a sparse state. Usually
464	/// states with a lot of transitions are either very rare, or occur near
465	/// the start state. In the latter case, they are probably dense already
466	/// anyway. In the former case, making them dense is fine because they're
467	/// rare.
468	///
469	/// This needs to be small enough to permit each of the sentinel values for
470	/// 'KIND' above. Namely, a sparse state embeds the number of transitions
471	/// into the 'KIND'. Basically, "sparse" is a state kind too, but it's the
472	/// "else" branch.
473	///
474	/// N.B. There isn't anything particularly magical about 127 here. I
475	/// just picked it because I figured any sparse state with this many
476	/// transitions is going to be exceptionally rare, and if it did have this
477	/// many transitions, then it would be quite slow to do a linear scan on
478	/// the transitions during a search anyway.
479	const MAX_SPARSE_TRANSITIONS: usize = `127`;
480
481	/// Remap state IDs in-place.
482	///
483	/// `state` should be the the raw binary encoding of a state. (The start
484	/// of the slice must correspond to the start of the state, but the slice
485	/// may extend past the end of the encoding of the state.)
486	fn remap(
487	alphabet_len: usize,
488	old_to_new: &[StateID],
489	state: &mut [u32],
490	) -> Result<(), BuildError> {
491	let kind = State::kind(state);
492	if kind == State::KIND_DENSE {
493	state[`1`] = old_to_new[state[`1`].as_usize()].as_u32();
494	for next in state[`2`..][..alphabet_len].iter_mut() {
495	*next = old_to_new[next.as_usize()].as_u32();
496	}
497	} else if kind == State::KIND_ONE {
498	state[`1`] = old_to_new[state[`1`].as_usize()].as_u32();
499	state[`2`] = old_to_new[state[`2`].as_usize()].as_u32();
500	} else {
501	let trans_len = State::sparse_trans_len(state);
502	let classes_len = u32_len(trans_len);
503	state[`1`] = old_to_new[state[`1`].as_usize()].as_u32();
504	for next in state[`2` + classes_len..][..trans_len].iter_mut() {
505	*next = old_to_new[next.as_usize()].as_u32();
506	}
507	}
508	Ok(())
509	}
510
511	/// Returns the length, in number of u32s, of this state.
512	///
513	/// This is useful for reading states consecutively, e.g., in the Debug
514	/// impl without needing to store a separate map from state index to state
515	/// identifier.
516	///
517	/// `state` should be the the raw binary encoding of a state. (The start
518	/// of the slice must correspond to the start of the state, but the slice
519	/// may extend past the end of the encoding of the state.)
520	fn len(alphabet_len: usize, is_match: bool, state: &[u32]) -> usize {
521	let kind_len = `1`;
522	let fail_len = `1`;
523	let kind = State::kind(state);
524	let (classes_len, trans_len) = if kind == State::KIND_DENSE {
525	(`0`, alphabet_len)
526	} else if kind == State::KIND_ONE {
527	(`0`, `1`)
528	} else {
529	let trans_len = State::sparse_trans_len(state);
530	let classes_len = u32_len(trans_len);
531	(classes_len, trans_len)
532	};
533	let match_len = if !is_match {
534	`0`
535	} else if State::match_len(alphabet_len, state) == `1` {
536	// This is a special case because when there is one pattern ID for
537	// a match state, it is represented by a single u32 with its high
538	// bit set (which is impossible for a valid pattern ID).
539	`1`
540	} else {
541	// We add 1 to include the u32 that indicates the number of
542	// pattern IDs that follow.
543	`1` + State::match_len(alphabet_len, state)
544	};
545	kind_len + fail_len + classes_len + trans_len + match_len
546	}
547
548	/// Returns the kind of this state.
549	///
550	/// This only includes the low byte.
551	#[inline(always)]
552	fn kind(state: &[u32]) -> u32 {
553	state[State::KIND] & `0xFF`
554	}
555
556	/// Get the number of sparse transitions in this state. This can never
557	/// be more than State::MAX_SPARSE_TRANSITIONS, as all states with more
558	/// transitions are encoded as dense states.
559	///
560	/// `state` should be the the raw binary encoding of a sparse state. (The
561	/// start of the slice must correspond to the start of the state, but the
562	/// slice may extend past the end of the encoding of the state.) If this
563	/// isn't a sparse state, then the return value is unspecified.
564	///
565	/// Do note that this is only legal to call on a sparse state. So for
566	/// example, "one transition" state is not a sparse state, so it would not
567	/// be legal to call this method on such a state.
568	#[inline(always)]
569	fn sparse_trans_len(state: &[u32]) -> usize {
570	(state[State::KIND] & `0xFF`).as_usize()
571	}
572
573	/// Returns the total number of matching pattern IDs in this state. Calling
574	/// this on a state that isn't a match results in unspecified behavior.
575	/// Thus, the returned number is never 0 for all correct calls.
576	///
577	/// `state` should be the the raw binary encoding of a state. (The start
578	/// of the slice must correspond to the start of the state, but the slice
579	/// may extend past the end of the encoding of the state.)
580	#[inline(always)]
581	fn match_len(alphabet_len: usize, state: &[u32]) -> usize {
582	// We don't need to handle KIND_ONE here because it can never be a
583	// match state.
584	let packed = if State::kind(state) == State::KIND_DENSE {
585	let start = `2` + alphabet_len;
586	state[start].as_usize()
587	} else {
588	let trans_len = State::sparse_trans_len(state);
589	let classes_len = u32_len(trans_len);
590	let start = `2` + classes_len + trans_len;
591	state[start].as_usize()
592	};
593	if packed & (`1` << `31`) == `0` {
594	packed
595	} else {
596	`1`
597	}
598	}
599
600	/// Returns the pattern ID corresponding to the given index for the state
601	/// given. The `index` provided must be less than the number of pattern IDs
602	/// in this state.
603	///
604	/// `state` should be the the raw binary encoding of a state. (The start of
605	/// the slice must correspond to the start of the state, but the slice may
606	/// extend past the end of the encoding of the state.)
607	///
608	/// If the given state is not a match state or if the index is out of
609	/// bounds, then this has unspecified behavior.
610	#[inline(always)]
611	fn match_pattern(
612	alphabet_len: usize,
613	state: &[u32],
614	index: usize,
615	) -> PatternID {
616	// We don't need to handle KIND_ONE here because it can never be a
617	// match state.
618	let start = if State::kind(state) == State::KIND_DENSE {
619	`2` + alphabet_len
620	} else {
621	let trans_len = State::sparse_trans_len(state);
622	let classes_len = u32_len(trans_len);
623	`2` + classes_len + trans_len
624	};
625	let packed = state[start];
626	let pid = if packed & (`1` << `31`) == `0` {
627	state[start + `1` + index]
628	} else {
629	assert_eq!(`0`, index);
630	packed & !(`1` << `31`)
631	};
632	PatternID::from_u32_unchecked(pid)
633	}
634
635	/// Read a state's binary encoding to its in-memory representation.
636	///
637	/// `alphabet_len` should be the total number of transitions defined for
638	/// dense states.
639	///
640	/// `is_match` should be true if this state is a match state and false
641	/// otherwise.
642	///
643	/// `state` should be the the raw binary encoding of a state. (The start
644	/// of the slice must correspond to the start of the state, but the slice
645	/// may extend past the end of the encoding of the state.)
646	fn read(
647	alphabet_len: usize,
648	is_match: bool,
649	state: &'a [u32],
650	) -> State<'a> {
651	let kind = State::kind(state);
652	let match_len =
653	if !is_match { `0` } else { State::match_len(alphabet_len, state) };
654	let (trans, fail) = if kind == State::KIND_DENSE {
655	let fail = StateID::from_u32_unchecked(state[`1`]);
656	let class_to_next = &state[`2`..][..alphabet_len];
657	(StateTrans::Dense { class_to_next }, fail)
658	} else if kind == State::KIND_ONE {
659	let fail = StateID::from_u32_unchecked(state[`1`]);
660	let class = state[State::KIND].low_u16().high_u8();
661	let next = state[`2`];
662	(StateTrans::One { class, next }, fail)
663	} else {
664	let fail = StateID::from_u32_unchecked(state[`1`]);
665	let trans_len = State::sparse_trans_len(state);
666	let classes_len = u32_len(trans_len);
667	let classes = &state[`2`..][..classes_len];
668	let nexts = &state[`2` + classes_len..][..trans_len];
669	(StateTrans::Sparse { classes, nexts }, fail)
670	};
671	State { fail, match_len, trans }
672	}
673
674	/// Encode the "old" state from a noncontiguous NFA to its binary
675	/// representation to the given `dst` slice. `classes` should be the byte
676	/// classes computed for the noncontiguous NFA that the given state came
677	/// from.
678	///
679	/// This returns an error if `dst` became so big that `StateID`s can no
680	/// longer be created for new states. Otherwise, it returns the state ID of
681	/// the new state created.
682	///
683	/// When `force_dense` is true, then the encoded state will always use a
684	/// dense format. Otherwise, the choice between dense and sparse will be
685	/// automatically chosen based on the old state.
686	fn write(
687	nnfa: &noncontiguous::NFA,
688	oldsid: StateID,
689	old: &noncontiguous::State,
690	classes: &ByteClasses,
691	dst: &mut Vec<u32>,
692	force_dense: bool,
693	) -> Result<StateID, BuildError> {
694	let sid = StateID::new(dst.len()).map_err(\|e\| {
695	BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted())
696	})?;
697	let old_len = nnfa.iter_trans(oldsid).count();
698	// For states with a lot of transitions, we might as well just make
699	// them dense. These kinds of hot states tend to be very rare, so we're
700	// okay with it. This also gives us more sentinels in the state's
701	// 'kind', which lets us create different state kinds to save on
702	// space.
703	let kind = if force_dense \|\| old_len > State::MAX_SPARSE_TRANSITIONS {
704	State::KIND_DENSE
705	} else if old_len == `1` && !old.is_match() {
706	State::KIND_ONE
707	} else {
708	// For a sparse state, the kind is just the number of transitions.
709	u32::try_from(old_len).unwrap()
710	};
711	if kind == State::KIND_DENSE {
712	dst.push(kind);
713	dst.push(old.fail().as_u32());
714	State::write_dense_trans(nnfa, oldsid, classes, dst)?;
715	} else if kind == State::KIND_ONE {
716	let t = nnfa.iter_trans(oldsid).next().unwrap();
717	let class = u32::from(classes.get(t.byte()));
718	dst.push(kind \| (class << `8`));
719	dst.push(old.fail().as_u32());
720	dst.push(t.next().as_u32());
721	} else {
722	dst.push(kind);
723	dst.push(old.fail().as_u32());
724	State::write_sparse_trans(nnfa, oldsid, classes, dst)?;
725	}
726	// Now finally write the number of matches and the matches themselves.
727	if old.is_match() {
728	let matches_len = nnfa.iter_matches(oldsid).count();
729	if matches_len == `1` {
730	let pid = nnfa.iter_matches(oldsid).next().unwrap().as_u32();
731	assert_eq!(`0`, pid & (`1` << `31`));
732	dst.push((`1` << `31`) \| pid);
733	} else {
734	assert_eq!(`0`, matches_len & (`1` << `31`));
735	dst.push(matches_len.as_u32());
736	dst.extend(nnfa.iter_matches(oldsid).map(\|pid\| pid.as_u32()));
737	}
738	}
739	Ok(sid)
740	}
741
742	/// Encode the "old" state transitions from a noncontiguous NFA to its
743	/// binary sparse representation to the given `dst` slice. `classes` should
744	/// be the byte classes computed for the noncontiguous NFA that the given
745	/// state came from.
746	///
747	/// This returns an error if `dst` became so big that `StateID`s can no
748	/// longer be created for new states.
749	fn write_sparse_trans(
750	nnfa: &noncontiguous::NFA,
751	oldsid: StateID,
752	classes: &ByteClasses,
753	dst: &mut Vec<u32>,
754	) -> Result<(), BuildError> {
755	let (mut chunk, mut len) = ([`0`; `4`], `0`);
756	for t in nnfa.iter_trans(oldsid) {
757	chunk[len] = classes.get(t.byte());
758	len += `1`;
759	if len == `4` {
760	dst.push(u32::from_ne_bytes(chunk));
761	chunk = [`0`; `4`];
762	len = `0`;
763	}
764	}
765	if len > `0` {
766	// In the case where the number of transitions isn't divisible
767	// by 4, the last u32 chunk will have some left over room. In
768	// this case, we "just" repeat the last equivalence class. By
769	// doing this, we know the leftover faux transitions will never
770	// be followed because if they were, it would have been followed
771	// prior to it in the last equivalence class. This saves us some
772	// branching in the search time state transition code.
773	let repeat = chunk[len - `1`];
774	while len < `4` {
775	chunk[len] = repeat;
776	len += `1`;
777	}
778	dst.push(u32::from_ne_bytes(chunk));
779	}
780	for t in nnfa.iter_trans(oldsid) {
781	dst.push(t.next().as_u32());
782	}
783	Ok(())
784	}
785
786	/// Encode the "old" state transitions from a noncontiguous NFA to its
787	/// binary dense representation to the given `dst` slice. `classes` should
788	/// be the byte classes computed for the noncontiguous NFA that the given
789	/// state came from.
790	///
791	/// This returns an error if `dst` became so big that `StateID`s can no
792	/// longer be created for new states.
793	fn write_dense_trans(
794	nnfa: &noncontiguous::NFA,
795	oldsid: StateID,
796	classes: &ByteClasses,
797	dst: &mut Vec<u32>,
798	) -> Result<(), BuildError> {
799	// Our byte classes let us shrink the size of our dense states to the
800	// number of equivalence classes instead of just fixing it to 256.
801	// Any non-explicitly defined transition is just a transition to the
802	// FAIL state, so we fill that in first and then overwrite them with
803	// explicitly defined transitions. (Most states probably only have one
804	// or two explicitly defined transitions.)
805	//
806	// N.B. Remember that while building the contiguous NFA, we use state
807	// IDs from the noncontiguous NFA. It isn't until we've added all
808	// states that we go back and map noncontiguous IDs to contiguous IDs.
809	let start = dst.len();
810	dst.extend(
811	core::iter::repeat(noncontiguous::NFA::FAIL.as_u32())
812	.take(classes.alphabet_len()),
813	);
814	assert!(start < dst.len(), "equivalence classes are never empty");
815	for t in nnfa.iter_trans(oldsid) {
816	dst[start + usize::from(classes.get(t.byte()))] =
817	t.next().as_u32();
818	}
819	Ok(())
820	}
821
822	/// Return an iterator over every explicitly defined transition in this
823	/// state.
824	fn transitions<'b>(&'b self) -> impl Iterator<Item = (u8, StateID)> + 'b {
825	let mut i = `0`;
826	core::iter::from_fn(move \|\| match self.trans {
827	StateTrans::Sparse { classes, nexts } => {
828	if i >= nexts.len() {
829	return None;
830	}
831	let chunk = classes[i / `4`];
832	let class = chunk.to_ne_bytes()[i % `4`];
833	let next = StateID::from_u32_unchecked(nexts[i]);
834	i += `1`;
835	Some((class, next))
836	}
837	StateTrans::One { class, next } => {
838	if i == `0` {
839	i += `1`;
840	Some((class, StateID::from_u32_unchecked(next)))
841	} else {
842	None
843	}
844	}
845	StateTrans::Dense { class_to_next } => {
846	if i >= class_to_next.len() {
847	return None;
848	}
849	let class = i.as_u8();
850	let next = StateID::from_u32_unchecked(class_to_next[i]);
851	i += `1`;
852	Some((class, next))
853	}
854	})
855	}
856	}
857
858	impl<'a> core::fmt::Debug for State<'a> {
859	fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
860	use crate::{automaton::sparse_transitions, util::debug::DebugByte};
861
862	let it = sparse_transitions(self.transitions())
863	// Writing out all FAIL transitions is quite noisy. Instead, we
864	// just require readers of the output to assume anything absent
865	// maps to the FAIL transition.
866	.filter(\|&(_, _, sid)\| sid != NFA::FAIL)
867	.enumerate();
868	for (i, (start, end, sid)) in it {
869	if i > `0` {
870	write!(f, ", ")?;
871	}
872	if start == end {
873	write!(f, "{:?} => {:?}", DebugByte(start), sid.as_usize())?;
874	} else {
875	write!(
876	f,
877	"{:?}-{:?} => {:?}",
878	DebugByte(start),
879	DebugByte(end),
880	sid.as_usize()
881	)?;
882	}
883	}
884	Ok(())
885	}
886	}
887
888	/// A builder for configuring an Aho-Corasick contiguous NFA.
889	///
890	/// This builder has a subset of the options available to a
891	/// [`AhoCorasickBuilder`](crate::AhoCorasickBuilder). Of the shared options,
892	/// their behavior is identical.
893	#[derive(Clone, Debug)]
894	pub struct Builder {
895	noncontiguous: noncontiguous::Builder,
896	dense_depth: usize,
897	byte_classes: bool,
898	}
899
900	impl Default for Builder {
901	fn default() -> Builder {
902	Builder {
903	noncontiguous: noncontiguous::Builder::new(),
904	dense_depth: `2`,
905	byte_classes: `true`,
906	}
907	}
908	}
909
910	impl Builder {
911	/// Create a new builder for configuring an Aho-Corasick contiguous NFA.
912	pub fn new() -> Builder {
913	Builder::default()
914	}
915
916	/// Build an Aho-Corasick contiguous NFA from the given iterator of
917	/// patterns.
918	///
919	/// A builder may be reused to create more NFAs.
920	pub fn build<I, P>(&self, patterns: I) -> Result<NFA, BuildError>
921	where
922	I: IntoIterator<Item = P>,
923	P: AsRef<[u8]>,
924	{
925	let nnfa = self.noncontiguous.build(patterns)?;
926	self.build_from_noncontiguous(&nnfa)
927	}
928
929	/// Build an Aho-Corasick contiguous NFA from the given noncontiguous NFA.
930	///
931	/// Note that when this method is used, only the `dense_depth` and
932	/// `byte_classes` settings on this builder are respected. The other
933	/// settings only apply to the initial construction of the Aho-Corasick
934	/// automaton. Since using this method requires that initial construction
935	/// has already completed, all settings impacting only initial construction
936	/// are no longer relevant.
937	pub fn build_from_noncontiguous(
938	&self,
939	nnfa: &noncontiguous::NFA,
940	) -> Result<NFA, BuildError> {
941	debug!("building contiguous NFA");
942	let byte_classes = if self.byte_classes {
943	nnfa.byte_classes().clone()
944	} else {
945	ByteClasses::singletons()
946	};
947	let mut index_to_state_id = vec![NFA::DEAD; nnfa.states().len()];
948	let mut nfa = NFA {
949	repr: vec![],
950	pattern_lens: nnfa.pattern_lens_raw().to_vec(),
951	state_len: nnfa.states().len(),
952	prefilter: nnfa.prefilter().map(\|p\| p.clone()),
953	match_kind: nnfa.match_kind(),
954	alphabet_len: byte_classes.alphabet_len(),
955	byte_classes,
956	min_pattern_len: nnfa.min_pattern_len(),
957	max_pattern_len: nnfa.max_pattern_len(),
958	// The special state IDs are set later.
959	special: Special::zero(),
960	};
961	for (oldsid, state) in nnfa.states().iter().with_state_ids() {
962	// We don't actually encode a fail state since it isn't necessary.
963	// But we still want to make sure any FAIL ids are mapped
964	// correctly.
965	if oldsid == noncontiguous::NFA::FAIL {
966	index_to_state_id[oldsid] = NFA::FAIL;
967	continue;
968	}
969	let force_dense = state.depth().as_usize() < self.dense_depth;
970	let newsid = State::write(
971	nnfa,
972	oldsid,
973	state,
974	&nfa.byte_classes,
975	&mut nfa.repr,
976	force_dense,
977	)?;
978	index_to_state_id[oldsid] = newsid;
979	}
980	for &newsid in index_to_state_id.iter() {
981	if newsid == NFA::FAIL {
982	continue;
983	}
984	let state = &mut nfa.repr[newsid.as_usize()..];
985	State::remap(nfa.alphabet_len, &index_to_state_id, state)?;
986	}
987	// Now that we've remapped all the IDs in our states, all that's left
988	// is remapping the special state IDs.
989	let remap = &index_to_state_id;
990	let old = nnfa.special();
991	let new = &mut nfa.special;
992	new.max_special_id = remap[old.max_special_id];
993	new.max_match_id = remap[old.max_match_id];
994	new.start_unanchored_id = remap[old.start_unanchored_id];
995	new.start_anchored_id = remap[old.start_anchored_id];
996	debug!(
997	"contiguous NFA built, <states: {:?}, size: {:?}, \
998	alphabet len: {:?}>",
999	nfa.state_len,
1000	nfa.memory_usage(),
1001	nfa.byte_classes.alphabet_len(),
1002	);
1003	// The vectors can grow ~twice as big during construction because a
1004	// Vec amortizes growth. But here, let's shrink things back down to
1005	// what we actually need since we're never going to add more to it.
1006	nfa.repr.shrink_to_fit();
1007	nfa.pattern_lens.shrink_to_fit();
1008	Ok(nfa)
1009	}
1010
1011	/// Set the desired match semantics.
1012	///
1013	/// This only applies when using [`Builder::build`] and not
1014	/// [`Builder::build_from_noncontiguous`].
1015	///
1016	/// See
1017	/// [`AhoCorasickBuilder::match_kind`](crate::AhoCorasickBuilder::match_kind)
1018	/// for more documentation and examples.
1019	pub fn match_kind(&mut self, kind: MatchKind) -> &mut Builder {
1020	self.noncontiguous.match_kind(kind);
1021	self
1022	}
1023
1024	/// Enable ASCII-aware case insensitive matching.
1025	///
1026	/// This only applies when using [`Builder::build`] and not
1027	/// [`Builder::build_from_noncontiguous`].
1028	///
1029	/// See
1030	/// [`AhoCorasickBuilder::ascii_case_insensitive`](crate::AhoCorasickBuilder::ascii_case_insensitive)
1031	/// for more documentation and examples.
1032	pub fn ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder {
1033	self.noncontiguous.ascii_case_insensitive(yes);
1034	self
1035	}
1036
1037	/// Enable heuristic prefilter optimizations.
1038	///
1039	/// This only applies when using [`Builder::build`] and not
1040	/// [`Builder::build_from_noncontiguous`].
1041	///
1042	/// See
1043	/// [`AhoCorasickBuilder::prefilter`](crate::AhoCorasickBuilder::prefilter)
1044	/// for more documentation and examples.
1045	pub fn prefilter(&mut self, yes: bool) -> &mut Builder {
1046	self.noncontiguous.prefilter(yes);
1047	self
1048	}
1049
1050	/// Set the limit on how many states use a dense representation for their
1051	/// transitions. Other states will generally use a sparse representation.
1052	///
1053	/// See
1054	/// [`AhoCorasickBuilder::dense_depth`](crate::AhoCorasickBuilder::dense_depth)
1055	/// for more documentation and examples.
1056	pub fn dense_depth(&mut self, depth: usize) -> &mut Builder {
1057	self.dense_depth = depth;
1058	self
1059	}
1060
1061	/// A debug setting for whether to attempt to shrink the size of the
1062	/// automaton's alphabet or not.
1063	///
1064	/// This should never be enabled unless you're debugging an automaton.
1065	/// Namely, disabling byte classes makes transitions easier to reason
1066	/// about, since they use the actual bytes instead of equivalence classes.
1067	/// Disabling this confers no performance benefit at search time.
1068	///
1069	/// See
1070	/// [`AhoCorasickBuilder::byte_classes`](crate::AhoCorasickBuilder::byte_classes)
1071	/// for more documentation and examples.
1072	pub fn byte_classes(&mut self, yes: bool) -> &mut Builder {
1073	self.byte_classes = yes;
1074	self
1075	}
1076	}
1077
1078	/// Computes the number of u32 values needed to represent one byte per the
1079	/// number of transitions given.
1080	fn u32_len(ntrans: usize) -> usize {
1081	if ntrans % `4` == `0` {
1082	ntrans >> `2`
1083	} else {
1084	(ntrans >> `2`) + `1`
1085	}
1086	}
1087
1088	#[cfg(test)]
1089	mod tests {
1090	// This test demonstrates a SWAR technique I tried in the sparse transition
1091	// code inside of 'next_state'. Namely, sparse transitions work by
1092	// iterating over u32 chunks, with each chunk containing up to 4 classes
1093	// corresponding to 4 transitions. This SWAR technique lets us find a
1094	// matching transition without converting the u32 to a [u8; 4].
1095	//
1096	// It turned out to be a little slower unfortunately, which isn't too
1097	// surprising, since this is likely a throughput oriented optimization.
1098	// Loop unrolling doesn't really help us because the vast majority of
1099	// states have very few transitions.
1100	//
1101	// Anyway, this code was a little tricky to write, so I converted it to a
1102	// test in case someone figures out how to use it more effectively than
1103	// I could.
1104	//
1105	// (This also only works on little endian. So big endian would need to be
1106	// accounted for if we ever decided to use this I think.)
1107	#[cfg(target_endian = "little")]
1108	#[test]
1109	fn swar() {
1110	use super::*;
1111
1112	fn has_zero_byte(x: u32) -> u32 {
1113	const LO_U32: u32 = `0x01010101`;
1114	const HI_U32: u32 = `0x80808080`;
1115
1116	x.wrapping_sub(LO_U32) & !x & HI_U32
1117	}
1118
1119	fn broadcast(b: u8) -> u32 {
1120	(u32::from(b)) * (u32::MAX / `255`)
1121	}
1122
1123	fn index_of(x: u32) -> usize {
1124	let o =
1125	(((x - `1`) & `0x01010101`).wrapping_mul(`0x01010101`) >> `24`) - `1`;
1126	o.as_usize()
1127	}
1128
1129	let bytes: [u8; `4`] = [b'1', b'A', b'a', b'z'];
1130	let chunk = u32::from_ne_bytes(bytes);
1131
1132	let needle = broadcast(b'1');
1133	assert_eq!(`0`, index_of(has_zero_byte(needle ^ chunk)));
1134	let needle = broadcast(b'A');
1135	assert_eq!(`1`, index_of(has_zero_byte(needle ^ chunk)));
1136	let needle = broadcast(b'a');
1137	assert_eq!(`2`, index_of(has_zero_byte(needle ^ chunk)));
1138	let needle = broadcast(b'z');
1139	assert_eq!(`3`, index_of(has_zero_byte(needle ^ chunk)));
1140	}
1141	}
1142

Provided by KDAB

Definitions