automaton.rs source code [crates/aho-corasick-0.7.20/src/automaton.rs]

1	use crate::ahocorasick::MatchKind;
2	use crate::prefilter::{self, Candidate, Prefilter, PrefilterState};
3	use crate::state_id::{dead_id, fail_id, StateID};
4	use crate::Match;
5
6	// NOTE: This trait essentially started as a copy of the same trait from from
7	// regex-automata, with some wording changed since we use this trait for
8	// NFAs in addition to DFAs in this crate. Additionally, we do not export
9	// this trait. It's only used internally to reduce code duplication. The
10	// regex-automata crate needs to expose it because its Regex type is generic
11	// over implementations of this trait. In this crate, we encapsulate everything
12	// behind the AhoCorasick type.
13	//
14	// This trait is a bit of a mess, but it's not quite clear how to fix it.
15	// Basically, there are several competing concerns:
16	//
17	// We need performance, so everything effectively needs to get monomorphized.*
18	// There are several variations on searching Aho-Corasick automatons:*
19	// overlapping, standard and leftmost. Overlapping and standard are somewhat
20	// combined together below, but there is no real way to combine standard with
21	// leftmost. Namely, leftmost requires continuing a search even after a match
22	// is found, in order to correctly disambiguate a match.
23	// On top of that, sometimes callers want to know which state the automaton*
24	// is in after searching. This is principally useful for overlapping and
25	// stream searches. However, when callers don't care about this, we really
26	// do not want to be forced to compute it, since it sometimes requires extra
27	// work. Thus, there are effectively two copies of leftmost searching: one
28	// for tracking the state ID and one that doesn't. We should ideally do the
29	// same for standard searching, but my sanity stopped me.
30
31	// SAFETY RATIONALE: Previously, the code below went to some length to remove
32	// all bounds checks. This generally produced tighter assembly and lead to
33	// 20-50% improvements in micro-benchmarks on corpora made up of random
34	// characters. This somewhat makes sense, since the branch predictor is going
35	// to be at its worse on random text.
36	//
37	// However, using the aho-corasick-debug tool and manually benchmarking
38	// different inputs, the code with* bounds checks actually wound up being*
39	// slightly faster:
40	//
41	// $ cat input
42	// Sherlock Holmes
43	// John Watson
44	// Professor Moriarty
45	// Irene Adler
46	// Mary Watson
47	//
48	// $ aho-corasick-debug-safe \
49	// input OpenSubtitles2018.raw.sample.en --kind leftmost-first --dfa
50	// pattern read time: 32.824µs
51	// automaton build time: 444.687µs
52	// automaton heap usage: 72392 bytes
53	// match count: 639
54	// count time: 1.809961702s
55	//
56	// $ aho-corasick-debug-master \
57	// input OpenSubtitles2018.raw.sample.en --kind leftmost-first --dfa
58	// pattern read time: 31.425µs
59	// automaton build time: 317.434µs
60	// automaton heap usage: 72392 bytes
61	// match count: 639
62	// count time: 2.059157705s
63	//
64	// I was able to reproduce this result on two different machines (an i5 and
65	// an i7). Therefore, we go the route of safe code for now.
66
67	/// A trait describing the interface of an Aho-Corasick finite state machine.
68	///
69	/// Every automaton has exactly one fail state, one dead state and exactly one
70	/// start state. Generally, these correspond to the first, second and third
71	/// states, respectively. The dead state is always treated as a sentinel. That
72	/// is, no correct Aho-Corasick automaton will ever transition into the fail
73	/// state. The dead state, however, can be transitioned into, but only when
74	/// leftmost-first or leftmost-longest match semantics are enabled and only
75	/// when at least one match has been observed.
76	///
77	/// Every automaton also has one or more match states, such that
78	/// `Automaton::is_match_state(id)` returns `true` if and only if `id`
79	/// corresponds to a match state.
80	pub trait Automaton {
81	/// The representation used for state identifiers in this automaton.
82	///
83	/// Typically, this is one of `u8`, `u16`, `u32`, `u64` or `usize`.
84	type ID: StateID;
85
86	/// The type of matching that should be done.
87	fn match_kind(&self) -> &MatchKind;
88
89	/// Returns true if and only if this automaton uses anchored searches.
90	fn anchored(&self) -> bool;
91
92	/// An optional prefilter for quickly skipping to the next candidate match.
93	/// A prefilter must report at least every match, although it may report
94	/// positions that do not correspond to a match. That is, it must not allow
95	/// false negatives, but can allow false positives.
96	///
97	/// Currently, a prefilter only runs when the automaton is in the start
98	/// state. That is, the position reported by a prefilter should always
99	/// correspond to the start of a potential match.
100	fn prefilter(&self) -> Option<&dyn Prefilter>;
101
102	/// Return the identifier of this automaton's start state.
103	fn start_state(&self) -> Self::ID;
104
105	/// Returns true if and only if the given state identifier refers to a
106	/// valid state.
107	fn is_valid(&self, id: Self::ID) -> bool;
108
109	/// Returns true if and only if the given identifier corresponds to a match
110	/// state.
111	///
112	/// The state ID given must be valid, or else implementors may panic.
113	fn is_match_state(&self, id: Self::ID) -> bool;
114
115	/// Returns true if and only if the given identifier corresponds to a state
116	/// that is either the dead state or a match state.
117	///
118	/// Depending on the implementation of the automaton, this routine can
119	/// be used to save a branch in the core matching loop. Nevertheless,
120	/// `is_match_state(id) \|\| id == dead_id()` is always a valid
121	/// implementation. Indeed, this is the default implementation.
122	///
123	/// The state ID given must be valid, or else implementors may panic.
124	fn is_match_or_dead_state(&self, id: Self::ID) -> bool {
125	id == dead_id() \|\| self.is_match_state(id)
126	}
127
128	/// If the given state is a match state, return the match corresponding
129	/// to the given match index. `end` must be the ending position of the
130	/// detected match. If no match exists or if `match_index` exceeds the
131	/// number of matches in this state, then `None` is returned.
132	///
133	/// The state ID given must be valid, or else implementors may panic.
134	///
135	/// If the given state ID is correct and if the `match_index` is less than
136	/// the number of matches for that state, then this is guaranteed to return
137	/// a match.
138	fn get_match(
139	&self,
140	id: Self::ID,
141	match_index: usize,
142	end: usize,
143	) -> Option<Match>;
144
145	/// Returns the number of matches for the given state. If the given state
146	/// is not a match state, then this returns 0.
147	///
148	/// The state ID given must be valid, or else implementors must panic.
149	fn match_count(&self, id: Self::ID) -> usize;
150
151	/// Given the current state that this automaton is in and the next input
152	/// byte, this method returns the identifier of the next state. The
153	/// identifier returned must always be valid and may never correspond to
154	/// the fail state. The returned identifier may, however, point to the
155	/// dead state.
156	///
157	/// This is not safe so that implementors may look up the next state
158	/// without memory safety checks such as bounds checks. As such, callers
159	/// must ensure that the given identifier corresponds to a valid automaton
160	/// state. Implementors must, in turn, ensure that this routine is safe for
161	/// all valid state identifiers and for all possible `u8` values.
162	fn next_state(&self, current: Self::ID, input: u8) -> Self::ID;
163
164	/// Like next_state, but debug_asserts that the underlying
165	/// implementation never returns a `fail_id()` for the next state.
166	fn next_state_no_fail(&self, current: Self::ID, input: u8) -> Self::ID {
167	let next = self.next_state(current, input);
168	// We should never see a transition to the failure state.
169	debug_assert!(
170	next != fail_id(),
171	"automaton should never return fail_id for next state"
172	);
173	next
174	}
175
176	/// Execute a search using standard match semantics.
177	///
178	/// This can be used even when the automaton was constructed with leftmost
179	/// match semantics when you want to find the earliest possible match. This
180	/// can also be used as part of an overlapping search implementation.
181	///
182	/// N.B. This does not report a match if `state_id` is given as a matching
183	/// state. As such, this should not be used directly.
184	#[inline(always)]
185	fn standard_find_at(
186	&self,
187	prestate: &mut PrefilterState,
188	haystack: &[u8],
189	at: usize,
190	state_id: &mut Self::ID,
191	) -> Option<Match> {
192	if let Some(pre) = self.prefilter() {
193	self.standard_find_at_imp(
194	prestate,
195	Some(pre),
196	haystack,
197	at,
198	state_id,
199	)
200	} else {
201	self.standard_find_at_imp(prestate, None, haystack, at, state_id)
202	}
203	}
204
205	// It's important for this to always be inlined. Namely, its only caller
206	// is standard_find_at, and the inlining should remove the case analysis
207	// for prefilter scanning when there is no prefilter available.
208	#[inline(always)]
209	fn standard_find_at_imp(
210	&self,
211	prestate: &mut PrefilterState,
212	prefilter: Option<&dyn Prefilter>,
213	haystack: &[u8],
214	mut at: usize,
215	state_id: &mut Self::ID,
216	) -> Option<Match> {
217	while at < haystack.len() {
218	if let Some(pre) = prefilter {
219	if prestate.is_effective(at) && *state_id == self.start_state()
220	{
221	let c = prefilter::next(prestate, pre, haystack, at)
222	.into_option();
223	match c {
224	None => return None,
225	Some(i) => {
226	at = i;
227	}
228	}
229	}
230	}
231	// CORRECTNESS: next_state is correct for all possible u8 values,
232	// so the only thing we're concerned about is the validity of
233	// `state_id`. `state_id` either comes from the caller (in which
234	// case, we assume it is correct), or it comes from the return
235	// value of next_state, which is guaranteed to be correct.
236	state_id = self.next_state_no_fail(state_id, haystack[at]);
237	at += `1`;
238	// This routine always quits immediately after seeing a
239	// match, and since dead states can only come after seeing
240	// a match, seeing a dead state here is impossible. (Unless
241	// we have an anchored automaton, in which case, dead states
242	// are used to stop a search.)
243	debug_assert!(
244	*state_id != dead_id() \|\| self.anchored(),
245	"standard find should never see a dead state"
246	);
247
248	if self.is_match_or_dead_state(*state_id) {
249	return if *state_id == dead_id() {
250	None
251	} else {
252	self.get_match(*state_id, `0`, at)
253	};
254	}
255	}
256	None
257	}
258
259	/// Execute a search using leftmost (either first or longest) match
260	/// semantics.
261	///
262	/// The principle difference between searching with standard semantics and
263	/// searching with leftmost semantics is that leftmost searching will
264	/// continue searching even after a match has been found. Once a match
265	/// is found, the search does not stop until either the haystack has been
266	/// exhausted or a dead state is observed in the automaton. (Dead states
267	/// only exist in automatons constructed with leftmost semantics.) That is,
268	/// we rely on the construction of the automaton to tell us when to quit.
269	#[inline(never)]
270	fn leftmost_find_at(
271	&self,
272	prestate: &mut PrefilterState,
273	haystack: &[u8],
274	at: usize,
275	state_id: &mut Self::ID,
276	) -> Option<Match> {
277	if let Some(pre) = self.prefilter() {
278	self.leftmost_find_at_imp(
279	prestate,
280	Some(pre),
281	haystack,
282	at,
283	state_id,
284	)
285	} else {
286	self.leftmost_find_at_imp(prestate, None, haystack, at, state_id)
287	}
288	}
289
290	// It's important for this to always be inlined. Namely, its only caller
291	// is leftmost_find_at, and the inlining should remove the case analysis
292	// for prefilter scanning when there is no prefilter available.
293	#[inline(always)]
294	fn leftmost_find_at_imp(
295	&self,
296	prestate: &mut PrefilterState,
297	prefilter: Option<&dyn Prefilter>,
298	haystack: &[u8],
299	mut at: usize,
300	state_id: &mut Self::ID,
301	) -> Option<Match> {
302	debug_assert!(self.match_kind().is_leftmost());
303	if self.anchored() && at > `0` && *state_id == self.start_state() {
304	return None;
305	}
306	let mut last_match = self.get_match(*state_id, `0`, at);
307	while at < haystack.len() {
308	if let Some(pre) = prefilter {
309	if prestate.is_effective(at) && *state_id == self.start_state()
310	{
311	let c = prefilter::next(prestate, pre, haystack, at)
312	.into_option();
313	match c {
314	None => return None,
315	Some(i) => {
316	at = i;
317	}
318	}
319	}
320	}
321	// CORRECTNESS: next_state is correct for all possible u8 values,
322	// so the only thing we're concerned about is the validity of
323	// `state_id`. `state_id` either comes from the caller (in which
324	// case, we assume it is correct), or it comes from the return
325	// value of next_state, which is guaranteed to be correct.
326	state_id = self.next_state_no_fail(state_id, haystack[at]);
327	at += `1`;
328	if self.is_match_or_dead_state(*state_id) {
329	if *state_id == dead_id() {
330	// The only way to enter into a dead state is if a match
331	// has been found, so we assert as much. This is different
332	// from normal automata, where you might enter a dead state
333	// if you know a subsequent match will never be found
334	// (regardless of whether a match has already been found).
335	// For Aho-Corasick, it is built so that we can match at
336	// any position, so the possibility of a match always
337	// exists.
338	//
339	// (Unless we have an anchored automaton, in which case,
340	// dead states are used to stop a search.)
341	debug_assert!(
342	last_match.is_some() \|\| self.anchored(),
343	"dead state should only be seen after match"
344	);
345	return last_match;
346	}
347	last_match = self.get_match(*state_id, `0`, at);
348	}
349	}
350	last_match
351	}
352
353	/// This is like leftmost_find_at, but does not need to track a caller
354	/// provided state id. In other words, the only output of this routine is a
355	/// match, if one exists.
356	///
357	/// It is regrettable that we need to effectively copy a chunk of
358	/// implementation twice, but when we don't need to track the state ID, we
359	/// can allow the prefilter to report matches immediately without having
360	/// to re-confirm them with the automaton. The re-confirmation step is
361	/// necessary in leftmost_find_at because tracing through the automaton is
362	/// the only way to correctly set the state ID. (Perhaps an alternative
363	/// would be to keep a map from pattern ID to matching state ID, but that
364	/// complicates the code and still doesn't permit us to defer to the
365	/// prefilter entirely when possible.)
366	///
367	/// I did try a few things to avoid the code duplication here, but nothing
368	/// optimized as well as this approach. (In microbenchmarks, there was
369	/// about a 25% difference.)
370	#[inline(never)]
371	fn leftmost_find_at_no_state(
372	&self,
373	prestate: &mut PrefilterState,
374	haystack: &[u8],
375	at: usize,
376	) -> Option<Match> {
377	if let Some(pre) = self.prefilter() {
378	self.leftmost_find_at_no_state_imp(
379	prestate,
380	Some(pre),
381	haystack,
382	at,
383	)
384	} else {
385	self.leftmost_find_at_no_state_imp(prestate, None, haystack, at)
386	}
387	}
388
389	// It's important for this to always be inlined. Namely, its only caller
390	// is leftmost_find_at_no_state, and the inlining should remove the case
391	// analysis for prefilter scanning when there is no prefilter available.
392	#[inline(always)]
393	fn leftmost_find_at_no_state_imp(
394	&self,
395	prestate: &mut PrefilterState,
396	prefilter: Option<&dyn Prefilter>,
397	haystack: &[u8],
398	mut at: usize,
399	) -> Option<Match> {
400	debug_assert!(self.match_kind().is_leftmost());
401	if self.anchored() && at > `0` {
402	return None;
403	}
404	// If our prefilter handles confirmation of matches 100% of the
405	// time, and since we don't need to track state IDs, we can avoid
406	// Aho-Corasick completely.
407	if let Some(pre) = prefilter {
408	// We should never have a prefilter during an anchored search.
409	debug_assert!(!self.anchored());
410	if !pre.reports_false_positives() {
411	return match pre.next_candidate(prestate, haystack, at) {
412	Candidate::None => None,
413	Candidate::Match(m) => Some(m),
414	Candidate::PossibleStartOfMatch(_) => unreachable!(),
415	};
416	}
417	}
418
419	let mut state_id = self.start_state();
420	let mut last_match = self.get_match(state_id, `0`, at);
421	while at < haystack.len() {
422	if let Some(pre) = prefilter {
423	if prestate.is_effective(at) && state_id == self.start_state()
424	{
425	match prefilter::next(prestate, pre, haystack, at) {
426	Candidate::None => return None,
427	// Since we aren't tracking a state ID, we can
428	// quit early once we know we have a match.
429	Candidate::Match(m) => return Some(m),
430	Candidate::PossibleStartOfMatch(i) => {
431	at = i;
432	}
433	}
434	}
435	}
436	// CORRECTNESS: next_state is correct for all possible u8 values,
437	// so the only thing we're concerned about is the validity of
438	// `state_id`. `state_id` either comes from the caller (in which
439	// case, we assume it is correct), or it comes from the return
440	// value of next_state, which is guaranteed to be correct.
441	state_id = self.next_state_no_fail(state_id, haystack[at]);
442	at += `1`;
443	if self.is_match_or_dead_state(state_id) {
444	if state_id == dead_id() {
445	// The only way to enter into a dead state is if a
446	// match has been found, so we assert as much. This
447	// is different from normal automata, where you might
448	// enter a dead state if you know a subsequent match
449	// will never be found (regardless of whether a match
450	// has already been found). For Aho-Corasick, it is
451	// built so that we can match at any position, so the
452	// possibility of a match always exists.
453	//
454	// (Unless we have an anchored automaton, in which
455	// case, dead states are used to stop a search.)
456	debug_assert!(
457	last_match.is_some() \|\| self.anchored(),
458	"dead state should only be seen after match"
459	);
460	return last_match;
461	}
462	last_match = self.get_match(state_id, `0`, at);
463	}
464	}
465	last_match
466	}
467
468	/// Execute an overlapping search.
469	///
470	/// When executing an overlapping match, the previous state ID in addition
471	/// to the previous match index should be given. If there are more matches
472	/// at the given state, then the match is reported and the given index is
473	/// incremented.
474	#[inline(always)]
475	fn overlapping_find_at(
476	&self,
477	prestate: &mut PrefilterState,
478	haystack: &[u8],
479	at: usize,
480	state_id: &mut Self::ID,
481	match_index: &mut usize,
482	) -> Option<Match> {
483	if self.anchored() && at > `0` && *state_id == self.start_state() {
484	return None;
485	}
486
487	let match_count = self.match_count(*state_id);
488	if *match_index < match_count {
489	// This is guaranteed to return a match since
490	// match_index < match_count.
491	let result = self.get_match(state_id, match_index, at);
492	debug_assert!(result.is_some(), "must be a match");
493	*match_index += `1`;
494	return result;
495	}
496
497	*match_index = `0`;
498	match self.standard_find_at(prestate, haystack, at, state_id) {
499	None => None,
500	Some(m) => {
501	*match_index = `1`;
502	Some(m)
503	}
504	}
505	}
506
507	/// Return the earliest match found. This returns as soon as we know that
508	/// we have a match. As such, this does not necessarily correspond to the
509	/// leftmost starting match, but rather, the leftmost position at which a
510	/// match ends.
511	#[inline(always)]
512	fn earliest_find_at(
513	&self,
514	prestate: &mut PrefilterState,
515	haystack: &[u8],
516	at: usize,
517	state_id: &mut Self::ID,
518	) -> Option<Match> {
519	if *state_id == self.start_state() {
520	if self.anchored() && at > `0` {
521	return None;
522	}
523	if let Some(m) = self.get_match(*state_id, `0`, at) {
524	return Some(m);
525	}
526	}
527	self.standard_find_at(prestate, haystack, at, state_id)
528	}
529
530	/// A convenience function for finding the next match according to the
531	/// match semantics of this automaton. For standard match semantics, this
532	/// finds the earliest match. Otherwise, the leftmost match is found.
533	#[inline(always)]
534	fn find_at(
535	&self,
536	prestate: &mut PrefilterState,
537	haystack: &[u8],
538	at: usize,
539	state_id: &mut Self::ID,
540	) -> Option<Match> {
541	match *self.match_kind() {
542	MatchKind::Standard => {
543	self.earliest_find_at(prestate, haystack, at, state_id)
544	}
545	MatchKind::LeftmostFirst \| MatchKind::LeftmostLongest => {
546	self.leftmost_find_at(prestate, haystack, at, state_id)
547	}
548	MatchKind::__Nonexhaustive => unreachable!(),
549	}
550	}
551
552	/// Like find_at, but does not track state identifiers. This permits some
553	/// optimizations when a prefilter that confirms its own matches is
554	/// present.
555	#[inline(always)]
556	fn find_at_no_state(
557	&self,
558	prestate: &mut PrefilterState,
559	haystack: &[u8],
560	at: usize,
561	) -> Option<Match> {
562	match *self.match_kind() {
563	MatchKind::Standard => {
564	let mut state = self.start_state();
565	self.earliest_find_at(prestate, haystack, at, &mut state)
566	}
567	MatchKind::LeftmostFirst \| MatchKind::LeftmostLongest => {
568	self.leftmost_find_at_no_state(prestate, haystack, at)
569	}
570	MatchKind::__Nonexhaustive => unreachable!(),
571	}
572	}
573	}
574