1 | use crate::ahocorasick::MatchKind; |
2 | use crate::prefilter::{self, Candidate, Prefilter, PrefilterState}; |
3 | use crate::state_id::{dead_id, fail_id, StateID}; |
4 | use crate::Match; |
5 | |
6 | // NOTE: This trait essentially started as a copy of the same trait from from |
7 | // regex-automata, with some wording changed since we use this trait for |
8 | // NFAs in addition to DFAs in this crate. Additionally, we do not export |
9 | // this trait. It's only used internally to reduce code duplication. The |
10 | // regex-automata crate needs to expose it because its Regex type is generic |
11 | // over implementations of this trait. In this crate, we encapsulate everything |
12 | // behind the AhoCorasick type. |
13 | // |
14 | // This trait is a bit of a mess, but it's not quite clear how to fix it. |
15 | // Basically, there are several competing concerns: |
16 | // |
17 | // * We need performance, so everything effectively needs to get monomorphized. |
18 | // * There are several variations on searching Aho-Corasick automatons: |
19 | // overlapping, standard and leftmost. Overlapping and standard are somewhat |
20 | // combined together below, but there is no real way to combine standard with |
21 | // leftmost. Namely, leftmost requires continuing a search even after a match |
22 | // is found, in order to correctly disambiguate a match. |
23 | // * On top of that, *sometimes* callers want to know which state the automaton |
24 | // is in after searching. This is principally useful for overlapping and |
25 | // stream searches. However, when callers don't care about this, we really |
26 | // do not want to be forced to compute it, since it sometimes requires extra |
27 | // work. Thus, there are effectively two copies of leftmost searching: one |
28 | // for tracking the state ID and one that doesn't. We should ideally do the |
29 | // same for standard searching, but my sanity stopped me. |
30 | |
31 | // SAFETY RATIONALE: Previously, the code below went to some length to remove |
32 | // all bounds checks. This generally produced tighter assembly and lead to |
33 | // 20-50% improvements in micro-benchmarks on corpora made up of random |
34 | // characters. This somewhat makes sense, since the branch predictor is going |
35 | // to be at its worse on random text. |
36 | // |
37 | // However, using the aho-corasick-debug tool and manually benchmarking |
38 | // different inputs, the code *with* bounds checks actually wound up being |
39 | // slightly faster: |
40 | // |
41 | // $ cat input |
42 | // Sherlock Holmes |
43 | // John Watson |
44 | // Professor Moriarty |
45 | // Irene Adler |
46 | // Mary Watson |
47 | // |
48 | // $ aho-corasick-debug-safe \ |
49 | // input OpenSubtitles2018.raw.sample.en --kind leftmost-first --dfa |
50 | // pattern read time: 32.824µs |
51 | // automaton build time: 444.687µs |
52 | // automaton heap usage: 72392 bytes |
53 | // match count: 639 |
54 | // count time: 1.809961702s |
55 | // |
56 | // $ aho-corasick-debug-master \ |
57 | // input OpenSubtitles2018.raw.sample.en --kind leftmost-first --dfa |
58 | // pattern read time: 31.425µs |
59 | // automaton build time: 317.434µs |
60 | // automaton heap usage: 72392 bytes |
61 | // match count: 639 |
62 | // count time: 2.059157705s |
63 | // |
64 | // I was able to reproduce this result on two different machines (an i5 and |
65 | // an i7). Therefore, we go the route of safe code for now. |
66 | |
67 | /// A trait describing the interface of an Aho-Corasick finite state machine. |
68 | /// |
69 | /// Every automaton has exactly one fail state, one dead state and exactly one |
70 | /// start state. Generally, these correspond to the first, second and third |
71 | /// states, respectively. The dead state is always treated as a sentinel. That |
72 | /// is, no correct Aho-Corasick automaton will ever transition into the fail |
73 | /// state. The dead state, however, can be transitioned into, but only when |
74 | /// leftmost-first or leftmost-longest match semantics are enabled and only |
75 | /// when at least one match has been observed. |
76 | /// |
77 | /// Every automaton also has one or more match states, such that |
78 | /// `Automaton::is_match_state(id)` returns `true` if and only if `id` |
79 | /// corresponds to a match state. |
80 | pub trait Automaton { |
81 | /// The representation used for state identifiers in this automaton. |
82 | /// |
83 | /// Typically, this is one of `u8`, `u16`, `u32`, `u64` or `usize`. |
84 | type ID: StateID; |
85 | |
86 | /// The type of matching that should be done. |
87 | fn match_kind(&self) -> &MatchKind; |
88 | |
89 | /// Returns true if and only if this automaton uses anchored searches. |
90 | fn anchored(&self) -> bool; |
91 | |
92 | /// An optional prefilter for quickly skipping to the next candidate match. |
93 | /// A prefilter must report at least every match, although it may report |
94 | /// positions that do not correspond to a match. That is, it must not allow |
95 | /// false negatives, but can allow false positives. |
96 | /// |
97 | /// Currently, a prefilter only runs when the automaton is in the start |
98 | /// state. That is, the position reported by a prefilter should always |
99 | /// correspond to the start of a potential match. |
100 | fn prefilter(&self) -> Option<&dyn Prefilter>; |
101 | |
102 | /// Return the identifier of this automaton's start state. |
103 | fn start_state(&self) -> Self::ID; |
104 | |
105 | /// Returns true if and only if the given state identifier refers to a |
106 | /// valid state. |
107 | fn is_valid(&self, id: Self::ID) -> bool; |
108 | |
109 | /// Returns true if and only if the given identifier corresponds to a match |
110 | /// state. |
111 | /// |
112 | /// The state ID given must be valid, or else implementors may panic. |
113 | fn is_match_state(&self, id: Self::ID) -> bool; |
114 | |
115 | /// Returns true if and only if the given identifier corresponds to a state |
116 | /// that is either the dead state or a match state. |
117 | /// |
118 | /// Depending on the implementation of the automaton, this routine can |
119 | /// be used to save a branch in the core matching loop. Nevertheless, |
120 | /// `is_match_state(id) || id == dead_id()` is always a valid |
121 | /// implementation. Indeed, this is the default implementation. |
122 | /// |
123 | /// The state ID given must be valid, or else implementors may panic. |
124 | fn is_match_or_dead_state(&self, id: Self::ID) -> bool { |
125 | id == dead_id() || self.is_match_state(id) |
126 | } |
127 | |
128 | /// If the given state is a match state, return the match corresponding |
129 | /// to the given match index. `end` must be the ending position of the |
130 | /// detected match. If no match exists or if `match_index` exceeds the |
131 | /// number of matches in this state, then `None` is returned. |
132 | /// |
133 | /// The state ID given must be valid, or else implementors may panic. |
134 | /// |
135 | /// If the given state ID is correct and if the `match_index` is less than |
136 | /// the number of matches for that state, then this is guaranteed to return |
137 | /// a match. |
138 | fn get_match( |
139 | &self, |
140 | id: Self::ID, |
141 | match_index: usize, |
142 | end: usize, |
143 | ) -> Option<Match>; |
144 | |
145 | /// Returns the number of matches for the given state. If the given state |
146 | /// is not a match state, then this returns 0. |
147 | /// |
148 | /// The state ID given must be valid, or else implementors must panic. |
149 | fn match_count(&self, id: Self::ID) -> usize; |
150 | |
151 | /// Given the current state that this automaton is in and the next input |
152 | /// byte, this method returns the identifier of the next state. The |
153 | /// identifier returned must always be valid and may never correspond to |
154 | /// the fail state. The returned identifier may, however, point to the |
155 | /// dead state. |
156 | /// |
157 | /// This is not safe so that implementors may look up the next state |
158 | /// without memory safety checks such as bounds checks. As such, callers |
159 | /// must ensure that the given identifier corresponds to a valid automaton |
160 | /// state. Implementors must, in turn, ensure that this routine is safe for |
161 | /// all valid state identifiers and for all possible `u8` values. |
162 | fn next_state(&self, current: Self::ID, input: u8) -> Self::ID; |
163 | |
164 | /// Like next_state, but debug_asserts that the underlying |
165 | /// implementation never returns a `fail_id()` for the next state. |
166 | fn next_state_no_fail(&self, current: Self::ID, input: u8) -> Self::ID { |
167 | let next = self.next_state(current, input); |
168 | // We should never see a transition to the failure state. |
169 | debug_assert!( |
170 | next != fail_id(), |
171 | "automaton should never return fail_id for next state" |
172 | ); |
173 | next |
174 | } |
175 | |
176 | /// Execute a search using standard match semantics. |
177 | /// |
178 | /// This can be used even when the automaton was constructed with leftmost |
179 | /// match semantics when you want to find the earliest possible match. This |
180 | /// can also be used as part of an overlapping search implementation. |
181 | /// |
182 | /// N.B. This does not report a match if `state_id` is given as a matching |
183 | /// state. As such, this should not be used directly. |
184 | #[inline (always)] |
185 | fn standard_find_at( |
186 | &self, |
187 | prestate: &mut PrefilterState, |
188 | haystack: &[u8], |
189 | at: usize, |
190 | state_id: &mut Self::ID, |
191 | ) -> Option<Match> { |
192 | if let Some(pre) = self.prefilter() { |
193 | self.standard_find_at_imp( |
194 | prestate, |
195 | Some(pre), |
196 | haystack, |
197 | at, |
198 | state_id, |
199 | ) |
200 | } else { |
201 | self.standard_find_at_imp(prestate, None, haystack, at, state_id) |
202 | } |
203 | } |
204 | |
205 | // It's important for this to always be inlined. Namely, its only caller |
206 | // is standard_find_at, and the inlining should remove the case analysis |
207 | // for prefilter scanning when there is no prefilter available. |
208 | #[inline (always)] |
209 | fn standard_find_at_imp( |
210 | &self, |
211 | prestate: &mut PrefilterState, |
212 | prefilter: Option<&dyn Prefilter>, |
213 | haystack: &[u8], |
214 | mut at: usize, |
215 | state_id: &mut Self::ID, |
216 | ) -> Option<Match> { |
217 | while at < haystack.len() { |
218 | if let Some(pre) = prefilter { |
219 | if prestate.is_effective(at) && *state_id == self.start_state() |
220 | { |
221 | let c = prefilter::next(prestate, pre, haystack, at) |
222 | .into_option(); |
223 | match c { |
224 | None => return None, |
225 | Some(i) => { |
226 | at = i; |
227 | } |
228 | } |
229 | } |
230 | } |
231 | // CORRECTNESS: next_state is correct for all possible u8 values, |
232 | // so the only thing we're concerned about is the validity of |
233 | // `state_id`. `state_id` either comes from the caller (in which |
234 | // case, we assume it is correct), or it comes from the return |
235 | // value of next_state, which is guaranteed to be correct. |
236 | *state_id = self.next_state_no_fail(*state_id, haystack[at]); |
237 | at += 1; |
238 | // This routine always quits immediately after seeing a |
239 | // match, and since dead states can only come after seeing |
240 | // a match, seeing a dead state here is impossible. (Unless |
241 | // we have an anchored automaton, in which case, dead states |
242 | // are used to stop a search.) |
243 | debug_assert!( |
244 | *state_id != dead_id() || self.anchored(), |
245 | "standard find should never see a dead state" |
246 | ); |
247 | |
248 | if self.is_match_or_dead_state(*state_id) { |
249 | return if *state_id == dead_id() { |
250 | None |
251 | } else { |
252 | self.get_match(*state_id, 0, at) |
253 | }; |
254 | } |
255 | } |
256 | None |
257 | } |
258 | |
259 | /// Execute a search using leftmost (either first or longest) match |
260 | /// semantics. |
261 | /// |
262 | /// The principle difference between searching with standard semantics and |
263 | /// searching with leftmost semantics is that leftmost searching will |
264 | /// continue searching even after a match has been found. Once a match |
265 | /// is found, the search does not stop until either the haystack has been |
266 | /// exhausted or a dead state is observed in the automaton. (Dead states |
267 | /// only exist in automatons constructed with leftmost semantics.) That is, |
268 | /// we rely on the construction of the automaton to tell us when to quit. |
269 | #[inline (never)] |
270 | fn leftmost_find_at( |
271 | &self, |
272 | prestate: &mut PrefilterState, |
273 | haystack: &[u8], |
274 | at: usize, |
275 | state_id: &mut Self::ID, |
276 | ) -> Option<Match> { |
277 | if let Some(pre) = self.prefilter() { |
278 | self.leftmost_find_at_imp( |
279 | prestate, |
280 | Some(pre), |
281 | haystack, |
282 | at, |
283 | state_id, |
284 | ) |
285 | } else { |
286 | self.leftmost_find_at_imp(prestate, None, haystack, at, state_id) |
287 | } |
288 | } |
289 | |
290 | // It's important for this to always be inlined. Namely, its only caller |
291 | // is leftmost_find_at, and the inlining should remove the case analysis |
292 | // for prefilter scanning when there is no prefilter available. |
293 | #[inline (always)] |
294 | fn leftmost_find_at_imp( |
295 | &self, |
296 | prestate: &mut PrefilterState, |
297 | prefilter: Option<&dyn Prefilter>, |
298 | haystack: &[u8], |
299 | mut at: usize, |
300 | state_id: &mut Self::ID, |
301 | ) -> Option<Match> { |
302 | debug_assert!(self.match_kind().is_leftmost()); |
303 | if self.anchored() && at > 0 && *state_id == self.start_state() { |
304 | return None; |
305 | } |
306 | let mut last_match = self.get_match(*state_id, 0, at); |
307 | while at < haystack.len() { |
308 | if let Some(pre) = prefilter { |
309 | if prestate.is_effective(at) && *state_id == self.start_state() |
310 | { |
311 | let c = prefilter::next(prestate, pre, haystack, at) |
312 | .into_option(); |
313 | match c { |
314 | None => return None, |
315 | Some(i) => { |
316 | at = i; |
317 | } |
318 | } |
319 | } |
320 | } |
321 | // CORRECTNESS: next_state is correct for all possible u8 values, |
322 | // so the only thing we're concerned about is the validity of |
323 | // `state_id`. `state_id` either comes from the caller (in which |
324 | // case, we assume it is correct), or it comes from the return |
325 | // value of next_state, which is guaranteed to be correct. |
326 | *state_id = self.next_state_no_fail(*state_id, haystack[at]); |
327 | at += 1; |
328 | if self.is_match_or_dead_state(*state_id) { |
329 | if *state_id == dead_id() { |
330 | // The only way to enter into a dead state is if a match |
331 | // has been found, so we assert as much. This is different |
332 | // from normal automata, where you might enter a dead state |
333 | // if you know a subsequent match will never be found |
334 | // (regardless of whether a match has already been found). |
335 | // For Aho-Corasick, it is built so that we can match at |
336 | // any position, so the possibility of a match always |
337 | // exists. |
338 | // |
339 | // (Unless we have an anchored automaton, in which case, |
340 | // dead states are used to stop a search.) |
341 | debug_assert!( |
342 | last_match.is_some() || self.anchored(), |
343 | "dead state should only be seen after match" |
344 | ); |
345 | return last_match; |
346 | } |
347 | last_match = self.get_match(*state_id, 0, at); |
348 | } |
349 | } |
350 | last_match |
351 | } |
352 | |
353 | /// This is like leftmost_find_at, but does not need to track a caller |
354 | /// provided state id. In other words, the only output of this routine is a |
355 | /// match, if one exists. |
356 | /// |
357 | /// It is regrettable that we need to effectively copy a chunk of |
358 | /// implementation twice, but when we don't need to track the state ID, we |
359 | /// can allow the prefilter to report matches immediately without having |
360 | /// to re-confirm them with the automaton. The re-confirmation step is |
361 | /// necessary in leftmost_find_at because tracing through the automaton is |
362 | /// the only way to correctly set the state ID. (Perhaps an alternative |
363 | /// would be to keep a map from pattern ID to matching state ID, but that |
364 | /// complicates the code and still doesn't permit us to defer to the |
365 | /// prefilter entirely when possible.) |
366 | /// |
367 | /// I did try a few things to avoid the code duplication here, but nothing |
368 | /// optimized as well as this approach. (In microbenchmarks, there was |
369 | /// about a 25% difference.) |
370 | #[inline (never)] |
371 | fn leftmost_find_at_no_state( |
372 | &self, |
373 | prestate: &mut PrefilterState, |
374 | haystack: &[u8], |
375 | at: usize, |
376 | ) -> Option<Match> { |
377 | if let Some(pre) = self.prefilter() { |
378 | self.leftmost_find_at_no_state_imp( |
379 | prestate, |
380 | Some(pre), |
381 | haystack, |
382 | at, |
383 | ) |
384 | } else { |
385 | self.leftmost_find_at_no_state_imp(prestate, None, haystack, at) |
386 | } |
387 | } |
388 | |
389 | // It's important for this to always be inlined. Namely, its only caller |
390 | // is leftmost_find_at_no_state, and the inlining should remove the case |
391 | // analysis for prefilter scanning when there is no prefilter available. |
392 | #[inline (always)] |
393 | fn leftmost_find_at_no_state_imp( |
394 | &self, |
395 | prestate: &mut PrefilterState, |
396 | prefilter: Option<&dyn Prefilter>, |
397 | haystack: &[u8], |
398 | mut at: usize, |
399 | ) -> Option<Match> { |
400 | debug_assert!(self.match_kind().is_leftmost()); |
401 | if self.anchored() && at > 0 { |
402 | return None; |
403 | } |
404 | // If our prefilter handles confirmation of matches 100% of the |
405 | // time, and since we don't need to track state IDs, we can avoid |
406 | // Aho-Corasick completely. |
407 | if let Some(pre) = prefilter { |
408 | // We should never have a prefilter during an anchored search. |
409 | debug_assert!(!self.anchored()); |
410 | if !pre.reports_false_positives() { |
411 | return match pre.next_candidate(prestate, haystack, at) { |
412 | Candidate::None => None, |
413 | Candidate::Match(m) => Some(m), |
414 | Candidate::PossibleStartOfMatch(_) => unreachable!(), |
415 | }; |
416 | } |
417 | } |
418 | |
419 | let mut state_id = self.start_state(); |
420 | let mut last_match = self.get_match(state_id, 0, at); |
421 | while at < haystack.len() { |
422 | if let Some(pre) = prefilter { |
423 | if prestate.is_effective(at) && state_id == self.start_state() |
424 | { |
425 | match prefilter::next(prestate, pre, haystack, at) { |
426 | Candidate::None => return None, |
427 | // Since we aren't tracking a state ID, we can |
428 | // quit early once we know we have a match. |
429 | Candidate::Match(m) => return Some(m), |
430 | Candidate::PossibleStartOfMatch(i) => { |
431 | at = i; |
432 | } |
433 | } |
434 | } |
435 | } |
436 | // CORRECTNESS: next_state is correct for all possible u8 values, |
437 | // so the only thing we're concerned about is the validity of |
438 | // `state_id`. `state_id` either comes from the caller (in which |
439 | // case, we assume it is correct), or it comes from the return |
440 | // value of next_state, which is guaranteed to be correct. |
441 | state_id = self.next_state_no_fail(state_id, haystack[at]); |
442 | at += 1; |
443 | if self.is_match_or_dead_state(state_id) { |
444 | if state_id == dead_id() { |
445 | // The only way to enter into a dead state is if a |
446 | // match has been found, so we assert as much. This |
447 | // is different from normal automata, where you might |
448 | // enter a dead state if you know a subsequent match |
449 | // will never be found (regardless of whether a match |
450 | // has already been found). For Aho-Corasick, it is |
451 | // built so that we can match at any position, so the |
452 | // possibility of a match always exists. |
453 | // |
454 | // (Unless we have an anchored automaton, in which |
455 | // case, dead states are used to stop a search.) |
456 | debug_assert!( |
457 | last_match.is_some() || self.anchored(), |
458 | "dead state should only be seen after match" |
459 | ); |
460 | return last_match; |
461 | } |
462 | last_match = self.get_match(state_id, 0, at); |
463 | } |
464 | } |
465 | last_match |
466 | } |
467 | |
468 | /// Execute an overlapping search. |
469 | /// |
470 | /// When executing an overlapping match, the previous state ID in addition |
471 | /// to the previous match index should be given. If there are more matches |
472 | /// at the given state, then the match is reported and the given index is |
473 | /// incremented. |
474 | #[inline (always)] |
475 | fn overlapping_find_at( |
476 | &self, |
477 | prestate: &mut PrefilterState, |
478 | haystack: &[u8], |
479 | at: usize, |
480 | state_id: &mut Self::ID, |
481 | match_index: &mut usize, |
482 | ) -> Option<Match> { |
483 | if self.anchored() && at > 0 && *state_id == self.start_state() { |
484 | return None; |
485 | } |
486 | |
487 | let match_count = self.match_count(*state_id); |
488 | if *match_index < match_count { |
489 | // This is guaranteed to return a match since |
490 | // match_index < match_count. |
491 | let result = self.get_match(*state_id, *match_index, at); |
492 | debug_assert!(result.is_some(), "must be a match" ); |
493 | *match_index += 1; |
494 | return result; |
495 | } |
496 | |
497 | *match_index = 0; |
498 | match self.standard_find_at(prestate, haystack, at, state_id) { |
499 | None => None, |
500 | Some(m) => { |
501 | *match_index = 1; |
502 | Some(m) |
503 | } |
504 | } |
505 | } |
506 | |
507 | /// Return the earliest match found. This returns as soon as we know that |
508 | /// we have a match. As such, this does not necessarily correspond to the |
509 | /// leftmost starting match, but rather, the leftmost position at which a |
510 | /// match ends. |
511 | #[inline (always)] |
512 | fn earliest_find_at( |
513 | &self, |
514 | prestate: &mut PrefilterState, |
515 | haystack: &[u8], |
516 | at: usize, |
517 | state_id: &mut Self::ID, |
518 | ) -> Option<Match> { |
519 | if *state_id == self.start_state() { |
520 | if self.anchored() && at > 0 { |
521 | return None; |
522 | } |
523 | if let Some(m) = self.get_match(*state_id, 0, at) { |
524 | return Some(m); |
525 | } |
526 | } |
527 | self.standard_find_at(prestate, haystack, at, state_id) |
528 | } |
529 | |
530 | /// A convenience function for finding the next match according to the |
531 | /// match semantics of this automaton. For standard match semantics, this |
532 | /// finds the earliest match. Otherwise, the leftmost match is found. |
533 | #[inline (always)] |
534 | fn find_at( |
535 | &self, |
536 | prestate: &mut PrefilterState, |
537 | haystack: &[u8], |
538 | at: usize, |
539 | state_id: &mut Self::ID, |
540 | ) -> Option<Match> { |
541 | match *self.match_kind() { |
542 | MatchKind::Standard => { |
543 | self.earliest_find_at(prestate, haystack, at, state_id) |
544 | } |
545 | MatchKind::LeftmostFirst | MatchKind::LeftmostLongest => { |
546 | self.leftmost_find_at(prestate, haystack, at, state_id) |
547 | } |
548 | MatchKind::__Nonexhaustive => unreachable!(), |
549 | } |
550 | } |
551 | |
552 | /// Like find_at, but does not track state identifiers. This permits some |
553 | /// optimizations when a prefilter that confirms its own matches is |
554 | /// present. |
555 | #[inline (always)] |
556 | fn find_at_no_state( |
557 | &self, |
558 | prestate: &mut PrefilterState, |
559 | haystack: &[u8], |
560 | at: usize, |
561 | ) -> Option<Match> { |
562 | match *self.match_kind() { |
563 | MatchKind::Standard => { |
564 | let mut state = self.start_state(); |
565 | self.earliest_find_at(prestate, haystack, at, &mut state) |
566 | } |
567 | MatchKind::LeftmostFirst | MatchKind::LeftmostLongest => { |
568 | self.leftmost_find_at_no_state(prestate, haystack, at) |
569 | } |
570 | MatchKind::__Nonexhaustive => unreachable!(), |
571 | } |
572 | } |
573 | } |
574 | |