1use crate::util::{
2 prefilter::PrefilterI,
3 search::{MatchKind, Span},
4};
5
6#[derive(Clone, Debug)]
7pub(crate) struct AhoCorasick {
8 #[cfg(not(feature = "perf-literal-multisubstring"))]
9 _unused: (),
10 #[cfg(feature = "perf-literal-multisubstring")]
11 ac: aho_corasick::AhoCorasick,
12}
13
14impl AhoCorasick {
15 pub(crate) fn new<B: AsRef<[u8]>>(
16 kind: MatchKind,
17 needles: &[B],
18 ) -> Option<AhoCorasick> {
19 #[cfg(not(feature = "perf-literal-multisubstring"))]
20 {
21 None
22 }
23 #[cfg(feature = "perf-literal-multisubstring")]
24 {
25 // We used to use `aho_corasick::MatchKind::Standard` here when
26 // `kind` was `MatchKind::All`, but this is not correct. The
27 // "standard" Aho-Corasick match semantics are to report a match
28 // immediately as soon as it is seen, but `All` isn't like that.
29 // In particular, with "standard" semantics, given the needles
30 // "abc" and "b" and the haystack "abc," it would report a match
31 // at offset 1 before a match at offset 0. This is never what we
32 // want in the context of the regex engine, regardless of whether
33 // we have leftmost-first or 'all' semantics. Namely, we always
34 // want the leftmost match.
35 let ac_match_kind = match kind {
36 MatchKind::LeftmostFirst | MatchKind::All => {
37 aho_corasick::MatchKind::LeftmostFirst
38 }
39 };
40 // This is kind of just an arbitrary number, but basically, if we
41 // have a small enough set of literals, then we try to use the VERY
42 // memory hungry DFA. Otherwise, we whimp out and use an NFA. The
43 // upshot is that the NFA is quite lean and decently fast. Faster
44 // than a naive Aho-Corasick NFA anyway.
45 let ac_kind = if needles.len() <= 500 {
46 aho_corasick::AhoCorasickKind::DFA
47 } else {
48 aho_corasick::AhoCorasickKind::ContiguousNFA
49 };
50 let result = aho_corasick::AhoCorasick::builder()
51 .kind(Some(ac_kind))
52 .match_kind(ac_match_kind)
53 .start_kind(aho_corasick::StartKind::Both)
54 // We try to handle all of the prefilter cases in the super
55 // module, and only use Aho-Corasick for the actual automaton.
56 // The aho-corasick crate does have some extra prefilters,
57 // namely, looking for rare bytes to feed to memchr{,2,3}
58 // instead of just the first byte. If we end up wanting
59 // those---and they are somewhat tricky to implement---then
60 // we could port them to this crate.
61 //
62 // The main reason for doing things this way is so we have a
63 // complete and easy to understand picture of which prefilters
64 // are available and how they work. Otherwise it seems too
65 // easy to get into a situation where we have a prefilter
66 // layered on top of prefilter, and that might have unintended
67 // consequences.
68 .prefilter(false)
69 .build(needles);
70 let ac = match result {
71 Ok(ac) => ac,
72 Err(_err) => {
73 debug!("aho-corasick prefilter failed to build: {}", _err);
74 return None;
75 }
76 };
77 Some(AhoCorasick { ac })
78 }
79 }
80}
81
82impl PrefilterI for AhoCorasick {
83 fn find(&self, haystack: &[u8], span: Span) -> Option<Span> {
84 #[cfg(not(feature = "perf-literal-multisubstring"))]
85 {
86 unreachable!()
87 }
88 #[cfg(feature = "perf-literal-multisubstring")]
89 {
90 let input =
91 aho_corasick::Input::new(haystack).span(span.start..span.end);
92 self.ac
93 .find(input)
94 .map(|m| Span { start: m.start(), end: m.end() })
95 }
96 }
97
98 fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> {
99 #[cfg(not(feature = "perf-literal-multisubstring"))]
100 {
101 unreachable!()
102 }
103 #[cfg(feature = "perf-literal-multisubstring")]
104 {
105 let input = aho_corasick::Input::new(haystack)
106 .anchored(aho_corasick::Anchored::Yes)
107 .span(span.start..span.end);
108 self.ac
109 .find(input)
110 .map(|m| Span { start: m.start(), end: m.end() })
111 }
112 }
113
114 fn memory_usage(&self) -> usize {
115 #[cfg(not(feature = "perf-literal-multisubstring"))]
116 {
117 unreachable!()
118 }
119 #[cfg(feature = "perf-literal-multisubstring")]
120 {
121 self.ac.memory_usage()
122 }
123 }
124
125 fn is_fast(&self) -> bool {
126 #[cfg(not(feature = "perf-literal-multisubstring"))]
127 {
128 unreachable!()
129 }
130 #[cfg(feature = "perf-literal-multisubstring")]
131 {
132 // Aho-Corasick is never considered "fast" because it's never
133 // going to be even close to an order of magnitude faster than the
134 // regex engine itself (assuming a DFA is used). In fact, it is
135 // usually slower. The magic of Aho-Corasick is that it can search
136 // a *large* number of literals with a relatively small amount of
137 // memory. The regex engines are far more wasteful.
138 //
139 // Aho-Corasick may be "fast" when the regex engine corresponds
140 // to, say, the PikeVM. That happens when the lazy DFA couldn't be
141 // built or used for some reason. But in these cases, the regex
142 // itself is likely quite big and we're probably hosed no matter
143 // what we do. (In this case, the best bet is for the caller to
144 // increase some of the memory limits on the hybrid cache capacity
145 // and hope that's enough.)
146 false
147 }
148 }
149}
150