1 | use crate::util::{ |
2 | prefilter::PrefilterI, |
3 | search::{MatchKind, Span}, |
4 | }; |
5 | |
6 | #[derive(Clone, Debug)] |
7 | pub(crate) struct AhoCorasick { |
8 | #[cfg (not(feature = "perf-literal-multisubstring" ))] |
9 | _unused: (), |
10 | #[cfg (feature = "perf-literal-multisubstring" )] |
11 | ac: aho_corasick::AhoCorasick, |
12 | } |
13 | |
14 | impl AhoCorasick { |
15 | pub(crate) fn new<B: AsRef<[u8]>>( |
16 | kind: MatchKind, |
17 | needles: &[B], |
18 | ) -> Option<AhoCorasick> { |
19 | #[cfg (not(feature = "perf-literal-multisubstring" ))] |
20 | { |
21 | None |
22 | } |
23 | #[cfg (feature = "perf-literal-multisubstring" )] |
24 | { |
25 | // We used to use `aho_corasick::MatchKind::Standard` here when |
26 | // `kind` was `MatchKind::All`, but this is not correct. The |
27 | // "standard" Aho-Corasick match semantics are to report a match |
28 | // immediately as soon as it is seen, but `All` isn't like that. |
29 | // In particular, with "standard" semantics, given the needles |
30 | // "abc" and "b" and the haystack "abc," it would report a match |
31 | // at offset 1 before a match at offset 0. This is never what we |
32 | // want in the context of the regex engine, regardless of whether |
33 | // we have leftmost-first or 'all' semantics. Namely, we always |
34 | // want the leftmost match. |
35 | let ac_match_kind = match kind { |
36 | MatchKind::LeftmostFirst | MatchKind::All => { |
37 | aho_corasick::MatchKind::LeftmostFirst |
38 | } |
39 | }; |
40 | // This is kind of just an arbitrary number, but basically, if we |
41 | // have a small enough set of literals, then we try to use the VERY |
42 | // memory hungry DFA. Otherwise, we whimp out and use an NFA. The |
43 | // upshot is that the NFA is quite lean and decently fast. Faster |
44 | // than a naive Aho-Corasick NFA anyway. |
45 | let ac_kind = if needles.len() <= 500 { |
46 | aho_corasick::AhoCorasickKind::DFA |
47 | } else { |
48 | aho_corasick::AhoCorasickKind::ContiguousNFA |
49 | }; |
50 | let result = aho_corasick::AhoCorasick::builder() |
51 | .kind(Some(ac_kind)) |
52 | .match_kind(ac_match_kind) |
53 | .start_kind(aho_corasick::StartKind::Both) |
54 | // We try to handle all of the prefilter cases in the super |
55 | // module, and only use Aho-Corasick for the actual automaton. |
56 | // The aho-corasick crate does have some extra prefilters, |
57 | // namely, looking for rare bytes to feed to memchr{,2,3} |
58 | // instead of just the first byte. If we end up wanting |
59 | // those---and they are somewhat tricky to implement---then |
60 | // we could port them to this crate. |
61 | // |
62 | // The main reason for doing things this way is so we have a |
63 | // complete and easy to understand picture of which prefilters |
64 | // are available and how they work. Otherwise it seems too |
65 | // easy to get into a situation where we have a prefilter |
66 | // layered on top of prefilter, and that might have unintended |
67 | // consequences. |
68 | .prefilter(false) |
69 | .build(needles); |
70 | let ac = match result { |
71 | Ok(ac) => ac, |
72 | Err(_err) => { |
73 | debug!("aho-corasick prefilter failed to build: {}" , _err); |
74 | return None; |
75 | } |
76 | }; |
77 | Some(AhoCorasick { ac }) |
78 | } |
79 | } |
80 | } |
81 | |
82 | impl PrefilterI for AhoCorasick { |
83 | fn find(&self, haystack: &[u8], span: Span) -> Option<Span> { |
84 | #[cfg (not(feature = "perf-literal-multisubstring" ))] |
85 | { |
86 | unreachable!() |
87 | } |
88 | #[cfg (feature = "perf-literal-multisubstring" )] |
89 | { |
90 | let input = |
91 | aho_corasick::Input::new(haystack).span(span.start..span.end); |
92 | self.ac |
93 | .find(input) |
94 | .map(|m| Span { start: m.start(), end: m.end() }) |
95 | } |
96 | } |
97 | |
98 | fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> { |
99 | #[cfg (not(feature = "perf-literal-multisubstring" ))] |
100 | { |
101 | unreachable!() |
102 | } |
103 | #[cfg (feature = "perf-literal-multisubstring" )] |
104 | { |
105 | let input = aho_corasick::Input::new(haystack) |
106 | .anchored(aho_corasick::Anchored::Yes) |
107 | .span(span.start..span.end); |
108 | self.ac |
109 | .find(input) |
110 | .map(|m| Span { start: m.start(), end: m.end() }) |
111 | } |
112 | } |
113 | |
114 | fn memory_usage(&self) -> usize { |
115 | #[cfg (not(feature = "perf-literal-multisubstring" ))] |
116 | { |
117 | unreachable!() |
118 | } |
119 | #[cfg (feature = "perf-literal-multisubstring" )] |
120 | { |
121 | self.ac.memory_usage() |
122 | } |
123 | } |
124 | |
125 | fn is_fast(&self) -> bool { |
126 | #[cfg (not(feature = "perf-literal-multisubstring" ))] |
127 | { |
128 | unreachable!() |
129 | } |
130 | #[cfg (feature = "perf-literal-multisubstring" )] |
131 | { |
132 | // Aho-Corasick is never considered "fast" because it's never |
133 | // going to be even close to an order of magnitude faster than the |
134 | // regex engine itself (assuming a DFA is used). In fact, it is |
135 | // usually slower. The magic of Aho-Corasick is that it can search |
136 | // a *large* number of literals with a relatively small amount of |
137 | // memory. The regex engines are far more wasteful. |
138 | // |
139 | // Aho-Corasick may be "fast" when the regex engine corresponds |
140 | // to, say, the PikeVM. That happens when the lazy DFA couldn't be |
141 | // built or used for some reason. But in these cases, the regex |
142 | // itself is likely quite big and we're probably hosed no matter |
143 | // what we do. (In this case, the best bet is for the caller to |
144 | // increase some of the memory limits on the hybrid cache capacity |
145 | // and hope that's enough.) |
146 | false |
147 | } |
148 | } |
149 | } |
150 | |