1 | use crate::util::{ |
2 | prefilter::PrefilterI, |
3 | search::{MatchKind, Span}, |
4 | }; |
5 | |
6 | #[derive (Clone, Debug)] |
7 | pub(crate) struct Teddy { |
8 | #[cfg (not(feature = "perf-literal-multisubstring" ))] |
9 | _unused: (), |
10 | /// The actual Teddy searcher. |
11 | /// |
12 | /// Technically, it's possible that Teddy doesn't actually get used, since |
13 | /// Teddy does require its haystack to at least be of a certain size |
14 | /// (usually around the size of whatever vector is being used, so ~16 |
15 | /// or ~32 bytes). For haystacks shorter than that, the implementation |
16 | /// currently uses Rabin-Karp. |
17 | #[cfg (feature = "perf-literal-multisubstring" )] |
18 | searcher: aho_corasick::packed::Searcher, |
19 | /// When running an anchored search, the packed searcher can't handle it so |
20 | /// we defer to Aho-Corasick itself. Kind of sad, but changing the packed |
21 | /// searchers to support anchored search would be difficult at worst and |
22 | /// annoying at best. Since packed searchers only apply to small numbers of |
23 | /// literals, we content ourselves that this is not much of an added cost. |
24 | /// (That packed searchers only work with a small number of literals is |
25 | /// also why we use a DFA here. Otherwise, the memory usage of a DFA would |
26 | /// likely be unacceptable.) |
27 | #[cfg (feature = "perf-literal-multisubstring" )] |
28 | anchored_ac: aho_corasick::dfa::DFA, |
29 | /// The length of the smallest literal we look for. |
30 | /// |
31 | /// We use this as a heuristic to figure out whether this will be "fast" or |
32 | /// not. Generally, the longer the better, because longer needles are more |
33 | /// discriminating and thus reduce false positive rate. |
34 | #[cfg (feature = "perf-literal-multisubstring" )] |
35 | minimum_len: usize, |
36 | } |
37 | |
38 | impl Teddy { |
39 | pub(crate) fn new<B: AsRef<[u8]>>( |
40 | kind: MatchKind, |
41 | needles: &[B], |
42 | ) -> Option<Teddy> { |
43 | #[cfg (not(feature = "perf-literal-multisubstring" ))] |
44 | { |
45 | None |
46 | } |
47 | #[cfg (feature = "perf-literal-multisubstring" )] |
48 | { |
49 | // We only really support leftmost-first semantics. In |
50 | // theory we could at least support leftmost-longest, as the |
51 | // aho-corasick crate does, but regex-automata doesn't know about |
52 | // leftmost-longest currently. |
53 | // |
54 | // And like the aho-corasick prefilter, if we're using `All` |
55 | // semantics, then we can still use leftmost semantics for a |
56 | // prefilter. (This might be a suspicious choice for the literal |
57 | // engine, which uses a prefilter as a regex engine directly, but |
58 | // that only happens when using leftmost-first semantics.) |
59 | let (packed_match_kind, ac_match_kind) = match kind { |
60 | MatchKind::LeftmostFirst | MatchKind::All => ( |
61 | aho_corasick::packed::MatchKind::LeftmostFirst, |
62 | aho_corasick::MatchKind::LeftmostFirst, |
63 | ), |
64 | }; |
65 | let minimum_len = |
66 | needles.iter().map(|n| n.as_ref().len()).min().unwrap_or(0); |
67 | let packed = aho_corasick::packed::Config::new() |
68 | .match_kind(packed_match_kind) |
69 | .builder() |
70 | .extend(needles) |
71 | .build()?; |
72 | let anchored_ac = aho_corasick::dfa::DFA::builder() |
73 | .match_kind(ac_match_kind) |
74 | .start_kind(aho_corasick::StartKind::Anchored) |
75 | .prefilter(false) |
76 | .build(needles) |
77 | .ok()?; |
78 | Some(Teddy { searcher: packed, anchored_ac, minimum_len }) |
79 | } |
80 | } |
81 | } |
82 | |
83 | impl PrefilterI for Teddy { |
84 | fn find(&self, haystack: &[u8], span: Span) -> Option<Span> { |
85 | #[cfg (not(feature = "perf-literal-multisubstring" ))] |
86 | { |
87 | unreachable!() |
88 | } |
89 | #[cfg (feature = "perf-literal-multisubstring" )] |
90 | { |
91 | let ac_span = |
92 | aho_corasick::Span { start: span.start, end: span.end }; |
93 | self.searcher |
94 | .find_in(haystack, ac_span) |
95 | .map(|m| Span { start: m.start(), end: m.end() }) |
96 | } |
97 | } |
98 | |
99 | fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> { |
100 | #[cfg (not(feature = "perf-literal-multisubstring" ))] |
101 | { |
102 | unreachable!() |
103 | } |
104 | #[cfg (feature = "perf-literal-multisubstring" )] |
105 | { |
106 | use aho_corasick::automaton::Automaton; |
107 | let input = aho_corasick::Input::new(haystack) |
108 | .anchored(aho_corasick::Anchored::Yes) |
109 | .span(span.start..span.end); |
110 | self.anchored_ac |
111 | .try_find(&input) |
112 | // OK because we build the DFA with anchored support. |
113 | .expect("aho-corasick DFA should never fail" ) |
114 | .map(|m| Span { start: m.start(), end: m.end() }) |
115 | } |
116 | } |
117 | |
118 | fn memory_usage(&self) -> usize { |
119 | #[cfg (not(feature = "perf-literal-multisubstring" ))] |
120 | { |
121 | unreachable!() |
122 | } |
123 | #[cfg (feature = "perf-literal-multisubstring" )] |
124 | { |
125 | use aho_corasick::automaton::Automaton; |
126 | self.searcher.memory_usage() + self.anchored_ac.memory_usage() |
127 | } |
128 | } |
129 | |
130 | fn is_fast(&self) -> bool { |
131 | #[cfg (not(feature = "perf-literal-multisubstring" ))] |
132 | { |
133 | unreachable!() |
134 | } |
135 | #[cfg (feature = "perf-literal-multisubstring" )] |
136 | { |
137 | // Teddy is usually quite fast, but I have seen some cases where |
138 | // a large number of literals can overwhelm it and make it not so |
139 | // fast. We make an educated but conservative guess at a limit, at |
140 | // which point, we're not so comfortable thinking Teddy is "fast." |
141 | // |
142 | // Well... this used to incorporate a "limit" on the *number* |
143 | // of literals, but I have since changed it to a minimum on the |
144 | // *smallest* literal. Namely, when there is a very small literal |
145 | // (1 or 2 bytes), it is far more likely that it leads to a higher |
146 | // false positive rate. (Although, of course, not always. For |
147 | // example, 'zq' is likely to have a very low false positive rate.) |
148 | // But when we have 3 bytes, we have a really good chance of being |
149 | // quite discriminatory and thus fast. |
150 | // |
151 | // We may still want to add some kind of limit on the number of |
152 | // literals here, but keep in mind that Teddy already has its own |
153 | // somewhat small limit (64 at time of writing). The main issue |
154 | // here is that if 'is_fast' is false, it opens the door for the |
155 | // reverse inner optimization to kick in. We really only want to |
156 | // resort to the reverse inner optimization if we absolutely must. |
157 | self.minimum_len >= 3 |
158 | } |
159 | } |
160 | } |
161 | |