1use crate::util::{
2 prefilter::PrefilterI,
3 search::{MatchKind, Span},
4};
5
6#[derive(Clone, Debug)]
7pub(crate) struct Teddy {
8 #[cfg(not(feature = "perf-literal-multisubstring"))]
9 _unused: (),
10 /// The actual Teddy searcher.
11 ///
12 /// Technically, it's possible that Teddy doesn't actually get used, since
13 /// Teddy does require its haystack to at least be of a certain size
14 /// (usually around the size of whatever vector is being used, so ~16
15 /// or ~32 bytes). For haystacks shorter than that, the implementation
16 /// currently uses Rabin-Karp.
17 #[cfg(feature = "perf-literal-multisubstring")]
18 searcher: aho_corasick::packed::Searcher,
19 /// When running an anchored search, the packed searcher can't handle it so
20 /// we defer to Aho-Corasick itself. Kind of sad, but changing the packed
21 /// searchers to support anchored search would be difficult at worst and
22 /// annoying at best. Since packed searchers only apply to small numbers of
23 /// literals, we content ourselves that this is not much of an added cost.
24 /// (That packed searchers only work with a small number of literals is
25 /// also why we use a DFA here. Otherwise, the memory usage of a DFA would
26 /// likely be unacceptable.)
27 #[cfg(feature = "perf-literal-multisubstring")]
28 anchored_ac: aho_corasick::dfa::DFA,
29 /// The length of the smallest literal we look for.
30 ///
31 /// We use this as a heuristic to figure out whether this will be "fast" or
32 /// not. Generally, the longer the better, because longer needles are more
33 /// discriminating and thus reduce false positive rate.
34 #[cfg(feature = "perf-literal-multisubstring")]
35 minimum_len: usize,
36}
37
38impl Teddy {
39 pub(crate) fn new<B: AsRef<[u8]>>(
40 kind: MatchKind,
41 needles: &[B],
42 ) -> Option<Teddy> {
43 #[cfg(not(feature = "perf-literal-multisubstring"))]
44 {
45 None
46 }
47 #[cfg(feature = "perf-literal-multisubstring")]
48 {
49 // We only really support leftmost-first semantics. In
50 // theory we could at least support leftmost-longest, as the
51 // aho-corasick crate does, but regex-automata doesn't know about
52 // leftmost-longest currently.
53 //
54 // And like the aho-corasick prefilter, if we're using `All`
55 // semantics, then we can still use leftmost semantics for a
56 // prefilter. (This might be a suspicious choice for the literal
57 // engine, which uses a prefilter as a regex engine directly, but
58 // that only happens when using leftmost-first semantics.)
59 let (packed_match_kind, ac_match_kind) = match kind {
60 MatchKind::LeftmostFirst | MatchKind::All => (
61 aho_corasick::packed::MatchKind::LeftmostFirst,
62 aho_corasick::MatchKind::LeftmostFirst,
63 ),
64 };
65 let minimum_len =
66 needles.iter().map(|n| n.as_ref().len()).min().unwrap_or(0);
67 let packed = aho_corasick::packed::Config::new()
68 .match_kind(packed_match_kind)
69 .builder()
70 .extend(needles)
71 .build()?;
72 let anchored_ac = aho_corasick::dfa::DFA::builder()
73 .match_kind(ac_match_kind)
74 .start_kind(aho_corasick::StartKind::Anchored)
75 .prefilter(false)
76 .build(needles)
77 .ok()?;
78 Some(Teddy { searcher: packed, anchored_ac, minimum_len })
79 }
80 }
81}
82
83impl PrefilterI for Teddy {
84 fn find(&self, haystack: &[u8], span: Span) -> Option<Span> {
85 #[cfg(not(feature = "perf-literal-multisubstring"))]
86 {
87 unreachable!()
88 }
89 #[cfg(feature = "perf-literal-multisubstring")]
90 {
91 let ac_span =
92 aho_corasick::Span { start: span.start, end: span.end };
93 self.searcher
94 .find_in(haystack, ac_span)
95 .map(|m| Span { start: m.start(), end: m.end() })
96 }
97 }
98
99 fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> {
100 #[cfg(not(feature = "perf-literal-multisubstring"))]
101 {
102 unreachable!()
103 }
104 #[cfg(feature = "perf-literal-multisubstring")]
105 {
106 use aho_corasick::automaton::Automaton;
107 let input = aho_corasick::Input::new(haystack)
108 .anchored(aho_corasick::Anchored::Yes)
109 .span(span.start..span.end);
110 self.anchored_ac
111 .try_find(&input)
112 // OK because we build the DFA with anchored support.
113 .expect("aho-corasick DFA should never fail")
114 .map(|m| Span { start: m.start(), end: m.end() })
115 }
116 }
117
118 fn memory_usage(&self) -> usize {
119 #[cfg(not(feature = "perf-literal-multisubstring"))]
120 {
121 unreachable!()
122 }
123 #[cfg(feature = "perf-literal-multisubstring")]
124 {
125 use aho_corasick::automaton::Automaton;
126 self.searcher.memory_usage() + self.anchored_ac.memory_usage()
127 }
128 }
129
130 fn is_fast(&self) -> bool {
131 #[cfg(not(feature = "perf-literal-multisubstring"))]
132 {
133 unreachable!()
134 }
135 #[cfg(feature = "perf-literal-multisubstring")]
136 {
137 // Teddy is usually quite fast, but I have seen some cases where
138 // a large number of literals can overwhelm it and make it not so
139 // fast. We make an educated but conservative guess at a limit, at
140 // which point, we're not so comfortable thinking Teddy is "fast."
141 //
142 // Well... this used to incorporate a "limit" on the *number*
143 // of literals, but I have since changed it to a minimum on the
144 // *smallest* literal. Namely, when there is a very small literal
145 // (1 or 2 bytes), it is far more likely that it leads to a higher
146 // false positive rate. (Although, of course, not always. For
147 // example, 'zq' is likely to have a very low false positive rate.)
148 // But when we have 3 bytes, we have a really good chance of being
149 // quite discriminatory and thus fast.
150 //
151 // We may still want to add some kind of limit on the number of
152 // literals here, but keep in mind that Teddy already has its own
153 // somewhat small limit (64 at time of writing). The main issue
154 // here is that if 'is_fast' is false, it opens the door for the
155 // reverse inner optimization to kick in. We really only want to
156 // resort to the reverse inner optimization if we absolutely must.
157 self.minimum_len >= 3
158 }
159 }
160}
161