1use std::mem;
2
3use aho_corasick::{self, packed, AhoCorasick};
4use memchr::{memchr, memchr2, memchr3, memmem};
5use regex_syntax::hir::literal::{Literal, Seq};
6
7/// A prefix extracted from a compiled regular expression.
8///
9/// A regex prefix is a set of literal strings that *must* be matched at the
10/// beginning of a regex in order for the entire regex to match. Similarly
11/// for a regex suffix.
12#[derive(Clone, Debug)]
13pub struct LiteralSearcher {
14 complete: bool,
15 lcp: Memmem,
16 lcs: Memmem,
17 matcher: Matcher,
18}
19
20#[derive(Clone, Debug)]
21enum Matcher {
22 /// No literals. (Never advances through the input.)
23 Empty,
24 /// A set of four or more single byte literals.
25 Bytes(SingleByteSet),
26 /// A single substring, using vector accelerated routines when available.
27 Memmem(Memmem),
28 /// An Aho-Corasick automaton.
29 AC { ac: AhoCorasick, lits: Vec<Literal> },
30 /// A packed multiple substring searcher, using SIMD.
31 ///
32 /// Note that Aho-Corasick will actually use this packed searcher
33 /// internally automatically, however, there is some overhead associated
34 /// with going through the Aho-Corasick machinery. So using the packed
35 /// searcher directly results in some gains.
36 Packed { s: packed::Searcher, lits: Vec<Literal> },
37}
38
39impl LiteralSearcher {
40 /// Returns a matcher that never matches and never advances the input.
41 pub fn empty() -> Self {
42 Self::new(Seq::infinite(), Matcher::Empty)
43 }
44
45 /// Returns a matcher for literal prefixes from the given set.
46 pub fn prefixes(lits: Seq) -> Self {
47 let matcher = Matcher::prefixes(&lits);
48 Self::new(lits, matcher)
49 }
50
51 /// Returns a matcher for literal suffixes from the given set.
52 pub fn suffixes(lits: Seq) -> Self {
53 let matcher = Matcher::suffixes(&lits);
54 Self::new(lits, matcher)
55 }
56
57 fn new(lits: Seq, matcher: Matcher) -> Self {
58 LiteralSearcher {
59 complete: lits.is_exact(),
60 lcp: Memmem::new(lits.longest_common_prefix().unwrap_or(b"")),
61 lcs: Memmem::new(lits.longest_common_suffix().unwrap_or(b"")),
62 matcher,
63 }
64 }
65
66 /// Returns true if all matches comprise the entire regular expression.
67 ///
68 /// This does not necessarily mean that a literal match implies a match
69 /// of the regular expression. For example, the regular expression `^a`
70 /// is comprised of a single complete literal `a`, but the regular
71 /// expression demands that it only match at the beginning of a string.
72 pub fn complete(&self) -> bool {
73 self.complete && !self.is_empty()
74 }
75
76 /// Find the position of a literal in `haystack` if it exists.
77 #[cfg_attr(feature = "perf-inline", inline(always))]
78 pub fn find(&self, haystack: &[u8]) -> Option<(usize, usize)> {
79 use self::Matcher::*;
80 match self.matcher {
81 Empty => Some((0, 0)),
82 Bytes(ref sset) => sset.find(haystack).map(|i| (i, i + 1)),
83 Memmem(ref s) => s.find(haystack).map(|i| (i, i + s.len())),
84 AC { ref ac, .. } => {
85 ac.find(haystack).map(|m| (m.start(), m.end()))
86 }
87 Packed { ref s, .. } => {
88 s.find(haystack).map(|m| (m.start(), m.end()))
89 }
90 }
91 }
92
93 /// Like find, except matches must start at index `0`.
94 pub fn find_start(&self, haystack: &[u8]) -> Option<(usize, usize)> {
95 for lit in self.iter() {
96 if lit.len() > haystack.len() {
97 continue;
98 }
99 if lit == &haystack[0..lit.len()] {
100 return Some((0, lit.len()));
101 }
102 }
103 None
104 }
105
106 /// Like find, except matches must end at index `haystack.len()`.
107 pub fn find_end(&self, haystack: &[u8]) -> Option<(usize, usize)> {
108 for lit in self.iter() {
109 if lit.len() > haystack.len() {
110 continue;
111 }
112 if lit == &haystack[haystack.len() - lit.len()..] {
113 return Some((haystack.len() - lit.len(), haystack.len()));
114 }
115 }
116 None
117 }
118
119 /// Returns an iterator over all literals to be matched.
120 pub fn iter(&self) -> LiteralIter<'_> {
121 match self.matcher {
122 Matcher::Empty => LiteralIter::Empty,
123 Matcher::Bytes(ref sset) => LiteralIter::Bytes(&sset.dense),
124 Matcher::Memmem(ref s) => LiteralIter::Single(&s.finder.needle()),
125 Matcher::AC { ref lits, .. } => LiteralIter::AC(lits),
126 Matcher::Packed { ref lits, .. } => LiteralIter::Packed(lits),
127 }
128 }
129
130 /// Returns a matcher for the longest common prefix of this matcher.
131 pub fn lcp(&self) -> &Memmem {
132 &self.lcp
133 }
134
135 /// Returns a matcher for the longest common suffix of this matcher.
136 pub fn lcs(&self) -> &Memmem {
137 &self.lcs
138 }
139
140 /// Returns true iff this prefix is empty.
141 pub fn is_empty(&self) -> bool {
142 self.len() == 0
143 }
144
145 /// Returns the number of prefixes in this machine.
146 pub fn len(&self) -> usize {
147 use self::Matcher::*;
148 match self.matcher {
149 Empty => 0,
150 Bytes(ref sset) => sset.dense.len(),
151 Memmem(_) => 1,
152 AC { ref ac, .. } => ac.patterns_len(),
153 Packed { ref lits, .. } => lits.len(),
154 }
155 }
156
157 /// Return the approximate heap usage of literals in bytes.
158 pub fn approximate_size(&self) -> usize {
159 use self::Matcher::*;
160 match self.matcher {
161 Empty => 0,
162 Bytes(ref sset) => sset.approximate_size(),
163 Memmem(ref single) => single.approximate_size(),
164 AC { ref ac, .. } => ac.memory_usage(),
165 Packed { ref s, .. } => s.memory_usage(),
166 }
167 }
168}
169
170impl Matcher {
171 fn prefixes(lits: &Seq) -> Self {
172 let sset = SingleByteSet::prefixes(lits);
173 Matcher::new(lits, sset)
174 }
175
176 fn suffixes(lits: &Seq) -> Self {
177 let sset = SingleByteSet::suffixes(lits);
178 Matcher::new(lits, sset)
179 }
180
181 fn new(lits: &Seq, sset: SingleByteSet) -> Self {
182 if lits.is_empty() || lits.min_literal_len() == Some(0) {
183 return Matcher::Empty;
184 }
185 let lits = match lits.literals() {
186 None => return Matcher::Empty,
187 Some(members) => members,
188 };
189 if sset.dense.len() >= 26 {
190 // Avoid trying to match a large number of single bytes.
191 // This is *very* sensitive to a frequency analysis comparison
192 // between the bytes in sset and the composition of the haystack.
193 // No matter the size of sset, if its members all are rare in the
194 // haystack, then it'd be worth using it. How to tune this... IDK.
195 // ---AG
196 return Matcher::Empty;
197 }
198 if sset.complete {
199 return Matcher::Bytes(sset);
200 }
201 if lits.len() == 1 {
202 return Matcher::Memmem(Memmem::new(lits[0].as_bytes()));
203 }
204
205 let pats: Vec<&[u8]> = lits.iter().map(|lit| lit.as_bytes()).collect();
206 let is_aho_corasick_fast = sset.dense.len() <= 1 && sset.all_ascii;
207 if lits.len() <= 100 && !is_aho_corasick_fast {
208 let mut builder = packed::Config::new()
209 .match_kind(packed::MatchKind::LeftmostFirst)
210 .builder();
211 if let Some(s) = builder.extend(&pats).build() {
212 return Matcher::Packed { s, lits: lits.to_owned() };
213 }
214 }
215 let ac = AhoCorasick::builder()
216 .match_kind(aho_corasick::MatchKind::LeftmostFirst)
217 .kind(Some(aho_corasick::AhoCorasickKind::DFA))
218 .build(&pats)
219 .unwrap();
220 Matcher::AC { ac, lits: lits.to_owned() }
221 }
222}
223
224#[derive(Debug)]
225pub enum LiteralIter<'a> {
226 Empty,
227 Bytes(&'a [u8]),
228 Single(&'a [u8]),
229 AC(&'a [Literal]),
230 Packed(&'a [Literal]),
231}
232
233impl<'a> Iterator for LiteralIter<'a> {
234 type Item = &'a [u8];
235
236 fn next(&mut self) -> Option<Self::Item> {
237 match *self {
238 LiteralIter::Empty => None,
239 LiteralIter::Bytes(ref mut many) => {
240 if many.is_empty() {
241 None
242 } else {
243 let next = &many[0..1];
244 *many = &many[1..];
245 Some(next)
246 }
247 }
248 LiteralIter::Single(ref mut one) => {
249 if one.is_empty() {
250 None
251 } else {
252 let next = &one[..];
253 *one = &[];
254 Some(next)
255 }
256 }
257 LiteralIter::AC(ref mut lits) => {
258 if lits.is_empty() {
259 None
260 } else {
261 let next = &lits[0];
262 *lits = &lits[1..];
263 Some(next.as_bytes())
264 }
265 }
266 LiteralIter::Packed(ref mut lits) => {
267 if lits.is_empty() {
268 None
269 } else {
270 let next = &lits[0];
271 *lits = &lits[1..];
272 Some(next.as_bytes())
273 }
274 }
275 }
276 }
277}
278
279#[derive(Clone, Debug)]
280struct SingleByteSet {
281 sparse: Vec<bool>,
282 dense: Vec<u8>,
283 complete: bool,
284 all_ascii: bool,
285}
286
287impl SingleByteSet {
288 fn new() -> SingleByteSet {
289 SingleByteSet {
290 sparse: vec![false; 256],
291 dense: vec![],
292 complete: true,
293 all_ascii: true,
294 }
295 }
296
297 fn prefixes(lits: &Seq) -> SingleByteSet {
298 let mut sset = SingleByteSet::new();
299 let lits = match lits.literals() {
300 None => return sset,
301 Some(lits) => lits,
302 };
303 for lit in lits.iter() {
304 sset.complete = sset.complete && lit.len() == 1;
305 if let Some(&b) = lit.as_bytes().get(0) {
306 if !sset.sparse[b as usize] {
307 if b > 0x7F {
308 sset.all_ascii = false;
309 }
310 sset.dense.push(b);
311 sset.sparse[b as usize] = true;
312 }
313 }
314 }
315 sset
316 }
317
318 fn suffixes(lits: &Seq) -> SingleByteSet {
319 let mut sset = SingleByteSet::new();
320 let lits = match lits.literals() {
321 None => return sset,
322 Some(lits) => lits,
323 };
324 for lit in lits.iter() {
325 sset.complete = sset.complete && lit.len() == 1;
326 if let Some(&b) = lit.as_bytes().last() {
327 if !sset.sparse[b as usize] {
328 if b > 0x7F {
329 sset.all_ascii = false;
330 }
331 sset.dense.push(b);
332 sset.sparse[b as usize] = true;
333 }
334 }
335 }
336 sset
337 }
338
339 /// Faster find that special cases certain sizes to use memchr.
340 #[cfg_attr(feature = "perf-inline", inline(always))]
341 fn find(&self, text: &[u8]) -> Option<usize> {
342 match self.dense.len() {
343 0 => None,
344 1 => memchr(self.dense[0], text),
345 2 => memchr2(self.dense[0], self.dense[1], text),
346 3 => memchr3(self.dense[0], self.dense[1], self.dense[2], text),
347 _ => self._find(text),
348 }
349 }
350
351 /// Generic find that works on any sized set.
352 fn _find(&self, haystack: &[u8]) -> Option<usize> {
353 for (i, &b) in haystack.iter().enumerate() {
354 if self.sparse[b as usize] {
355 return Some(i);
356 }
357 }
358 None
359 }
360
361 fn approximate_size(&self) -> usize {
362 (self.dense.len() * mem::size_of::<u8>())
363 + (self.sparse.len() * mem::size_of::<bool>())
364 }
365}
366
367/// A simple wrapper around the memchr crate's memmem implementation.
368///
369/// The API this exposes mirrors the API of previous substring searchers that
370/// this supplanted.
371#[derive(Clone, Debug)]
372pub struct Memmem {
373 finder: memmem::Finder<'static>,
374 char_len: usize,
375}
376
377impl Memmem {
378 fn new(pat: &[u8]) -> Memmem {
379 Memmem {
380 finder: memmem::Finder::new(pat).into_owned(),
381 char_len: char_len_lossy(pat),
382 }
383 }
384
385 #[cfg_attr(feature = "perf-inline", inline(always))]
386 pub fn find(&self, haystack: &[u8]) -> Option<usize> {
387 self.finder.find(haystack)
388 }
389
390 #[cfg_attr(feature = "perf-inline", inline(always))]
391 pub fn is_suffix(&self, text: &[u8]) -> bool {
392 if text.len() < self.len() {
393 return false;
394 }
395 &text[text.len() - self.len()..] == self.finder.needle()
396 }
397
398 pub fn len(&self) -> usize {
399 self.finder.needle().len()
400 }
401
402 pub fn char_len(&self) -> usize {
403 self.char_len
404 }
405
406 fn approximate_size(&self) -> usize {
407 self.finder.needle().len() * mem::size_of::<u8>()
408 }
409}
410
411fn char_len_lossy(bytes: &[u8]) -> usize {
412 String::from_utf8_lossy(bytes).chars().count()
413}
414