1 | use std::mem; |
2 | |
3 | use aho_corasick::{self, packed, AhoCorasick}; |
4 | use memchr::{memchr, memchr2, memchr3, memmem}; |
5 | use regex_syntax::hir::literal::{Literal, Seq}; |
6 | |
7 | /// A prefix extracted from a compiled regular expression. |
8 | /// |
9 | /// A regex prefix is a set of literal strings that *must* be matched at the |
10 | /// beginning of a regex in order for the entire regex to match. Similarly |
11 | /// for a regex suffix. |
12 | #[derive (Clone, Debug)] |
13 | pub struct LiteralSearcher { |
14 | complete: bool, |
15 | lcp: Memmem, |
16 | lcs: Memmem, |
17 | matcher: Matcher, |
18 | } |
19 | |
20 | #[derive (Clone, Debug)] |
21 | enum Matcher { |
22 | /// No literals. (Never advances through the input.) |
23 | Empty, |
24 | /// A set of four or more single byte literals. |
25 | Bytes(SingleByteSet), |
26 | /// A single substring, using vector accelerated routines when available. |
27 | Memmem(Memmem), |
28 | /// An Aho-Corasick automaton. |
29 | AC { ac: AhoCorasick, lits: Vec<Literal> }, |
30 | /// A packed multiple substring searcher, using SIMD. |
31 | /// |
32 | /// Note that Aho-Corasick will actually use this packed searcher |
33 | /// internally automatically, however, there is some overhead associated |
34 | /// with going through the Aho-Corasick machinery. So using the packed |
35 | /// searcher directly results in some gains. |
36 | Packed { s: packed::Searcher, lits: Vec<Literal> }, |
37 | } |
38 | |
39 | impl LiteralSearcher { |
40 | /// Returns a matcher that never matches and never advances the input. |
41 | pub fn empty() -> Self { |
42 | Self::new(Seq::infinite(), Matcher::Empty) |
43 | } |
44 | |
45 | /// Returns a matcher for literal prefixes from the given set. |
46 | pub fn prefixes(lits: Seq) -> Self { |
47 | let matcher = Matcher::prefixes(&lits); |
48 | Self::new(lits, matcher) |
49 | } |
50 | |
51 | /// Returns a matcher for literal suffixes from the given set. |
52 | pub fn suffixes(lits: Seq) -> Self { |
53 | let matcher = Matcher::suffixes(&lits); |
54 | Self::new(lits, matcher) |
55 | } |
56 | |
57 | fn new(lits: Seq, matcher: Matcher) -> Self { |
58 | LiteralSearcher { |
59 | complete: lits.is_exact(), |
60 | lcp: Memmem::new(lits.longest_common_prefix().unwrap_or(b"" )), |
61 | lcs: Memmem::new(lits.longest_common_suffix().unwrap_or(b"" )), |
62 | matcher, |
63 | } |
64 | } |
65 | |
66 | /// Returns true if all matches comprise the entire regular expression. |
67 | /// |
68 | /// This does not necessarily mean that a literal match implies a match |
69 | /// of the regular expression. For example, the regular expression `^a` |
70 | /// is comprised of a single complete literal `a`, but the regular |
71 | /// expression demands that it only match at the beginning of a string. |
72 | pub fn complete(&self) -> bool { |
73 | self.complete && !self.is_empty() |
74 | } |
75 | |
76 | /// Find the position of a literal in `haystack` if it exists. |
77 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
78 | pub fn find(&self, haystack: &[u8]) -> Option<(usize, usize)> { |
79 | use self::Matcher::*; |
80 | match self.matcher { |
81 | Empty => Some((0, 0)), |
82 | Bytes(ref sset) => sset.find(haystack).map(|i| (i, i + 1)), |
83 | Memmem(ref s) => s.find(haystack).map(|i| (i, i + s.len())), |
84 | AC { ref ac, .. } => { |
85 | ac.find(haystack).map(|m| (m.start(), m.end())) |
86 | } |
87 | Packed { ref s, .. } => { |
88 | s.find(haystack).map(|m| (m.start(), m.end())) |
89 | } |
90 | } |
91 | } |
92 | |
93 | /// Like find, except matches must start at index `0`. |
94 | pub fn find_start(&self, haystack: &[u8]) -> Option<(usize, usize)> { |
95 | for lit in self.iter() { |
96 | if lit.len() > haystack.len() { |
97 | continue; |
98 | } |
99 | if lit == &haystack[0..lit.len()] { |
100 | return Some((0, lit.len())); |
101 | } |
102 | } |
103 | None |
104 | } |
105 | |
106 | /// Like find, except matches must end at index `haystack.len()`. |
107 | pub fn find_end(&self, haystack: &[u8]) -> Option<(usize, usize)> { |
108 | for lit in self.iter() { |
109 | if lit.len() > haystack.len() { |
110 | continue; |
111 | } |
112 | if lit == &haystack[haystack.len() - lit.len()..] { |
113 | return Some((haystack.len() - lit.len(), haystack.len())); |
114 | } |
115 | } |
116 | None |
117 | } |
118 | |
119 | /// Returns an iterator over all literals to be matched. |
120 | pub fn iter(&self) -> LiteralIter<'_> { |
121 | match self.matcher { |
122 | Matcher::Empty => LiteralIter::Empty, |
123 | Matcher::Bytes(ref sset) => LiteralIter::Bytes(&sset.dense), |
124 | Matcher::Memmem(ref s) => LiteralIter::Single(&s.finder.needle()), |
125 | Matcher::AC { ref lits, .. } => LiteralIter::AC(lits), |
126 | Matcher::Packed { ref lits, .. } => LiteralIter::Packed(lits), |
127 | } |
128 | } |
129 | |
130 | /// Returns a matcher for the longest common prefix of this matcher. |
131 | pub fn lcp(&self) -> &Memmem { |
132 | &self.lcp |
133 | } |
134 | |
135 | /// Returns a matcher for the longest common suffix of this matcher. |
136 | pub fn lcs(&self) -> &Memmem { |
137 | &self.lcs |
138 | } |
139 | |
140 | /// Returns true iff this prefix is empty. |
141 | pub fn is_empty(&self) -> bool { |
142 | self.len() == 0 |
143 | } |
144 | |
145 | /// Returns the number of prefixes in this machine. |
146 | pub fn len(&self) -> usize { |
147 | use self::Matcher::*; |
148 | match self.matcher { |
149 | Empty => 0, |
150 | Bytes(ref sset) => sset.dense.len(), |
151 | Memmem(_) => 1, |
152 | AC { ref ac, .. } => ac.patterns_len(), |
153 | Packed { ref lits, .. } => lits.len(), |
154 | } |
155 | } |
156 | |
157 | /// Return the approximate heap usage of literals in bytes. |
158 | pub fn approximate_size(&self) -> usize { |
159 | use self::Matcher::*; |
160 | match self.matcher { |
161 | Empty => 0, |
162 | Bytes(ref sset) => sset.approximate_size(), |
163 | Memmem(ref single) => single.approximate_size(), |
164 | AC { ref ac, .. } => ac.memory_usage(), |
165 | Packed { ref s, .. } => s.memory_usage(), |
166 | } |
167 | } |
168 | } |
169 | |
170 | impl Matcher { |
171 | fn prefixes(lits: &Seq) -> Self { |
172 | let sset = SingleByteSet::prefixes(lits); |
173 | Matcher::new(lits, sset) |
174 | } |
175 | |
176 | fn suffixes(lits: &Seq) -> Self { |
177 | let sset = SingleByteSet::suffixes(lits); |
178 | Matcher::new(lits, sset) |
179 | } |
180 | |
181 | fn new(lits: &Seq, sset: SingleByteSet) -> Self { |
182 | if lits.is_empty() || lits.min_literal_len() == Some(0) { |
183 | return Matcher::Empty; |
184 | } |
185 | let lits = match lits.literals() { |
186 | None => return Matcher::Empty, |
187 | Some(members) => members, |
188 | }; |
189 | if sset.dense.len() >= 26 { |
190 | // Avoid trying to match a large number of single bytes. |
191 | // This is *very* sensitive to a frequency analysis comparison |
192 | // between the bytes in sset and the composition of the haystack. |
193 | // No matter the size of sset, if its members all are rare in the |
194 | // haystack, then it'd be worth using it. How to tune this... IDK. |
195 | // ---AG |
196 | return Matcher::Empty; |
197 | } |
198 | if sset.complete { |
199 | return Matcher::Bytes(sset); |
200 | } |
201 | if lits.len() == 1 { |
202 | return Matcher::Memmem(Memmem::new(lits[0].as_bytes())); |
203 | } |
204 | |
205 | let pats: Vec<&[u8]> = lits.iter().map(|lit| lit.as_bytes()).collect(); |
206 | let is_aho_corasick_fast = sset.dense.len() <= 1 && sset.all_ascii; |
207 | if lits.len() <= 100 && !is_aho_corasick_fast { |
208 | let mut builder = packed::Config::new() |
209 | .match_kind(packed::MatchKind::LeftmostFirst) |
210 | .builder(); |
211 | if let Some(s) = builder.extend(&pats).build() { |
212 | return Matcher::Packed { s, lits: lits.to_owned() }; |
213 | } |
214 | } |
215 | let ac = AhoCorasick::builder() |
216 | .match_kind(aho_corasick::MatchKind::LeftmostFirst) |
217 | .kind(Some(aho_corasick::AhoCorasickKind::DFA)) |
218 | .build(&pats) |
219 | .unwrap(); |
220 | Matcher::AC { ac, lits: lits.to_owned() } |
221 | } |
222 | } |
223 | |
224 | #[derive (Debug)] |
225 | pub enum LiteralIter<'a> { |
226 | Empty, |
227 | Bytes(&'a [u8]), |
228 | Single(&'a [u8]), |
229 | AC(&'a [Literal]), |
230 | Packed(&'a [Literal]), |
231 | } |
232 | |
233 | impl<'a> Iterator for LiteralIter<'a> { |
234 | type Item = &'a [u8]; |
235 | |
236 | fn next(&mut self) -> Option<Self::Item> { |
237 | match *self { |
238 | LiteralIter::Empty => None, |
239 | LiteralIter::Bytes(ref mut many) => { |
240 | if many.is_empty() { |
241 | None |
242 | } else { |
243 | let next = &many[0..1]; |
244 | *many = &many[1..]; |
245 | Some(next) |
246 | } |
247 | } |
248 | LiteralIter::Single(ref mut one) => { |
249 | if one.is_empty() { |
250 | None |
251 | } else { |
252 | let next = &one[..]; |
253 | *one = &[]; |
254 | Some(next) |
255 | } |
256 | } |
257 | LiteralIter::AC(ref mut lits) => { |
258 | if lits.is_empty() { |
259 | None |
260 | } else { |
261 | let next = &lits[0]; |
262 | *lits = &lits[1..]; |
263 | Some(next.as_bytes()) |
264 | } |
265 | } |
266 | LiteralIter::Packed(ref mut lits) => { |
267 | if lits.is_empty() { |
268 | None |
269 | } else { |
270 | let next = &lits[0]; |
271 | *lits = &lits[1..]; |
272 | Some(next.as_bytes()) |
273 | } |
274 | } |
275 | } |
276 | } |
277 | } |
278 | |
279 | #[derive (Clone, Debug)] |
280 | struct SingleByteSet { |
281 | sparse: Vec<bool>, |
282 | dense: Vec<u8>, |
283 | complete: bool, |
284 | all_ascii: bool, |
285 | } |
286 | |
287 | impl SingleByteSet { |
288 | fn new() -> SingleByteSet { |
289 | SingleByteSet { |
290 | sparse: vec![false; 256], |
291 | dense: vec![], |
292 | complete: true, |
293 | all_ascii: true, |
294 | } |
295 | } |
296 | |
297 | fn prefixes(lits: &Seq) -> SingleByteSet { |
298 | let mut sset = SingleByteSet::new(); |
299 | let lits = match lits.literals() { |
300 | None => return sset, |
301 | Some(lits) => lits, |
302 | }; |
303 | for lit in lits.iter() { |
304 | sset.complete = sset.complete && lit.len() == 1; |
305 | if let Some(&b) = lit.as_bytes().get(0) { |
306 | if !sset.sparse[b as usize] { |
307 | if b > 0x7F { |
308 | sset.all_ascii = false; |
309 | } |
310 | sset.dense.push(b); |
311 | sset.sparse[b as usize] = true; |
312 | } |
313 | } |
314 | } |
315 | sset |
316 | } |
317 | |
318 | fn suffixes(lits: &Seq) -> SingleByteSet { |
319 | let mut sset = SingleByteSet::new(); |
320 | let lits = match lits.literals() { |
321 | None => return sset, |
322 | Some(lits) => lits, |
323 | }; |
324 | for lit in lits.iter() { |
325 | sset.complete = sset.complete && lit.len() == 1; |
326 | if let Some(&b) = lit.as_bytes().last() { |
327 | if !sset.sparse[b as usize] { |
328 | if b > 0x7F { |
329 | sset.all_ascii = false; |
330 | } |
331 | sset.dense.push(b); |
332 | sset.sparse[b as usize] = true; |
333 | } |
334 | } |
335 | } |
336 | sset |
337 | } |
338 | |
339 | /// Faster find that special cases certain sizes to use memchr. |
340 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
341 | fn find(&self, text: &[u8]) -> Option<usize> { |
342 | match self.dense.len() { |
343 | 0 => None, |
344 | 1 => memchr(self.dense[0], text), |
345 | 2 => memchr2(self.dense[0], self.dense[1], text), |
346 | 3 => memchr3(self.dense[0], self.dense[1], self.dense[2], text), |
347 | _ => self._find(text), |
348 | } |
349 | } |
350 | |
351 | /// Generic find that works on any sized set. |
352 | fn _find(&self, haystack: &[u8]) -> Option<usize> { |
353 | for (i, &b) in haystack.iter().enumerate() { |
354 | if self.sparse[b as usize] { |
355 | return Some(i); |
356 | } |
357 | } |
358 | None |
359 | } |
360 | |
361 | fn approximate_size(&self) -> usize { |
362 | (self.dense.len() * mem::size_of::<u8>()) |
363 | + (self.sparse.len() * mem::size_of::<bool>()) |
364 | } |
365 | } |
366 | |
367 | /// A simple wrapper around the memchr crate's memmem implementation. |
368 | /// |
369 | /// The API this exposes mirrors the API of previous substring searchers that |
370 | /// this supplanted. |
371 | #[derive (Clone, Debug)] |
372 | pub struct Memmem { |
373 | finder: memmem::Finder<'static>, |
374 | char_len: usize, |
375 | } |
376 | |
377 | impl Memmem { |
378 | fn new(pat: &[u8]) -> Memmem { |
379 | Memmem { |
380 | finder: memmem::Finder::new(pat).into_owned(), |
381 | char_len: char_len_lossy(pat), |
382 | } |
383 | } |
384 | |
385 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
386 | pub fn find(&self, haystack: &[u8]) -> Option<usize> { |
387 | self.finder.find(haystack) |
388 | } |
389 | |
390 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
391 | pub fn is_suffix(&self, text: &[u8]) -> bool { |
392 | if text.len() < self.len() { |
393 | return false; |
394 | } |
395 | &text[text.len() - self.len()..] == self.finder.needle() |
396 | } |
397 | |
398 | pub fn len(&self) -> usize { |
399 | self.finder.needle().len() |
400 | } |
401 | |
402 | pub fn char_len(&self) -> usize { |
403 | self.char_len |
404 | } |
405 | |
406 | fn approximate_size(&self) -> usize { |
407 | self.finder.needle().len() * mem::size_of::<u8>() |
408 | } |
409 | } |
410 | |
411 | fn char_len_lossy(bytes: &[u8]) -> usize { |
412 | String::from_utf8_lossy(bytes).chars().count() |
413 | } |
414 | |