1 | use alloc::{vec, vec::Vec}; |
2 | |
3 | use regex_syntax::hir::Hir; |
4 | |
5 | use crate::{meta::regex::RegexInfo, util::search::MatchKind}; |
6 | |
7 | /// Pull out an alternation of literals from the given sequence of HIR |
8 | /// expressions. |
9 | /// |
10 | /// There are numerous ways for this to fail. Generally, this only applies |
11 | /// to regexes of the form 'foo|bar|baz|...|quux'. It can also fail if there |
12 | /// are "too few" alternates, in which case, the regex engine is likely faster. |
13 | /// |
14 | /// And currently, this only returns something when 'hirs.len() == 1'. |
15 | pub(crate) fn alternation_literals( |
16 | info: &RegexInfo, |
17 | hirs: &[&Hir], |
18 | ) -> Option<Vec<Vec<u8>>> { |
19 | use regex_syntax::hir::{HirKind, Literal}; |
20 | |
21 | // Might as well skip the work below if we know we can't build an |
22 | // Aho-Corasick searcher. |
23 | if !cfg!(feature = "perf-literal-multisubstring" ) { |
24 | return None; |
25 | } |
26 | // This is pretty hacky, but basically, if `is_alternation_literal` is |
27 | // true, then we can make several assumptions about the structure of our |
28 | // HIR. This is what justifies the `unreachable!` statements below. |
29 | if hirs.len() != 1 |
30 | || !info.props()[0].look_set().is_empty() |
31 | || info.props()[0].explicit_captures_len() > 0 |
32 | || !info.props()[0].is_alternation_literal() |
33 | || info.config().get_match_kind() != MatchKind::LeftmostFirst |
34 | { |
35 | return None; |
36 | } |
37 | let hir = &hirs[0]; |
38 | let alts = match *hir.kind() { |
39 | HirKind::Alternation(ref alts) => alts, |
40 | _ => return None, // one literal isn't worth it |
41 | }; |
42 | |
43 | let mut lits = vec![]; |
44 | for alt in alts { |
45 | let mut lit = vec![]; |
46 | match *alt.kind() { |
47 | HirKind::Literal(Literal(ref bytes)) => { |
48 | lit.extend_from_slice(bytes) |
49 | } |
50 | HirKind::Concat(ref exprs) => { |
51 | for e in exprs { |
52 | match *e.kind() { |
53 | HirKind::Literal(Literal(ref bytes)) => { |
54 | lit.extend_from_slice(bytes); |
55 | } |
56 | _ => unreachable!("expected literal, got {:?}" , e), |
57 | } |
58 | } |
59 | } |
60 | _ => unreachable!("expected literal or concat, got {:?}" , alt), |
61 | } |
62 | lits.push(lit); |
63 | } |
64 | // Why do this? Well, when the number of literals is small, it's likely |
65 | // that we'll use the lazy DFA which is in turn likely to be faster than |
66 | // Aho-Corasick in such cases. Primarily because Aho-Corasick doesn't have |
67 | // a "lazy DFA" but either a contiguous NFA or a full DFA. We rarely use |
68 | // the latter because it is so hungry (in time and space), and the former |
69 | // is decently fast, but not as fast as a well oiled lazy DFA. |
70 | // |
71 | // However, once the number starts getting large, the lazy DFA is likely |
72 | // to start thrashing because of the modest default cache size. When |
73 | // exactly does this happen? Dunno. But at whatever point that is (we make |
74 | // a guess below based on ad hoc benchmarking), we'll want to cut over to |
75 | // Aho-Corasick, where even the contiguous NFA is likely to do much better. |
76 | if lits.len() < 3000 { |
77 | debug!("skipping Aho-Corasick because there are too few literals" ); |
78 | return None; |
79 | } |
80 | Some(lits) |
81 | } |
82 | |