| 1 | use alloc::{vec, vec::Vec}; |
| 2 | |
| 3 | use regex_syntax::hir::Hir; |
| 4 | |
| 5 | use crate::{meta::regex::RegexInfo, util::search::MatchKind}; |
| 6 | |
| 7 | /// Pull out an alternation of literals from the given sequence of HIR |
| 8 | /// expressions. |
| 9 | /// |
| 10 | /// There are numerous ways for this to fail. Generally, this only applies |
| 11 | /// to regexes of the form 'foo|bar|baz|...|quux'. It can also fail if there |
| 12 | /// are "too few" alternates, in which case, the regex engine is likely faster. |
| 13 | /// |
| 14 | /// And currently, this only returns something when 'hirs.len() == 1'. |
| 15 | pub(crate) fn alternation_literals( |
| 16 | info: &RegexInfo, |
| 17 | hirs: &[&Hir], |
| 18 | ) -> Option<Vec<Vec<u8>>> { |
| 19 | use regex_syntax::hir::{HirKind, Literal}; |
| 20 | |
| 21 | // Might as well skip the work below if we know we can't build an |
| 22 | // Aho-Corasick searcher. |
| 23 | if !cfg!(feature = "perf-literal-multisubstring" ) { |
| 24 | return None; |
| 25 | } |
| 26 | // This is pretty hacky, but basically, if `is_alternation_literal` is |
| 27 | // true, then we can make several assumptions about the structure of our |
| 28 | // HIR. This is what justifies the `unreachable!` statements below. |
| 29 | if hirs.len() != 1 |
| 30 | || !info.props()[0].look_set().is_empty() |
| 31 | || info.props()[0].explicit_captures_len() > 0 |
| 32 | || !info.props()[0].is_alternation_literal() |
| 33 | || info.config().get_match_kind() != MatchKind::LeftmostFirst |
| 34 | { |
| 35 | return None; |
| 36 | } |
| 37 | let hir = &hirs[0]; |
| 38 | let alts = match *hir.kind() { |
| 39 | HirKind::Alternation(ref alts) => alts, |
| 40 | _ => return None, // one literal isn't worth it |
| 41 | }; |
| 42 | |
| 43 | let mut lits = vec![]; |
| 44 | for alt in alts { |
| 45 | let mut lit = vec![]; |
| 46 | match *alt.kind() { |
| 47 | HirKind::Literal(Literal(ref bytes)) => { |
| 48 | lit.extend_from_slice(bytes) |
| 49 | } |
| 50 | HirKind::Concat(ref exprs) => { |
| 51 | for e in exprs { |
| 52 | match *e.kind() { |
| 53 | HirKind::Literal(Literal(ref bytes)) => { |
| 54 | lit.extend_from_slice(bytes); |
| 55 | } |
| 56 | _ => unreachable!("expected literal, got {:?}" , e), |
| 57 | } |
| 58 | } |
| 59 | } |
| 60 | _ => unreachable!("expected literal or concat, got {:?}" , alt), |
| 61 | } |
| 62 | lits.push(lit); |
| 63 | } |
| 64 | // Why do this? Well, when the number of literals is small, it's likely |
| 65 | // that we'll use the lazy DFA which is in turn likely to be faster than |
| 66 | // Aho-Corasick in such cases. Primarily because Aho-Corasick doesn't have |
| 67 | // a "lazy DFA" but either a contiguous NFA or a full DFA. We rarely use |
| 68 | // the latter because it is so hungry (in time and space), and the former |
| 69 | // is decently fast, but not as fast as a well oiled lazy DFA. |
| 70 | // |
| 71 | // However, once the number starts getting large, the lazy DFA is likely |
| 72 | // to start thrashing because of the modest default cache size. When |
| 73 | // exactly does this happen? Dunno. But at whatever point that is (we make |
| 74 | // a guess below based on ad hoc benchmarking), we'll want to cut over to |
| 75 | // Aho-Corasick, where even the contiguous NFA is likely to do much better. |
| 76 | if lits.len() < 3000 { |
| 77 | debug!("skipping Aho-Corasick because there are too few literals" ); |
| 78 | return None; |
| 79 | } |
| 80 | Some(lits) |
| 81 | } |
| 82 | |