literal.rs - Codebrowser

1	use alloc::{vec, vec::Vec};
2
3	use regex_syntax::hir::Hir;
4
5	use crate::{meta::regex::RegexInfo, util::search::MatchKind};
6
7	/// Pull out an alternation of literals from the given sequence of HIR
8	/// expressions.
9	///
10	/// There are numerous ways for this to fail. Generally, this only applies
11	/// to regexes of the form 'foo\|bar\|baz\|...\|quux'. It can also fail if there
12	/// are "too few" alternates, in which case, the regex engine is likely faster.
13	///
14	/// And currently, this only returns something when 'hirs.len() == 1'.
15	pub(crate) fn alternation_literals(
16	info: &RegexInfo,
17	hirs: &[&Hir],
18	) -> Option<Vec<Vec<u8>>> {
19	use regex_syntax::hir::{HirKind, Literal};
20
21	// Might as well skip the work below if we know we can't build an
22	// Aho-Corasick searcher.
23	if !cfg!(feature = "perf-literal-multisubstring") {
24	return None;
25	}
26	// This is pretty hacky, but basically, if `is_alternation_literal` is
27	// true, then we can make several assumptions about the structure of our
28	// HIR. This is what justifies the `unreachable!` statements below.
29	if hirs.len() != `1`
30	\|\| !info.props()[`0`].look_set().is_empty()
31	\|\| info.props()[`0`].explicit_captures_len() > `0`
32	\|\| !info.props()[`0`].is_alternation_literal()
33	\|\| info.config().get_match_kind() != MatchKind::LeftmostFirst
34	{
35	return None;
36	}
37	let hir = &hirs[`0`];
38	let alts = match *hir.kind() {
39	HirKind::Alternation(ref alts) => alts,
40	_ => return None, // one literal isn't worth it
41	};
42
43	let mut lits = vec![];
44	for alt in alts {
45	let mut lit = vec![];
46	match *alt.kind() {
47	HirKind::Literal(Literal(ref bytes)) => {
48	lit.extend_from_slice(bytes)
49	}
50	HirKind::Concat(ref exprs) => {
51	for e in exprs {
52	match *e.kind() {
53	HirKind::Literal(Literal(ref bytes)) => {
54	lit.extend_from_slice(bytes);
55	}
56	_ => unreachable!("expected literal, got {:?}", e),
57	}
58	}
59	}
60	_ => unreachable!("expected literal or concat, got {:?}", alt),
61	}
62	lits.push(lit);
63	}
64	// Why do this? Well, when the number of literals is small, it's likely
65	// that we'll use the lazy DFA which is in turn likely to be faster than
66	// Aho-Corasick in such cases. Primarily because Aho-Corasick doesn't have
67	// a "lazy DFA" but either a contiguous NFA or a full DFA. We rarely use
68	// the latter because it is so hungry (in time and space), and the former
69	// is decently fast, but not as fast as a well oiled lazy DFA.
70	//
71	// However, once the number starts getting large, the lazy DFA is likely
72	// to start thrashing because of the modest default cache size. When
73	// exactly does this happen? Dunno. But at whatever point that is (we make
74	// a guess below based on ad hoc benchmarking), we'll want to cut over to
75	// Aho-Corasick, where even the contiguous NFA is likely to do much better.
76	if lits.len() < `3000` {
77	debug!("skipping Aho-Corasick because there are too few literals");
78	return None;
79	}
80	Some(lits)
81	}
82