reverse_inner.rs source code [crates/regex-automata-0.4.9/src/meta/reverse_inner.rs]

1	/!*
2	A module dedicated to plucking inner literals out of a regex pattern, and
3	then constructing a prefilter for them. We also include a regex pattern
4	"prefix" that corresponds to the bits of the regex that need to match before
5	the literals do. The reverse inner optimization then proceeds by looking for
6	matches of the inner literal(s), and then doing a reverse search of the prefix
7	from the start of the literal match to find the overall start position of the
8	match.
9
10	The essential invariant we want to uphold here is that the literals we return
11	reflect a set where at least* one of them must match in order for the overall*
12	regex to match. We also need to maintain the invariant that the regex prefix
13	returned corresponds to the entirety of the regex up until the literals we
14	return.
15
16	This somewhat limits what we can do. That is, if we a regex like
17	`\w+(@!\|%%)\w+`, then we can pluck the `{@!, %%}` out and build a prefilter
18	from it. Then we just need to compile `\w+` in reverse. No fuss no muss. But if
19	we have a regex like \d+@!\|\w+%%`, then we get kind of stymied. Technically,
20	we could still extract `{@!, %%}`, and it is true that at least of them must
21	match. But then, what is our regex prefix? Again, in theory, that could be
22	`\d+\|\w+`, but that's not quite right, because the `\d+` only matches when `@!`
23	matches, and `\w+` only matches when `%%` matches.
24
25	All of that is technically possible to do, but it seemingly requires a lot of
26	sophistication and machinery. Probably the way to tackle that is with some kind
27	of formalism and approach this problem more generally.
28
29	For now, the code below basically just looks for a top-level concatenation.
30	And if it can find one, it looks for literals in each of the direct child
31	sub-expressions of that concatenation. If some good ones are found, we return
32	those and a concatenation of the Hir expressions seen up to that point.
33	*/
34
35	use alloc::vec::Vec;
36
37	use regex_syntax::hir::{self, literal, Hir, HirKind};
38
39	use crate::{util::prefilter::Prefilter, MatchKind};
40
41	/// Attempts to extract an "inner" prefilter from the given HIR expressions. If
42	/// one was found, then a concatenation of the HIR expressions that precede it
43	/// is returned.
44	///
45	/// The idea here is that the prefilter returned can be used to find candidate
46	/// matches. And then the HIR returned can be used to build a reverse regex
47	/// matcher, which will find the start of the candidate match. Finally, the
48	/// match still has to be confirmed with a normal anchored forward scan to find
49	/// the end position of the match.
50	///
51	/// Note that this assumes leftmost-first match semantics, so callers must
52	/// not call this otherwise.
53	pub(crate) fn extract(hirs: &[&Hir]) -> Option<(Hir, Prefilter)> {
54	if hirs.len() != `1` {
55	debug!(
56	"skipping reverse inner optimization since it only \
57	supports 1 pattern, {} were given",
58	hirs.len(),
59	);
60	return None;
61	}
62	let mut concat = match top_concat(hirs[`0`]) {
63	Some(concat) => concat,
64	None => {
65	debug!(
66	"skipping reverse inner optimization because a top-level \
67	concatenation could not found",
68	);
69	return None;
70	}
71	};
72	// We skip the first HIR because if it did have a prefix prefilter in it,
73	// we probably wouldn't be here looking for an inner prefilter.
74	for i in `1`..concat.len() {
75	let hir = &concat[i];
76	let pre = match prefilter(hir) {
77	None => continue,
78	Some(pre) => pre,
79	};
80	// Even if we got a prefilter, if it isn't consider "fast," then we
81	// probably don't want to bother with it. Namely, since the reverse
82	// inner optimization requires some overhead, it likely only makes
83	// sense if the prefilter scan itself is (believed) to be much faster
84	// than the regex engine.
85	if !pre.is_fast() {
86	debug!(
87	"skipping extracted inner prefilter because \
88	it probably isn't fast"
89	);
90	continue;
91	}
92	let concat_suffix = Hir::concat(concat.split_off(i));
93	let concat_prefix = Hir::concat(concat);
94	// Look for a prefilter again. Why? Because above we only looked for
95	// a prefilter on the individual 'hir', but we might be able to find
96	// something better and more discriminatory by looking at the entire
97	// suffix. We don't do this above to avoid making this loop worst case
98	// quadratic in the length of 'concat'.
99	let pre2 = match prefilter(&concat_suffix) {
100	None => pre,
101	Some(pre2) => {
102	if pre2.is_fast() {
103	pre2
104	} else {
105	pre
106	}
107	}
108	};
109	return Some((concat_prefix, pre2));
110	}
111	debug!(
112	"skipping reverse inner optimization because a top-level \
113	sub-expression with a fast prefilter could not be found"
114	);
115	None
116	}
117
118	/// Attempt to extract a prefilter from an HIR expression.
119	///
120	/// We do a little massaging here to do our best that the prefilter we get out
121	/// of this is probably* fast. Basically, the false positive rate has a much*
122	/// higher impact for things like the reverse inner optimization because more
123	/// work needs to potentially be done for each candidate match.
124	///
125	/// Note that this assumes leftmost-first match semantics, so callers must
126	/// not call this otherwise.
127	fn prefilter(hir: &Hir) -> Option<Prefilter> {
128	let mut extractor = literal::Extractor::new();
129	extractor.kind(literal::ExtractKind::Prefix);
130	let mut prefixes = extractor.extract(hir);
131	debug!(
132	"inner prefixes (len={:?}) extracted before optimization: {:?}",
133	prefixes.len(),
134	prefixes
135	);
136	// Since these are inner literals, we know they cannot be exact. But the
137	// extractor doesn't know this. We mark them as inexact because this might
138	// impact literal optimization. Namely, optimization weights "all literals
139	// are exact" as very high, because it presumes that any match results in
140	// an overall match. But of course, that is not the case here.
141	//
142	// In practice, this avoids plucking out a ASCII-only \s as an alternation
143	// of single-byte whitespace characters.
144	prefixes.make_inexact();
145	prefixes.optimize_for_prefix_by_preference();
146	debug!(
147	"inner prefixes (len={:?}) extracted after optimization: {:?}",
148	prefixes.len(),
149	prefixes
150	);
151	prefixes
152	.literals()
153	.and_then(\|lits\| Prefilter::new(MatchKind::LeftmostFirst, lits))
154	}
155
156	/// Looks for a "top level" HirKind::Concat item in the given HIR. This will
157	/// try to return one even if it's embedded in a capturing group, but is
158	/// otherwise pretty conservative in what is returned.
159	///
160	/// The HIR returned is a complete copy of the concat with all capturing
161	/// groups removed. In effect, the concat returned is "flattened" with respect
162	/// to capturing groups. This makes the detection logic above for prefixes
163	/// a bit simpler, and it works because 1) capturing groups never influence
164	/// whether a match occurs or not and 2) capturing groups are not used when
165	/// doing the reverse inner search to find the start of the match.
166	fn top_concat(mut hir: &Hir) -> Option<Vec<Hir>> {
167	loop {
168	hir = match hir.kind() {
169	HirKind::Empty
170	\| HirKind::Literal(_)
171	\| HirKind::Class(_)
172	\| HirKind::Look(_)
173	\| HirKind::Repetition(_)
174	\| HirKind::Alternation(_) => return None,
175	HirKind::Capture(hir::Capture { ref sub, .. }) => sub,
176	HirKind::Concat(ref subs) => {
177	// We are careful to only do the flattening/copy when we know
178	// we have a "top level" concat we can inspect. This avoids
179	// doing extra work in cases where we definitely won't use it.
180	// (This might still be wasted work if we can't go on to find
181	// some literals to extract.)
182	let concat =
183	Hir::concat(subs.iter().map(\|h\| flatten(h)).collect());
184	return match concat.into_kind() {
185	HirKind::Concat(xs) => Some(xs),
186	// It is actually possible for this case to occur, because
187	// 'Hir::concat' might simplify the expression to the point
188	// that concatenations are actually removed. One wonders
189	// whether this leads to other cases where we should be
190	// extracting literals, but in theory, I believe if we do
191	// get here, then it means that a "real" prefilter failed
192	// to be extracted and we should probably leave well enough
193	// alone. (A "real" prefilter is unbothered by "top-level
194	// concats" and "capturing groups.")
195	_ => return None,
196	};
197	}
198	};
199	}
200	}
201
202	/// Returns a copy of the given HIR but with all capturing groups removed.
203	fn flatten(hir: &Hir) -> Hir {
204	match hir.kind() {
205	HirKind::Empty => Hir::empty(),
206	HirKind::Literal(hir::Literal(ref x: &Box<[u8]>)) => Hir::literal(lit:x.clone()),
207	HirKind::Class(ref x: &Class) => Hir::class(x.clone()),
208	HirKind::Look(ref x: &Look) => Hir::look(x.clone()),
209	HirKind::Repetition(ref x: &Repetition) => Hir::repetition(rep:x.with(sub:flatten(&x.sub))),
210	// This is the interesting case. We just drop the group information
211	// entirely and use the child HIR itself.
212	HirKind::Capture(hir::Capture { ref sub: &Box, .. }) => flatten(hir:sub),
213	HirKind::Alternation(ref xs: &Vec) => {
214	Hir::alternation(subs:xs.iter().map(\|x: &Hir\| flatten(hir:x)).collect())
215	}
216	HirKind::Concat(ref xs: &Vec) => {
217	Hir::concat(subs:xs.iter().map(\|x: &Hir\| flatten(hir:x)).collect())
218	}
219	}
220	}
221