1/*!
2A module dedicated to plucking inner literals out of a regex pattern, and
3then constructing a prefilter for them. We also include a regex pattern
4"prefix" that corresponds to the bits of the regex that need to match before
5the literals do. The reverse inner optimization then proceeds by looking for
6matches of the inner literal(s), and then doing a reverse search of the prefix
7from the start of the literal match to find the overall start position of the
8match.
9
10The essential invariant we want to uphold here is that the literals we return
11reflect a set where *at least* one of them must match in order for the overall
12regex to match. We also need to maintain the invariant that the regex prefix
13returned corresponds to the entirety of the regex up until the literals we
14return.
15
16This somewhat limits what we can do. That is, if we a regex like
17`\w+(@!|%%)\w+`, then we can pluck the `{@!, %%}` out and build a prefilter
18from it. Then we just need to compile `\w+` in reverse. No fuss no muss. But if
19we have a regex like \d+@!|\w+%%`, then we get kind of stymied. Technically,
20we could still extract `{@!, %%}`, and it is true that at least of them must
21match. But then, what is our regex prefix? Again, in theory, that could be
22`\d+|\w+`, but that's not quite right, because the `\d+` only matches when `@!`
23matches, and `\w+` only matches when `%%` matches.
24
25All of that is technically possible to do, but it seemingly requires a lot of
26sophistication and machinery. Probably the way to tackle that is with some kind
27of formalism and approach this problem more generally.
28
29For now, the code below basically just looks for a top-level concatenation.
30And if it can find one, it looks for literals in each of the direct child
31sub-expressions of that concatenation. If some good ones are found, we return
32those and a concatenation of the Hir expressions seen up to that point.
33*/
34
35use alloc::vec::Vec;
36
37use regex_syntax::hir::{self, literal, Hir, HirKind};
38
39use crate::{util::prefilter::Prefilter, MatchKind};
40
41/// Attempts to extract an "inner" prefilter from the given HIR expressions. If
42/// one was found, then a concatenation of the HIR expressions that precede it
43/// is returned.
44///
45/// The idea here is that the prefilter returned can be used to find candidate
46/// matches. And then the HIR returned can be used to build a reverse regex
47/// matcher, which will find the start of the candidate match. Finally, the
48/// match still has to be confirmed with a normal anchored forward scan to find
49/// the end position of the match.
50///
51/// Note that this assumes leftmost-first match semantics, so callers must
52/// not call this otherwise.
53pub(crate) fn extract(hirs: &[&Hir]) -> Option<(Hir, Prefilter)> {
54 if hirs.len() != 1 {
55 debug!(
56 "skipping reverse inner optimization since it only \
57 supports 1 pattern, {} were given",
58 hirs.len(),
59 );
60 return None;
61 }
62 let mut concat = match top_concat(hirs[0]) {
63 Some(concat) => concat,
64 None => {
65 debug!(
66 "skipping reverse inner optimization because a top-level \
67 concatenation could not found",
68 );
69 return None;
70 }
71 };
72 // We skip the first HIR because if it did have a prefix prefilter in it,
73 // we probably wouldn't be here looking for an inner prefilter.
74 for i in 1..concat.len() {
75 let hir = &concat[i];
76 let pre = match prefilter(hir) {
77 None => continue,
78 Some(pre) => pre,
79 };
80 // Even if we got a prefilter, if it isn't consider "fast," then we
81 // probably don't want to bother with it. Namely, since the reverse
82 // inner optimization requires some overhead, it likely only makes
83 // sense if the prefilter scan itself is (believed) to be much faster
84 // than the regex engine.
85 if !pre.is_fast() {
86 debug!(
87 "skipping extracted inner prefilter because \
88 it probably isn't fast"
89 );
90 continue;
91 }
92 let concat_suffix = Hir::concat(concat.split_off(i));
93 let concat_prefix = Hir::concat(concat);
94 // Look for a prefilter again. Why? Because above we only looked for
95 // a prefilter on the individual 'hir', but we might be able to find
96 // something better and more discriminatory by looking at the entire
97 // suffix. We don't do this above to avoid making this loop worst case
98 // quadratic in the length of 'concat'.
99 let pre2 = match prefilter(&concat_suffix) {
100 None => pre,
101 Some(pre2) => {
102 if pre2.is_fast() {
103 pre2
104 } else {
105 pre
106 }
107 }
108 };
109 return Some((concat_prefix, pre2));
110 }
111 debug!(
112 "skipping reverse inner optimization because a top-level \
113 sub-expression with a fast prefilter could not be found"
114 );
115 None
116}
117
118/// Attempt to extract a prefilter from an HIR expression.
119///
120/// We do a little massaging here to do our best that the prefilter we get out
121/// of this is *probably* fast. Basically, the false positive rate has a much
122/// higher impact for things like the reverse inner optimization because more
123/// work needs to potentially be done for each candidate match.
124///
125/// Note that this assumes leftmost-first match semantics, so callers must
126/// not call this otherwise.
127fn prefilter(hir: &Hir) -> Option<Prefilter> {
128 let mut extractor = literal::Extractor::new();
129 extractor.kind(literal::ExtractKind::Prefix);
130 let mut prefixes = extractor.extract(hir);
131 debug!(
132 "inner prefixes (len={:?}) extracted before optimization: {:?}",
133 prefixes.len(),
134 prefixes
135 );
136 // Since these are inner literals, we know they cannot be exact. But the
137 // extractor doesn't know this. We mark them as inexact because this might
138 // impact literal optimization. Namely, optimization weights "all literals
139 // are exact" as very high, because it presumes that any match results in
140 // an overall match. But of course, that is not the case here.
141 //
142 // In practice, this avoids plucking out a ASCII-only \s as an alternation
143 // of single-byte whitespace characters.
144 prefixes.make_inexact();
145 prefixes.optimize_for_prefix_by_preference();
146 debug!(
147 "inner prefixes (len={:?}) extracted after optimization: {:?}",
148 prefixes.len(),
149 prefixes
150 );
151 prefixes
152 .literals()
153 .and_then(|lits| Prefilter::new(MatchKind::LeftmostFirst, lits))
154}
155
156/// Looks for a "top level" HirKind::Concat item in the given HIR. This will
157/// try to return one even if it's embedded in a capturing group, but is
158/// otherwise pretty conservative in what is returned.
159///
160/// The HIR returned is a complete copy of the concat with all capturing
161/// groups removed. In effect, the concat returned is "flattened" with respect
162/// to capturing groups. This makes the detection logic above for prefixes
163/// a bit simpler, and it works because 1) capturing groups never influence
164/// whether a match occurs or not and 2) capturing groups are not used when
165/// doing the reverse inner search to find the start of the match.
166fn top_concat(mut hir: &Hir) -> Option<Vec<Hir>> {
167 loop {
168 hir = match hir.kind() {
169 HirKind::Empty
170 | HirKind::Literal(_)
171 | HirKind::Class(_)
172 | HirKind::Look(_)
173 | HirKind::Repetition(_)
174 | HirKind::Alternation(_) => return None,
175 HirKind::Capture(hir::Capture { ref sub, .. }) => sub,
176 HirKind::Concat(ref subs) => {
177 // We are careful to only do the flattening/copy when we know
178 // we have a "top level" concat we can inspect. This avoids
179 // doing extra work in cases where we definitely won't use it.
180 // (This might still be wasted work if we can't go on to find
181 // some literals to extract.)
182 let concat =
183 Hir::concat(subs.iter().map(|h| flatten(h)).collect());
184 return match concat.into_kind() {
185 HirKind::Concat(xs) => Some(xs),
186 // It is actually possible for this case to occur, because
187 // 'Hir::concat' might simplify the expression to the point
188 // that concatenations are actually removed. One wonders
189 // whether this leads to other cases where we should be
190 // extracting literals, but in theory, I believe if we do
191 // get here, then it means that a "real" prefilter failed
192 // to be extracted and we should probably leave well enough
193 // alone. (A "real" prefilter is unbothered by "top-level
194 // concats" and "capturing groups.")
195 _ => return None,
196 };
197 }
198 };
199 }
200}
201
202/// Returns a copy of the given HIR but with all capturing groups removed.
203fn flatten(hir: &Hir) -> Hir {
204 match hir.kind() {
205 HirKind::Empty => Hir::empty(),
206 HirKind::Literal(hir::Literal(ref x: &Box<[u8]>)) => Hir::literal(lit:x.clone()),
207 HirKind::Class(ref x: &Class) => Hir::class(x.clone()),
208 HirKind::Look(ref x: &Look) => Hir::look(x.clone()),
209 HirKind::Repetition(ref x: &Repetition) => Hir::repetition(rep:x.with(sub:flatten(&x.sub))),
210 // This is the interesting case. We just drop the group information
211 // entirely and use the child HIR itself.
212 HirKind::Capture(hir::Capture { ref sub: &Box, .. }) => flatten(hir:sub),
213 HirKind::Alternation(ref xs: &Vec) => {
214 Hir::alternation(subs:xs.iter().map(|x: &Hir| flatten(hir:x)).collect())
215 }
216 HirKind::Concat(ref xs: &Vec) => {
217 Hir::concat(subs:xs.iter().map(|x: &Hir| flatten(hir:x)).collect())
218 }
219 }
220}
221