stopat.rs source code [crates/regex-automata-0.4.9/src/meta/stopat.rs]

1	/!*
2	This module defines two bespoke forward DFA search routines. One for the lazy
3	DFA and one for the fully compiled DFA. These routines differ from the normal
4	ones by reporting the position at which the search terminates when a match
5	isn't found.
6
7	This position at which a search terminates is useful in contexts where the meta
8	regex engine runs optimizations that could go quadratic if we aren't careful.
9	Namely, a regex search could* scan to the end of the haystack only to report a*
10	non-match. If the caller doesn't know that the search scanned to the end of the
11	haystack, it might restart the search at the next literal candidate it finds
12	and repeat the process.
13
14	Providing the caller with the position at which the search stopped provides a
15	way for the caller to determine the point at which subsequent scans should not
16	pass. This is principally used in the "reverse inner" optimization, which works
17	like this:
18
19	1. Look for a match of an inner literal. Say, 'Z' in '\w+Z\d+'.
20	2. At the spot where 'Z' matches, do a reverse anchored search from there for
21	'\w+'.
22	3. If the reverse search matches, it corresponds to the start position of a
23	(possible) match. At this point, do a forward anchored search to find the end
24	position. If an end position is found, then we have a match and we know its
25	bounds.
26
27	If the forward anchored search in (3) searches the entire rest of the haystack
28	but reports a non-match, then a naive implementation of the above will continue
29	back at step 1 looking for more candidates. There might still be a match to be
30	found! It's possible. But we already scanned the whole haystack. So if we keep
31	repeating the process, then we might wind up taking quadratic time in the size
32	of the haystack, which is not great.
33
34	So if the forward anchored search in (3) reports the position at which it
35	stops, then we can detect whether quadratic behavior might be occurring in
36	steps (1) and (2). For (1), it occurs if the literal candidate found occurs
37	before the end of the previous search in (3), since that means we're now
38	going to look for another match in a place where the forward search has already
39	scanned. It is correct* to do so, but our technique has become inefficient.*
40	For (2), quadratic behavior occurs similarly when its reverse search extends
41	past the point where the previous forward search in (3) terminated. Indeed, to
42	implement (2), we use the sibling 'limited' module for ensuring our reverse
43	scan doesn't go further than we want.
44
45	See the 'opt/reverse-inner' benchmarks in rebar for a real demonstration of
46	how quadratic behavior is mitigated.
47	*/
48
49	use crate::{meta::error::RetryFailError, HalfMatch, Input, MatchError};
50
51	#[cfg(feature = "dfa-build")]
52	pub(crate) fn dfa_try_search_half_fwd(
53	dfa: &crate::dfa::dense::DFA<alloc::vec::Vec<u32>>,
54	input: &Input<'_>,
55	) -> Result<Result<HalfMatch, usize>, RetryFailError> {
56	use crate::dfa::{accel, Automaton};
57
58	let mut mat = None;
59	let mut sid = dfa.start_state_forward(input)?;
60	let mut at = input.start();
61	while at < input.end() {
62	sid = dfa.next_state(sid, input.haystack()[at]);
63	if dfa.is_special_state(sid) {
64	if dfa.is_match_state(sid) {
65	let pattern = dfa.match_pattern(sid, `0`);
66	mat = Some(HalfMatch::new(pattern, at));
67	if input.get_earliest() {
68	return Ok(mat.ok_or(at));
69	}
70	if dfa.is_accel_state(sid) {
71	let needs = dfa.accelerator(sid);
72	at = accel::find_fwd(needs, input.haystack(), at)
73	.unwrap_or(input.end());
74	continue;
75	}
76	} else if dfa.is_accel_state(sid) {
77	let needs = dfa.accelerator(sid);
78	at = accel::find_fwd(needs, input.haystack(), at)
79	.unwrap_or(input.end());
80	continue;
81	} else if dfa.is_dead_state(sid) {
82	return Ok(mat.ok_or(at));
83	} else if dfa.is_quit_state(sid) {
84	return Err(MatchError::quit(input.haystack()[at], at).into());
85	} else {
86	// Ideally we wouldn't use a DFA that specialized start states
87	// and thus 'is_start_state()' could never be true here, but in
88	// practice we reuse the DFA created for the full regex which
89	// will specialize start states whenever there is a prefilter.
90	debug_assert!(dfa.is_start_state(sid));
91	}
92	}
93	at += `1`;
94	}
95	dfa_eoi_fwd(dfa, input, &mut sid, &mut mat)?;
96	Ok(mat.ok_or(at))
97	}
98
99	#[cfg(feature = "hybrid")]
100	pub(crate) fn hybrid_try_search_half_fwd(
101	dfa: &crate::hybrid::dfa::DFA,
102	cache: &mut crate::hybrid::dfa::Cache,
103	input: &Input<'_>,
104	) -> Result<Result<HalfMatch, usize>, RetryFailError> {
105	let mut mat = None;
106	let mut sid = dfa.start_state_forward(cache, input)?;
107	let mut at = input.start();
108	while at < input.end() {
109	sid = dfa
110	.next_state(cache, sid, input.haystack()[at])
111	.map_err(\|_\| MatchError::gave_up(at))?;
112	if sid.is_tagged() {
113	if sid.is_match() {
114	let pattern = dfa.match_pattern(cache, sid, `0`);
115	mat = Some(HalfMatch::new(pattern, at));
116	if input.get_earliest() {
117	return Ok(mat.ok_or(at));
118	}
119	} else if sid.is_dead() {
120	return Ok(mat.ok_or(at));
121	} else if sid.is_quit() {
122	return Err(MatchError::quit(input.haystack()[at], at).into());
123	} else {
124	// We should NEVER get an unknown state ID back from
125	// dfa.next_state().
126	debug_assert!(!sid.is_unknown());
127	// Ideally we wouldn't use a lazy DFA that specialized start
128	// states and thus 'sid.is_start()' could never be true here,
129	// but in practice we reuse the lazy DFA created for the full
130	// regex which will specialize start states whenever there is
131	// a prefilter.
132	debug_assert!(sid.is_start());
133	}
134	}
135	at += `1`;
136	}
137	hybrid_eoi_fwd(dfa, cache, input, &mut sid, &mut mat)?;
138	Ok(mat.ok_or(at))
139	}
140
141	#[cfg(feature = "dfa-build")]
142	#[cfg_attr(feature = "perf-inline", inline(always))]
143	fn dfa_eoi_fwd(
144	dfa: &crate::dfa::dense::DFA<alloc::vec::Vec<u32>>,
145	input: &Input<'_>,
146	sid: &mut crate::util::primitives::StateID,
147	mat: &mut Option<HalfMatch>,
148	) -> Result<(), MatchError> {
149	use crate::dfa::Automaton;
150
151	let sp = input.get_span();
152	match input.haystack().get(sp.end) {
153	Some(&b) => {
154	sid = dfa.next_state(sid, b);
155	if dfa.is_match_state(*sid) {
156	let pattern = dfa.match_pattern(*sid, `0`);
157	*mat = Some(HalfMatch::new(pattern, sp.end));
158	} else if dfa.is_quit_state(*sid) {
159	return Err(MatchError::quit(b, sp.end));
160	}
161	}
162	None => {
163	sid = dfa.next_eoi_state(sid);
164	if dfa.is_match_state(*sid) {
165	let pattern = dfa.match_pattern(*sid, `0`);
166	*mat = Some(HalfMatch::new(pattern, input.haystack().len()));
167	}
168	// N.B. We don't have to check 'is_quit' here because the EOI
169	// transition can never lead to a quit state.
170	debug_assert!(!dfa.is_quit_state(*sid));
171	}
172	}
173	Ok(())
174	}
175
176	#[cfg(feature = "hybrid")]
177	#[cfg_attr(feature = "perf-inline", inline(always))]
178	fn hybrid_eoi_fwd(
179	dfa: &crate::hybrid::dfa::DFA,
180	cache: &mut crate::hybrid::dfa::Cache,
181	input: &Input<'_>,
182	sid: &mut crate::hybrid::LazyStateID,
183	mat: &mut Option<HalfMatch>,
184	) -> Result<(), MatchError> {
185	let sp = input.get_span();
186	match input.haystack().get(sp.end) {
187	Some(&b) => {
188	*sid = dfa
189	.next_state(cache, *sid, b)
190	.map_err(\|_\| MatchError::gave_up(sp.end))?;
191	if sid.is_match() {
192	let pattern = dfa.match_pattern(cache, *sid, `0`);
193	*mat = Some(HalfMatch::new(pattern, sp.end));
194	} else if sid.is_quit() {
195	return Err(MatchError::quit(b, sp.end));
196	}
197	}
198	None => {
199	*sid = dfa
200	.next_eoi_state(cache, *sid)
201	.map_err(\|_\| MatchError::gave_up(input.haystack().len()))?;
202	if sid.is_match() {
203	let pattern = dfa.match_pattern(cache, *sid, `0`);
204	*mat = Some(HalfMatch::new(pattern, input.haystack().len()));
205	}
206	// N.B. We don't have to check 'is_quit' here because the EOI
207	// transition can never lead to a quit state.
208	debug_assert!(!sid.is_quit());
209	}
210	}
211	Ok(())
212	}
213