search.rs source code [crates/regex_automata/src/dfa/search.rs]

1	use crate::{
2	dfa::{
3	accel,
4	automaton::{Automaton, OverlappingState, StateMatch},
5	},
6	util::{
7	id::{PatternID, StateID},
8	matchtypes::HalfMatch,
9	prefilter, MATCH_OFFSET,
10	},
11	MatchError,
12	};
13
14	#[inline(never)]
15	pub fn find_earliest_fwd<A: Automaton + ?Sized>(
16	pre: Option<&mut prefilter::Scanner>,
17	dfa: &A,
18	pattern_id: Option<PatternID>,
19	bytes: &[u8],
20	start: usize,
21	end: usize,
22	) -> Result<Option<HalfMatch>, MatchError> {
23	// Searching with a pattern ID is always anchored, so we should never use
24	// a prefilter.
25	if pre.is_some() && pattern_id.is_none() {
26	find_fwd(pre, earliest:`true`, dfa, pattern_id, haystack:bytes, start, end)
27	} else {
28	find_fwd(pre:None, earliest:`true`, dfa, pattern_id, haystack:bytes, start, end)
29	}
30	}
31
32	#[inline(never)]
33	pub fn find_leftmost_fwd<A: Automaton + ?Sized>(
34	pre: Option<&mut prefilter::Scanner>,
35	dfa: &A,
36	pattern_id: Option<PatternID>,
37	bytes: &[u8],
38	start: usize,
39	end: usize,
40	) -> Result<Option<HalfMatch>, MatchError> {
41	// Searching with a pattern ID is always anchored, so we should never use
42	// a prefilter.
43	if pre.is_some() && pattern_id.is_none() {
44	find_fwd(pre, earliest:`false`, dfa, pattern_id, haystack:bytes, start, end)
45	} else {
46	find_fwd(pre:None, earliest:`false`, dfa, pattern_id, haystack:bytes, start, end)
47	}
48	}
49
50	/// This is marked as `inline(always)` specifically because it supports
51	/// multiple modes of searching. Namely, the 'pre' and 'earliest' parameters
52	/// getting inlined eliminate some critical branches. To avoid bloating binary
53	/// size, we only call this function in a fixed number of places.
54	#[inline(always)]
55	fn find_fwd<A: Automaton + ?Sized>(
56	mut pre: Option<&mut prefilter::Scanner>,
57	earliest: bool,
58	dfa: &A,
59	pattern_id: Option<PatternID>,
60	haystack: &[u8],
61	start: usize,
62	end: usize,
63	) -> Result<Option<HalfMatch>, MatchError> {
64	assert!(start <= end);
65	assert!(start <= haystack.len());
66	assert!(end <= haystack.len());
67
68	// Why do this? This lets 'bytes[at]' work without bounds checks below.
69	// It seems the assert on 'end <= haystack.len()' above is otherwise
70	// not enough. Why not just make 'bytes' scoped this way anyway? Well,
71	// 'eoi_fwd' (below) might actually want to try to access the byte at 'end'
72	// for resolving look-ahead.
73	let bytes = &haystack[..end];
74
75	let mut state = init_fwd(dfa, pattern_id, haystack, start, end)?;
76	let mut last_match = None;
77	let mut at = start;
78	if let Some(ref mut pre) = pre {
79	// If a prefilter doesn't report false positives, then we don't need to
80	// touch the DFA at all. However, since all matches include the pattern
81	// ID, and the prefilter infrastructure doesn't report pattern IDs, we
82	// limit this optimization to cases where there is exactly one pattern.
83	// In that case, any match must be the 0th pattern.
84	if dfa.pattern_count() == `1` && !pre.reports_false_positives() {
85	return Ok(pre.next_candidate(bytes, at).into_option().map(
86	\|offset\| HalfMatch { pattern: PatternID::ZERO, offset },
87	));
88	} else if pre.is_effective(at) {
89	match pre.next_candidate(bytes, at).into_option() {
90	None => return Ok(None),
91	Some(i) => {
92	at = i;
93	}
94	}
95	}
96	}
97	while at < end {
98	let byte = bytes[at];
99	state = dfa.next_state(state, byte);
100	at += `1`;
101	if dfa.is_special_state(state) {
102	if dfa.is_start_state(state) {
103	if let Some(ref mut pre) = pre {
104	if pre.is_effective(at) {
105	match pre.next_candidate(bytes, at).into_option() {
106	None => return Ok(None),
107	Some(i) => {
108	at = i;
109	}
110	}
111	}
112	} else if dfa.is_accel_state(state) {
113	let needles = dfa.accelerator(state);
114	at = accel::find_fwd(needles, bytes, at)
115	.unwrap_or(bytes.len());
116	}
117	} else if dfa.is_match_state(state) {
118	last_match = Some(HalfMatch {
119	pattern: dfa.match_pattern(state, `0`),
120	offset: at - MATCH_OFFSET,
121	});
122	if earliest {
123	return Ok(last_match);
124	}
125	if dfa.is_accel_state(state) {
126	let needles = dfa.accelerator(state);
127	at = accel::find_fwd(needles, bytes, at)
128	.unwrap_or(bytes.len());
129	}
130	} else if dfa.is_accel_state(state) {
131	let needs = dfa.accelerator(state);
132	at = accel::find_fwd(needs, bytes, at).unwrap_or(bytes.len());
133	} else if dfa.is_dead_state(state) {
134	return Ok(last_match);
135	} else {
136	debug_assert!(dfa.is_quit_state(state));
137	if last_match.is_some() {
138	return Ok(last_match);
139	}
140	return Err(MatchError::Quit { byte, offset: at - `1` });
141	}
142	}
143	while at < end && dfa.next_state(state, bytes[at]) == state {
144	at += `1`;
145	}
146	}
147	Ok(eoi_fwd(dfa, haystack, end, &mut state)?.or(last_match))
148	}
149
150	#[inline(never)]
151	pub fn find_earliest_rev<A: Automaton + ?Sized>(
152	dfa: &A,
153	pattern_id: Option<PatternID>,
154	bytes: &[u8],
155	start: usize,
156	end: usize,
157	) -> Result<Option<HalfMatch>, MatchError> {
158	find_rev(earliest:`true`, dfa, pattern_id, bytes, start, end)
159	}
160
161	#[inline(never)]
162	pub fn find_leftmost_rev<A: Automaton + ?Sized>(
163	dfa: &A,
164	pattern_id: Option<PatternID>,
165	bytes: &[u8],
166	start: usize,
167	end: usize,
168	) -> Result<Option<HalfMatch>, MatchError> {
169	find_rev(earliest:`false`, dfa, pattern_id, bytes, start, end)
170	}
171
172	/// This is marked as `inline(always)` specifically because it supports
173	/// multiple modes of searching. Namely, the 'earliest' boolean getting inlined
174	/// permits eliminating a few crucial branches.
175	#[inline(always)]
176	fn find_rev<A: Automaton + ?Sized>(
177	earliest: bool,
178	dfa: &A,
179	pattern_id: Option<PatternID>,
180	bytes: &[u8],
181	start: usize,
182	end: usize,
183	) -> Result<Option<HalfMatch>, MatchError> {
184	assert!(start <= end);
185	assert!(start <= bytes.len());
186	assert!(end <= bytes.len());
187
188	let mut state = init_rev(dfa, pattern_id, bytes, start, end)?;
189	let mut last_match = None;
190	let mut at = end;
191	while at > start {
192	at -= `1`;
193	while at > start && dfa.next_state(state, bytes[at]) == state {
194	at -= `1`;
195	}
196
197	let byte = bytes[at];
198	state = dfa.next_state(state, byte);
199	if dfa.is_special_state(state) {
200	if dfa.is_start_state(state) {
201	if dfa.is_accel_state(state) {
202	let needles = dfa.accelerator(state);
203	at = accel::find_rev(needles, bytes, at)
204	.map(\|i\| i + `1`)
205	.unwrap_or(`0`);
206	}
207	} else if dfa.is_match_state(state) {
208	last_match = Some(HalfMatch {
209	pattern: dfa.match_pattern(state, `0`),
210	offset: at + MATCH_OFFSET,
211	});
212	if earliest {
213	return Ok(last_match);
214	}
215	if dfa.is_accel_state(state) {
216	let needles = dfa.accelerator(state);
217	at = accel::find_rev(needles, bytes, at)
218	.map(\|i\| i + `1`)
219	.unwrap_or(`0`);
220	}
221	} else if dfa.is_accel_state(state) {
222	let needles = dfa.accelerator(state);
223	at = accel::find_rev(needles, bytes, at)
224	.map(\|i\| i + `1`)
225	.unwrap_or(`0`);
226	} else if dfa.is_dead_state(state) {
227	return Ok(last_match);
228	} else {
229	debug_assert!(dfa.is_quit_state(state));
230	if last_match.is_some() {
231	return Ok(last_match);
232	}
233	return Err(MatchError::Quit { byte, offset: at });
234	}
235	}
236	}
237	Ok(eoi_rev(dfa, bytes, start, state)?.or(last_match))
238	}
239
240	#[inline(never)]
241	pub fn find_overlapping_fwd<A: Automaton + ?Sized>(
242	pre: Option<&mut prefilter::Scanner>,
243	dfa: &A,
244	pattern_id: Option<PatternID>,
245	bytes: &[u8],
246	start: usize,
247	end: usize,
248	caller_state: &mut OverlappingState,
249	) -> Result<Option<HalfMatch>, MatchError> {
250	// Searching with a pattern ID is always anchored, so we should only ever
251	// use a prefilter when no pattern ID is given.
252	if pre.is_some() && pattern_id.is_none() {
253	find_overlapping_fwd_imp(
254	pre,
255	dfa,
256	pattern_id,
257	bytes,
258	start,
259	end,
260	caller_state,
261	)
262	} else {
263	find_overlapping_fwd_imp(
264	None,
265	dfa,
266	pattern_id,
267	bytes,
268	start,
269	end,
270	caller_state,
271	)
272	}
273	}
274
275	/// This is marked as `inline(always)` specifically because it supports
276	/// multiple modes of searching. Namely, the 'pre' prefilter getting inlined
277	/// permits eliminating a few crucial branches and reduces code size when it is
278	/// not used.
279	#[inline(always)]
280	fn find_overlapping_fwd_imp<A: Automaton + ?Sized>(
281	mut pre: Option<&mut prefilter::Scanner>,
282	dfa: &A,
283	pattern_id: Option<PatternID>,
284	bytes: &[u8],
285	mut start: usize,
286	end: usize,
287	caller_state: &mut OverlappingState,
288	) -> Result<Option<HalfMatch>, MatchError> {
289	assert!(start <= end);
290	assert!(start <= bytes.len());
291	assert!(end <= bytes.len());
292
293	let mut state = match caller_state.id() {
294	None => init_fwd(dfa, pattern_id, bytes, start, end)?,
295	Some(id) => {
296	if let Some(last) = caller_state.last_match() {
297	let match_count = dfa.match_count(id);
298	if last.match_index < match_count {
299	let m = HalfMatch {
300	pattern: dfa.match_pattern(id, last.match_index),
301	offset: last.offset,
302	};
303	last.match_index += `1`;
304	return Ok(Some(m));
305	}
306	}
307
308	// This is a subtle but critical detail. If the caller provides a
309	// non-None state ID, then it must be the case that the state ID
310	// corresponds to one set by this function. The state ID therefore
311	// corresponds to a match state, a dead state or some other state.
312	// However, "some other" state _only_ occurs when the input has
313	// been exhausted because the only way to stop before then is to
314	// see a match or a dead/quit state.
315	//
316	// If the input is exhausted or if it's a dead state, then
317	// incrementing the starting position has no relevance on
318	// correctness, since the loop below will either not execute
319	// at all or will immediately stop due to being in a dead state.
320	// (Once in a dead state it is impossible to leave it.)
321	//
322	// Therefore, the only case we need to consider is when
323	// caller_state is a match state. In this case, since our machines
324	// support the ability to delay a match by a certain number of
325	// bytes (to support look-around), it follows that we actually
326	// consumed that many additional bytes on our previous search. When
327	// the caller resumes their search to find subsequent matches, they
328	// will use the ending location from the previous match as the next
329	// starting point, which is `MATCH_OFFSET` bytes PRIOR to where
330	// we scanned to on the previous search. Therefore, we need to
331	// compensate by bumping `start` up by `MATCH_OFFSET` bytes.
332	//
333	// Incidentally, since MATCH_OFFSET is non-zero, this also makes
334	// dealing with empty matches convenient. Namely, callers needn't
335	// special case them when implementing an iterator. Instead, this
336	// ensures that forward progress is always made.
337	start += MATCH_OFFSET;
338	id
339	}
340	};
341
342	let mut at = start;
343	while at < end {
344	let byte = bytes[at];
345	state = dfa.next_state(state, byte);
346	at += `1`;
347	if dfa.is_special_state(state) {
348	caller_state.set_id(state);
349	if dfa.is_start_state(state) {
350	if let Some(ref mut pre) = pre {
351	if pre.is_effective(at) {
352	match pre.next_candidate(bytes, at).into_option() {
353	None => return Ok(None),
354	Some(i) => {
355	at = i;
356	}
357	}
358	}
359	} else if dfa.is_accel_state(state) {
360	let needles = dfa.accelerator(state);
361	at = accel::find_fwd(needles, bytes, at)
362	.unwrap_or(bytes.len());
363	}
364	} else if dfa.is_match_state(state) {
365	let offset = at - MATCH_OFFSET;
366	caller_state
367	.set_last_match(StateMatch { match_index: `1`, offset });
368	return Ok(Some(HalfMatch {
369	pattern: dfa.match_pattern(state, `0`),
370	offset,
371	}));
372	} else if dfa.is_accel_state(state) {
373	let needs = dfa.accelerator(state);
374	at = accel::find_fwd(needs, bytes, at).unwrap_or(bytes.len());
375	} else if dfa.is_dead_state(state) {
376	return Ok(None);
377	} else {
378	debug_assert!(dfa.is_quit_state(state));
379	return Err(MatchError::Quit { byte, offset: at - `1` });
380	}
381	}
382	}
383
384	let result = eoi_fwd(dfa, bytes, end, &mut state);
385	caller_state.set_id(state);
386	if let Ok(Some(ref last_match)) = result {
387	caller_state.set_last_match(StateMatch {
388	match_index: `1`,
389	offset: last_match.offset(),
390	});
391	}
392	result
393	}
394
395	fn init_fwd<A: Automaton + ?Sized>(
396	dfa: &A,
397	pattern_id: Option<PatternID>,
398	bytes: &[u8],
399	start: usize,
400	end: usize,
401	) -> Result<StateID, MatchError> {
402	let state: StateID = dfa.start_state_forward(pattern_id, bytes, start, end);
403	// Start states can never be match states, since all matches are delayed
404	// by 1 byte.
405	assert!(!dfa.is_match_state(state));
406	Ok(state)
407	}
408
409	fn init_rev<A: Automaton + ?Sized>(
410	dfa: &A,
411	pattern_id: Option<PatternID>,
412	bytes: &[u8],
413	start: usize,
414	end: usize,
415	) -> Result<StateID, MatchError> {
416	let state: StateID = dfa.start_state_reverse(pattern_id, bytes, start, end);
417	// Start states can never be match states, since all matches are delayed
418	// by 1 byte.
419	assert!(!dfa.is_match_state(state));
420	Ok(state)
421	}
422
423	fn eoi_fwd<A: Automaton + ?Sized>(
424	dfa: &A,
425	bytes: &[u8],
426	end: usize,
427	state: &mut StateID,
428	) -> Result<Option<HalfMatch>, MatchError> {
429	match bytes.get(end) {
430	Some(&b) => {
431	state = dfa.next_state(state, b);
432	if dfa.is_match_state(*state) {
433	Ok(Some(HalfMatch {
434	pattern: dfa.match_pattern(*state, `0`),
435	offset: end,
436	}))
437	} else {
438	Ok(None)
439	}
440	}
441	None => {
442	state = dfa.next_eoi_state(state);
443	if dfa.is_match_state(*state) {
444	Ok(Some(HalfMatch {
445	pattern: dfa.match_pattern(*state, `0`),
446	offset: bytes.len(),
447	}))
448	} else {
449	Ok(None)
450	}
451	}
452	}
453	}
454
455	fn eoi_rev<A: Automaton + ?Sized>(
456	dfa: &A,
457	bytes: &[u8],
458	start: usize,
459	state: StateID,
460	) -> Result<Option<HalfMatch>, MatchError> {
461	if start > `0` {
462	let state: StateID = dfa.next_state(current:state, input:bytes[start - `1`]);
463	if dfa.is_match_state(id:state) {
464	Ok(Some(HalfMatch {
465	pattern: dfa.match_pattern(id:state, index:`0`),
466	offset: start,
467	}))
468	} else {
469	Ok(None)
470	}
471	} else {
472	let state: StateID = dfa.next_eoi_state(current:state);
473	if dfa.is_match_state(id:state) {
474	Ok(Some(HalfMatch {
475	pattern: dfa.match_pattern(id:state, index:`0`),
476	offset: `0`,
477	}))
478	} else {
479	Ok(None)
480	}
481	}
482	}
483
484	// Currently unused, but is useful to keep around. This was originally used
485	// when the code above used raw pointers for its main loop.
486	// /// Returns the distance between the given pointer and the start of `bytes`.
487	// /// This assumes that the given pointer points to somewhere in the `bytes`
488	// /// slice given.
489	// fn offset(bytes: &[u8], p: const u8) -> usize {*
490	// debug_assert!(bytes.as_ptr() <= p);
491	// debug_assert!(bytes[bytes.len()..].as_ptr() >= p);
492	// ((p as isize) - (bytes.as_ptr() as isize)) as usize
493	// }
494