1use crate::{
2 dfa::{
3 accel,
4 automaton::{Automaton, OverlappingState},
5 },
6 util::{
7 prefilter::Prefilter,
8 primitives::StateID,
9 search::{Anchored, HalfMatch, Input, Span},
10 },
11 MatchError,
12};
13
14#[inline(never)]
15pub fn find_fwd<A: Automaton + ?Sized>(
16 dfa: &A,
17 input: &Input<'_>,
18) -> Result<Option<HalfMatch>, MatchError> {
19 if input.is_done() {
20 return Ok(None);
21 }
22 let pre = if input.get_anchored().is_anchored() {
23 None
24 } else {
25 dfa.get_prefilter()
26 };
27 // Searching with a pattern ID is always anchored, so we should never use
28 // a prefilter.
29 if pre.is_some() {
30 if input.get_earliest() {
31 find_fwd_imp(dfa, input, pre, true)
32 } else {
33 find_fwd_imp(dfa, input, pre, false)
34 }
35 } else {
36 if input.get_earliest() {
37 find_fwd_imp(dfa, input, None, true)
38 } else {
39 find_fwd_imp(dfa, input, None, false)
40 }
41 }
42}
43
44#[cfg_attr(feature = "perf-inline", inline(always))]
45fn find_fwd_imp<A: Automaton + ?Sized>(
46 dfa: &A,
47 input: &Input<'_>,
48 pre: Option<&'_ Prefilter>,
49 earliest: bool,
50) -> Result<Option<HalfMatch>, MatchError> {
51 // See 'prefilter_restart' docs for explanation.
52 let universal_start = dfa.universal_start_state(Anchored::No).is_some();
53 let mut mat = None;
54 let mut sid = init_fwd(dfa, input)?;
55 let mut at = input.start();
56 // This could just be a closure, but then I think it would be unsound
57 // because it would need to be safe to invoke. This way, the lack of safety
58 // is clearer in the code below.
59 macro_rules! next_unchecked {
60 ($sid:expr, $at:expr) => {{
61 let byte = *input.haystack().get_unchecked($at);
62 dfa.next_state_unchecked($sid, byte)
63 }};
64 }
65
66 if let Some(ref pre) = pre {
67 let span = Span::from(at..input.end());
68 // If a prefilter doesn't report false positives, then we don't need to
69 // touch the DFA at all. However, since all matches include the pattern
70 // ID, and the prefilter infrastructure doesn't report pattern IDs, we
71 // limit this optimization to cases where there is exactly one pattern.
72 // In that case, any match must be the 0th pattern.
73 match pre.find(input.haystack(), span) {
74 None => return Ok(mat),
75 Some(ref span) => {
76 at = span.start;
77 if !universal_start {
78 sid = prefilter_restart(dfa, &input, at)?;
79 }
80 }
81 }
82 }
83 while at < input.end() {
84 // SAFETY: There are two safety invariants we need to uphold here in
85 // the loops below: that 'sid' and 'prev_sid' are valid state IDs
86 // for this DFA, and that 'at' is a valid index into 'haystack'.
87 // For the former, we rely on the invariant that next_state* and
88 // start_state_forward always returns a valid state ID (given a valid
89 // state ID in the former case). For the latter safety invariant, we
90 // always guard unchecked access with a check that 'at' is less than
91 // 'end', where 'end <= haystack.len()'. In the unrolled loop below, we
92 // ensure that 'at' is always in bounds.
93 //
94 // PERF: See a similar comment in src/hybrid/search.rs that justifies
95 // this extra work to make the search loop fast. The same reasoning and
96 // benchmarks apply here.
97 let mut prev_sid;
98 while at < input.end() {
99 prev_sid = unsafe { next_unchecked!(sid, at) };
100 if dfa.is_special_state(prev_sid) || at + 3 >= input.end() {
101 core::mem::swap(&mut prev_sid, &mut sid);
102 break;
103 }
104 at += 1;
105
106 sid = unsafe { next_unchecked!(prev_sid, at) };
107 if dfa.is_special_state(sid) {
108 break;
109 }
110 at += 1;
111
112 prev_sid = unsafe { next_unchecked!(sid, at) };
113 if dfa.is_special_state(prev_sid) {
114 core::mem::swap(&mut prev_sid, &mut sid);
115 break;
116 }
117 at += 1;
118
119 sid = unsafe { next_unchecked!(prev_sid, at) };
120 if dfa.is_special_state(sid) {
121 break;
122 }
123 at += 1;
124 }
125 if dfa.is_special_state(sid) {
126 if dfa.is_start_state(sid) {
127 if let Some(ref pre) = pre {
128 let span = Span::from(at..input.end());
129 match pre.find(input.haystack(), span) {
130 None => return Ok(mat),
131 Some(ref span) => {
132 // We want to skip any update to 'at' below
133 // at the end of this iteration and just
134 // jump immediately back to the next state
135 // transition at the leading position of the
136 // candidate match.
137 //
138 // ... but only if we actually made progress
139 // with our prefilter, otherwise if the start
140 // state has a self-loop, we can get stuck.
141 if span.start > at {
142 at = span.start;
143 if !universal_start {
144 sid = prefilter_restart(dfa, &input, at)?;
145 }
146 continue;
147 }
148 }
149 }
150 } else if dfa.is_accel_state(sid) {
151 let needles = dfa.accelerator(sid);
152 at = accel::find_fwd(needles, input.haystack(), at + 1)
153 .unwrap_or(input.end());
154 continue;
155 }
156 } else if dfa.is_match_state(sid) {
157 let pattern = dfa.match_pattern(sid, 0);
158 mat = Some(HalfMatch::new(pattern, at));
159 if earliest {
160 return Ok(mat);
161 }
162 if dfa.is_accel_state(sid) {
163 let needles = dfa.accelerator(sid);
164 at = accel::find_fwd(needles, input.haystack(), at + 1)
165 .unwrap_or(input.end());
166 continue;
167 }
168 } else if dfa.is_accel_state(sid) {
169 let needs = dfa.accelerator(sid);
170 at = accel::find_fwd(needs, input.haystack(), at + 1)
171 .unwrap_or(input.end());
172 continue;
173 } else if dfa.is_dead_state(sid) {
174 return Ok(mat);
175 } else {
176 // It's important that this is a debug_assert, since this can
177 // actually be tripped even if DFA::from_bytes succeeds and
178 // returns a supposedly valid DFA.
179 return Err(MatchError::quit(input.haystack()[at], at));
180 }
181 }
182 at += 1;
183 }
184 eoi_fwd(dfa, input, &mut sid, &mut mat)?;
185 Ok(mat)
186}
187
188#[inline(never)]
189pub fn find_rev<A: Automaton + ?Sized>(
190 dfa: &A,
191 input: &Input<'_>,
192) -> Result<Option<HalfMatch>, MatchError> {
193 if input.is_done() {
194 return Ok(None);
195 }
196 if input.get_earliest() {
197 find_rev_imp(dfa, input, true)
198 } else {
199 find_rev_imp(dfa, input, false)
200 }
201}
202
203#[cfg_attr(feature = "perf-inline", inline(always))]
204fn find_rev_imp<A: Automaton + ?Sized>(
205 dfa: &A,
206 input: &Input<'_>,
207 earliest: bool,
208) -> Result<Option<HalfMatch>, MatchError> {
209 let mut mat = None;
210 let mut sid = init_rev(dfa, input)?;
211 // In reverse search, the loop below can't handle the case of searching an
212 // empty slice. Ideally we could write something congruent to the forward
213 // search, i.e., 'while at >= start', but 'start' might be 0. Since we use
214 // an unsigned offset, 'at >= 0' is trivially always true. We could avoid
215 // this extra case handling by using a signed offset, but Rust makes it
216 // annoying to do. So... We just handle the empty case separately.
217 if input.start() == input.end() {
218 eoi_rev(dfa, input, &mut sid, &mut mat)?;
219 return Ok(mat);
220 }
221
222 let mut at = input.end() - 1;
223 macro_rules! next_unchecked {
224 ($sid:expr, $at:expr) => {{
225 let byte = *input.haystack().get_unchecked($at);
226 dfa.next_state_unchecked($sid, byte)
227 }};
228 }
229 loop {
230 // SAFETY: See comments in 'find_fwd' for a safety argument.
231 let mut prev_sid;
232 while at >= input.start() {
233 prev_sid = unsafe { next_unchecked!(sid, at) };
234 if dfa.is_special_state(prev_sid)
235 || at <= input.start().saturating_add(3)
236 {
237 core::mem::swap(&mut prev_sid, &mut sid);
238 break;
239 }
240 at -= 1;
241
242 sid = unsafe { next_unchecked!(prev_sid, at) };
243 if dfa.is_special_state(sid) {
244 break;
245 }
246 at -= 1;
247
248 prev_sid = unsafe { next_unchecked!(sid, at) };
249 if dfa.is_special_state(prev_sid) {
250 core::mem::swap(&mut prev_sid, &mut sid);
251 break;
252 }
253 at -= 1;
254
255 sid = unsafe { next_unchecked!(prev_sid, at) };
256 if dfa.is_special_state(sid) {
257 break;
258 }
259 at -= 1;
260 }
261 if dfa.is_special_state(sid) {
262 if dfa.is_start_state(sid) {
263 if dfa.is_accel_state(sid) {
264 let needles = dfa.accelerator(sid);
265 at = accel::find_rev(needles, input.haystack(), at)
266 .map(|i| i + 1)
267 .unwrap_or(input.start());
268 }
269 } else if dfa.is_match_state(sid) {
270 let pattern = dfa.match_pattern(sid, 0);
271 // Since reverse searches report the beginning of a match
272 // and the beginning is inclusive (not exclusive like the
273 // end of a match), we add 1 to make it inclusive.
274 mat = Some(HalfMatch::new(pattern, at + 1));
275 if earliest {
276 return Ok(mat);
277 }
278 if dfa.is_accel_state(sid) {
279 let needles = dfa.accelerator(sid);
280 at = accel::find_rev(needles, input.haystack(), at)
281 .map(|i| i + 1)
282 .unwrap_or(input.start());
283 }
284 } else if dfa.is_accel_state(sid) {
285 let needles = dfa.accelerator(sid);
286 // If the accelerator returns nothing, why don't we quit the
287 // search? Well, if the accelerator doesn't find anything, that
288 // doesn't mean we don't have a match. It just means that we
289 // can't leave the current state given one of the 255 possible
290 // byte values. However, there might be an EOI transition. So
291 // we set 'at' to the end of the haystack, which will cause
292 // this loop to stop and fall down into the EOI transition.
293 at = accel::find_rev(needles, input.haystack(), at)
294 .map(|i| i + 1)
295 .unwrap_or(input.start());
296 } else if dfa.is_dead_state(sid) {
297 return Ok(mat);
298 } else {
299 return Err(MatchError::quit(input.haystack()[at], at));
300 }
301 }
302 if at == input.start() {
303 break;
304 }
305 at -= 1;
306 }
307 eoi_rev(dfa, input, &mut sid, &mut mat)?;
308 Ok(mat)
309}
310
311#[inline(never)]
312pub fn find_overlapping_fwd<A: Automaton + ?Sized>(
313 dfa: &A,
314 input: &Input<'_>,
315 state: &mut OverlappingState,
316) -> Result<(), MatchError> {
317 state.mat = None;
318 if input.is_done() {
319 return Ok(());
320 }
321 let pre = if input.get_anchored().is_anchored() {
322 None
323 } else {
324 dfa.get_prefilter()
325 };
326 if pre.is_some() {
327 find_overlapping_fwd_imp(dfa, input, pre, state)
328 } else {
329 find_overlapping_fwd_imp(dfa, input, None, state)
330 }
331}
332
333#[cfg_attr(feature = "perf-inline", inline(always))]
334fn find_overlapping_fwd_imp<A: Automaton + ?Sized>(
335 dfa: &A,
336 input: &Input<'_>,
337 pre: Option<&'_ Prefilter>,
338 state: &mut OverlappingState,
339) -> Result<(), MatchError> {
340 // See 'prefilter_restart' docs for explanation.
341 let universal_start = dfa.universal_start_state(Anchored::No).is_some();
342 let mut sid = match state.id {
343 None => {
344 state.at = input.start();
345 init_fwd(dfa, input)?
346 }
347 Some(sid) => {
348 if let Some(match_index) = state.next_match_index {
349 let match_len = dfa.match_len(sid);
350 if match_index < match_len {
351 state.next_match_index = Some(match_index + 1);
352 let pattern = dfa.match_pattern(sid, match_index);
353 state.mat = Some(HalfMatch::new(pattern, state.at));
354 return Ok(());
355 }
356 }
357 // Once we've reported all matches at a given position, we need to
358 // advance the search to the next position.
359 state.at += 1;
360 if state.at > input.end() {
361 return Ok(());
362 }
363 sid
364 }
365 };
366
367 // NOTE: We don't optimize the crap out of this routine primarily because
368 // it seems like most find_overlapping searches will have higher match
369 // counts, and thus, throughput is perhaps not as important. But if you
370 // have a use case for something faster, feel free to file an issue.
371 while state.at < input.end() {
372 sid = dfa.next_state(sid, input.haystack()[state.at]);
373 if dfa.is_special_state(sid) {
374 state.id = Some(sid);
375 if dfa.is_start_state(sid) {
376 if let Some(ref pre) = pre {
377 let span = Span::from(state.at..input.end());
378 match pre.find(input.haystack(), span) {
379 None => return Ok(()),
380 Some(ref span) => {
381 if span.start > state.at {
382 state.at = span.start;
383 if !universal_start {
384 sid = prefilter_restart(
385 dfa, &input, state.at,
386 )?;
387 }
388 continue;
389 }
390 }
391 }
392 } else if dfa.is_accel_state(sid) {
393 let needles = dfa.accelerator(sid);
394 state.at = accel::find_fwd(
395 needles,
396 input.haystack(),
397 state.at + 1,
398 )
399 .unwrap_or(input.end());
400 continue;
401 }
402 } else if dfa.is_match_state(sid) {
403 state.next_match_index = Some(1);
404 let pattern = dfa.match_pattern(sid, 0);
405 state.mat = Some(HalfMatch::new(pattern, state.at));
406 return Ok(());
407 } else if dfa.is_accel_state(sid) {
408 let needs = dfa.accelerator(sid);
409 // If the accelerator returns nothing, why don't we quit the
410 // search? Well, if the accelerator doesn't find anything, that
411 // doesn't mean we don't have a match. It just means that we
412 // can't leave the current state given one of the 255 possible
413 // byte values. However, there might be an EOI transition. So
414 // we set 'at' to the end of the haystack, which will cause
415 // this loop to stop and fall down into the EOI transition.
416 state.at =
417 accel::find_fwd(needs, input.haystack(), state.at + 1)
418 .unwrap_or(input.end());
419 continue;
420 } else if dfa.is_dead_state(sid) {
421 return Ok(());
422 } else {
423 return Err(MatchError::quit(
424 input.haystack()[state.at],
425 state.at,
426 ));
427 }
428 }
429 state.at += 1;
430 }
431
432 let result = eoi_fwd(dfa, input, &mut sid, &mut state.mat);
433 state.id = Some(sid);
434 if state.mat.is_some() {
435 // '1' is always correct here since if we get to this point, this
436 // always corresponds to the first (index '0') match discovered at
437 // this position. So the next match to report at this position (if
438 // it exists) is at index '1'.
439 state.next_match_index = Some(1);
440 }
441 result
442}
443
444#[inline(never)]
445pub(crate) fn find_overlapping_rev<A: Automaton + ?Sized>(
446 dfa: &A,
447 input: &Input<'_>,
448 state: &mut OverlappingState,
449) -> Result<(), MatchError> {
450 state.mat = None;
451 if input.is_done() {
452 return Ok(());
453 }
454 let mut sid = match state.id {
455 None => {
456 let sid = init_rev(dfa, input)?;
457 state.id = Some(sid);
458 if input.start() == input.end() {
459 state.rev_eoi = true;
460 } else {
461 state.at = input.end() - 1;
462 }
463 sid
464 }
465 Some(sid) => {
466 if let Some(match_index) = state.next_match_index {
467 let match_len = dfa.match_len(sid);
468 if match_index < match_len {
469 state.next_match_index = Some(match_index + 1);
470 let pattern = dfa.match_pattern(sid, match_index);
471 state.mat = Some(HalfMatch::new(pattern, state.at));
472 return Ok(());
473 }
474 }
475 // Once we've reported all matches at a given position, we need
476 // to advance the search to the next position. However, if we've
477 // already followed the EOI transition, then we know we're done
478 // with the search and there cannot be any more matches to report.
479 if state.rev_eoi {
480 return Ok(());
481 } else if state.at == input.start() {
482 // At this point, we should follow the EOI transition. This
483 // will cause us the skip the main loop below and fall through
484 // to the final 'eoi_rev' transition.
485 state.rev_eoi = true;
486 } else {
487 // We haven't hit the end of the search yet, so move on.
488 state.at -= 1;
489 }
490 sid
491 }
492 };
493 while !state.rev_eoi {
494 sid = dfa.next_state(sid, input.haystack()[state.at]);
495 if dfa.is_special_state(sid) {
496 state.id = Some(sid);
497 if dfa.is_start_state(sid) {
498 if dfa.is_accel_state(sid) {
499 let needles = dfa.accelerator(sid);
500 state.at =
501 accel::find_rev(needles, input.haystack(), state.at)
502 .map(|i| i + 1)
503 .unwrap_or(input.start());
504 }
505 } else if dfa.is_match_state(sid) {
506 state.next_match_index = Some(1);
507 let pattern = dfa.match_pattern(sid, 0);
508 state.mat = Some(HalfMatch::new(pattern, state.at + 1));
509 return Ok(());
510 } else if dfa.is_accel_state(sid) {
511 let needles = dfa.accelerator(sid);
512 // If the accelerator returns nothing, why don't we quit the
513 // search? Well, if the accelerator doesn't find anything, that
514 // doesn't mean we don't have a match. It just means that we
515 // can't leave the current state given one of the 255 possible
516 // byte values. However, there might be an EOI transition. So
517 // we set 'at' to the end of the haystack, which will cause
518 // this loop to stop and fall down into the EOI transition.
519 state.at =
520 accel::find_rev(needles, input.haystack(), state.at)
521 .map(|i| i + 1)
522 .unwrap_or(input.start());
523 } else if dfa.is_dead_state(sid) {
524 return Ok(());
525 } else {
526 return Err(MatchError::quit(
527 input.haystack()[state.at],
528 state.at,
529 ));
530 }
531 }
532 if state.at == input.start() {
533 break;
534 }
535 state.at -= 1;
536 }
537
538 let result = eoi_rev(dfa, input, &mut sid, &mut state.mat);
539 state.rev_eoi = true;
540 state.id = Some(sid);
541 if state.mat.is_some() {
542 // '1' is always correct here since if we get to this point, this
543 // always corresponds to the first (index '0') match discovered at
544 // this position. So the next match to report at this position (if
545 // it exists) is at index '1'.
546 state.next_match_index = Some(1);
547 }
548 result
549}
550
551#[cfg_attr(feature = "perf-inline", inline(always))]
552fn init_fwd<A: Automaton + ?Sized>(
553 dfa: &A,
554 input: &Input<'_>,
555) -> Result<StateID, MatchError> {
556 let sid = dfa.start_state_forward(input)?;
557 // Start states can never be match states, since all matches are delayed
558 // by 1 byte.
559 debug_assert!(!dfa.is_match_state(sid));
560 Ok(sid)
561}
562
563#[cfg_attr(feature = "perf-inline", inline(always))]
564fn init_rev<A: Automaton + ?Sized>(
565 dfa: &A,
566 input: &Input<'_>,
567) -> Result<StateID, MatchError> {
568 let sid = dfa.start_state_reverse(input)?;
569 // Start states can never be match states, since all matches are delayed
570 // by 1 byte.
571 debug_assert!(!dfa.is_match_state(sid));
572 Ok(sid)
573}
574
575#[cfg_attr(feature = "perf-inline", inline(always))]
576fn eoi_fwd<A: Automaton + ?Sized>(
577 dfa: &A,
578 input: &Input<'_>,
579 sid: &mut StateID,
580 mat: &mut Option<HalfMatch>,
581) -> Result<(), MatchError> {
582 let sp = input.get_span();
583 match input.haystack().get(sp.end) {
584 Some(&b) => {
585 *sid = dfa.next_state(*sid, b);
586 if dfa.is_match_state(*sid) {
587 let pattern = dfa.match_pattern(*sid, 0);
588 *mat = Some(HalfMatch::new(pattern, sp.end));
589 } else if dfa.is_quit_state(*sid) {
590 return Err(MatchError::quit(b, sp.end));
591 }
592 }
593 None => {
594 *sid = dfa.next_eoi_state(*sid);
595 if dfa.is_match_state(*sid) {
596 let pattern = dfa.match_pattern(*sid, 0);
597 *mat = Some(HalfMatch::new(pattern, input.haystack().len()));
598 }
599 }
600 }
601 Ok(())
602}
603
604#[cfg_attr(feature = "perf-inline", inline(always))]
605fn eoi_rev<A: Automaton + ?Sized>(
606 dfa: &A,
607 input: &Input<'_>,
608 sid: &mut StateID,
609 mat: &mut Option<HalfMatch>,
610) -> Result<(), MatchError> {
611 let sp = input.get_span();
612 if sp.start > 0 {
613 let byte = input.haystack()[sp.start - 1];
614 *sid = dfa.next_state(*sid, byte);
615 if dfa.is_match_state(*sid) {
616 let pattern = dfa.match_pattern(*sid, 0);
617 *mat = Some(HalfMatch::new(pattern, sp.start));
618 } else if dfa.is_quit_state(*sid) {
619 return Err(MatchError::quit(byte, sp.start - 1));
620 }
621 } else {
622 *sid = dfa.next_eoi_state(*sid);
623 if dfa.is_match_state(*sid) {
624 let pattern = dfa.match_pattern(*sid, 0);
625 *mat = Some(HalfMatch::new(pattern, 0));
626 }
627 }
628 Ok(())
629}
630
631/// Re-compute the starting state that a DFA should be in after finding a
632/// prefilter candidate match at the position `at`.
633///
634/// The function with the same name has a bit more docs in hybrid/search.rs.
635#[cfg_attr(feature = "perf-inline", inline(always))]
636fn prefilter_restart<A: Automaton + ?Sized>(
637 dfa: &A,
638 input: &Input<'_>,
639 at: usize,
640) -> Result<StateID, MatchError> {
641 let mut input = input.clone();
642 input.set_start(at);
643 init_fwd(dfa, &input)
644}
645