1use crate::{
2 dfa::{
3 accel,
4 automaton::{Automaton, OverlappingState, StateMatch},
5 },
6 util::{
7 id::{PatternID, StateID},
8 matchtypes::HalfMatch,
9 prefilter, MATCH_OFFSET,
10 },
11 MatchError,
12};
13
14#[inline(never)]
15pub fn find_earliest_fwd<A: Automaton + ?Sized>(
16 pre: Option<&mut prefilter::Scanner>,
17 dfa: &A,
18 pattern_id: Option<PatternID>,
19 bytes: &[u8],
20 start: usize,
21 end: usize,
22) -> Result<Option<HalfMatch>, MatchError> {
23 // Searching with a pattern ID is always anchored, so we should never use
24 // a prefilter.
25 if pre.is_some() && pattern_id.is_none() {
26 find_fwd(pre, earliest:true, dfa, pattern_id, haystack:bytes, start, end)
27 } else {
28 find_fwd(pre:None, earliest:true, dfa, pattern_id, haystack:bytes, start, end)
29 }
30}
31
32#[inline(never)]
33pub fn find_leftmost_fwd<A: Automaton + ?Sized>(
34 pre: Option<&mut prefilter::Scanner>,
35 dfa: &A,
36 pattern_id: Option<PatternID>,
37 bytes: &[u8],
38 start: usize,
39 end: usize,
40) -> Result<Option<HalfMatch>, MatchError> {
41 // Searching with a pattern ID is always anchored, so we should never use
42 // a prefilter.
43 if pre.is_some() && pattern_id.is_none() {
44 find_fwd(pre, earliest:false, dfa, pattern_id, haystack:bytes, start, end)
45 } else {
46 find_fwd(pre:None, earliest:false, dfa, pattern_id, haystack:bytes, start, end)
47 }
48}
49
50/// This is marked as `inline(always)` specifically because it supports
51/// multiple modes of searching. Namely, the 'pre' and 'earliest' parameters
52/// getting inlined eliminate some critical branches. To avoid bloating binary
53/// size, we only call this function in a fixed number of places.
54#[inline(always)]
55fn find_fwd<A: Automaton + ?Sized>(
56 mut pre: Option<&mut prefilter::Scanner>,
57 earliest: bool,
58 dfa: &A,
59 pattern_id: Option<PatternID>,
60 haystack: &[u8],
61 start: usize,
62 end: usize,
63) -> Result<Option<HalfMatch>, MatchError> {
64 assert!(start <= end);
65 assert!(start <= haystack.len());
66 assert!(end <= haystack.len());
67
68 // Why do this? This lets 'bytes[at]' work without bounds checks below.
69 // It seems the assert on 'end <= haystack.len()' above is otherwise
70 // not enough. Why not just make 'bytes' scoped this way anyway? Well,
71 // 'eoi_fwd' (below) might actually want to try to access the byte at 'end'
72 // for resolving look-ahead.
73 let bytes = &haystack[..end];
74
75 let mut state = init_fwd(dfa, pattern_id, haystack, start, end)?;
76 let mut last_match = None;
77 let mut at = start;
78 if let Some(ref mut pre) = pre {
79 // If a prefilter doesn't report false positives, then we don't need to
80 // touch the DFA at all. However, since all matches include the pattern
81 // ID, and the prefilter infrastructure doesn't report pattern IDs, we
82 // limit this optimization to cases where there is exactly one pattern.
83 // In that case, any match must be the 0th pattern.
84 if dfa.pattern_count() == 1 && !pre.reports_false_positives() {
85 return Ok(pre.next_candidate(bytes, at).into_option().map(
86 |offset| HalfMatch { pattern: PatternID::ZERO, offset },
87 ));
88 } else if pre.is_effective(at) {
89 match pre.next_candidate(bytes, at).into_option() {
90 None => return Ok(None),
91 Some(i) => {
92 at = i;
93 }
94 }
95 }
96 }
97 while at < end {
98 let byte = bytes[at];
99 state = dfa.next_state(state, byte);
100 at += 1;
101 if dfa.is_special_state(state) {
102 if dfa.is_start_state(state) {
103 if let Some(ref mut pre) = pre {
104 if pre.is_effective(at) {
105 match pre.next_candidate(bytes, at).into_option() {
106 None => return Ok(None),
107 Some(i) => {
108 at = i;
109 }
110 }
111 }
112 } else if dfa.is_accel_state(state) {
113 let needles = dfa.accelerator(state);
114 at = accel::find_fwd(needles, bytes, at)
115 .unwrap_or(bytes.len());
116 }
117 } else if dfa.is_match_state(state) {
118 last_match = Some(HalfMatch {
119 pattern: dfa.match_pattern(state, 0),
120 offset: at - MATCH_OFFSET,
121 });
122 if earliest {
123 return Ok(last_match);
124 }
125 if dfa.is_accel_state(state) {
126 let needles = dfa.accelerator(state);
127 at = accel::find_fwd(needles, bytes, at)
128 .unwrap_or(bytes.len());
129 }
130 } else if dfa.is_accel_state(state) {
131 let needs = dfa.accelerator(state);
132 at = accel::find_fwd(needs, bytes, at).unwrap_or(bytes.len());
133 } else if dfa.is_dead_state(state) {
134 return Ok(last_match);
135 } else {
136 debug_assert!(dfa.is_quit_state(state));
137 if last_match.is_some() {
138 return Ok(last_match);
139 }
140 return Err(MatchError::Quit { byte, offset: at - 1 });
141 }
142 }
143 while at < end && dfa.next_state(state, bytes[at]) == state {
144 at += 1;
145 }
146 }
147 Ok(eoi_fwd(dfa, haystack, end, &mut state)?.or(last_match))
148}
149
150#[inline(never)]
151pub fn find_earliest_rev<A: Automaton + ?Sized>(
152 dfa: &A,
153 pattern_id: Option<PatternID>,
154 bytes: &[u8],
155 start: usize,
156 end: usize,
157) -> Result<Option<HalfMatch>, MatchError> {
158 find_rev(earliest:true, dfa, pattern_id, bytes, start, end)
159}
160
161#[inline(never)]
162pub fn find_leftmost_rev<A: Automaton + ?Sized>(
163 dfa: &A,
164 pattern_id: Option<PatternID>,
165 bytes: &[u8],
166 start: usize,
167 end: usize,
168) -> Result<Option<HalfMatch>, MatchError> {
169 find_rev(earliest:false, dfa, pattern_id, bytes, start, end)
170}
171
172/// This is marked as `inline(always)` specifically because it supports
173/// multiple modes of searching. Namely, the 'earliest' boolean getting inlined
174/// permits eliminating a few crucial branches.
175#[inline(always)]
176fn find_rev<A: Automaton + ?Sized>(
177 earliest: bool,
178 dfa: &A,
179 pattern_id: Option<PatternID>,
180 bytes: &[u8],
181 start: usize,
182 end: usize,
183) -> Result<Option<HalfMatch>, MatchError> {
184 assert!(start <= end);
185 assert!(start <= bytes.len());
186 assert!(end <= bytes.len());
187
188 let mut state = init_rev(dfa, pattern_id, bytes, start, end)?;
189 let mut last_match = None;
190 let mut at = end;
191 while at > start {
192 at -= 1;
193 while at > start && dfa.next_state(state, bytes[at]) == state {
194 at -= 1;
195 }
196
197 let byte = bytes[at];
198 state = dfa.next_state(state, byte);
199 if dfa.is_special_state(state) {
200 if dfa.is_start_state(state) {
201 if dfa.is_accel_state(state) {
202 let needles = dfa.accelerator(state);
203 at = accel::find_rev(needles, bytes, at)
204 .map(|i| i + 1)
205 .unwrap_or(0);
206 }
207 } else if dfa.is_match_state(state) {
208 last_match = Some(HalfMatch {
209 pattern: dfa.match_pattern(state, 0),
210 offset: at + MATCH_OFFSET,
211 });
212 if earliest {
213 return Ok(last_match);
214 }
215 if dfa.is_accel_state(state) {
216 let needles = dfa.accelerator(state);
217 at = accel::find_rev(needles, bytes, at)
218 .map(|i| i + 1)
219 .unwrap_or(0);
220 }
221 } else if dfa.is_accel_state(state) {
222 let needles = dfa.accelerator(state);
223 at = accel::find_rev(needles, bytes, at)
224 .map(|i| i + 1)
225 .unwrap_or(0);
226 } else if dfa.is_dead_state(state) {
227 return Ok(last_match);
228 } else {
229 debug_assert!(dfa.is_quit_state(state));
230 if last_match.is_some() {
231 return Ok(last_match);
232 }
233 return Err(MatchError::Quit { byte, offset: at });
234 }
235 }
236 }
237 Ok(eoi_rev(dfa, bytes, start, state)?.or(last_match))
238}
239
240#[inline(never)]
241pub fn find_overlapping_fwd<A: Automaton + ?Sized>(
242 pre: Option<&mut prefilter::Scanner>,
243 dfa: &A,
244 pattern_id: Option<PatternID>,
245 bytes: &[u8],
246 start: usize,
247 end: usize,
248 caller_state: &mut OverlappingState,
249) -> Result<Option<HalfMatch>, MatchError> {
250 // Searching with a pattern ID is always anchored, so we should only ever
251 // use a prefilter when no pattern ID is given.
252 if pre.is_some() && pattern_id.is_none() {
253 find_overlapping_fwd_imp(
254 pre,
255 dfa,
256 pattern_id,
257 bytes,
258 start,
259 end,
260 caller_state,
261 )
262 } else {
263 find_overlapping_fwd_imp(
264 None,
265 dfa,
266 pattern_id,
267 bytes,
268 start,
269 end,
270 caller_state,
271 )
272 }
273}
274
275/// This is marked as `inline(always)` specifically because it supports
276/// multiple modes of searching. Namely, the 'pre' prefilter getting inlined
277/// permits eliminating a few crucial branches and reduces code size when it is
278/// not used.
279#[inline(always)]
280fn find_overlapping_fwd_imp<A: Automaton + ?Sized>(
281 mut pre: Option<&mut prefilter::Scanner>,
282 dfa: &A,
283 pattern_id: Option<PatternID>,
284 bytes: &[u8],
285 mut start: usize,
286 end: usize,
287 caller_state: &mut OverlappingState,
288) -> Result<Option<HalfMatch>, MatchError> {
289 assert!(start <= end);
290 assert!(start <= bytes.len());
291 assert!(end <= bytes.len());
292
293 let mut state = match caller_state.id() {
294 None => init_fwd(dfa, pattern_id, bytes, start, end)?,
295 Some(id) => {
296 if let Some(last) = caller_state.last_match() {
297 let match_count = dfa.match_count(id);
298 if last.match_index < match_count {
299 let m = HalfMatch {
300 pattern: dfa.match_pattern(id, last.match_index),
301 offset: last.offset,
302 };
303 last.match_index += 1;
304 return Ok(Some(m));
305 }
306 }
307
308 // This is a subtle but critical detail. If the caller provides a
309 // non-None state ID, then it must be the case that the state ID
310 // corresponds to one set by this function. The state ID therefore
311 // corresponds to a match state, a dead state or some other state.
312 // However, "some other" state _only_ occurs when the input has
313 // been exhausted because the only way to stop before then is to
314 // see a match or a dead/quit state.
315 //
316 // If the input is exhausted or if it's a dead state, then
317 // incrementing the starting position has no relevance on
318 // correctness, since the loop below will either not execute
319 // at all or will immediately stop due to being in a dead state.
320 // (Once in a dead state it is impossible to leave it.)
321 //
322 // Therefore, the only case we need to consider is when
323 // caller_state is a match state. In this case, since our machines
324 // support the ability to delay a match by a certain number of
325 // bytes (to support look-around), it follows that we actually
326 // consumed that many additional bytes on our previous search. When
327 // the caller resumes their search to find subsequent matches, they
328 // will use the ending location from the previous match as the next
329 // starting point, which is `MATCH_OFFSET` bytes PRIOR to where
330 // we scanned to on the previous search. Therefore, we need to
331 // compensate by bumping `start` up by `MATCH_OFFSET` bytes.
332 //
333 // Incidentally, since MATCH_OFFSET is non-zero, this also makes
334 // dealing with empty matches convenient. Namely, callers needn't
335 // special case them when implementing an iterator. Instead, this
336 // ensures that forward progress is always made.
337 start += MATCH_OFFSET;
338 id
339 }
340 };
341
342 let mut at = start;
343 while at < end {
344 let byte = bytes[at];
345 state = dfa.next_state(state, byte);
346 at += 1;
347 if dfa.is_special_state(state) {
348 caller_state.set_id(state);
349 if dfa.is_start_state(state) {
350 if let Some(ref mut pre) = pre {
351 if pre.is_effective(at) {
352 match pre.next_candidate(bytes, at).into_option() {
353 None => return Ok(None),
354 Some(i) => {
355 at = i;
356 }
357 }
358 }
359 } else if dfa.is_accel_state(state) {
360 let needles = dfa.accelerator(state);
361 at = accel::find_fwd(needles, bytes, at)
362 .unwrap_or(bytes.len());
363 }
364 } else if dfa.is_match_state(state) {
365 let offset = at - MATCH_OFFSET;
366 caller_state
367 .set_last_match(StateMatch { match_index: 1, offset });
368 return Ok(Some(HalfMatch {
369 pattern: dfa.match_pattern(state, 0),
370 offset,
371 }));
372 } else if dfa.is_accel_state(state) {
373 let needs = dfa.accelerator(state);
374 at = accel::find_fwd(needs, bytes, at).unwrap_or(bytes.len());
375 } else if dfa.is_dead_state(state) {
376 return Ok(None);
377 } else {
378 debug_assert!(dfa.is_quit_state(state));
379 return Err(MatchError::Quit { byte, offset: at - 1 });
380 }
381 }
382 }
383
384 let result = eoi_fwd(dfa, bytes, end, &mut state);
385 caller_state.set_id(state);
386 if let Ok(Some(ref last_match)) = result {
387 caller_state.set_last_match(StateMatch {
388 match_index: 1,
389 offset: last_match.offset(),
390 });
391 }
392 result
393}
394
395fn init_fwd<A: Automaton + ?Sized>(
396 dfa: &A,
397 pattern_id: Option<PatternID>,
398 bytes: &[u8],
399 start: usize,
400 end: usize,
401) -> Result<StateID, MatchError> {
402 let state: StateID = dfa.start_state_forward(pattern_id, bytes, start, end);
403 // Start states can never be match states, since all matches are delayed
404 // by 1 byte.
405 assert!(!dfa.is_match_state(state));
406 Ok(state)
407}
408
409fn init_rev<A: Automaton + ?Sized>(
410 dfa: &A,
411 pattern_id: Option<PatternID>,
412 bytes: &[u8],
413 start: usize,
414 end: usize,
415) -> Result<StateID, MatchError> {
416 let state: StateID = dfa.start_state_reverse(pattern_id, bytes, start, end);
417 // Start states can never be match states, since all matches are delayed
418 // by 1 byte.
419 assert!(!dfa.is_match_state(state));
420 Ok(state)
421}
422
423fn eoi_fwd<A: Automaton + ?Sized>(
424 dfa: &A,
425 bytes: &[u8],
426 end: usize,
427 state: &mut StateID,
428) -> Result<Option<HalfMatch>, MatchError> {
429 match bytes.get(end) {
430 Some(&b) => {
431 *state = dfa.next_state(*state, b);
432 if dfa.is_match_state(*state) {
433 Ok(Some(HalfMatch {
434 pattern: dfa.match_pattern(*state, 0),
435 offset: end,
436 }))
437 } else {
438 Ok(None)
439 }
440 }
441 None => {
442 *state = dfa.next_eoi_state(*state);
443 if dfa.is_match_state(*state) {
444 Ok(Some(HalfMatch {
445 pattern: dfa.match_pattern(*state, 0),
446 offset: bytes.len(),
447 }))
448 } else {
449 Ok(None)
450 }
451 }
452 }
453}
454
455fn eoi_rev<A: Automaton + ?Sized>(
456 dfa: &A,
457 bytes: &[u8],
458 start: usize,
459 state: StateID,
460) -> Result<Option<HalfMatch>, MatchError> {
461 if start > 0 {
462 let state: StateID = dfa.next_state(current:state, input:bytes[start - 1]);
463 if dfa.is_match_state(id:state) {
464 Ok(Some(HalfMatch {
465 pattern: dfa.match_pattern(id:state, index:0),
466 offset: start,
467 }))
468 } else {
469 Ok(None)
470 }
471 } else {
472 let state: StateID = dfa.next_eoi_state(current:state);
473 if dfa.is_match_state(id:state) {
474 Ok(Some(HalfMatch {
475 pattern: dfa.match_pattern(id:state, index:0),
476 offset: 0,
477 }))
478 } else {
479 Ok(None)
480 }
481 }
482}
483
484// Currently unused, but is useful to keep around. This was originally used
485// when the code above used raw pointers for its main loop.
486// /// Returns the distance between the given pointer and the start of `bytes`.
487// /// This assumes that the given pointer points to somewhere in the `bytes`
488// /// slice given.
489// fn offset(bytes: &[u8], p: *const u8) -> usize {
490// debug_assert!(bytes.as_ptr() <= p);
491// debug_assert!(bytes[bytes.len()..].as_ptr() >= p);
492// ((p as isize) - (bytes.as_ptr() as isize)) as usize
493// }
494