1 | use crate::{ |
2 | dfa::{ |
3 | accel, |
4 | automaton::{Automaton, OverlappingState, StateMatch}, |
5 | }, |
6 | util::{ |
7 | id::{PatternID, StateID}, |
8 | matchtypes::HalfMatch, |
9 | prefilter, MATCH_OFFSET, |
10 | }, |
11 | MatchError, |
12 | }; |
13 | |
14 | #[inline (never)] |
15 | pub fn find_earliest_fwd<A: Automaton + ?Sized>( |
16 | pre: Option<&mut prefilter::Scanner>, |
17 | dfa: &A, |
18 | pattern_id: Option<PatternID>, |
19 | bytes: &[u8], |
20 | start: usize, |
21 | end: usize, |
22 | ) -> Result<Option<HalfMatch>, MatchError> { |
23 | // Searching with a pattern ID is always anchored, so we should never use |
24 | // a prefilter. |
25 | if pre.is_some() && pattern_id.is_none() { |
26 | find_fwd(pre, earliest:true, dfa, pattern_id, haystack:bytes, start, end) |
27 | } else { |
28 | find_fwd(pre:None, earliest:true, dfa, pattern_id, haystack:bytes, start, end) |
29 | } |
30 | } |
31 | |
32 | #[inline (never)] |
33 | pub fn find_leftmost_fwd<A: Automaton + ?Sized>( |
34 | pre: Option<&mut prefilter::Scanner>, |
35 | dfa: &A, |
36 | pattern_id: Option<PatternID>, |
37 | bytes: &[u8], |
38 | start: usize, |
39 | end: usize, |
40 | ) -> Result<Option<HalfMatch>, MatchError> { |
41 | // Searching with a pattern ID is always anchored, so we should never use |
42 | // a prefilter. |
43 | if pre.is_some() && pattern_id.is_none() { |
44 | find_fwd(pre, earliest:false, dfa, pattern_id, haystack:bytes, start, end) |
45 | } else { |
46 | find_fwd(pre:None, earliest:false, dfa, pattern_id, haystack:bytes, start, end) |
47 | } |
48 | } |
49 | |
50 | /// This is marked as `inline(always)` specifically because it supports |
51 | /// multiple modes of searching. Namely, the 'pre' and 'earliest' parameters |
52 | /// getting inlined eliminate some critical branches. To avoid bloating binary |
53 | /// size, we only call this function in a fixed number of places. |
54 | #[inline (always)] |
55 | fn find_fwd<A: Automaton + ?Sized>( |
56 | mut pre: Option<&mut prefilter::Scanner>, |
57 | earliest: bool, |
58 | dfa: &A, |
59 | pattern_id: Option<PatternID>, |
60 | haystack: &[u8], |
61 | start: usize, |
62 | end: usize, |
63 | ) -> Result<Option<HalfMatch>, MatchError> { |
64 | assert!(start <= end); |
65 | assert!(start <= haystack.len()); |
66 | assert!(end <= haystack.len()); |
67 | |
68 | // Why do this? This lets 'bytes[at]' work without bounds checks below. |
69 | // It seems the assert on 'end <= haystack.len()' above is otherwise |
70 | // not enough. Why not just make 'bytes' scoped this way anyway? Well, |
71 | // 'eoi_fwd' (below) might actually want to try to access the byte at 'end' |
72 | // for resolving look-ahead. |
73 | let bytes = &haystack[..end]; |
74 | |
75 | let mut state = init_fwd(dfa, pattern_id, haystack, start, end)?; |
76 | let mut last_match = None; |
77 | let mut at = start; |
78 | if let Some(ref mut pre) = pre { |
79 | // If a prefilter doesn't report false positives, then we don't need to |
80 | // touch the DFA at all. However, since all matches include the pattern |
81 | // ID, and the prefilter infrastructure doesn't report pattern IDs, we |
82 | // limit this optimization to cases where there is exactly one pattern. |
83 | // In that case, any match must be the 0th pattern. |
84 | if dfa.pattern_count() == 1 && !pre.reports_false_positives() { |
85 | return Ok(pre.next_candidate(bytes, at).into_option().map( |
86 | |offset| HalfMatch { pattern: PatternID::ZERO, offset }, |
87 | )); |
88 | } else if pre.is_effective(at) { |
89 | match pre.next_candidate(bytes, at).into_option() { |
90 | None => return Ok(None), |
91 | Some(i) => { |
92 | at = i; |
93 | } |
94 | } |
95 | } |
96 | } |
97 | while at < end { |
98 | let byte = bytes[at]; |
99 | state = dfa.next_state(state, byte); |
100 | at += 1; |
101 | if dfa.is_special_state(state) { |
102 | if dfa.is_start_state(state) { |
103 | if let Some(ref mut pre) = pre { |
104 | if pre.is_effective(at) { |
105 | match pre.next_candidate(bytes, at).into_option() { |
106 | None => return Ok(None), |
107 | Some(i) => { |
108 | at = i; |
109 | } |
110 | } |
111 | } |
112 | } else if dfa.is_accel_state(state) { |
113 | let needles = dfa.accelerator(state); |
114 | at = accel::find_fwd(needles, bytes, at) |
115 | .unwrap_or(bytes.len()); |
116 | } |
117 | } else if dfa.is_match_state(state) { |
118 | last_match = Some(HalfMatch { |
119 | pattern: dfa.match_pattern(state, 0), |
120 | offset: at - MATCH_OFFSET, |
121 | }); |
122 | if earliest { |
123 | return Ok(last_match); |
124 | } |
125 | if dfa.is_accel_state(state) { |
126 | let needles = dfa.accelerator(state); |
127 | at = accel::find_fwd(needles, bytes, at) |
128 | .unwrap_or(bytes.len()); |
129 | } |
130 | } else if dfa.is_accel_state(state) { |
131 | let needs = dfa.accelerator(state); |
132 | at = accel::find_fwd(needs, bytes, at).unwrap_or(bytes.len()); |
133 | } else if dfa.is_dead_state(state) { |
134 | return Ok(last_match); |
135 | } else { |
136 | debug_assert!(dfa.is_quit_state(state)); |
137 | if last_match.is_some() { |
138 | return Ok(last_match); |
139 | } |
140 | return Err(MatchError::Quit { byte, offset: at - 1 }); |
141 | } |
142 | } |
143 | while at < end && dfa.next_state(state, bytes[at]) == state { |
144 | at += 1; |
145 | } |
146 | } |
147 | Ok(eoi_fwd(dfa, haystack, end, &mut state)?.or(last_match)) |
148 | } |
149 | |
150 | #[inline (never)] |
151 | pub fn find_earliest_rev<A: Automaton + ?Sized>( |
152 | dfa: &A, |
153 | pattern_id: Option<PatternID>, |
154 | bytes: &[u8], |
155 | start: usize, |
156 | end: usize, |
157 | ) -> Result<Option<HalfMatch>, MatchError> { |
158 | find_rev(earliest:true, dfa, pattern_id, bytes, start, end) |
159 | } |
160 | |
161 | #[inline (never)] |
162 | pub fn find_leftmost_rev<A: Automaton + ?Sized>( |
163 | dfa: &A, |
164 | pattern_id: Option<PatternID>, |
165 | bytes: &[u8], |
166 | start: usize, |
167 | end: usize, |
168 | ) -> Result<Option<HalfMatch>, MatchError> { |
169 | find_rev(earliest:false, dfa, pattern_id, bytes, start, end) |
170 | } |
171 | |
172 | /// This is marked as `inline(always)` specifically because it supports |
173 | /// multiple modes of searching. Namely, the 'earliest' boolean getting inlined |
174 | /// permits eliminating a few crucial branches. |
175 | #[inline (always)] |
176 | fn find_rev<A: Automaton + ?Sized>( |
177 | earliest: bool, |
178 | dfa: &A, |
179 | pattern_id: Option<PatternID>, |
180 | bytes: &[u8], |
181 | start: usize, |
182 | end: usize, |
183 | ) -> Result<Option<HalfMatch>, MatchError> { |
184 | assert!(start <= end); |
185 | assert!(start <= bytes.len()); |
186 | assert!(end <= bytes.len()); |
187 | |
188 | let mut state = init_rev(dfa, pattern_id, bytes, start, end)?; |
189 | let mut last_match = None; |
190 | let mut at = end; |
191 | while at > start { |
192 | at -= 1; |
193 | while at > start && dfa.next_state(state, bytes[at]) == state { |
194 | at -= 1; |
195 | } |
196 | |
197 | let byte = bytes[at]; |
198 | state = dfa.next_state(state, byte); |
199 | if dfa.is_special_state(state) { |
200 | if dfa.is_start_state(state) { |
201 | if dfa.is_accel_state(state) { |
202 | let needles = dfa.accelerator(state); |
203 | at = accel::find_rev(needles, bytes, at) |
204 | .map(|i| i + 1) |
205 | .unwrap_or(0); |
206 | } |
207 | } else if dfa.is_match_state(state) { |
208 | last_match = Some(HalfMatch { |
209 | pattern: dfa.match_pattern(state, 0), |
210 | offset: at + MATCH_OFFSET, |
211 | }); |
212 | if earliest { |
213 | return Ok(last_match); |
214 | } |
215 | if dfa.is_accel_state(state) { |
216 | let needles = dfa.accelerator(state); |
217 | at = accel::find_rev(needles, bytes, at) |
218 | .map(|i| i + 1) |
219 | .unwrap_or(0); |
220 | } |
221 | } else if dfa.is_accel_state(state) { |
222 | let needles = dfa.accelerator(state); |
223 | at = accel::find_rev(needles, bytes, at) |
224 | .map(|i| i + 1) |
225 | .unwrap_or(0); |
226 | } else if dfa.is_dead_state(state) { |
227 | return Ok(last_match); |
228 | } else { |
229 | debug_assert!(dfa.is_quit_state(state)); |
230 | if last_match.is_some() { |
231 | return Ok(last_match); |
232 | } |
233 | return Err(MatchError::Quit { byte, offset: at }); |
234 | } |
235 | } |
236 | } |
237 | Ok(eoi_rev(dfa, bytes, start, state)?.or(last_match)) |
238 | } |
239 | |
240 | #[inline (never)] |
241 | pub fn find_overlapping_fwd<A: Automaton + ?Sized>( |
242 | pre: Option<&mut prefilter::Scanner>, |
243 | dfa: &A, |
244 | pattern_id: Option<PatternID>, |
245 | bytes: &[u8], |
246 | start: usize, |
247 | end: usize, |
248 | caller_state: &mut OverlappingState, |
249 | ) -> Result<Option<HalfMatch>, MatchError> { |
250 | // Searching with a pattern ID is always anchored, so we should only ever |
251 | // use a prefilter when no pattern ID is given. |
252 | if pre.is_some() && pattern_id.is_none() { |
253 | find_overlapping_fwd_imp( |
254 | pre, |
255 | dfa, |
256 | pattern_id, |
257 | bytes, |
258 | start, |
259 | end, |
260 | caller_state, |
261 | ) |
262 | } else { |
263 | find_overlapping_fwd_imp( |
264 | None, |
265 | dfa, |
266 | pattern_id, |
267 | bytes, |
268 | start, |
269 | end, |
270 | caller_state, |
271 | ) |
272 | } |
273 | } |
274 | |
275 | /// This is marked as `inline(always)` specifically because it supports |
276 | /// multiple modes of searching. Namely, the 'pre' prefilter getting inlined |
277 | /// permits eliminating a few crucial branches and reduces code size when it is |
278 | /// not used. |
279 | #[inline (always)] |
280 | fn find_overlapping_fwd_imp<A: Automaton + ?Sized>( |
281 | mut pre: Option<&mut prefilter::Scanner>, |
282 | dfa: &A, |
283 | pattern_id: Option<PatternID>, |
284 | bytes: &[u8], |
285 | mut start: usize, |
286 | end: usize, |
287 | caller_state: &mut OverlappingState, |
288 | ) -> Result<Option<HalfMatch>, MatchError> { |
289 | assert!(start <= end); |
290 | assert!(start <= bytes.len()); |
291 | assert!(end <= bytes.len()); |
292 | |
293 | let mut state = match caller_state.id() { |
294 | None => init_fwd(dfa, pattern_id, bytes, start, end)?, |
295 | Some(id) => { |
296 | if let Some(last) = caller_state.last_match() { |
297 | let match_count = dfa.match_count(id); |
298 | if last.match_index < match_count { |
299 | let m = HalfMatch { |
300 | pattern: dfa.match_pattern(id, last.match_index), |
301 | offset: last.offset, |
302 | }; |
303 | last.match_index += 1; |
304 | return Ok(Some(m)); |
305 | } |
306 | } |
307 | |
308 | // This is a subtle but critical detail. If the caller provides a |
309 | // non-None state ID, then it must be the case that the state ID |
310 | // corresponds to one set by this function. The state ID therefore |
311 | // corresponds to a match state, a dead state or some other state. |
312 | // However, "some other" state _only_ occurs when the input has |
313 | // been exhausted because the only way to stop before then is to |
314 | // see a match or a dead/quit state. |
315 | // |
316 | // If the input is exhausted or if it's a dead state, then |
317 | // incrementing the starting position has no relevance on |
318 | // correctness, since the loop below will either not execute |
319 | // at all or will immediately stop due to being in a dead state. |
320 | // (Once in a dead state it is impossible to leave it.) |
321 | // |
322 | // Therefore, the only case we need to consider is when |
323 | // caller_state is a match state. In this case, since our machines |
324 | // support the ability to delay a match by a certain number of |
325 | // bytes (to support look-around), it follows that we actually |
326 | // consumed that many additional bytes on our previous search. When |
327 | // the caller resumes their search to find subsequent matches, they |
328 | // will use the ending location from the previous match as the next |
329 | // starting point, which is `MATCH_OFFSET` bytes PRIOR to where |
330 | // we scanned to on the previous search. Therefore, we need to |
331 | // compensate by bumping `start` up by `MATCH_OFFSET` bytes. |
332 | // |
333 | // Incidentally, since MATCH_OFFSET is non-zero, this also makes |
334 | // dealing with empty matches convenient. Namely, callers needn't |
335 | // special case them when implementing an iterator. Instead, this |
336 | // ensures that forward progress is always made. |
337 | start += MATCH_OFFSET; |
338 | id |
339 | } |
340 | }; |
341 | |
342 | let mut at = start; |
343 | while at < end { |
344 | let byte = bytes[at]; |
345 | state = dfa.next_state(state, byte); |
346 | at += 1; |
347 | if dfa.is_special_state(state) { |
348 | caller_state.set_id(state); |
349 | if dfa.is_start_state(state) { |
350 | if let Some(ref mut pre) = pre { |
351 | if pre.is_effective(at) { |
352 | match pre.next_candidate(bytes, at).into_option() { |
353 | None => return Ok(None), |
354 | Some(i) => { |
355 | at = i; |
356 | } |
357 | } |
358 | } |
359 | } else if dfa.is_accel_state(state) { |
360 | let needles = dfa.accelerator(state); |
361 | at = accel::find_fwd(needles, bytes, at) |
362 | .unwrap_or(bytes.len()); |
363 | } |
364 | } else if dfa.is_match_state(state) { |
365 | let offset = at - MATCH_OFFSET; |
366 | caller_state |
367 | .set_last_match(StateMatch { match_index: 1, offset }); |
368 | return Ok(Some(HalfMatch { |
369 | pattern: dfa.match_pattern(state, 0), |
370 | offset, |
371 | })); |
372 | } else if dfa.is_accel_state(state) { |
373 | let needs = dfa.accelerator(state); |
374 | at = accel::find_fwd(needs, bytes, at).unwrap_or(bytes.len()); |
375 | } else if dfa.is_dead_state(state) { |
376 | return Ok(None); |
377 | } else { |
378 | debug_assert!(dfa.is_quit_state(state)); |
379 | return Err(MatchError::Quit { byte, offset: at - 1 }); |
380 | } |
381 | } |
382 | } |
383 | |
384 | let result = eoi_fwd(dfa, bytes, end, &mut state); |
385 | caller_state.set_id(state); |
386 | if let Ok(Some(ref last_match)) = result { |
387 | caller_state.set_last_match(StateMatch { |
388 | match_index: 1, |
389 | offset: last_match.offset(), |
390 | }); |
391 | } |
392 | result |
393 | } |
394 | |
395 | fn init_fwd<A: Automaton + ?Sized>( |
396 | dfa: &A, |
397 | pattern_id: Option<PatternID>, |
398 | bytes: &[u8], |
399 | start: usize, |
400 | end: usize, |
401 | ) -> Result<StateID, MatchError> { |
402 | let state: StateID = dfa.start_state_forward(pattern_id, bytes, start, end); |
403 | // Start states can never be match states, since all matches are delayed |
404 | // by 1 byte. |
405 | assert!(!dfa.is_match_state(state)); |
406 | Ok(state) |
407 | } |
408 | |
409 | fn init_rev<A: Automaton + ?Sized>( |
410 | dfa: &A, |
411 | pattern_id: Option<PatternID>, |
412 | bytes: &[u8], |
413 | start: usize, |
414 | end: usize, |
415 | ) -> Result<StateID, MatchError> { |
416 | let state: StateID = dfa.start_state_reverse(pattern_id, bytes, start, end); |
417 | // Start states can never be match states, since all matches are delayed |
418 | // by 1 byte. |
419 | assert!(!dfa.is_match_state(state)); |
420 | Ok(state) |
421 | } |
422 | |
423 | fn eoi_fwd<A: Automaton + ?Sized>( |
424 | dfa: &A, |
425 | bytes: &[u8], |
426 | end: usize, |
427 | state: &mut StateID, |
428 | ) -> Result<Option<HalfMatch>, MatchError> { |
429 | match bytes.get(end) { |
430 | Some(&b) => { |
431 | *state = dfa.next_state(*state, b); |
432 | if dfa.is_match_state(*state) { |
433 | Ok(Some(HalfMatch { |
434 | pattern: dfa.match_pattern(*state, 0), |
435 | offset: end, |
436 | })) |
437 | } else { |
438 | Ok(None) |
439 | } |
440 | } |
441 | None => { |
442 | *state = dfa.next_eoi_state(*state); |
443 | if dfa.is_match_state(*state) { |
444 | Ok(Some(HalfMatch { |
445 | pattern: dfa.match_pattern(*state, 0), |
446 | offset: bytes.len(), |
447 | })) |
448 | } else { |
449 | Ok(None) |
450 | } |
451 | } |
452 | } |
453 | } |
454 | |
455 | fn eoi_rev<A: Automaton + ?Sized>( |
456 | dfa: &A, |
457 | bytes: &[u8], |
458 | start: usize, |
459 | state: StateID, |
460 | ) -> Result<Option<HalfMatch>, MatchError> { |
461 | if start > 0 { |
462 | let state: StateID = dfa.next_state(current:state, input:bytes[start - 1]); |
463 | if dfa.is_match_state(id:state) { |
464 | Ok(Some(HalfMatch { |
465 | pattern: dfa.match_pattern(id:state, index:0), |
466 | offset: start, |
467 | })) |
468 | } else { |
469 | Ok(None) |
470 | } |
471 | } else { |
472 | let state: StateID = dfa.next_eoi_state(current:state); |
473 | if dfa.is_match_state(id:state) { |
474 | Ok(Some(HalfMatch { |
475 | pattern: dfa.match_pattern(id:state, index:0), |
476 | offset: 0, |
477 | })) |
478 | } else { |
479 | Ok(None) |
480 | } |
481 | } |
482 | } |
483 | |
484 | // Currently unused, but is useful to keep around. This was originally used |
485 | // when the code above used raw pointers for its main loop. |
486 | // /// Returns the distance between the given pointer and the start of `bytes`. |
487 | // /// This assumes that the given pointer points to somewhere in the `bytes` |
488 | // /// slice given. |
489 | // fn offset(bytes: &[u8], p: *const u8) -> usize { |
490 | // debug_assert!(bytes.as_ptr() <= p); |
491 | // debug_assert!(bytes[bytes.len()..].as_ptr() >= p); |
492 | // ((p as isize) - (bytes.as_ptr() as isize)) as usize |
493 | // } |
494 | |