| 1 | use crate::{ |
| 2 | dfa::{ |
| 3 | accel, |
| 4 | automaton::{Automaton, OverlappingState, StateMatch}, |
| 5 | }, |
| 6 | util::{ |
| 7 | id::{PatternID, StateID}, |
| 8 | matchtypes::HalfMatch, |
| 9 | prefilter, MATCH_OFFSET, |
| 10 | }, |
| 11 | MatchError, |
| 12 | }; |
| 13 | |
| 14 | #[inline (never)] |
| 15 | pub fn find_earliest_fwd<A: Automaton + ?Sized>( |
| 16 | pre: Option<&mut prefilter::Scanner>, |
| 17 | dfa: &A, |
| 18 | pattern_id: Option<PatternID>, |
| 19 | bytes: &[u8], |
| 20 | start: usize, |
| 21 | end: usize, |
| 22 | ) -> Result<Option<HalfMatch>, MatchError> { |
| 23 | // Searching with a pattern ID is always anchored, so we should never use |
| 24 | // a prefilter. |
| 25 | if pre.is_some() && pattern_id.is_none() { |
| 26 | find_fwd(pre, earliest:true, dfa, pattern_id, haystack:bytes, start, end) |
| 27 | } else { |
| 28 | find_fwd(pre:None, earliest:true, dfa, pattern_id, haystack:bytes, start, end) |
| 29 | } |
| 30 | } |
| 31 | |
| 32 | #[inline (never)] |
| 33 | pub fn find_leftmost_fwd<A: Automaton + ?Sized>( |
| 34 | pre: Option<&mut prefilter::Scanner>, |
| 35 | dfa: &A, |
| 36 | pattern_id: Option<PatternID>, |
| 37 | bytes: &[u8], |
| 38 | start: usize, |
| 39 | end: usize, |
| 40 | ) -> Result<Option<HalfMatch>, MatchError> { |
| 41 | // Searching with a pattern ID is always anchored, so we should never use |
| 42 | // a prefilter. |
| 43 | if pre.is_some() && pattern_id.is_none() { |
| 44 | find_fwd(pre, earliest:false, dfa, pattern_id, haystack:bytes, start, end) |
| 45 | } else { |
| 46 | find_fwd(pre:None, earliest:false, dfa, pattern_id, haystack:bytes, start, end) |
| 47 | } |
| 48 | } |
| 49 | |
| 50 | /// This is marked as `inline(always)` specifically because it supports |
| 51 | /// multiple modes of searching. Namely, the 'pre' and 'earliest' parameters |
| 52 | /// getting inlined eliminate some critical branches. To avoid bloating binary |
| 53 | /// size, we only call this function in a fixed number of places. |
| 54 | #[inline (always)] |
| 55 | fn find_fwd<A: Automaton + ?Sized>( |
| 56 | mut pre: Option<&mut prefilter::Scanner>, |
| 57 | earliest: bool, |
| 58 | dfa: &A, |
| 59 | pattern_id: Option<PatternID>, |
| 60 | haystack: &[u8], |
| 61 | start: usize, |
| 62 | end: usize, |
| 63 | ) -> Result<Option<HalfMatch>, MatchError> { |
| 64 | assert!(start <= end); |
| 65 | assert!(start <= haystack.len()); |
| 66 | assert!(end <= haystack.len()); |
| 67 | |
| 68 | // Why do this? This lets 'bytes[at]' work without bounds checks below. |
| 69 | // It seems the assert on 'end <= haystack.len()' above is otherwise |
| 70 | // not enough. Why not just make 'bytes' scoped this way anyway? Well, |
| 71 | // 'eoi_fwd' (below) might actually want to try to access the byte at 'end' |
| 72 | // for resolving look-ahead. |
| 73 | let bytes = &haystack[..end]; |
| 74 | |
| 75 | let mut state = init_fwd(dfa, pattern_id, haystack, start, end)?; |
| 76 | let mut last_match = None; |
| 77 | let mut at = start; |
| 78 | if let Some(ref mut pre) = pre { |
| 79 | // If a prefilter doesn't report false positives, then we don't need to |
| 80 | // touch the DFA at all. However, since all matches include the pattern |
| 81 | // ID, and the prefilter infrastructure doesn't report pattern IDs, we |
| 82 | // limit this optimization to cases where there is exactly one pattern. |
| 83 | // In that case, any match must be the 0th pattern. |
| 84 | if dfa.pattern_count() == 1 && !pre.reports_false_positives() { |
| 85 | return Ok(pre.next_candidate(bytes, at).into_option().map( |
| 86 | |offset| HalfMatch { pattern: PatternID::ZERO, offset }, |
| 87 | )); |
| 88 | } else if pre.is_effective(at) { |
| 89 | match pre.next_candidate(bytes, at).into_option() { |
| 90 | None => return Ok(None), |
| 91 | Some(i) => { |
| 92 | at = i; |
| 93 | } |
| 94 | } |
| 95 | } |
| 96 | } |
| 97 | while at < end { |
| 98 | let byte = bytes[at]; |
| 99 | state = dfa.next_state(state, byte); |
| 100 | at += 1; |
| 101 | if dfa.is_special_state(state) { |
| 102 | if dfa.is_start_state(state) { |
| 103 | if let Some(ref mut pre) = pre { |
| 104 | if pre.is_effective(at) { |
| 105 | match pre.next_candidate(bytes, at).into_option() { |
| 106 | None => return Ok(None), |
| 107 | Some(i) => { |
| 108 | at = i; |
| 109 | } |
| 110 | } |
| 111 | } |
| 112 | } else if dfa.is_accel_state(state) { |
| 113 | let needles = dfa.accelerator(state); |
| 114 | at = accel::find_fwd(needles, bytes, at) |
| 115 | .unwrap_or(bytes.len()); |
| 116 | } |
| 117 | } else if dfa.is_match_state(state) { |
| 118 | last_match = Some(HalfMatch { |
| 119 | pattern: dfa.match_pattern(state, 0), |
| 120 | offset: at - MATCH_OFFSET, |
| 121 | }); |
| 122 | if earliest { |
| 123 | return Ok(last_match); |
| 124 | } |
| 125 | if dfa.is_accel_state(state) { |
| 126 | let needles = dfa.accelerator(state); |
| 127 | at = accel::find_fwd(needles, bytes, at) |
| 128 | .unwrap_or(bytes.len()); |
| 129 | } |
| 130 | } else if dfa.is_accel_state(state) { |
| 131 | let needs = dfa.accelerator(state); |
| 132 | at = accel::find_fwd(needs, bytes, at).unwrap_or(bytes.len()); |
| 133 | } else if dfa.is_dead_state(state) { |
| 134 | return Ok(last_match); |
| 135 | } else { |
| 136 | debug_assert!(dfa.is_quit_state(state)); |
| 137 | if last_match.is_some() { |
| 138 | return Ok(last_match); |
| 139 | } |
| 140 | return Err(MatchError::Quit { byte, offset: at - 1 }); |
| 141 | } |
| 142 | } |
| 143 | while at < end && dfa.next_state(state, bytes[at]) == state { |
| 144 | at += 1; |
| 145 | } |
| 146 | } |
| 147 | Ok(eoi_fwd(dfa, haystack, end, &mut state)?.or(last_match)) |
| 148 | } |
| 149 | |
| 150 | #[inline (never)] |
| 151 | pub fn find_earliest_rev<A: Automaton + ?Sized>( |
| 152 | dfa: &A, |
| 153 | pattern_id: Option<PatternID>, |
| 154 | bytes: &[u8], |
| 155 | start: usize, |
| 156 | end: usize, |
| 157 | ) -> Result<Option<HalfMatch>, MatchError> { |
| 158 | find_rev(earliest:true, dfa, pattern_id, bytes, start, end) |
| 159 | } |
| 160 | |
| 161 | #[inline (never)] |
| 162 | pub fn find_leftmost_rev<A: Automaton + ?Sized>( |
| 163 | dfa: &A, |
| 164 | pattern_id: Option<PatternID>, |
| 165 | bytes: &[u8], |
| 166 | start: usize, |
| 167 | end: usize, |
| 168 | ) -> Result<Option<HalfMatch>, MatchError> { |
| 169 | find_rev(earliest:false, dfa, pattern_id, bytes, start, end) |
| 170 | } |
| 171 | |
| 172 | /// This is marked as `inline(always)` specifically because it supports |
| 173 | /// multiple modes of searching. Namely, the 'earliest' boolean getting inlined |
| 174 | /// permits eliminating a few crucial branches. |
| 175 | #[inline (always)] |
| 176 | fn find_rev<A: Automaton + ?Sized>( |
| 177 | earliest: bool, |
| 178 | dfa: &A, |
| 179 | pattern_id: Option<PatternID>, |
| 180 | bytes: &[u8], |
| 181 | start: usize, |
| 182 | end: usize, |
| 183 | ) -> Result<Option<HalfMatch>, MatchError> { |
| 184 | assert!(start <= end); |
| 185 | assert!(start <= bytes.len()); |
| 186 | assert!(end <= bytes.len()); |
| 187 | |
| 188 | let mut state = init_rev(dfa, pattern_id, bytes, start, end)?; |
| 189 | let mut last_match = None; |
| 190 | let mut at = end; |
| 191 | while at > start { |
| 192 | at -= 1; |
| 193 | while at > start && dfa.next_state(state, bytes[at]) == state { |
| 194 | at -= 1; |
| 195 | } |
| 196 | |
| 197 | let byte = bytes[at]; |
| 198 | state = dfa.next_state(state, byte); |
| 199 | if dfa.is_special_state(state) { |
| 200 | if dfa.is_start_state(state) { |
| 201 | if dfa.is_accel_state(state) { |
| 202 | let needles = dfa.accelerator(state); |
| 203 | at = accel::find_rev(needles, bytes, at) |
| 204 | .map(|i| i + 1) |
| 205 | .unwrap_or(0); |
| 206 | } |
| 207 | } else if dfa.is_match_state(state) { |
| 208 | last_match = Some(HalfMatch { |
| 209 | pattern: dfa.match_pattern(state, 0), |
| 210 | offset: at + MATCH_OFFSET, |
| 211 | }); |
| 212 | if earliest { |
| 213 | return Ok(last_match); |
| 214 | } |
| 215 | if dfa.is_accel_state(state) { |
| 216 | let needles = dfa.accelerator(state); |
| 217 | at = accel::find_rev(needles, bytes, at) |
| 218 | .map(|i| i + 1) |
| 219 | .unwrap_or(0); |
| 220 | } |
| 221 | } else if dfa.is_accel_state(state) { |
| 222 | let needles = dfa.accelerator(state); |
| 223 | at = accel::find_rev(needles, bytes, at) |
| 224 | .map(|i| i + 1) |
| 225 | .unwrap_or(0); |
| 226 | } else if dfa.is_dead_state(state) { |
| 227 | return Ok(last_match); |
| 228 | } else { |
| 229 | debug_assert!(dfa.is_quit_state(state)); |
| 230 | if last_match.is_some() { |
| 231 | return Ok(last_match); |
| 232 | } |
| 233 | return Err(MatchError::Quit { byte, offset: at }); |
| 234 | } |
| 235 | } |
| 236 | } |
| 237 | Ok(eoi_rev(dfa, bytes, start, state)?.or(last_match)) |
| 238 | } |
| 239 | |
| 240 | #[inline (never)] |
| 241 | pub fn find_overlapping_fwd<A: Automaton + ?Sized>( |
| 242 | pre: Option<&mut prefilter::Scanner>, |
| 243 | dfa: &A, |
| 244 | pattern_id: Option<PatternID>, |
| 245 | bytes: &[u8], |
| 246 | start: usize, |
| 247 | end: usize, |
| 248 | caller_state: &mut OverlappingState, |
| 249 | ) -> Result<Option<HalfMatch>, MatchError> { |
| 250 | // Searching with a pattern ID is always anchored, so we should only ever |
| 251 | // use a prefilter when no pattern ID is given. |
| 252 | if pre.is_some() && pattern_id.is_none() { |
| 253 | find_overlapping_fwd_imp( |
| 254 | pre, |
| 255 | dfa, |
| 256 | pattern_id, |
| 257 | bytes, |
| 258 | start, |
| 259 | end, |
| 260 | caller_state, |
| 261 | ) |
| 262 | } else { |
| 263 | find_overlapping_fwd_imp( |
| 264 | None, |
| 265 | dfa, |
| 266 | pattern_id, |
| 267 | bytes, |
| 268 | start, |
| 269 | end, |
| 270 | caller_state, |
| 271 | ) |
| 272 | } |
| 273 | } |
| 274 | |
| 275 | /// This is marked as `inline(always)` specifically because it supports |
| 276 | /// multiple modes of searching. Namely, the 'pre' prefilter getting inlined |
| 277 | /// permits eliminating a few crucial branches and reduces code size when it is |
| 278 | /// not used. |
| 279 | #[inline (always)] |
| 280 | fn find_overlapping_fwd_imp<A: Automaton + ?Sized>( |
| 281 | mut pre: Option<&mut prefilter::Scanner>, |
| 282 | dfa: &A, |
| 283 | pattern_id: Option<PatternID>, |
| 284 | bytes: &[u8], |
| 285 | mut start: usize, |
| 286 | end: usize, |
| 287 | caller_state: &mut OverlappingState, |
| 288 | ) -> Result<Option<HalfMatch>, MatchError> { |
| 289 | assert!(start <= end); |
| 290 | assert!(start <= bytes.len()); |
| 291 | assert!(end <= bytes.len()); |
| 292 | |
| 293 | let mut state = match caller_state.id() { |
| 294 | None => init_fwd(dfa, pattern_id, bytes, start, end)?, |
| 295 | Some(id) => { |
| 296 | if let Some(last) = caller_state.last_match() { |
| 297 | let match_count = dfa.match_count(id); |
| 298 | if last.match_index < match_count { |
| 299 | let m = HalfMatch { |
| 300 | pattern: dfa.match_pattern(id, last.match_index), |
| 301 | offset: last.offset, |
| 302 | }; |
| 303 | last.match_index += 1; |
| 304 | return Ok(Some(m)); |
| 305 | } |
| 306 | } |
| 307 | |
| 308 | // This is a subtle but critical detail. If the caller provides a |
| 309 | // non-None state ID, then it must be the case that the state ID |
| 310 | // corresponds to one set by this function. The state ID therefore |
| 311 | // corresponds to a match state, a dead state or some other state. |
| 312 | // However, "some other" state _only_ occurs when the input has |
| 313 | // been exhausted because the only way to stop before then is to |
| 314 | // see a match or a dead/quit state. |
| 315 | // |
| 316 | // If the input is exhausted or if it's a dead state, then |
| 317 | // incrementing the starting position has no relevance on |
| 318 | // correctness, since the loop below will either not execute |
| 319 | // at all or will immediately stop due to being in a dead state. |
| 320 | // (Once in a dead state it is impossible to leave it.) |
| 321 | // |
| 322 | // Therefore, the only case we need to consider is when |
| 323 | // caller_state is a match state. In this case, since our machines |
| 324 | // support the ability to delay a match by a certain number of |
| 325 | // bytes (to support look-around), it follows that we actually |
| 326 | // consumed that many additional bytes on our previous search. When |
| 327 | // the caller resumes their search to find subsequent matches, they |
| 328 | // will use the ending location from the previous match as the next |
| 329 | // starting point, which is `MATCH_OFFSET` bytes PRIOR to where |
| 330 | // we scanned to on the previous search. Therefore, we need to |
| 331 | // compensate by bumping `start` up by `MATCH_OFFSET` bytes. |
| 332 | // |
| 333 | // Incidentally, since MATCH_OFFSET is non-zero, this also makes |
| 334 | // dealing with empty matches convenient. Namely, callers needn't |
| 335 | // special case them when implementing an iterator. Instead, this |
| 336 | // ensures that forward progress is always made. |
| 337 | start += MATCH_OFFSET; |
| 338 | id |
| 339 | } |
| 340 | }; |
| 341 | |
| 342 | let mut at = start; |
| 343 | while at < end { |
| 344 | let byte = bytes[at]; |
| 345 | state = dfa.next_state(state, byte); |
| 346 | at += 1; |
| 347 | if dfa.is_special_state(state) { |
| 348 | caller_state.set_id(state); |
| 349 | if dfa.is_start_state(state) { |
| 350 | if let Some(ref mut pre) = pre { |
| 351 | if pre.is_effective(at) { |
| 352 | match pre.next_candidate(bytes, at).into_option() { |
| 353 | None => return Ok(None), |
| 354 | Some(i) => { |
| 355 | at = i; |
| 356 | } |
| 357 | } |
| 358 | } |
| 359 | } else if dfa.is_accel_state(state) { |
| 360 | let needles = dfa.accelerator(state); |
| 361 | at = accel::find_fwd(needles, bytes, at) |
| 362 | .unwrap_or(bytes.len()); |
| 363 | } |
| 364 | } else if dfa.is_match_state(state) { |
| 365 | let offset = at - MATCH_OFFSET; |
| 366 | caller_state |
| 367 | .set_last_match(StateMatch { match_index: 1, offset }); |
| 368 | return Ok(Some(HalfMatch { |
| 369 | pattern: dfa.match_pattern(state, 0), |
| 370 | offset, |
| 371 | })); |
| 372 | } else if dfa.is_accel_state(state) { |
| 373 | let needs = dfa.accelerator(state); |
| 374 | at = accel::find_fwd(needs, bytes, at).unwrap_or(bytes.len()); |
| 375 | } else if dfa.is_dead_state(state) { |
| 376 | return Ok(None); |
| 377 | } else { |
| 378 | debug_assert!(dfa.is_quit_state(state)); |
| 379 | return Err(MatchError::Quit { byte, offset: at - 1 }); |
| 380 | } |
| 381 | } |
| 382 | } |
| 383 | |
| 384 | let result = eoi_fwd(dfa, bytes, end, &mut state); |
| 385 | caller_state.set_id(state); |
| 386 | if let Ok(Some(ref last_match)) = result { |
| 387 | caller_state.set_last_match(StateMatch { |
| 388 | match_index: 1, |
| 389 | offset: last_match.offset(), |
| 390 | }); |
| 391 | } |
| 392 | result |
| 393 | } |
| 394 | |
| 395 | fn init_fwd<A: Automaton + ?Sized>( |
| 396 | dfa: &A, |
| 397 | pattern_id: Option<PatternID>, |
| 398 | bytes: &[u8], |
| 399 | start: usize, |
| 400 | end: usize, |
| 401 | ) -> Result<StateID, MatchError> { |
| 402 | let state: StateID = dfa.start_state_forward(pattern_id, bytes, start, end); |
| 403 | // Start states can never be match states, since all matches are delayed |
| 404 | // by 1 byte. |
| 405 | assert!(!dfa.is_match_state(state)); |
| 406 | Ok(state) |
| 407 | } |
| 408 | |
| 409 | fn init_rev<A: Automaton + ?Sized>( |
| 410 | dfa: &A, |
| 411 | pattern_id: Option<PatternID>, |
| 412 | bytes: &[u8], |
| 413 | start: usize, |
| 414 | end: usize, |
| 415 | ) -> Result<StateID, MatchError> { |
| 416 | let state: StateID = dfa.start_state_reverse(pattern_id, bytes, start, end); |
| 417 | // Start states can never be match states, since all matches are delayed |
| 418 | // by 1 byte. |
| 419 | assert!(!dfa.is_match_state(state)); |
| 420 | Ok(state) |
| 421 | } |
| 422 | |
| 423 | fn eoi_fwd<A: Automaton + ?Sized>( |
| 424 | dfa: &A, |
| 425 | bytes: &[u8], |
| 426 | end: usize, |
| 427 | state: &mut StateID, |
| 428 | ) -> Result<Option<HalfMatch>, MatchError> { |
| 429 | match bytes.get(end) { |
| 430 | Some(&b) => { |
| 431 | *state = dfa.next_state(*state, b); |
| 432 | if dfa.is_match_state(*state) { |
| 433 | Ok(Some(HalfMatch { |
| 434 | pattern: dfa.match_pattern(*state, 0), |
| 435 | offset: end, |
| 436 | })) |
| 437 | } else { |
| 438 | Ok(None) |
| 439 | } |
| 440 | } |
| 441 | None => { |
| 442 | *state = dfa.next_eoi_state(*state); |
| 443 | if dfa.is_match_state(*state) { |
| 444 | Ok(Some(HalfMatch { |
| 445 | pattern: dfa.match_pattern(*state, 0), |
| 446 | offset: bytes.len(), |
| 447 | })) |
| 448 | } else { |
| 449 | Ok(None) |
| 450 | } |
| 451 | } |
| 452 | } |
| 453 | } |
| 454 | |
| 455 | fn eoi_rev<A: Automaton + ?Sized>( |
| 456 | dfa: &A, |
| 457 | bytes: &[u8], |
| 458 | start: usize, |
| 459 | state: StateID, |
| 460 | ) -> Result<Option<HalfMatch>, MatchError> { |
| 461 | if start > 0 { |
| 462 | let state: StateID = dfa.next_state(current:state, input:bytes[start - 1]); |
| 463 | if dfa.is_match_state(id:state) { |
| 464 | Ok(Some(HalfMatch { |
| 465 | pattern: dfa.match_pattern(id:state, index:0), |
| 466 | offset: start, |
| 467 | })) |
| 468 | } else { |
| 469 | Ok(None) |
| 470 | } |
| 471 | } else { |
| 472 | let state: StateID = dfa.next_eoi_state(current:state); |
| 473 | if dfa.is_match_state(id:state) { |
| 474 | Ok(Some(HalfMatch { |
| 475 | pattern: dfa.match_pattern(id:state, index:0), |
| 476 | offset: 0, |
| 477 | })) |
| 478 | } else { |
| 479 | Ok(None) |
| 480 | } |
| 481 | } |
| 482 | } |
| 483 | |
| 484 | // Currently unused, but is useful to keep around. This was originally used |
| 485 | // when the code above used raw pointers for its main loop. |
| 486 | // /// Returns the distance between the given pointer and the start of `bytes`. |
| 487 | // /// This assumes that the given pointer points to somewhere in the `bytes` |
| 488 | // /// slice given. |
| 489 | // fn offset(bytes: &[u8], p: *const u8) -> usize { |
| 490 | // debug_assert!(bytes.as_ptr() <= p); |
| 491 | // debug_assert!(bytes[bytes.len()..].as_ptr() >= p); |
| 492 | // ((p as isize) - (bytes.as_ptr() as isize)) as usize |
| 493 | // } |
| 494 | |