| 1 | use alloc::{vec, vec::Vec}; |
| 2 | |
| 3 | use crate::{ |
| 4 | int::{NonMaxUsize, U32}, |
| 5 | nfa::{State, StateID, NFA}, |
| 6 | pool::CachePoolGuard, |
| 7 | utf8, |
| 8 | }; |
| 9 | |
| 10 | /// A PikeVM searcher. |
| 11 | /// |
| 12 | /// A PikeVM uses the standard Thompson NFA linear time search algorithm, but |
| 13 | /// augmented to support tracking the offsets of matching capture groups. |
| 14 | #[derive (Clone, Debug)] |
| 15 | pub(crate) struct PikeVM { |
| 16 | nfa: NFA, |
| 17 | } |
| 18 | |
| 19 | impl PikeVM { |
| 20 | /// Create a new PikeVM searcher that uses the given NFA. |
| 21 | pub(crate) fn new(nfa: NFA) -> PikeVM { |
| 22 | PikeVM { nfa } |
| 23 | } |
| 24 | |
| 25 | /// Return the underlying NFA used by this PikeVM. |
| 26 | pub(crate) fn nfa(&self) -> &NFA { |
| 27 | &self.nfa |
| 28 | } |
| 29 | |
| 30 | /// Returns an iterator of non-overlapping matches in the given haystack. |
| 31 | pub(crate) fn find_iter<'r, 'h>( |
| 32 | &'r self, |
| 33 | cache: CachePoolGuard<'r>, |
| 34 | haystack: &'h [u8], |
| 35 | ) -> FindMatches<'r, 'h> { |
| 36 | FindMatches { |
| 37 | pikevm: self, |
| 38 | cache, |
| 39 | haystack, |
| 40 | at: 0, |
| 41 | slots: vec![None, None], |
| 42 | last_match_end: None, |
| 43 | } |
| 44 | } |
| 45 | |
| 46 | /// Returns an iterator of non-overlapping capture matches in the given |
| 47 | /// haystack. |
| 48 | pub(crate) fn captures_iter<'r, 'h>( |
| 49 | &'r self, |
| 50 | cache: CachePoolGuard<'r>, |
| 51 | haystack: &'h [u8], |
| 52 | ) -> CapturesMatches<'r, 'h> { |
| 53 | // OK because the NFA wouldn't have compiled if this could overflow. |
| 54 | let len = self.nfa().group_len().checked_mul(2).unwrap(); |
| 55 | CapturesMatches { |
| 56 | it: FindMatches { |
| 57 | pikevm: self, |
| 58 | cache, |
| 59 | haystack, |
| 60 | at: 0, |
| 61 | slots: vec![None; len], |
| 62 | last_match_end: None, |
| 63 | }, |
| 64 | } |
| 65 | } |
| 66 | |
| 67 | /// The implementation of standard leftmost search. |
| 68 | /// |
| 69 | /// Capturing group spans are written to `slots`, but only if requested. |
| 70 | /// `slots` can be any length. Any slot in the NFA that is activated but |
| 71 | /// which is out of bounds for the given `slots` is ignored. |
| 72 | pub(crate) fn search( |
| 73 | &self, |
| 74 | cache: &mut Cache, |
| 75 | haystack: &[u8], |
| 76 | start: usize, |
| 77 | end: usize, |
| 78 | earliest: bool, |
| 79 | slots: &mut [Option<NonMaxUsize>], |
| 80 | ) -> bool { |
| 81 | cache.setup_search(slots.len()); |
| 82 | if start > end { |
| 83 | return false; |
| 84 | } |
| 85 | // Why do we even care about this? Well, in our `slots` representation, |
| 86 | // we use usize::MAX as a sentinel to indicate "no match." This isn't |
| 87 | // problematic so long as our haystack doesn't have a maximal length. |
| 88 | // Byte slices are guaranteed by Rust to have a length that fits into |
| 89 | // isize, and so this assert should always pass. But we put it here to |
| 90 | // make our assumption explicit. |
| 91 | assert!( |
| 92 | haystack.len() < core::usize::MAX, |
| 93 | "byte slice lengths must be less than usize MAX" , |
| 94 | ); |
| 95 | |
| 96 | let Cache { ref mut stack, ref mut curr, ref mut next } = cache; |
| 97 | let start_id = self.nfa().start(); |
| 98 | let anchored = self.nfa().is_start_anchored(); |
| 99 | let mut matched = false; |
| 100 | // Yes, our search doesn't end at `end`, but includes it. This is |
| 101 | // necessary because matches are delayed by one byte. The delay is used |
| 102 | // to handle look-behind assertions. In the case of the PikeVM, the |
| 103 | // delay is implemented by not considering a match to exist until it |
| 104 | // is visited in `nexts`. Technically, we know a match exists in the |
| 105 | // previous iteration via `epsilon_closure`. |
| 106 | let mut at = start; |
| 107 | while at <= end { |
| 108 | // If we have no states left to visit, then there are some cases |
| 109 | // where we know we can quit early or even skip ahead. |
| 110 | if curr.set.is_empty() { |
| 111 | // We have a match so we can quit. |
| 112 | if matched { |
| 113 | break; |
| 114 | } |
| 115 | // If we're running an anchored search and we've advanced |
| 116 | // beyond the start position with no other states to try, then |
| 117 | // we will never observe a match and thus can stop. |
| 118 | if anchored && at > start { |
| 119 | break; |
| 120 | } |
| 121 | } |
| 122 | // Instead of using a hypothetical unanchored start state in the |
| 123 | // NFA (which doesn't exist, but we could add it), we actually |
| 124 | // always use its anchored starting state. As a result, when doing |
| 125 | // an unanchored search, we need to simulate our own '(?s:.)*?' |
| 126 | // prefix, to permit a match to appear anywhere. |
| 127 | // |
| 128 | // Now, we don't *have* to do things this way. We could create |
| 129 | // a proper unanchored start state in the NFA and do one |
| 130 | // `epsilon_closure` call from that starting state before the main |
| 131 | // loop here. And that is just as correct. However, it turns out to |
| 132 | // be slower than our approach here because it slightly increases |
| 133 | // the cost of processing each byte by requiring us to visit |
| 134 | // more NFA states to deal with the additional NFA states in the |
| 135 | // unanchored prefix. By simulating it explicitly here, we lower |
| 136 | // those costs substantially. The cost is itself small, but it adds |
| 137 | // up for large haystacks. |
| 138 | // |
| 139 | // In order to simulate the '(?s:.)*?' prefix---which is not |
| 140 | // greedy---we are careful not to perform an epsilon closure on |
| 141 | // the start state if we already have a match. Namely, if we |
| 142 | // did otherwise, we would never reach a terminating condition |
| 143 | // because there would always be additional states to process. |
| 144 | if !matched { |
| 145 | // Since we are adding to the 'curr' active states and since |
| 146 | // this is for the start ID, we use a slots slice that is |
| 147 | // guaranteed to have the right length but where every element |
| 148 | // is absent. This is exactly what we want, because this |
| 149 | // epsilon closure is responsible for simulating an unanchored |
| 150 | // '(?s:.)*?' prefix. It is specifically outside of any |
| 151 | // capturing groups, and thus, using slots that are always |
| 152 | // absent is correct. |
| 153 | // |
| 154 | // Note though that we can't just use `&mut []` here, since |
| 155 | // this epsilon closure may traverse through `Capture` states |
| 156 | // transitions, and thus must be able to write offsets to the |
| 157 | // slots given which are later copied to slot values in `curr`. |
| 158 | let slots = next.slot_table.all_absent(); |
| 159 | self.epsilon_closure( |
| 160 | stack, slots, curr, haystack, at, start_id, |
| 161 | ); |
| 162 | } |
| 163 | let (ch, len) = utf8::decode_lossy(&haystack[at..]); |
| 164 | if self.nexts(stack, curr, next, haystack, at, ch, len, slots) { |
| 165 | matched = true; |
| 166 | } |
| 167 | // Unless the caller asked us to return early, we need to mush |
| 168 | // on to see if we can extend our match. (But note that 'nexts' |
| 169 | // will quit right after seeing a match, as is consistent with |
| 170 | // leftmost-first match priority.) |
| 171 | if (earliest && matched) || len == 0 { |
| 172 | break; |
| 173 | } |
| 174 | core::mem::swap(curr, next); |
| 175 | next.set.clear(); |
| 176 | at += len; |
| 177 | } |
| 178 | matched |
| 179 | } |
| 180 | |
| 181 | /// Process the active states in 'curr' to find the states (written to |
| 182 | /// 'next') we should process for the next byte in the haystack. |
| 183 | /// |
| 184 | /// 'stack' is used to perform a depth first traversal of the NFA when |
| 185 | /// computing an epsilon closure. |
| 186 | /// |
| 187 | /// When a match is found, the slots for that match state (in 'curr') are |
| 188 | /// copied to 'caps'. Moreover, once a match is seen, processing for 'curr' |
| 189 | /// stops (unless the PikeVM was configured with MatchKind::All semantics). |
| 190 | /// |
| 191 | /// `at_ch` is the Unicode scalar value whose UTF-8 encoding begins at `at` |
| 192 | /// in `haystack`. |
| 193 | /// |
| 194 | /// `at_len` is the number of bytes consumed by `at_ch`. This is usually |
| 195 | /// equal to `at_ch.len_utf8()`, but not always. For example, in the case |
| 196 | /// where `at_ch` is the replacement codepoint that results from decoding |
| 197 | /// invalid UTF-8. In that case, `at_len` can be 1, 2 or 3. |
| 198 | fn nexts( |
| 199 | &self, |
| 200 | stack: &mut Vec<FollowEpsilon>, |
| 201 | curr: &mut ActiveStates, |
| 202 | next: &mut ActiveStates, |
| 203 | haystack: &[u8], |
| 204 | at: usize, |
| 205 | at_ch: char, |
| 206 | at_len: usize, |
| 207 | slots: &mut [Option<NonMaxUsize>], |
| 208 | ) -> bool { |
| 209 | let ActiveStates { ref set, ref mut slot_table } = *curr; |
| 210 | for sid in set.iter() { |
| 211 | if self.next( |
| 212 | stack, slot_table, next, haystack, at, at_ch, at_len, sid, |
| 213 | ) { |
| 214 | slots.copy_from_slice(slot_table.for_state(sid)); |
| 215 | return true; |
| 216 | } |
| 217 | } |
| 218 | false |
| 219 | } |
| 220 | |
| 221 | /// Starting from `sid`, if the position `at` in the `haystack` has a |
| 222 | /// transition defined out of `sid`, then add the state transitioned to and |
| 223 | /// its epsilon closure to the `next` set of states to explore. |
| 224 | /// |
| 225 | /// `stack` is used by the epsilon closure computation to perform a depth |
| 226 | /// first traversal of the NFA. |
| 227 | /// |
| 228 | /// `curr_slot_table` should be the table of slots for the current set of |
| 229 | /// states being explored. If there is a transition out of `sid`, then |
| 230 | /// sid's row in the slot table is used to perform the epsilon closure. |
| 231 | /// |
| 232 | /// `at_ch` is the Unicode scalar value whose UTF-8 encoding begins at `at` |
| 233 | /// in `haystack`. The caller provides it so that this routine doesn't |
| 234 | /// need to re-decode it. (Since it's expected that this routine is called |
| 235 | /// multiple times for each position.) |
| 236 | /// |
| 237 | /// `at_len` is the number of bytes consumed by `at_ch`. This is usually |
| 238 | /// equal to `at_ch.len_utf8()`, but not always. For example, in the case |
| 239 | /// where `at_ch` is the replacement codepoint that results from decoding |
| 240 | /// invalid UTF-8. In that case, `at_len` can be 1, 2 or 3. |
| 241 | fn next( |
| 242 | &self, |
| 243 | stack: &mut Vec<FollowEpsilon>, |
| 244 | curr_slot_table: &mut SlotTable, |
| 245 | next: &mut ActiveStates, |
| 246 | haystack: &[u8], |
| 247 | at: usize, |
| 248 | at_ch: char, |
| 249 | at_len: usize, |
| 250 | sid: StateID, |
| 251 | ) -> bool { |
| 252 | match *self.nfa.state(sid) { |
| 253 | State::Fail |
| 254 | | State::Goto { .. } |
| 255 | | State::Splits { .. } |
| 256 | | State::Capture { .. } => false, |
| 257 | State::Char { target, ch } => { |
| 258 | if at_ch == ch && at_len > 0 { |
| 259 | let slots = curr_slot_table.for_state(sid); |
| 260 | // OK because `at_len` is always derived from the number |
| 261 | // of bytes read from `at` that make up `at_ch`. So this |
| 262 | // will never wrap. |
| 263 | let at = at.wrapping_add(at_len); |
| 264 | self.epsilon_closure( |
| 265 | stack, slots, next, haystack, at, target, |
| 266 | ); |
| 267 | } |
| 268 | false |
| 269 | } |
| 270 | State::Ranges { target, ref ranges } => { |
| 271 | for (start, end) in ranges.iter().copied() { |
| 272 | if start > at_ch { |
| 273 | break; |
| 274 | } else if start <= at_ch && at_ch <= end { |
| 275 | if at_len == 0 { |
| 276 | return false; |
| 277 | } |
| 278 | let slots = curr_slot_table.for_state(sid); |
| 279 | // OK because `at_len` is always derived from the |
| 280 | // number of bytes read from `at` that make up `at_ch`. |
| 281 | // So this will never wrap. |
| 282 | let at = at.wrapping_add(at_len); |
| 283 | self.epsilon_closure( |
| 284 | stack, slots, next, haystack, at, target, |
| 285 | ); |
| 286 | } |
| 287 | } |
| 288 | false |
| 289 | } |
| 290 | State::Match => true, |
| 291 | } |
| 292 | } |
| 293 | |
| 294 | /// Compute the epsilon closure of `sid`, writing the closure into `next` |
| 295 | /// while copying slot values from `curr_slots` into corresponding states |
| 296 | /// in `next`. `curr_slots` should be the slot values corresponding to |
| 297 | /// `sid`. |
| 298 | /// |
| 299 | /// The given `stack` is used to perform a depth first traversal of the |
| 300 | /// NFA by recursively following all epsilon transitions out of `sid`. |
| 301 | /// Conditional epsilon transitions are followed if and only if they are |
| 302 | /// satisfied for the position `at` in the `input` haystack. |
| 303 | /// |
| 304 | /// While this routine may write to `curr_slots`, once it returns, any |
| 305 | /// writes are undone and the original values (even if absent) are |
| 306 | /// restored. |
| 307 | fn epsilon_closure( |
| 308 | &self, |
| 309 | stack: &mut Vec<FollowEpsilon>, |
| 310 | curr_slots: &mut [Option<NonMaxUsize>], |
| 311 | next: &mut ActiveStates, |
| 312 | haystack: &[u8], |
| 313 | at: usize, |
| 314 | sid: StateID, |
| 315 | ) { |
| 316 | stack.push(FollowEpsilon::Explore(sid)); |
| 317 | while let Some(frame) = stack.pop() { |
| 318 | match frame { |
| 319 | FollowEpsilon::RestoreCapture { slot, offset } => { |
| 320 | curr_slots[slot.as_usize()] = offset; |
| 321 | } |
| 322 | FollowEpsilon::Explore(sid) => { |
| 323 | self.epsilon_closure_explore( |
| 324 | stack, curr_slots, next, haystack, at, sid, |
| 325 | ); |
| 326 | } |
| 327 | } |
| 328 | } |
| 329 | } |
| 330 | |
| 331 | /// Explore all of the epsilon transitions out of `sid`. This is mostly |
| 332 | /// split out from `epsilon_closure` in order to clearly delineate |
| 333 | /// the actual work of computing an epsilon closure from the stack |
| 334 | /// book-keeping. |
| 335 | /// |
| 336 | /// This will push any additional explorations needed on to `stack`. |
| 337 | /// |
| 338 | /// `curr_slots` should refer to the slots for the currently active NFA |
| 339 | /// state. That is, the current state we are stepping through. These |
| 340 | /// slots are mutated in place as new `Captures` states are traversed |
| 341 | /// during epsilon closure, but the slots are restored to their original |
| 342 | /// values once the full epsilon closure is completed. The ultimate use of |
| 343 | /// `curr_slots` is to copy them to the corresponding `next_slots`, so that |
| 344 | /// the capturing group spans are forwarded from the currently active state |
| 345 | /// to the next. |
| 346 | /// |
| 347 | /// `next` refers to the next set of active states. Computing an epsilon |
| 348 | /// closure may increase the next set of active states. |
| 349 | /// |
| 350 | /// `haystack` refers to the what we're searching and `at` refers to the |
| 351 | /// current position in the haystack. These are used to check whether |
| 352 | /// conditional epsilon transitions (like look-around) are satisfied at |
| 353 | /// the current position. If they aren't, then the epsilon closure won't |
| 354 | /// include them. |
| 355 | fn epsilon_closure_explore( |
| 356 | &self, |
| 357 | stack: &mut Vec<FollowEpsilon>, |
| 358 | curr_slots: &mut [Option<NonMaxUsize>], |
| 359 | next: &mut ActiveStates, |
| 360 | haystack: &[u8], |
| 361 | at: usize, |
| 362 | mut sid: StateID, |
| 363 | ) { |
| 364 | // We can avoid pushing some state IDs on to our stack in precisely |
| 365 | // the cases where a 'push(x)' would be immediately followed by a 'x |
| 366 | // = pop()'. This is achieved by this outer-loop. We simply set 'sid' |
| 367 | // to be the next state ID we want to explore once we're done with |
| 368 | // our initial exploration. In practice, this avoids a lot of stack |
| 369 | // thrashing. |
| 370 | loop { |
| 371 | // Record this state as part of our next set of active states. If |
| 372 | // we've already explored it, then no need to do it again. |
| 373 | if !next.set.insert(sid) { |
| 374 | return; |
| 375 | } |
| 376 | match *self.nfa.state(sid) { |
| 377 | State::Fail |
| 378 | | State::Match { .. } |
| 379 | | State::Char { .. } |
| 380 | | State::Ranges { .. } => { |
| 381 | next.slot_table.for_state(sid).copy_from_slice(curr_slots); |
| 382 | return; |
| 383 | } |
| 384 | State::Goto { target, look: None } => { |
| 385 | sid = target; |
| 386 | } |
| 387 | State::Goto { target, look: Some(look) } => { |
| 388 | if !look.is_match(haystack, at) { |
| 389 | return; |
| 390 | } |
| 391 | sid = target; |
| 392 | } |
| 393 | State::Splits { ref targets, reverse: false } => { |
| 394 | sid = match targets.get(0) { |
| 395 | None => return, |
| 396 | Some(&sid) => sid, |
| 397 | }; |
| 398 | stack.extend( |
| 399 | targets[1..] |
| 400 | .iter() |
| 401 | .copied() |
| 402 | .rev() |
| 403 | .map(FollowEpsilon::Explore), |
| 404 | ); |
| 405 | } |
| 406 | State::Splits { ref targets, reverse: true } => { |
| 407 | sid = match targets.last() { |
| 408 | None => return, |
| 409 | Some(&sid) => sid, |
| 410 | }; |
| 411 | stack.extend( |
| 412 | targets[..targets.len() - 1] |
| 413 | .iter() |
| 414 | .copied() |
| 415 | .map(FollowEpsilon::Explore), |
| 416 | ); |
| 417 | } |
| 418 | State::Capture { target, slot } => { |
| 419 | // There's no need to do anything with slots that |
| 420 | // ultimately won't be copied into the caller-provided |
| 421 | // 'Captures' value. So we just skip dealing with them at |
| 422 | // all. |
| 423 | if slot.as_usize() < curr_slots.len() { |
| 424 | stack.push(FollowEpsilon::RestoreCapture { |
| 425 | slot, |
| 426 | offset: curr_slots[slot.as_usize()], |
| 427 | }); |
| 428 | // OK because length of a slice must fit into an isize. |
| 429 | curr_slots[slot.as_usize()] = |
| 430 | Some(NonMaxUsize::new(at).unwrap()); |
| 431 | } |
| 432 | sid = target; |
| 433 | } |
| 434 | } |
| 435 | } |
| 436 | } |
| 437 | } |
| 438 | |
| 439 | /// An iterator over all successive non-overlapping matches in a particular |
| 440 | /// haystack. `'r` represents the lifetime of the regex, `'c` is the lifetime |
| 441 | /// of the cache and `'h` represents the lifetime of the haystack. |
| 442 | #[derive (Debug)] |
| 443 | pub(crate) struct FindMatches<'r, 'h> { |
| 444 | pikevm: &'r PikeVM, |
| 445 | cache: CachePoolGuard<'r>, |
| 446 | haystack: &'h [u8], |
| 447 | at: usize, |
| 448 | slots: Vec<Option<NonMaxUsize>>, |
| 449 | last_match_end: Option<usize>, |
| 450 | } |
| 451 | |
| 452 | impl<'r, 'h> Iterator for FindMatches<'r, 'h> { |
| 453 | type Item = (usize, usize); |
| 454 | |
| 455 | fn next(&mut self) -> Option<(usize, usize)> { |
| 456 | if !self.pikevm.search( |
| 457 | &mut self.cache, |
| 458 | self.haystack, |
| 459 | self.at, |
| 460 | self.haystack.len(), |
| 461 | earliest:false, |
| 462 | &mut self.slots, |
| 463 | ) { |
| 464 | return None; |
| 465 | } |
| 466 | let mut m: (usize, usize) = |
| 467 | (self.slots[0].unwrap().get(), self.slots[1].unwrap().get()); |
| 468 | if m.0 >= m.1 { |
| 469 | m = self.handle_overlapping_empty_match(m)?; |
| 470 | } |
| 471 | self.at = m.1; |
| 472 | self.last_match_end = Some(m.1); |
| 473 | Some(m) |
| 474 | } |
| 475 | } |
| 476 | |
| 477 | impl<'r, 'h> FindMatches<'r, 'h> { |
| 478 | /// Handles the special case of an empty match by ensuring that 1) the |
| 479 | /// iterator always advances and 2) empty matches never overlap with other |
| 480 | /// matches. |
| 481 | /// |
| 482 | /// Note that we mark this cold and forcefully prevent inlining because |
| 483 | /// handling empty matches like this is extremely rare and does require a |
| 484 | /// bit of code, comparatively. Keeping this code out of the main iterator |
| 485 | /// function keeps it smaller and more amenable to inlining itself. |
| 486 | #[cold ] |
| 487 | #[inline (never)] |
| 488 | fn handle_overlapping_empty_match( |
| 489 | &mut self, |
| 490 | mut m: (usize, usize), |
| 491 | ) -> Option<(usize, usize)> { |
| 492 | assert!(m.0 >= m.1); |
| 493 | if Some(m.1) == self.last_match_end { |
| 494 | let len = |
| 495 | core::cmp::max(1, utf8::decode(&self.haystack[self.at..]).1); |
| 496 | self.at = self.at.checked_add(len).unwrap(); |
| 497 | if !self.pikevm.search( |
| 498 | &mut self.cache, |
| 499 | self.haystack, |
| 500 | self.at, |
| 501 | self.haystack.len(), |
| 502 | false, |
| 503 | &mut self.slots, |
| 504 | ) { |
| 505 | return None; |
| 506 | } |
| 507 | m = (self.slots[0].unwrap().get(), self.slots[1].unwrap().get()); |
| 508 | } |
| 509 | Some(m) |
| 510 | } |
| 511 | } |
| 512 | |
| 513 | /// An iterator over all successive non-overlapping capture matches in a particular |
| 514 | /// haystack. `'r` represents the lifetime of the regex, `'c` is the lifetime |
| 515 | /// of the cache and `'h` represents the lifetime of the haystack. |
| 516 | #[derive (Debug)] |
| 517 | pub(crate) struct CapturesMatches<'r, 'h> { |
| 518 | it: FindMatches<'r, 'h>, |
| 519 | } |
| 520 | |
| 521 | impl<'r, 'h> Iterator for CapturesMatches<'r, 'h> { |
| 522 | type Item = Vec<Option<NonMaxUsize>>; |
| 523 | |
| 524 | fn next(&mut self) -> Option<Vec<Option<NonMaxUsize>>> { |
| 525 | self.it.next()?; |
| 526 | Some(self.it.slots.clone()) |
| 527 | } |
| 528 | } |
| 529 | |
| 530 | /// A cache represents mutable state that a `PikeVM` requires during a search. |
| 531 | /// |
| 532 | /// For a given `PikeVM`, its corresponding cache may be created either via |
| 533 | /// `PikeVM::create_cache`, or via `Cache::new`. They are equivalent in every |
| 534 | /// way, except the former does not require explicitly importing `Cache`. |
| 535 | /// |
| 536 | /// A particular `Cache` is coupled with the `PikeVM` from which it was |
| 537 | /// created. It may only be used with that `PikeVM`. A cache and its |
| 538 | /// allocations may be re-purposed via `Cache::reset`, in which case, it can |
| 539 | /// only be used with the new `PikeVM` (and not the old one). |
| 540 | #[derive (Clone, Debug)] |
| 541 | pub(crate) struct Cache { |
| 542 | /// Stack used while computing epsilon closure. This effectively lets us |
| 543 | /// move what is more naturally expressed through recursion to a stack |
| 544 | /// on the heap. |
| 545 | stack: Vec<FollowEpsilon>, |
| 546 | /// The current active states being explored for the current byte in the |
| 547 | /// haystack. |
| 548 | curr: ActiveStates, |
| 549 | /// The next set of states we're building that will be explored for the |
| 550 | /// next byte in the haystack. |
| 551 | next: ActiveStates, |
| 552 | } |
| 553 | |
| 554 | impl Cache { |
| 555 | /// Create a new `PikeVM` cache. |
| 556 | /// |
| 557 | /// A potentially more convenient routine to create a cache is |
| 558 | /// `PikeVM::create_cache`, as it does not require also importing the |
| 559 | /// `Cache` type. |
| 560 | /// |
| 561 | /// If you want to reuse the returned `Cache` with some other `PikeVM`, |
| 562 | /// then you must call `Cache::reset` with the desired `PikeVM`. |
| 563 | pub(crate) fn new(re: &PikeVM) -> Cache { |
| 564 | Cache { |
| 565 | stack: vec![], |
| 566 | curr: ActiveStates::new(re), |
| 567 | next: ActiveStates::new(re), |
| 568 | } |
| 569 | } |
| 570 | |
| 571 | /// Clears this cache. This should be called at the start of every search |
| 572 | /// to ensure we start with a clean slate. |
| 573 | /// |
| 574 | /// This also sets the length of the capturing groups used in the current |
| 575 | /// search. This permits an optimization where by 'SlotTable::for_state' |
| 576 | /// only returns the number of slots equivalent to the number of slots |
| 577 | /// given in the 'Captures' value. This may be less than the total number |
| 578 | /// of possible slots, e.g., when one only wants to track overall match |
| 579 | /// offsets. This in turn permits less copying of capturing group spans |
| 580 | /// in the PikeVM. |
| 581 | fn setup_search(&mut self, captures_slot_len: usize) { |
| 582 | self.stack.clear(); |
| 583 | self.curr.setup_search(captures_slot_len); |
| 584 | self.next.setup_search(captures_slot_len); |
| 585 | } |
| 586 | } |
| 587 | |
| 588 | /// A set of active states used to "simulate" the execution of an NFA via the |
| 589 | /// PikeVM. |
| 590 | /// |
| 591 | /// There are two sets of these used during NFA simulation. One set corresponds |
| 592 | /// to the "current" set of states being traversed for the current position |
| 593 | /// in a haystack. The other set corresponds to the "next" set of states being |
| 594 | /// built, which will become the new "current" set for the next position in the |
| 595 | /// haystack. These two sets correspond to CLIST and NLIST in Thompson's |
| 596 | /// original paper regexes: https://dl.acm.org/doi/pdf/10.1145/363347.363387 |
| 597 | /// |
| 598 | /// In addition to representing a set of NFA states, this also maintains slot |
| 599 | /// values for each state. These slot values are what turn the NFA simulation |
| 600 | /// into the "Pike VM." Namely, they track capturing group values for each |
| 601 | /// state. During the computation of epsilon closure, we copy slot values from |
| 602 | /// states in the "current" set to the "next" set. Eventually, once a match |
| 603 | /// is found, the slot values for that match state are what we write to the |
| 604 | /// caller provided slots. |
| 605 | #[derive (Clone, Debug)] |
| 606 | struct ActiveStates { |
| 607 | /// The set of active NFA states. This set preserves insertion order, which |
| 608 | /// is critical for simulating the match semantics of backtracking regex |
| 609 | /// engines. |
| 610 | set: SparseSet, |
| 611 | /// The slots for every NFA state, where each slot stores a (possibly |
| 612 | /// absent) offset. Every capturing group has two slots. One for a start |
| 613 | /// offset and one for an end offset. |
| 614 | slot_table: SlotTable, |
| 615 | } |
| 616 | |
| 617 | impl ActiveStates { |
| 618 | /// Create a new set of active states for the given PikeVM. The active |
| 619 | /// states returned may only be used with the given PikeVM. (Use 'reset' |
| 620 | /// to re-purpose the allocation for a different PikeVM.) |
| 621 | fn new(re: &PikeVM) -> ActiveStates { |
| 622 | let mut active = ActiveStates { |
| 623 | set: SparseSet::new(0), |
| 624 | slot_table: SlotTable::new(), |
| 625 | }; |
| 626 | active.reset(re); |
| 627 | active |
| 628 | } |
| 629 | |
| 630 | /// Reset this set of active states such that it can be used with the given |
| 631 | /// PikeVM (and only that PikeVM). |
| 632 | fn reset(&mut self, re: &PikeVM) { |
| 633 | self.set.resize(re.nfa().len()); |
| 634 | self.slot_table.reset(re); |
| 635 | } |
| 636 | |
| 637 | /// Setup this set of active states for a new search. The given slot |
| 638 | /// length should be the number of slots in a caller provided 'Captures' |
| 639 | /// (and may be zero). |
| 640 | fn setup_search(&mut self, captures_slot_len: usize) { |
| 641 | self.set.clear(); |
| 642 | self.slot_table.setup_search(captures_slot_len); |
| 643 | } |
| 644 | } |
| 645 | |
| 646 | /// A table of slots, where each row represent a state in an NFA. Thus, the |
| 647 | /// table has room for storing slots for every single state in an NFA. |
| 648 | /// |
| 649 | /// This table is represented with a single contiguous allocation. In general, |
| 650 | /// the notion of "capturing group" doesn't really exist at this level of |
| 651 | /// abstraction, hence the name "slot" instead. (Indeed, every capturing group |
| 652 | /// maps to a pair of slots, one for the start offset and one for the end |
| 653 | /// offset.) Slots are indexed by the `Captures` NFA state. |
| 654 | #[derive (Clone, Debug)] |
| 655 | struct SlotTable { |
| 656 | /// The actual table of offsets. |
| 657 | table: Vec<Option<NonMaxUsize>>, |
| 658 | /// The number of slots per state, i.e., the table's stride or the length |
| 659 | /// of each row. |
| 660 | slots_per_state: usize, |
| 661 | /// The number of slots in the caller-provided `Captures` value for the |
| 662 | /// current search. Setting this to `slots_per_state` is always correct, |
| 663 | /// but may be wasteful. |
| 664 | slots_for_captures: usize, |
| 665 | } |
| 666 | |
| 667 | impl SlotTable { |
| 668 | /// Create a new slot table. |
| 669 | /// |
| 670 | /// One should call 'reset' with the corresponding PikeVM before use. |
| 671 | fn new() -> SlotTable { |
| 672 | SlotTable { table: vec![], slots_for_captures: 0, slots_per_state: 0 } |
| 673 | } |
| 674 | |
| 675 | /// Reset this slot table such that it can be used with the given PikeVM |
| 676 | /// (and only that PikeVM). |
| 677 | fn reset(&mut self, re: &PikeVM) { |
| 678 | let nfa = re.nfa(); |
| 679 | // OK because NFA construction would have failed if this overflowed. |
| 680 | self.slots_per_state = nfa.group_len().checked_mul(2).unwrap(); |
| 681 | // This is always correct, but may be reduced for a particular search |
| 682 | // if fewer slots were given by the caller, e.g., none at all or only |
| 683 | // slots for tracking the overall match instead of all slots for every |
| 684 | // group. |
| 685 | self.slots_for_captures = self.slots_per_state; |
| 686 | let len = nfa |
| 687 | .len() |
| 688 | // We add 1 so that our last row is always empty. We use it as |
| 689 | // "scratch" space for computing the epsilon closure off of the |
| 690 | // starting state. |
| 691 | .checked_add(1) |
| 692 | .and_then(|x| x.checked_mul(self.slots_per_state)) |
| 693 | // It seems like this could actually panic on legitimate inputs |
| 694 | // on 32-bit targets. Should we somehow convert this to an error? |
| 695 | // What about something similar for the lazy DFA cache? If you're |
| 696 | // tripping this assert, please file a bug. |
| 697 | .expect("slot table length doesn't overflow" ); |
| 698 | self.table.resize(len, None); |
| 699 | } |
| 700 | |
| 701 | /// Perform any per-search setup for this slot table. |
| 702 | /// |
| 703 | /// In particular, this sets the length of the number of slots used in the |
| 704 | /// slots given by the caller (if any at all). This number may be smaller |
| 705 | /// than the total number of slots available, e.g., when the caller is only |
| 706 | /// interested in tracking the overall match and not the spans of every |
| 707 | /// matching capturing group. Only tracking the overall match can save a |
| 708 | /// substantial amount of time copying capturing spans during a search. |
| 709 | fn setup_search(&mut self, captures_slot_len: usize) { |
| 710 | self.slots_for_captures = captures_slot_len; |
| 711 | } |
| 712 | |
| 713 | /// Return a mutable slice of the slots for the given state. |
| 714 | /// |
| 715 | /// Note that the length of the slice returned may be less than the total |
| 716 | /// number of slots available for this state. In particular, the length |
| 717 | /// always matches the number of slots indicated via `setup_search`. |
| 718 | fn for_state(&mut self, sid: StateID) -> &mut [Option<NonMaxUsize>] { |
| 719 | let i = sid.as_usize() * self.slots_per_state; |
| 720 | &mut self.table[i..i + self.slots_for_captures] |
| 721 | } |
| 722 | |
| 723 | /// Return a slice of slots of appropriate length where every slot offset |
| 724 | /// is guaranteed to be absent. This is useful in cases where you need to |
| 725 | /// compute an epsilon closure outside of the user supplied regex, and thus |
| 726 | /// never want it to have any capturing slots set. |
| 727 | fn all_absent(&mut self) -> &mut [Option<NonMaxUsize>] { |
| 728 | let i = self.table.len() - self.slots_per_state; |
| 729 | &mut self.table[i..i + self.slots_for_captures] |
| 730 | } |
| 731 | } |
| 732 | |
| 733 | /// Represents a stack frame for use while computing an epsilon closure. |
| 734 | /// |
| 735 | /// (An "epsilon closure" refers to the set of reachable NFA states from a |
| 736 | /// single state without consuming any input. That is, the set of all epsilon |
| 737 | /// transitions not only from that single state, but from every other state |
| 738 | /// reachable by an epsilon transition as well. This is why it's called a |
| 739 | /// "closure.") |
| 740 | /// |
| 741 | /// Computing the epsilon closure in a Thompson NFA proceeds via a depth |
| 742 | /// first traversal over all epsilon transitions from a particular state. |
| 743 | /// (A depth first traversal is important because it emulates the same priority |
| 744 | /// of matches that is typically found in backtracking regex engines.) This |
| 745 | /// depth first traversal is naturally expressed using recursion, but to avoid |
| 746 | /// a call stack size proportional to the size of a regex, we put our stack on |
| 747 | /// the heap instead. |
| 748 | /// |
| 749 | /// This stack thus consists of call frames. The typical call frame is |
| 750 | /// `Explore`, which instructs epsilon closure to explore the epsilon |
| 751 | /// transitions from that state. (Subsequent epsilon transitions are then |
| 752 | /// pushed on to the stack as more `Explore` frames.) If the state ID being |
| 753 | /// explored has no epsilon transitions, then the capturing group slots are |
| 754 | /// copied from the original state that sparked the epsilon closure (from the |
| 755 | /// 'step' routine) to the state ID being explored. This way, capturing group |
| 756 | /// slots are forwarded from the previous state to the next. |
| 757 | /// |
| 758 | /// The other stack frame, `RestoreCaptures`, instructs the epsilon closure to |
| 759 | /// set the position for a particular slot back to some particular offset. This |
| 760 | /// frame is pushed when `Explore` sees a `Capture` transition. `Explore` will |
| 761 | /// set the offset of the slot indicated in `Capture` to the current offset, |
| 762 | /// and then push the old offset on to the stack as a `RestoreCapture` frame. |
| 763 | /// Thus, the new offset is only used until the epsilon closure reverts back to |
| 764 | /// the `RestoreCapture` frame. In effect, this gives the `Capture` epsilon |
| 765 | /// transition its "scope" to only states that come "after" it during depth |
| 766 | /// first traversal. |
| 767 | #[derive (Clone, Debug)] |
| 768 | enum FollowEpsilon { |
| 769 | /// Explore the epsilon transitions from a state ID. |
| 770 | Explore(StateID), |
| 771 | /// Reset the given `slot` to the given `offset` (which might be `None`). |
| 772 | RestoreCapture { slot: u32, offset: Option<NonMaxUsize> }, |
| 773 | } |
| 774 | |
| 775 | /// A sparse set used for representing ordered NFA states. |
| 776 | /// |
| 777 | /// This supports constant time addition and membership testing. Clearing an |
| 778 | /// entire set can also be done in constant time. Iteration yields elements |
| 779 | /// in the order in which they were inserted. |
| 780 | /// |
| 781 | /// The data structure is based on: https://research.swtch.com/sparse |
| 782 | /// Note though that we don't actually use uninitialized memory. We generally |
| 783 | /// reuse sparse sets, so the initial allocation cost is bareable. However, its |
| 784 | /// other properties listed above are extremely useful. |
| 785 | #[derive (Clone)] |
| 786 | struct SparseSet { |
| 787 | /// The number of elements currently in this set. |
| 788 | len: usize, |
| 789 | /// Dense contains the ids in the order in which they were inserted. |
| 790 | dense: Vec<StateID>, |
| 791 | /// Sparse maps ids to their location in dense. |
| 792 | /// |
| 793 | /// A state ID is in the set if and only if |
| 794 | /// sparse[id] < len && id == dense[sparse[id]]. |
| 795 | /// |
| 796 | /// Note that these are indices into 'dense'. It's a little weird to use |
| 797 | /// StateID here, but we know our length can never exceed the bounds of |
| 798 | /// StateID (enforced by 'resize') and StateID will be at most 4 bytes |
| 799 | /// where as a usize is likely double that in most cases. |
| 800 | sparse: Vec<StateID>, |
| 801 | } |
| 802 | |
| 803 | impl SparseSet { |
| 804 | /// Create a new sparse set with the given capacity. |
| 805 | /// |
| 806 | /// Sparse sets have a fixed size and they cannot grow. Attempting to |
| 807 | /// insert more distinct elements than the total capacity of the set will |
| 808 | /// result in a panic. |
| 809 | /// |
| 810 | /// This panics if the capacity given is bigger than `StateID::LIMIT`. |
| 811 | fn new(capacity: usize) -> SparseSet { |
| 812 | let mut set = SparseSet { len: 0, dense: vec![], sparse: vec![] }; |
| 813 | set.resize(capacity); |
| 814 | set |
| 815 | } |
| 816 | |
| 817 | /// Resizes this sparse set to have the new capacity given. |
| 818 | /// |
| 819 | /// This set is automatically cleared. |
| 820 | /// |
| 821 | /// This panics if the capacity given is bigger than `StateID::LIMIT`. |
| 822 | fn resize(&mut self, new_capacity: usize) { |
| 823 | assert!( |
| 824 | new_capacity <= u32::MAX.as_usize(), |
| 825 | "sparse set capacity cannot excced {:?}" , |
| 826 | u32::MAX, |
| 827 | ); |
| 828 | self.clear(); |
| 829 | self.dense.resize(new_capacity, 0); |
| 830 | self.sparse.resize(new_capacity, 0); |
| 831 | } |
| 832 | |
| 833 | /// Returns the capacity of this set. |
| 834 | /// |
| 835 | /// The capacity represents a fixed limit on the number of distinct |
| 836 | /// elements that are allowed in this set. The capacity cannot be changed. |
| 837 | fn capacity(&self) -> usize { |
| 838 | self.dense.len() |
| 839 | } |
| 840 | |
| 841 | /// Returns the number of elements in this set. |
| 842 | fn len(&self) -> usize { |
| 843 | self.len |
| 844 | } |
| 845 | |
| 846 | /// Returns true if and only if this set is empty. |
| 847 | fn is_empty(&self) -> bool { |
| 848 | self.len() == 0 |
| 849 | } |
| 850 | |
| 851 | /// Insert the state ID value into this set and return true if the given |
| 852 | /// state ID was not previously in this set. |
| 853 | /// |
| 854 | /// This operation is idempotent. If the given value is already in this |
| 855 | /// set, then this is a no-op. |
| 856 | /// |
| 857 | /// If more than `capacity` ids are inserted, then this panics. |
| 858 | fn insert(&mut self, id: StateID) -> bool { |
| 859 | if self.contains(id) { |
| 860 | return false; |
| 861 | } |
| 862 | |
| 863 | let index = self.len(); |
| 864 | assert!( |
| 865 | index < self.capacity(), |
| 866 | " {:?} exceeds capacity of {:?} when inserting {:?}" , |
| 867 | index, |
| 868 | self.capacity(), |
| 869 | id, |
| 870 | ); |
| 871 | self.dense[index] = id; |
| 872 | // OK because we don't permit the capacity to be set higher than |
| 873 | // u32::MAX. |
| 874 | self.sparse[id.as_usize()] = u32::try_from(index).unwrap(); |
| 875 | self.len += 1; |
| 876 | true |
| 877 | } |
| 878 | |
| 879 | /// Returns true if and only if this set contains the given value. |
| 880 | fn contains(&self, id: StateID) -> bool { |
| 881 | let index = self.sparse[id.as_usize()]; |
| 882 | index.as_usize() < self.len() && self.dense[index.as_usize()] == id |
| 883 | } |
| 884 | |
| 885 | /// Clear this set such that it has no members. |
| 886 | fn clear(&mut self) { |
| 887 | self.len = 0; |
| 888 | } |
| 889 | |
| 890 | /// Returns an iterator over all the state IDs in this set in the order in |
| 891 | /// which they were inserted. |
| 892 | fn iter(&self) -> SparseSetIter<'_> { |
| 893 | SparseSetIter(self.dense[..self.len()].iter()) |
| 894 | } |
| 895 | } |
| 896 | |
| 897 | impl core::fmt::Debug for SparseSet { |
| 898 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
| 899 | let elements: Vec<StateID> = self.iter().collect(); |
| 900 | f.debug_tuple(name:"SparseSet" ).field(&elements).finish() |
| 901 | } |
| 902 | } |
| 903 | |
| 904 | /// An iterator over all elements in a sparse set. |
| 905 | /// |
| 906 | /// The lifetime `'a` refers to the lifetime of the set being iterated over. |
| 907 | #[derive (Debug)] |
| 908 | struct SparseSetIter<'a>(core::slice::Iter<'a, StateID>); |
| 909 | |
| 910 | impl<'a> Iterator for SparseSetIter<'a> { |
| 911 | type Item = StateID; |
| 912 | |
| 913 | fn next(&mut self) -> Option<StateID> { |
| 914 | self.0.next().map(|&id: u32| id) |
| 915 | } |
| 916 | } |
| 917 | |