1 | use alloc::{vec, vec::Vec}; |
2 | |
3 | use crate::{ |
4 | int::{NonMaxUsize, U32}, |
5 | nfa::{State, StateID, NFA}, |
6 | pool::CachePoolGuard, |
7 | utf8, |
8 | }; |
9 | |
10 | /// A PikeVM searcher. |
11 | /// |
12 | /// A PikeVM uses the standard Thompson NFA linear time search algorithm, but |
13 | /// augmented to support tracking the offsets of matching capture groups. |
14 | #[derive (Clone, Debug)] |
15 | pub(crate) struct PikeVM { |
16 | nfa: NFA, |
17 | } |
18 | |
19 | impl PikeVM { |
20 | /// Create a new PikeVM searcher that uses the given NFA. |
21 | pub(crate) fn new(nfa: NFA) -> PikeVM { |
22 | PikeVM { nfa } |
23 | } |
24 | |
25 | /// Return the underlying NFA used by this PikeVM. |
26 | pub(crate) fn nfa(&self) -> &NFA { |
27 | &self.nfa |
28 | } |
29 | |
30 | /// Returns an iterator of non-overlapping matches in the given haystack. |
31 | pub(crate) fn find_iter<'r, 'h>( |
32 | &'r self, |
33 | cache: CachePoolGuard<'r>, |
34 | haystack: &'h [u8], |
35 | ) -> FindMatches<'r, 'h> { |
36 | FindMatches { |
37 | pikevm: self, |
38 | cache, |
39 | haystack, |
40 | at: 0, |
41 | slots: vec![None, None], |
42 | last_match_end: None, |
43 | } |
44 | } |
45 | |
46 | /// Returns an iterator of non-overlapping capture matches in the given |
47 | /// haystack. |
48 | pub(crate) fn captures_iter<'r, 'h>( |
49 | &'r self, |
50 | cache: CachePoolGuard<'r>, |
51 | haystack: &'h [u8], |
52 | ) -> CapturesMatches<'r, 'h> { |
53 | // OK because the NFA wouldn't have compiled if this could overflow. |
54 | let len = self.nfa().group_len().checked_mul(2).unwrap(); |
55 | CapturesMatches { |
56 | it: FindMatches { |
57 | pikevm: self, |
58 | cache, |
59 | haystack, |
60 | at: 0, |
61 | slots: vec![None; len], |
62 | last_match_end: None, |
63 | }, |
64 | } |
65 | } |
66 | |
67 | /// The implementation of standard leftmost search. |
68 | /// |
69 | /// Capturing group spans are written to `slots`, but only if requested. |
70 | /// `slots` can be any length. Any slot in the NFA that is activated but |
71 | /// which is out of bounds for the given `slots` is ignored. |
72 | pub(crate) fn search( |
73 | &self, |
74 | cache: &mut Cache, |
75 | haystack: &[u8], |
76 | start: usize, |
77 | end: usize, |
78 | earliest: bool, |
79 | slots: &mut [Option<NonMaxUsize>], |
80 | ) -> bool { |
81 | cache.setup_search(slots.len()); |
82 | if start > end { |
83 | return false; |
84 | } |
85 | // Why do we even care about this? Well, in our `slots` representation, |
86 | // we use usize::MAX as a sentinel to indicate "no match." This isn't |
87 | // problematic so long as our haystack doesn't have a maximal length. |
88 | // Byte slices are guaranteed by Rust to have a length that fits into |
89 | // isize, and so this assert should always pass. But we put it here to |
90 | // make our assumption explicit. |
91 | assert!( |
92 | haystack.len() < core::usize::MAX, |
93 | "byte slice lengths must be less than usize MAX" , |
94 | ); |
95 | |
96 | let Cache { ref mut stack, ref mut curr, ref mut next } = cache; |
97 | let start_id = self.nfa().start(); |
98 | let anchored = self.nfa().is_start_anchored(); |
99 | let mut matched = false; |
100 | // Yes, our search doesn't end at `end`, but includes it. This is |
101 | // necessary because matches are delayed by one byte. The delay is used |
102 | // to handle look-behind assertions. In the case of the PikeVM, the |
103 | // delay is implemented by not considering a match to exist until it |
104 | // is visited in `nexts`. Technically, we know a match exists in the |
105 | // previous iteration via `epsilon_closure`. |
106 | let mut at = start; |
107 | while at <= end { |
108 | // If we have no states left to visit, then there are some cases |
109 | // where we know we can quit early or even skip ahead. |
110 | if curr.set.is_empty() { |
111 | // We have a match so we can quit. |
112 | if matched { |
113 | break; |
114 | } |
115 | // If we're running an anchored search and we've advanced |
116 | // beyond the start position with no other states to try, then |
117 | // we will never observe a match and thus can stop. |
118 | if anchored && at > start { |
119 | break; |
120 | } |
121 | } |
122 | // Instead of using a hypothetical unanchored start state in the |
123 | // NFA (which doesn't exist, but we could add it), we actually |
124 | // always use its anchored starting state. As a result, when doing |
125 | // an unanchored search, we need to simulate our own '(?s:.)*?' |
126 | // prefix, to permit a match to appear anywhere. |
127 | // |
128 | // Now, we don't *have* to do things this way. We could create |
129 | // a proper unanchored start state in the NFA and do one |
130 | // `epsilon_closure` call from that starting state before the main |
131 | // loop here. And that is just as correct. However, it turns out to |
132 | // be slower than our approach here because it slightly increases |
133 | // the cost of processing each byte by requiring us to visit |
134 | // more NFA states to deal with the additional NFA states in the |
135 | // unanchored prefix. By simulating it explicitly here, we lower |
136 | // those costs substantially. The cost is itself small, but it adds |
137 | // up for large haystacks. |
138 | // |
139 | // In order to simulate the '(?s:.)*?' prefix---which is not |
140 | // greedy---we are careful not to perform an epsilon closure on |
141 | // the start state if we already have a match. Namely, if we |
142 | // did otherwise, we would never reach a terminating condition |
143 | // because there would always be additional states to process. |
144 | if !matched { |
145 | // Since we are adding to the 'curr' active states and since |
146 | // this is for the start ID, we use a slots slice that is |
147 | // guaranteed to have the right length but where every element |
148 | // is absent. This is exactly what we want, because this |
149 | // epsilon closure is responsible for simulating an unanchored |
150 | // '(?s:.)*?' prefix. It is specifically outside of any |
151 | // capturing groups, and thus, using slots that are always |
152 | // absent is correct. |
153 | // |
154 | // Note though that we can't just use `&mut []` here, since |
155 | // this epsilon closure may traverse through `Capture` states |
156 | // transitions, and thus must be able to write offsets to the |
157 | // slots given which are later copied to slot values in `curr`. |
158 | let slots = next.slot_table.all_absent(); |
159 | self.epsilon_closure( |
160 | stack, slots, curr, haystack, at, start_id, |
161 | ); |
162 | } |
163 | let (ch, len) = utf8::decode_lossy(&haystack[at..]); |
164 | if self.nexts(stack, curr, next, haystack, at, ch, len, slots) { |
165 | matched = true; |
166 | } |
167 | // Unless the caller asked us to return early, we need to mush |
168 | // on to see if we can extend our match. (But note that 'nexts' |
169 | // will quit right after seeing a match, as is consistent with |
170 | // leftmost-first match priority.) |
171 | if (earliest && matched) || len == 0 { |
172 | break; |
173 | } |
174 | core::mem::swap(curr, next); |
175 | next.set.clear(); |
176 | at += len; |
177 | } |
178 | matched |
179 | } |
180 | |
181 | /// Process the active states in 'curr' to find the states (written to |
182 | /// 'next') we should process for the next byte in the haystack. |
183 | /// |
184 | /// 'stack' is used to perform a depth first traversal of the NFA when |
185 | /// computing an epsilon closure. |
186 | /// |
187 | /// When a match is found, the slots for that match state (in 'curr') are |
188 | /// copied to 'caps'. Moreover, once a match is seen, processing for 'curr' |
189 | /// stops (unless the PikeVM was configured with MatchKind::All semantics). |
190 | /// |
191 | /// `at_ch` is the Unicode scalar value whose UTF-8 encoding begins at `at` |
192 | /// in `haystack`. |
193 | /// |
194 | /// `at_len` is the number of bytes consumed by `at_ch`. This is usually |
195 | /// equal to `at_ch.len_utf8()`, but not always. For example, in the case |
196 | /// where `at_ch` is the replacement codepoint that results from decoding |
197 | /// invalid UTF-8. In that case, `at_len` can be 1, 2 or 3. |
198 | fn nexts( |
199 | &self, |
200 | stack: &mut Vec<FollowEpsilon>, |
201 | curr: &mut ActiveStates, |
202 | next: &mut ActiveStates, |
203 | haystack: &[u8], |
204 | at: usize, |
205 | at_ch: char, |
206 | at_len: usize, |
207 | slots: &mut [Option<NonMaxUsize>], |
208 | ) -> bool { |
209 | let ActiveStates { ref set, ref mut slot_table } = *curr; |
210 | for sid in set.iter() { |
211 | if self.next( |
212 | stack, slot_table, next, haystack, at, at_ch, at_len, sid, |
213 | ) { |
214 | slots.copy_from_slice(slot_table.for_state(sid)); |
215 | return true; |
216 | } |
217 | } |
218 | false |
219 | } |
220 | |
221 | /// Starting from `sid`, if the position `at` in the `haystack` has a |
222 | /// transition defined out of `sid`, then add the state transitioned to and |
223 | /// its epsilon closure to the `next` set of states to explore. |
224 | /// |
225 | /// `stack` is used by the epsilon closure computation to perform a depth |
226 | /// first traversal of the NFA. |
227 | /// |
228 | /// `curr_slot_table` should be the table of slots for the current set of |
229 | /// states being explored. If there is a transition out of `sid`, then |
230 | /// sid's row in the slot table is used to perform the epsilon closure. |
231 | /// |
232 | /// `at_ch` is the Unicode scalar value whose UTF-8 encoding begins at `at` |
233 | /// in `haystack`. The caller provides it so that this routine doesn't |
234 | /// need to re-decode it. (Since it's expected that this routine is called |
235 | /// multiple times for each position.) |
236 | /// |
237 | /// `at_len` is the number of bytes consumed by `at_ch`. This is usually |
238 | /// equal to `at_ch.len_utf8()`, but not always. For example, in the case |
239 | /// where `at_ch` is the replacement codepoint that results from decoding |
240 | /// invalid UTF-8. In that case, `at_len` can be 1, 2 or 3. |
241 | fn next( |
242 | &self, |
243 | stack: &mut Vec<FollowEpsilon>, |
244 | curr_slot_table: &mut SlotTable, |
245 | next: &mut ActiveStates, |
246 | haystack: &[u8], |
247 | at: usize, |
248 | at_ch: char, |
249 | at_len: usize, |
250 | sid: StateID, |
251 | ) -> bool { |
252 | match *self.nfa.state(sid) { |
253 | State::Fail |
254 | | State::Goto { .. } |
255 | | State::Splits { .. } |
256 | | State::Capture { .. } => false, |
257 | State::Char { target, ch } => { |
258 | if at_ch == ch && at_len > 0 { |
259 | let slots = curr_slot_table.for_state(sid); |
260 | // OK because `at_len` is always derived from the number |
261 | // of bytes read from `at` that make up `at_ch`. So this |
262 | // will never wrap. |
263 | let at = at.wrapping_add(at_len); |
264 | self.epsilon_closure( |
265 | stack, slots, next, haystack, at, target, |
266 | ); |
267 | } |
268 | false |
269 | } |
270 | State::Ranges { target, ref ranges } => { |
271 | for (start, end) in ranges.iter().copied() { |
272 | if start > at_ch { |
273 | break; |
274 | } else if start <= at_ch && at_ch <= end { |
275 | if at_len == 0 { |
276 | return false; |
277 | } |
278 | let slots = curr_slot_table.for_state(sid); |
279 | // OK because `at_len` is always derived from the |
280 | // number of bytes read from `at` that make up `at_ch`. |
281 | // So this will never wrap. |
282 | let at = at.wrapping_add(at_len); |
283 | self.epsilon_closure( |
284 | stack, slots, next, haystack, at, target, |
285 | ); |
286 | } |
287 | } |
288 | false |
289 | } |
290 | State::Match => true, |
291 | } |
292 | } |
293 | |
294 | /// Compute the epsilon closure of `sid`, writing the closure into `next` |
295 | /// while copying slot values from `curr_slots` into corresponding states |
296 | /// in `next`. `curr_slots` should be the slot values corresponding to |
297 | /// `sid`. |
298 | /// |
299 | /// The given `stack` is used to perform a depth first traversal of the |
300 | /// NFA by recursively following all epsilon transitions out of `sid`. |
301 | /// Conditional epsilon transitions are followed if and only if they are |
302 | /// satisfied for the position `at` in the `input` haystack. |
303 | /// |
304 | /// While this routine may write to `curr_slots`, once it returns, any |
305 | /// writes are undone and the original values (even if absent) are |
306 | /// restored. |
307 | fn epsilon_closure( |
308 | &self, |
309 | stack: &mut Vec<FollowEpsilon>, |
310 | curr_slots: &mut [Option<NonMaxUsize>], |
311 | next: &mut ActiveStates, |
312 | haystack: &[u8], |
313 | at: usize, |
314 | sid: StateID, |
315 | ) { |
316 | stack.push(FollowEpsilon::Explore(sid)); |
317 | while let Some(frame) = stack.pop() { |
318 | match frame { |
319 | FollowEpsilon::RestoreCapture { slot, offset } => { |
320 | curr_slots[slot.as_usize()] = offset; |
321 | } |
322 | FollowEpsilon::Explore(sid) => { |
323 | self.epsilon_closure_explore( |
324 | stack, curr_slots, next, haystack, at, sid, |
325 | ); |
326 | } |
327 | } |
328 | } |
329 | } |
330 | |
331 | /// Explore all of the epsilon transitions out of `sid`. This is mostly |
332 | /// split out from `epsilon_closure` in order to clearly delineate |
333 | /// the actual work of computing an epsilon closure from the stack |
334 | /// book-keeping. |
335 | /// |
336 | /// This will push any additional explorations needed on to `stack`. |
337 | /// |
338 | /// `curr_slots` should refer to the slots for the currently active NFA |
339 | /// state. That is, the current state we are stepping through. These |
340 | /// slots are mutated in place as new `Captures` states are traversed |
341 | /// during epsilon closure, but the slots are restored to their original |
342 | /// values once the full epsilon closure is completed. The ultimate use of |
343 | /// `curr_slots` is to copy them to the corresponding `next_slots`, so that |
344 | /// the capturing group spans are forwarded from the currently active state |
345 | /// to the next. |
346 | /// |
347 | /// `next` refers to the next set of active states. Computing an epsilon |
348 | /// closure may increase the next set of active states. |
349 | /// |
350 | /// `haystack` refers to the what we're searching and `at` refers to the |
351 | /// current position in the haystack. These are used to check whether |
352 | /// conditional epsilon transitions (like look-around) are satisfied at |
353 | /// the current position. If they aren't, then the epsilon closure won't |
354 | /// include them. |
355 | fn epsilon_closure_explore( |
356 | &self, |
357 | stack: &mut Vec<FollowEpsilon>, |
358 | curr_slots: &mut [Option<NonMaxUsize>], |
359 | next: &mut ActiveStates, |
360 | haystack: &[u8], |
361 | at: usize, |
362 | mut sid: StateID, |
363 | ) { |
364 | // We can avoid pushing some state IDs on to our stack in precisely |
365 | // the cases where a 'push(x)' would be immediately followed by a 'x |
366 | // = pop()'. This is achieved by this outer-loop. We simply set 'sid' |
367 | // to be the next state ID we want to explore once we're done with |
368 | // our initial exploration. In practice, this avoids a lot of stack |
369 | // thrashing. |
370 | loop { |
371 | // Record this state as part of our next set of active states. If |
372 | // we've already explored it, then no need to do it again. |
373 | if !next.set.insert(sid) { |
374 | return; |
375 | } |
376 | match *self.nfa.state(sid) { |
377 | State::Fail |
378 | | State::Match { .. } |
379 | | State::Char { .. } |
380 | | State::Ranges { .. } => { |
381 | next.slot_table.for_state(sid).copy_from_slice(curr_slots); |
382 | return; |
383 | } |
384 | State::Goto { target, look: None } => { |
385 | sid = target; |
386 | } |
387 | State::Goto { target, look: Some(look) } => { |
388 | if !look.is_match(haystack, at) { |
389 | return; |
390 | } |
391 | sid = target; |
392 | } |
393 | State::Splits { ref targets, reverse: false } => { |
394 | sid = match targets.get(0) { |
395 | None => return, |
396 | Some(&sid) => sid, |
397 | }; |
398 | stack.extend( |
399 | targets[1..] |
400 | .iter() |
401 | .copied() |
402 | .rev() |
403 | .map(FollowEpsilon::Explore), |
404 | ); |
405 | } |
406 | State::Splits { ref targets, reverse: true } => { |
407 | sid = match targets.last() { |
408 | None => return, |
409 | Some(&sid) => sid, |
410 | }; |
411 | stack.extend( |
412 | targets[..targets.len() - 1] |
413 | .iter() |
414 | .copied() |
415 | .map(FollowEpsilon::Explore), |
416 | ); |
417 | } |
418 | State::Capture { target, slot } => { |
419 | // There's no need to do anything with slots that |
420 | // ultimately won't be copied into the caller-provided |
421 | // 'Captures' value. So we just skip dealing with them at |
422 | // all. |
423 | if slot.as_usize() < curr_slots.len() { |
424 | stack.push(FollowEpsilon::RestoreCapture { |
425 | slot, |
426 | offset: curr_slots[slot.as_usize()], |
427 | }); |
428 | // OK because length of a slice must fit into an isize. |
429 | curr_slots[slot.as_usize()] = |
430 | Some(NonMaxUsize::new(at).unwrap()); |
431 | } |
432 | sid = target; |
433 | } |
434 | } |
435 | } |
436 | } |
437 | } |
438 | |
439 | /// An iterator over all successive non-overlapping matches in a particular |
440 | /// haystack. `'r` represents the lifetime of the regex, `'c` is the lifetime |
441 | /// of the cache and `'h` represents the lifetime of the haystack. |
442 | #[derive (Debug)] |
443 | pub(crate) struct FindMatches<'r, 'h> { |
444 | pikevm: &'r PikeVM, |
445 | cache: CachePoolGuard<'r>, |
446 | haystack: &'h [u8], |
447 | at: usize, |
448 | slots: Vec<Option<NonMaxUsize>>, |
449 | last_match_end: Option<usize>, |
450 | } |
451 | |
452 | impl<'r, 'h> Iterator for FindMatches<'r, 'h> { |
453 | type Item = (usize, usize); |
454 | |
455 | fn next(&mut self) -> Option<(usize, usize)> { |
456 | if !self.pikevm.search( |
457 | &mut self.cache, |
458 | self.haystack, |
459 | self.at, |
460 | self.haystack.len(), |
461 | earliest:false, |
462 | &mut self.slots, |
463 | ) { |
464 | return None; |
465 | } |
466 | let mut m: (usize, usize) = |
467 | (self.slots[0].unwrap().get(), self.slots[1].unwrap().get()); |
468 | if m.0 >= m.1 { |
469 | m = self.handle_overlapping_empty_match(m)?; |
470 | } |
471 | self.at = m.1; |
472 | self.last_match_end = Some(m.1); |
473 | Some(m) |
474 | } |
475 | } |
476 | |
477 | impl<'r, 'h> FindMatches<'r, 'h> { |
478 | /// Handles the special case of an empty match by ensuring that 1) the |
479 | /// iterator always advances and 2) empty matches never overlap with other |
480 | /// matches. |
481 | /// |
482 | /// Note that we mark this cold and forcefully prevent inlining because |
483 | /// handling empty matches like this is extremely rare and does require a |
484 | /// bit of code, comparatively. Keeping this code out of the main iterator |
485 | /// function keeps it smaller and more amenable to inlining itself. |
486 | #[cold ] |
487 | #[inline (never)] |
488 | fn handle_overlapping_empty_match( |
489 | &mut self, |
490 | mut m: (usize, usize), |
491 | ) -> Option<(usize, usize)> { |
492 | assert!(m.0 >= m.1); |
493 | if Some(m.1) == self.last_match_end { |
494 | let len = |
495 | core::cmp::max(1, utf8::decode(&self.haystack[self.at..]).1); |
496 | self.at = self.at.checked_add(len).unwrap(); |
497 | if !self.pikevm.search( |
498 | &mut self.cache, |
499 | self.haystack, |
500 | self.at, |
501 | self.haystack.len(), |
502 | false, |
503 | &mut self.slots, |
504 | ) { |
505 | return None; |
506 | } |
507 | m = (self.slots[0].unwrap().get(), self.slots[1].unwrap().get()); |
508 | } |
509 | Some(m) |
510 | } |
511 | } |
512 | |
513 | /// An iterator over all successive non-overlapping capture matches in a particular |
514 | /// haystack. `'r` represents the lifetime of the regex, `'c` is the lifetime |
515 | /// of the cache and `'h` represents the lifetime of the haystack. |
516 | #[derive (Debug)] |
517 | pub(crate) struct CapturesMatches<'r, 'h> { |
518 | it: FindMatches<'r, 'h>, |
519 | } |
520 | |
521 | impl<'r, 'h> Iterator for CapturesMatches<'r, 'h> { |
522 | type Item = Vec<Option<NonMaxUsize>>; |
523 | |
524 | fn next(&mut self) -> Option<Vec<Option<NonMaxUsize>>> { |
525 | self.it.next()?; |
526 | Some(self.it.slots.clone()) |
527 | } |
528 | } |
529 | |
530 | /// A cache represents mutable state that a `PikeVM` requires during a search. |
531 | /// |
532 | /// For a given `PikeVM`, its corresponding cache may be created either via |
533 | /// `PikeVM::create_cache`, or via `Cache::new`. They are equivalent in every |
534 | /// way, except the former does not require explicitly importing `Cache`. |
535 | /// |
536 | /// A particular `Cache` is coupled with the `PikeVM` from which it was |
537 | /// created. It may only be used with that `PikeVM`. A cache and its |
538 | /// allocations may be re-purposed via `Cache::reset`, in which case, it can |
539 | /// only be used with the new `PikeVM` (and not the old one). |
540 | #[derive (Clone, Debug)] |
541 | pub(crate) struct Cache { |
542 | /// Stack used while computing epsilon closure. This effectively lets us |
543 | /// move what is more naturally expressed through recursion to a stack |
544 | /// on the heap. |
545 | stack: Vec<FollowEpsilon>, |
546 | /// The current active states being explored for the current byte in the |
547 | /// haystack. |
548 | curr: ActiveStates, |
549 | /// The next set of states we're building that will be explored for the |
550 | /// next byte in the haystack. |
551 | next: ActiveStates, |
552 | } |
553 | |
554 | impl Cache { |
555 | /// Create a new `PikeVM` cache. |
556 | /// |
557 | /// A potentially more convenient routine to create a cache is |
558 | /// `PikeVM::create_cache`, as it does not require also importing the |
559 | /// `Cache` type. |
560 | /// |
561 | /// If you want to reuse the returned `Cache` with some other `PikeVM`, |
562 | /// then you must call `Cache::reset` with the desired `PikeVM`. |
563 | pub(crate) fn new(re: &PikeVM) -> Cache { |
564 | Cache { |
565 | stack: vec![], |
566 | curr: ActiveStates::new(re), |
567 | next: ActiveStates::new(re), |
568 | } |
569 | } |
570 | |
571 | /// Clears this cache. This should be called at the start of every search |
572 | /// to ensure we start with a clean slate. |
573 | /// |
574 | /// This also sets the length of the capturing groups used in the current |
575 | /// search. This permits an optimization where by 'SlotTable::for_state' |
576 | /// only returns the number of slots equivalent to the number of slots |
577 | /// given in the 'Captures' value. This may be less than the total number |
578 | /// of possible slots, e.g., when one only wants to track overall match |
579 | /// offsets. This in turn permits less copying of capturing group spans |
580 | /// in the PikeVM. |
581 | fn setup_search(&mut self, captures_slot_len: usize) { |
582 | self.stack.clear(); |
583 | self.curr.setup_search(captures_slot_len); |
584 | self.next.setup_search(captures_slot_len); |
585 | } |
586 | } |
587 | |
588 | /// A set of active states used to "simulate" the execution of an NFA via the |
589 | /// PikeVM. |
590 | /// |
591 | /// There are two sets of these used during NFA simulation. One set corresponds |
592 | /// to the "current" set of states being traversed for the current position |
593 | /// in a haystack. The other set corresponds to the "next" set of states being |
594 | /// built, which will become the new "current" set for the next position in the |
595 | /// haystack. These two sets correspond to CLIST and NLIST in Thompson's |
596 | /// original paper regexes: https://dl.acm.org/doi/pdf/10.1145/363347.363387 |
597 | /// |
598 | /// In addition to representing a set of NFA states, this also maintains slot |
599 | /// values for each state. These slot values are what turn the NFA simulation |
600 | /// into the "Pike VM." Namely, they track capturing group values for each |
601 | /// state. During the computation of epsilon closure, we copy slot values from |
602 | /// states in the "current" set to the "next" set. Eventually, once a match |
603 | /// is found, the slot values for that match state are what we write to the |
604 | /// caller provided slots. |
605 | #[derive (Clone, Debug)] |
606 | struct ActiveStates { |
607 | /// The set of active NFA states. This set preserves insertion order, which |
608 | /// is critical for simulating the match semantics of backtracking regex |
609 | /// engines. |
610 | set: SparseSet, |
611 | /// The slots for every NFA state, where each slot stores a (possibly |
612 | /// absent) offset. Every capturing group has two slots. One for a start |
613 | /// offset and one for an end offset. |
614 | slot_table: SlotTable, |
615 | } |
616 | |
617 | impl ActiveStates { |
618 | /// Create a new set of active states for the given PikeVM. The active |
619 | /// states returned may only be used with the given PikeVM. (Use 'reset' |
620 | /// to re-purpose the allocation for a different PikeVM.) |
621 | fn new(re: &PikeVM) -> ActiveStates { |
622 | let mut active = ActiveStates { |
623 | set: SparseSet::new(0), |
624 | slot_table: SlotTable::new(), |
625 | }; |
626 | active.reset(re); |
627 | active |
628 | } |
629 | |
630 | /// Reset this set of active states such that it can be used with the given |
631 | /// PikeVM (and only that PikeVM). |
632 | fn reset(&mut self, re: &PikeVM) { |
633 | self.set.resize(re.nfa().len()); |
634 | self.slot_table.reset(re); |
635 | } |
636 | |
637 | /// Setup this set of active states for a new search. The given slot |
638 | /// length should be the number of slots in a caller provided 'Captures' |
639 | /// (and may be zero). |
640 | fn setup_search(&mut self, captures_slot_len: usize) { |
641 | self.set.clear(); |
642 | self.slot_table.setup_search(captures_slot_len); |
643 | } |
644 | } |
645 | |
646 | /// A table of slots, where each row represent a state in an NFA. Thus, the |
647 | /// table has room for storing slots for every single state in an NFA. |
648 | /// |
649 | /// This table is represented with a single contiguous allocation. In general, |
650 | /// the notion of "capturing group" doesn't really exist at this level of |
651 | /// abstraction, hence the name "slot" instead. (Indeed, every capturing group |
652 | /// maps to a pair of slots, one for the start offset and one for the end |
653 | /// offset.) Slots are indexed by the `Captures` NFA state. |
654 | #[derive (Clone, Debug)] |
655 | struct SlotTable { |
656 | /// The actual table of offsets. |
657 | table: Vec<Option<NonMaxUsize>>, |
658 | /// The number of slots per state, i.e., the table's stride or the length |
659 | /// of each row. |
660 | slots_per_state: usize, |
661 | /// The number of slots in the caller-provided `Captures` value for the |
662 | /// current search. Setting this to `slots_per_state` is always correct, |
663 | /// but may be wasteful. |
664 | slots_for_captures: usize, |
665 | } |
666 | |
667 | impl SlotTable { |
668 | /// Create a new slot table. |
669 | /// |
670 | /// One should call 'reset' with the corresponding PikeVM before use. |
671 | fn new() -> SlotTable { |
672 | SlotTable { table: vec![], slots_for_captures: 0, slots_per_state: 0 } |
673 | } |
674 | |
675 | /// Reset this slot table such that it can be used with the given PikeVM |
676 | /// (and only that PikeVM). |
677 | fn reset(&mut self, re: &PikeVM) { |
678 | let nfa = re.nfa(); |
679 | // OK because NFA construction would have failed if this overflowed. |
680 | self.slots_per_state = nfa.group_len().checked_mul(2).unwrap(); |
681 | // This is always correct, but may be reduced for a particular search |
682 | // if fewer slots were given by the caller, e.g., none at all or only |
683 | // slots for tracking the overall match instead of all slots for every |
684 | // group. |
685 | self.slots_for_captures = self.slots_per_state; |
686 | let len = nfa |
687 | .len() |
688 | // We add 1 so that our last row is always empty. We use it as |
689 | // "scratch" space for computing the epsilon closure off of the |
690 | // starting state. |
691 | .checked_add(1) |
692 | .and_then(|x| x.checked_mul(self.slots_per_state)) |
693 | // It seems like this could actually panic on legitimate inputs |
694 | // on 32-bit targets. Should we somehow convert this to an error? |
695 | // What about something similar for the lazy DFA cache? If you're |
696 | // tripping this assert, please file a bug. |
697 | .expect("slot table length doesn't overflow" ); |
698 | self.table.resize(len, None); |
699 | } |
700 | |
701 | /// Perform any per-search setup for this slot table. |
702 | /// |
703 | /// In particular, this sets the length of the number of slots used in the |
704 | /// slots given by the caller (if any at all). This number may be smaller |
705 | /// than the total number of slots available, e.g., when the caller is only |
706 | /// interested in tracking the overall match and not the spans of every |
707 | /// matching capturing group. Only tracking the overall match can save a |
708 | /// substantial amount of time copying capturing spans during a search. |
709 | fn setup_search(&mut self, captures_slot_len: usize) { |
710 | self.slots_for_captures = captures_slot_len; |
711 | } |
712 | |
713 | /// Return a mutable slice of the slots for the given state. |
714 | /// |
715 | /// Note that the length of the slice returned may be less than the total |
716 | /// number of slots available for this state. In particular, the length |
717 | /// always matches the number of slots indicated via `setup_search`. |
718 | fn for_state(&mut self, sid: StateID) -> &mut [Option<NonMaxUsize>] { |
719 | let i = sid.as_usize() * self.slots_per_state; |
720 | &mut self.table[i..i + self.slots_for_captures] |
721 | } |
722 | |
723 | /// Return a slice of slots of appropriate length where every slot offset |
724 | /// is guaranteed to be absent. This is useful in cases where you need to |
725 | /// compute an epsilon closure outside of the user supplied regex, and thus |
726 | /// never want it to have any capturing slots set. |
727 | fn all_absent(&mut self) -> &mut [Option<NonMaxUsize>] { |
728 | let i = self.table.len() - self.slots_per_state; |
729 | &mut self.table[i..i + self.slots_for_captures] |
730 | } |
731 | } |
732 | |
733 | /// Represents a stack frame for use while computing an epsilon closure. |
734 | /// |
735 | /// (An "epsilon closure" refers to the set of reachable NFA states from a |
736 | /// single state without consuming any input. That is, the set of all epsilon |
737 | /// transitions not only from that single state, but from every other state |
738 | /// reachable by an epsilon transition as well. This is why it's called a |
739 | /// "closure.") |
740 | /// |
741 | /// Computing the epsilon closure in a Thompson NFA proceeds via a depth |
742 | /// first traversal over all epsilon transitions from a particular state. |
743 | /// (A depth first traversal is important because it emulates the same priority |
744 | /// of matches that is typically found in backtracking regex engines.) This |
745 | /// depth first traversal is naturally expressed using recursion, but to avoid |
746 | /// a call stack size proportional to the size of a regex, we put our stack on |
747 | /// the heap instead. |
748 | /// |
749 | /// This stack thus consists of call frames. The typical call frame is |
750 | /// `Explore`, which instructs epsilon closure to explore the epsilon |
751 | /// transitions from that state. (Subsequent epsilon transitions are then |
752 | /// pushed on to the stack as more `Explore` frames.) If the state ID being |
753 | /// explored has no epsilon transitions, then the capturing group slots are |
754 | /// copied from the original state that sparked the epsilon closure (from the |
755 | /// 'step' routine) to the state ID being explored. This way, capturing group |
756 | /// slots are forwarded from the previous state to the next. |
757 | /// |
758 | /// The other stack frame, `RestoreCaptures`, instructs the epsilon closure to |
759 | /// set the position for a particular slot back to some particular offset. This |
760 | /// frame is pushed when `Explore` sees a `Capture` transition. `Explore` will |
761 | /// set the offset of the slot indicated in `Capture` to the current offset, |
762 | /// and then push the old offset on to the stack as a `RestoreCapture` frame. |
763 | /// Thus, the new offset is only used until the epsilon closure reverts back to |
764 | /// the `RestoreCapture` frame. In effect, this gives the `Capture` epsilon |
765 | /// transition its "scope" to only states that come "after" it during depth |
766 | /// first traversal. |
767 | #[derive (Clone, Debug)] |
768 | enum FollowEpsilon { |
769 | /// Explore the epsilon transitions from a state ID. |
770 | Explore(StateID), |
771 | /// Reset the given `slot` to the given `offset` (which might be `None`). |
772 | RestoreCapture { slot: u32, offset: Option<NonMaxUsize> }, |
773 | } |
774 | |
775 | /// A sparse set used for representing ordered NFA states. |
776 | /// |
777 | /// This supports constant time addition and membership testing. Clearing an |
778 | /// entire set can also be done in constant time. Iteration yields elements |
779 | /// in the order in which they were inserted. |
780 | /// |
781 | /// The data structure is based on: https://research.swtch.com/sparse |
782 | /// Note though that we don't actually use uninitialized memory. We generally |
783 | /// reuse sparse sets, so the initial allocation cost is bareable. However, its |
784 | /// other properties listed above are extremely useful. |
785 | #[derive (Clone)] |
786 | struct SparseSet { |
787 | /// The number of elements currently in this set. |
788 | len: usize, |
789 | /// Dense contains the ids in the order in which they were inserted. |
790 | dense: Vec<StateID>, |
791 | /// Sparse maps ids to their location in dense. |
792 | /// |
793 | /// A state ID is in the set if and only if |
794 | /// sparse[id] < len && id == dense[sparse[id]]. |
795 | /// |
796 | /// Note that these are indices into 'dense'. It's a little weird to use |
797 | /// StateID here, but we know our length can never exceed the bounds of |
798 | /// StateID (enforced by 'resize') and StateID will be at most 4 bytes |
799 | /// where as a usize is likely double that in most cases. |
800 | sparse: Vec<StateID>, |
801 | } |
802 | |
803 | impl SparseSet { |
804 | /// Create a new sparse set with the given capacity. |
805 | /// |
806 | /// Sparse sets have a fixed size and they cannot grow. Attempting to |
807 | /// insert more distinct elements than the total capacity of the set will |
808 | /// result in a panic. |
809 | /// |
810 | /// This panics if the capacity given is bigger than `StateID::LIMIT`. |
811 | fn new(capacity: usize) -> SparseSet { |
812 | let mut set = SparseSet { len: 0, dense: vec![], sparse: vec![] }; |
813 | set.resize(capacity); |
814 | set |
815 | } |
816 | |
817 | /// Resizes this sparse set to have the new capacity given. |
818 | /// |
819 | /// This set is automatically cleared. |
820 | /// |
821 | /// This panics if the capacity given is bigger than `StateID::LIMIT`. |
822 | fn resize(&mut self, new_capacity: usize) { |
823 | assert!( |
824 | new_capacity <= u32::MAX.as_usize(), |
825 | "sparse set capacity cannot excced {:?}" , |
826 | u32::MAX, |
827 | ); |
828 | self.clear(); |
829 | self.dense.resize(new_capacity, 0); |
830 | self.sparse.resize(new_capacity, 0); |
831 | } |
832 | |
833 | /// Returns the capacity of this set. |
834 | /// |
835 | /// The capacity represents a fixed limit on the number of distinct |
836 | /// elements that are allowed in this set. The capacity cannot be changed. |
837 | fn capacity(&self) -> usize { |
838 | self.dense.len() |
839 | } |
840 | |
841 | /// Returns the number of elements in this set. |
842 | fn len(&self) -> usize { |
843 | self.len |
844 | } |
845 | |
846 | /// Returns true if and only if this set is empty. |
847 | fn is_empty(&self) -> bool { |
848 | self.len() == 0 |
849 | } |
850 | |
851 | /// Insert the state ID value into this set and return true if the given |
852 | /// state ID was not previously in this set. |
853 | /// |
854 | /// This operation is idempotent. If the given value is already in this |
855 | /// set, then this is a no-op. |
856 | /// |
857 | /// If more than `capacity` ids are inserted, then this panics. |
858 | fn insert(&mut self, id: StateID) -> bool { |
859 | if self.contains(id) { |
860 | return false; |
861 | } |
862 | |
863 | let index = self.len(); |
864 | assert!( |
865 | index < self.capacity(), |
866 | " {:?} exceeds capacity of {:?} when inserting {:?}" , |
867 | index, |
868 | self.capacity(), |
869 | id, |
870 | ); |
871 | self.dense[index] = id; |
872 | // OK because we don't permit the capacity to be set higher than |
873 | // u32::MAX. |
874 | self.sparse[id.as_usize()] = u32::try_from(index).unwrap(); |
875 | self.len += 1; |
876 | true |
877 | } |
878 | |
879 | /// Returns true if and only if this set contains the given value. |
880 | fn contains(&self, id: StateID) -> bool { |
881 | let index = self.sparse[id.as_usize()]; |
882 | index.as_usize() < self.len() && self.dense[index.as_usize()] == id |
883 | } |
884 | |
885 | /// Clear this set such that it has no members. |
886 | fn clear(&mut self) { |
887 | self.len = 0; |
888 | } |
889 | |
890 | /// Returns an iterator over all the state IDs in this set in the order in |
891 | /// which they were inserted. |
892 | fn iter(&self) -> SparseSetIter<'_> { |
893 | SparseSetIter(self.dense[..self.len()].iter()) |
894 | } |
895 | } |
896 | |
897 | impl core::fmt::Debug for SparseSet { |
898 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
899 | let elements: Vec<StateID> = self.iter().collect(); |
900 | f.debug_tuple(name:"SparseSet" ).field(&elements).finish() |
901 | } |
902 | } |
903 | |
904 | /// An iterator over all elements in a sparse set. |
905 | /// |
906 | /// The lifetime `'a` refers to the lifetime of the set being iterated over. |
907 | #[derive (Debug)] |
908 | struct SparseSetIter<'a>(core::slice::Iter<'a, StateID>); |
909 | |
910 | impl<'a> Iterator for SparseSetIter<'a> { |
911 | type Item = StateID; |
912 | |
913 | fn next(&mut self) -> Option<StateID> { |
914 | self.0.next().map(|&id: u32| id) |
915 | } |
916 | } |
917 | |