1/*!
2The DFA matching engine.
3
4A DFA provides faster matching because the engine is in exactly one state at
5any point in time. In the NFA, there may be multiple active states, and
6considerable CPU cycles are spent shuffling them around. In finite automata
7speak, the DFA follows epsilon transitions in the regex far less than the NFA.
8
9A DFA is a classic trade off between time and space. The NFA is slower, but
10its memory requirements are typically small and predictable. The DFA is faster,
11but given the right regex and the right input, the number of states in the
12DFA can grow exponentially. To mitigate this space problem, we do two things:
13
141. We implement an *online* DFA. That is, the DFA is constructed from the NFA
15 during a search. When a new state is computed, it is stored in a cache so
16 that it may be reused. An important consequence of this implementation
17 is that states that are never reached for a particular input are never
18 computed. (This is impossible in an "offline" DFA which needs to compute
19 all possible states up front.)
202. If the cache gets too big, we wipe it and continue matching.
21
22In pathological cases, a new state can be created for every byte of input.
23(e.g., The regex `(a|b)*a(a|b){20}` on a long sequence of a's and b's.)
24In this case, performance regresses to slightly slower than the full NFA
25simulation, in large part because the cache becomes useless. If the cache
26is wiped too frequently, the DFA quits and control falls back to one of the
27NFA simulations.
28
29Because of the "lazy" nature of this DFA, the inner matching loop is
30considerably more complex than one might expect out of a DFA. A number of
31tricks are employed to make it fast. Tread carefully.
32
33N.B. While this implementation is heavily commented, Russ Cox's series of
34articles on regexes is strongly recommended: <https://swtch.com/~rsc/regexp/>
35(As is the DFA implementation in RE2, which heavily influenced this
36implementation.)
37*/
38
39use std::collections::HashMap;
40use std::fmt;
41use std::iter::repeat;
42use std::mem;
43use std::sync::Arc;
44
45use crate::exec::ProgramCache;
46use crate::prog::{Inst, Program};
47use crate::sparse::SparseSet;
48
49/// Return true if and only if the given program can be executed by a DFA.
50///
51/// Generally, a DFA is always possible. A pathological case where it is not
52/// possible is if the number of NFA states exceeds `u32::MAX`, in which case,
53/// this function will return false.
54///
55/// This function will also return false if the given program has any Unicode
56/// instructions (Char or Ranges) since the DFA operates on bytes only.
57pub fn can_exec(insts: &Program) -> bool {
58 use crate::prog::Inst::*;
59 // If for some reason we manage to allocate a regex program with more
60 // than i32::MAX instructions, then we can't execute the DFA because we
61 // use 32 bit instruction pointer deltas for memory savings.
62 // If i32::MAX is the largest positive delta,
63 // then -i32::MAX == i32::MIN + 1 is the largest negative delta,
64 // and we are OK to use 32 bits.
65 if insts.dfa_size_limit == 0 || insts.len() > ::std::i32::MAX as usize {
66 return false;
67 }
68 for inst: &Inst in insts {
69 match *inst {
70 Char(_) | Ranges(_) => return false,
71 EmptyLook(_) | Match(_) | Save(_) | Split(_) | Bytes(_) => {}
72 }
73 }
74 true
75}
76
77/// A reusable cache of DFA states.
78///
79/// This cache is reused between multiple invocations of the same regex
80/// program. (It is not shared simultaneously between threads. If there is
81/// contention, then new caches are created.)
82#[derive(Debug)]
83pub struct Cache {
84 /// Group persistent DFA related cache state together. The sparse sets
85 /// listed below are used as scratch space while computing uncached states.
86 inner: CacheInner,
87 /// qcur and qnext are ordered sets with constant time
88 /// addition/membership/clearing-whole-set and linear time iteration. They
89 /// are used to manage the sets of NFA states in DFA states when computing
90 /// cached DFA states. In particular, the order of the NFA states matters
91 /// for leftmost-first style matching. Namely, when computing a cached
92 /// state, the set of NFA states stops growing as soon as the first Match
93 /// instruction is observed.
94 qcur: SparseSet,
95 qnext: SparseSet,
96}
97
98/// `CacheInner` is logically just a part of Cache, but groups together fields
99/// that aren't passed as function parameters throughout search. (This split
100/// is mostly an artifact of the borrow checker. It is happily paid.)
101#[derive(Debug)]
102struct CacheInner {
103 /// A cache of pre-compiled DFA states, keyed by the set of NFA states
104 /// and the set of empty-width flags set at the byte in the input when the
105 /// state was observed.
106 ///
107 /// A StatePtr is effectively a `*State`, but to avoid various inconvenient
108 /// things, we just pass indexes around manually. The performance impact of
109 /// this is probably an instruction or two in the inner loop. However, on
110 /// 64 bit, each StatePtr is half the size of a *State.
111 compiled: StateMap,
112 /// The transition table.
113 ///
114 /// The transition table is laid out in row-major order, where states are
115 /// rows and the transitions for each state are columns. At a high level,
116 /// given state `s` and byte `b`, the next state can be found at index
117 /// `s * 256 + b`.
118 ///
119 /// This is, of course, a lie. A StatePtr is actually a pointer to the
120 /// *start* of a row in this table. When indexing in the DFA's inner loop,
121 /// this removes the need to multiply the StatePtr by the stride. Yes, it
122 /// matters. This reduces the number of states we can store, but: the
123 /// stride is rarely 256 since we define transitions in terms of
124 /// *equivalence classes* of bytes. Each class corresponds to a set of
125 /// bytes that never discriminate a distinct path through the DFA from each
126 /// other.
127 trans: Transitions,
128 /// A set of cached start states, which are limited to the number of
129 /// permutations of flags set just before the initial byte of input. (The
130 /// index into this vec is a `EmptyFlags`.)
131 ///
132 /// N.B. A start state can be "dead" (i.e., no possible match), so we
133 /// represent it with a StatePtr.
134 start_states: Vec<StatePtr>,
135 /// Stack scratch space used to follow epsilon transitions in the NFA.
136 /// (This permits us to avoid recursion.)
137 ///
138 /// The maximum stack size is the number of NFA states.
139 stack: Vec<InstPtr>,
140 /// The total number of times this cache has been flushed by the DFA
141 /// because of space constraints.
142 flush_count: u64,
143 /// The total heap size of the DFA's cache. We use this to determine when
144 /// we should flush the cache.
145 size: usize,
146 /// Scratch space used when building instruction pointer lists for new
147 /// states. This helps amortize allocation.
148 insts_scratch_space: Vec<u8>,
149}
150
151/// The transition table.
152///
153/// It is laid out in row-major order, with states as rows and byte class
154/// transitions as columns.
155///
156/// The transition table is responsible for producing valid `StatePtrs`. A
157/// `StatePtr` points to the start of a particular row in this table. When
158/// indexing to find the next state this allows us to avoid a multiplication
159/// when computing an index into the table.
160#[derive(Clone)]
161struct Transitions {
162 /// The table.
163 table: Vec<StatePtr>,
164 /// The stride.
165 num_byte_classes: usize,
166}
167
168/// Fsm encapsulates the actual execution of the DFA.
169#[derive(Debug)]
170pub struct Fsm<'a> {
171 /// prog contains the NFA instruction opcodes. DFA execution uses either
172 /// the `dfa` instructions or the `dfa_reverse` instructions from
173 /// `exec::ExecReadOnly`. (It never uses `ExecReadOnly.nfa`, which may have
174 /// Unicode opcodes that cannot be executed by the DFA.)
175 prog: &'a Program,
176 /// The start state. We record it here because the pointer may change
177 /// when the cache is wiped.
178 start: StatePtr,
179 /// The current position in the input.
180 at: usize,
181 /// Should we quit after seeing the first match? e.g., When the caller
182 /// uses `is_match` or `shortest_match`.
183 quit_after_match: bool,
184 /// The last state that matched.
185 ///
186 /// When no match has occurred, this is set to STATE_UNKNOWN.
187 ///
188 /// This is only useful when matching regex sets. The last match state
189 /// is useful because it contains all of the match instructions seen,
190 /// thereby allowing us to enumerate which regexes in the set matched.
191 last_match_si: StatePtr,
192 /// The input position of the last cache flush. We use this to determine
193 /// if we're thrashing in the cache too often. If so, the DFA quits so
194 /// that we can fall back to the NFA algorithm.
195 last_cache_flush: usize,
196 /// All cached DFA information that is persisted between searches.
197 cache: &'a mut CacheInner,
198}
199
200/// The result of running the DFA.
201///
202/// Generally, the result is either a match or not a match, but sometimes the
203/// DFA runs too slowly because the cache size is too small. In that case, it
204/// gives up with the intent of falling back to the NFA algorithm.
205///
206/// The DFA can also give up if it runs out of room to create new states, or if
207/// it sees non-ASCII bytes in the presence of a Unicode word boundary.
208#[derive(Clone, Debug)]
209pub enum Result<T> {
210 Match(T),
211 NoMatch(usize),
212 Quit,
213}
214
215impl<T> Result<T> {
216 /// Returns true if this result corresponds to a match.
217 pub fn is_match(&self) -> bool {
218 match *self {
219 Result::Match(_) => true,
220 Result::NoMatch(_) | Result::Quit => false,
221 }
222 }
223
224 /// Maps the given function onto T and returns the result.
225 ///
226 /// If this isn't a match, then this is a no-op.
227 #[cfg(feature = "perf-literal")]
228 pub fn map<U, F: FnMut(T) -> U>(self, mut f: F) -> Result<U> {
229 match self {
230 Result::Match(t) => Result::Match(f(t)),
231 Result::NoMatch(x) => Result::NoMatch(x),
232 Result::Quit => Result::Quit,
233 }
234 }
235
236 /// Sets the non-match position.
237 ///
238 /// If this isn't a non-match, then this is a no-op.
239 fn set_non_match(self, at: usize) -> Result<T> {
240 match self {
241 Result::NoMatch(_) => Result::NoMatch(at),
242 r => r,
243 }
244 }
245}
246
247/// `State` is a DFA state. It contains an ordered set of NFA states (not
248/// necessarily complete) and a smattering of flags.
249///
250/// The flags are packed into the first byte of data.
251///
252/// States don't carry their transitions. Instead, transitions are stored in
253/// a single row-major table.
254///
255/// Delta encoding is used to store the instruction pointers.
256/// The first instruction pointer is stored directly starting
257/// at data[1], and each following pointer is stored as an offset
258/// to the previous one. If a delta is in the range -127..127,
259/// it is packed into a single byte; Otherwise the byte 128 (-128 as an i8)
260/// is coded as a flag, followed by 4 bytes encoding the delta.
261#[derive(Clone, Eq, Hash, PartialEq)]
262struct State {
263 data: Arc<[u8]>,
264}
265
266/// `InstPtr` is a 32 bit pointer into a sequence of opcodes (i.e., it indexes
267/// an NFA state).
268///
269/// Throughout this library, this is usually set to `usize`, but we force a
270/// `u32` here for the DFA to save on space.
271type InstPtr = u32;
272
273/// Adds ip to data using delta encoding with respect to prev.
274///
275/// After completion, `data` will contain `ip` and `prev` will be set to `ip`.
276fn push_inst_ptr(data: &mut Vec<u8>, prev: &mut InstPtr, ip: InstPtr) {
277 let delta: i32 = (ip as i32) - (*prev as i32);
278 write_vari32(data, n:delta);
279 *prev = ip;
280}
281
282struct InstPtrs<'a> {
283 base: usize,
284 data: &'a [u8],
285}
286
287impl<'a> Iterator for InstPtrs<'a> {
288 type Item = usize;
289
290 fn next(&mut self) -> Option<usize> {
291 if self.data.is_empty() {
292 return None;
293 }
294 let (delta: i32, nread: usize) = read_vari32(self.data);
295 let base: i32 = self.base as i32 + delta;
296 debug_assert!(base >= 0);
297 debug_assert!(nread > 0);
298 self.data = &self.data[nread..];
299 self.base = base as usize;
300 Some(self.base)
301 }
302}
303
304impl State {
305 fn flags(&self) -> StateFlags {
306 StateFlags(self.data[0])
307 }
308
309 fn inst_ptrs(&self) -> InstPtrs<'_> {
310 InstPtrs { base: 0, data: &self.data[1..] }
311 }
312}
313
314/// `StatePtr` is a 32 bit pointer to the start of a row in the transition
315/// table.
316///
317/// It has many special values. There are two types of special values:
318/// sentinels and flags.
319///
320/// Sentinels corresponds to special states that carry some kind of
321/// significance. There are three such states: unknown, dead and quit states.
322///
323/// Unknown states are states that haven't been computed yet. They indicate
324/// that a transition should be filled in that points to either an existing
325/// cached state or a new state altogether. In general, an unknown state means
326/// "follow the NFA's epsilon transitions."
327///
328/// Dead states are states that can never lead to a match, no matter what
329/// subsequent input is observed. This means that the DFA should quit
330/// immediately and return the longest match it has found thus far.
331///
332/// Quit states are states that imply the DFA is not capable of matching the
333/// regex correctly. Currently, this is only used when a Unicode word boundary
334/// exists in the regex *and* a non-ASCII byte is observed.
335///
336/// The other type of state pointer is a state pointer with special flag bits.
337/// There are two flags: a start flag and a match flag. The lower bits of both
338/// kinds always contain a "valid" `StatePtr` (indicated by the `STATE_MAX`
339/// mask).
340///
341/// The start flag means that the state is a start state, and therefore may be
342/// subject to special prefix scanning optimizations.
343///
344/// The match flag means that the state is a match state, and therefore the
345/// current position in the input (while searching) should be recorded.
346///
347/// The above exists mostly in the service of making the inner loop fast.
348/// In particular, the inner *inner* loop looks something like this:
349///
350/// ```ignore
351/// while state <= STATE_MAX and i < len(text):
352/// state = state.next[i]
353/// ```
354///
355/// This is nice because it lets us execute a lazy DFA as if it were an
356/// entirely offline DFA (i.e., with very few instructions). The loop will
357/// quit only when we need to examine a case that needs special attention.
358type StatePtr = u32;
359
360/// An unknown state means that the state has not been computed yet, and that
361/// the only way to progress is to compute it.
362const STATE_UNKNOWN: StatePtr = 1 << 31;
363
364/// A dead state means that the state has been computed and it is known that
365/// once it is entered, no future match can ever occur.
366const STATE_DEAD: StatePtr = STATE_UNKNOWN + 1;
367
368/// A quit state means that the DFA came across some input that it doesn't
369/// know how to process correctly. The DFA should quit and another matching
370/// engine should be run in its place.
371const STATE_QUIT: StatePtr = STATE_DEAD + 1;
372
373/// A start state is a state that the DFA can start in.
374///
375/// Note that start states have their lower bits set to a state pointer.
376const STATE_START: StatePtr = 1 << 30;
377
378/// A match state means that the regex has successfully matched.
379///
380/// Note that match states have their lower bits set to a state pointer.
381const STATE_MATCH: StatePtr = 1 << 29;
382
383/// The maximum state pointer. This is useful to mask out the "valid" state
384/// pointer from a state with the "start" or "match" bits set.
385///
386/// It doesn't make sense to use this with unknown, dead or quit state
387/// pointers, since those pointers are sentinels and never have their lower
388/// bits set to anything meaningful.
389const STATE_MAX: StatePtr = STATE_MATCH - 1;
390
391/// Byte is a u8 in spirit, but a u16 in practice so that we can represent the
392/// special EOF sentinel value.
393#[derive(Copy, Clone, Debug)]
394struct Byte(u16);
395
396/// A set of flags for zero-width assertions.
397#[derive(Clone, Copy, Eq, Debug, Default, Hash, PartialEq)]
398struct EmptyFlags {
399 start: bool,
400 end: bool,
401 start_line: bool,
402 end_line: bool,
403 word_boundary: bool,
404 not_word_boundary: bool,
405}
406
407/// A set of flags describing various configurations of a DFA state. This is
408/// represented by a `u8` so that it is compact.
409#[derive(Clone, Copy, Eq, Default, Hash, PartialEq)]
410struct StateFlags(u8);
411
412impl Cache {
413 /// Create new empty cache for the DFA engine.
414 pub fn new(prog: &Program) -> Self {
415 // We add 1 to account for the special EOF byte.
416 let num_byte_classes: usize = (prog.byte_classes[255] as usize + 1) + 1;
417 let starts: Vec = vec![STATE_UNKNOWN; 256];
418 let mut cache: Cache = Cache {
419 inner: CacheInner {
420 compiled: StateMap::new(num_byte_classes),
421 trans: Transitions::new(num_byte_classes),
422 start_states: starts,
423 stack: vec![],
424 flush_count: 0,
425 size: 0,
426 insts_scratch_space: vec![],
427 },
428 qcur: SparseSet::new(size:prog.insts.len()),
429 qnext: SparseSet::new(size:prog.insts.len()),
430 };
431 cache.inner.reset_size();
432 cache
433 }
434}
435
436impl CacheInner {
437 /// Resets the cache size to account for fixed costs, such as the program
438 /// and stack sizes.
439 fn reset_size(&mut self) {
440 self.size = (self.start_states.len() * mem::size_of::<StatePtr>())
441 + (self.stack.len() * mem::size_of::<InstPtr>());
442 }
443}
444
445impl<'a> Fsm<'a> {
446 #[cfg_attr(feature = "perf-inline", inline(always))]
447 pub fn forward(
448 prog: &'a Program,
449 cache: &ProgramCache,
450 quit_after_match: bool,
451 text: &[u8],
452 at: usize,
453 ) -> Result<usize> {
454 let mut cache = cache.borrow_mut();
455 let cache = &mut cache.dfa;
456 let mut dfa = Fsm {
457 prog,
458 start: 0, // filled in below
459 at,
460 quit_after_match,
461 last_match_si: STATE_UNKNOWN,
462 last_cache_flush: at,
463 cache: &mut cache.inner,
464 };
465 let (empty_flags, state_flags) = dfa.start_flags(text, at);
466 dfa.start =
467 match dfa.start_state(&mut cache.qcur, empty_flags, state_flags) {
468 None => return Result::Quit,
469 Some(STATE_DEAD) => return Result::NoMatch(at),
470 Some(si) => si,
471 };
472 debug_assert!(dfa.start != STATE_UNKNOWN);
473 dfa.exec_at(&mut cache.qcur, &mut cache.qnext, text)
474 }
475
476 #[cfg_attr(feature = "perf-inline", inline(always))]
477 pub fn reverse(
478 prog: &'a Program,
479 cache: &ProgramCache,
480 quit_after_match: bool,
481 text: &[u8],
482 at: usize,
483 ) -> Result<usize> {
484 let mut cache = cache.borrow_mut();
485 let cache = &mut cache.dfa_reverse;
486 let mut dfa = Fsm {
487 prog,
488 start: 0, // filled in below
489 at,
490 quit_after_match,
491 last_match_si: STATE_UNKNOWN,
492 last_cache_flush: at,
493 cache: &mut cache.inner,
494 };
495 let (empty_flags, state_flags) = dfa.start_flags_reverse(text, at);
496 dfa.start =
497 match dfa.start_state(&mut cache.qcur, empty_flags, state_flags) {
498 None => return Result::Quit,
499 Some(STATE_DEAD) => return Result::NoMatch(at),
500 Some(si) => si,
501 };
502 debug_assert!(dfa.start != STATE_UNKNOWN);
503 dfa.exec_at_reverse(&mut cache.qcur, &mut cache.qnext, text)
504 }
505
506 #[cfg_attr(feature = "perf-inline", inline(always))]
507 pub fn forward_many(
508 prog: &'a Program,
509 cache: &ProgramCache,
510 matches: &mut [bool],
511 text: &[u8],
512 at: usize,
513 ) -> Result<usize> {
514 debug_assert!(matches.len() == prog.matches.len());
515 let mut cache = cache.borrow_mut();
516 let cache = &mut cache.dfa;
517 let mut dfa = Fsm {
518 prog,
519 start: 0, // filled in below
520 at,
521 quit_after_match: false,
522 last_match_si: STATE_UNKNOWN,
523 last_cache_flush: at,
524 cache: &mut cache.inner,
525 };
526 let (empty_flags, state_flags) = dfa.start_flags(text, at);
527 dfa.start =
528 match dfa.start_state(&mut cache.qcur, empty_flags, state_flags) {
529 None => return Result::Quit,
530 Some(STATE_DEAD) => return Result::NoMatch(at),
531 Some(si) => si,
532 };
533 debug_assert!(dfa.start != STATE_UNKNOWN);
534 let result = dfa.exec_at(&mut cache.qcur, &mut cache.qnext, text);
535 if result.is_match() {
536 if matches.len() == 1 {
537 matches[0] = true;
538 } else {
539 debug_assert!(dfa.last_match_si != STATE_UNKNOWN);
540 debug_assert!(dfa.last_match_si != STATE_DEAD);
541 for ip in dfa.state(dfa.last_match_si).inst_ptrs() {
542 if let Inst::Match(slot) = dfa.prog[ip] {
543 matches[slot] = true;
544 }
545 }
546 }
547 }
548 result
549 }
550
551 /// Executes the DFA on a forward NFA.
552 ///
553 /// {qcur,qnext} are scratch ordered sets which may be non-empty.
554 #[cfg_attr(feature = "perf-inline", inline(always))]
555 fn exec_at(
556 &mut self,
557 qcur: &mut SparseSet,
558 qnext: &mut SparseSet,
559 text: &[u8],
560 ) -> Result<usize> {
561 // For the most part, the DFA is basically:
562 //
563 // last_match = null
564 // while current_byte != EOF:
565 // si = current_state.next[current_byte]
566 // if si is match
567 // last_match = si
568 // return last_match
569 //
570 // However, we need to deal with a few things:
571 //
572 // 1. This is an *online* DFA, so the current state's next list
573 // may not point to anywhere yet, so we must go out and compute
574 // them. (They are then cached into the current state's next list
575 // to avoid re-computation.)
576 // 2. If we come across a state that is known to be dead (i.e., never
577 // leads to a match), then we can quit early.
578 // 3. If the caller just wants to know if a match occurs, then we
579 // can quit as soon as we know we have a match. (Full leftmost
580 // first semantics require continuing on.)
581 // 4. If we're in the start state, then we can use a pre-computed set
582 // of prefix literals to skip quickly along the input.
583 // 5. After the input is exhausted, we run the DFA on one symbol
584 // that stands for EOF. This is useful for handling empty width
585 // assertions.
586 // 6. We can't actually do state.next[byte]. Instead, we have to do
587 // state.next[byte_classes[byte]], which permits us to keep the
588 // 'next' list very small.
589 //
590 // Since there's a bunch of extra stuff we need to consider, we do some
591 // pretty hairy tricks to get the inner loop to run as fast as
592 // possible.
593 debug_assert!(!self.prog.is_reverse);
594
595 // The last match is the currently known ending match position. It is
596 // reported as an index to the most recent byte that resulted in a
597 // transition to a match state and is always stored in capture slot `1`
598 // when searching forwards. Its maximum value is `text.len()`.
599 let mut result = Result::NoMatch(self.at);
600 let (mut prev_si, mut next_si) = (self.start, self.start);
601 let mut at = self.at;
602 while at < text.len() {
603 // This is the real inner loop. We take advantage of special bits
604 // set in the state pointer to determine whether a state is in the
605 // "common" case or not. Specifically, the common case is a
606 // non-match non-start non-dead state that has already been
607 // computed. So long as we remain in the common case, this inner
608 // loop will chew through the input.
609 //
610 // We also unroll the loop 4 times to amortize the cost of checking
611 // whether we've consumed the entire input. We are also careful
612 // to make sure that `prev_si` always represents the previous state
613 // and `next_si` always represents the next state after the loop
614 // exits, even if it isn't always true inside the loop.
615 while next_si <= STATE_MAX && at < text.len() {
616 // Argument for safety is in the definition of next_si.
617 prev_si = unsafe { self.next_si(next_si, text, at) };
618 at += 1;
619 if prev_si > STATE_MAX || at + 2 >= text.len() {
620 mem::swap(&mut prev_si, &mut next_si);
621 break;
622 }
623 next_si = unsafe { self.next_si(prev_si, text, at) };
624 at += 1;
625 if next_si > STATE_MAX {
626 break;
627 }
628 prev_si = unsafe { self.next_si(next_si, text, at) };
629 at += 1;
630 if prev_si > STATE_MAX {
631 mem::swap(&mut prev_si, &mut next_si);
632 break;
633 }
634 next_si = unsafe { self.next_si(prev_si, text, at) };
635 at += 1;
636 }
637 if next_si & STATE_MATCH > 0 {
638 // A match state is outside of the common case because it needs
639 // special case analysis. In particular, we need to record the
640 // last position as having matched and possibly quit the DFA if
641 // we don't need to keep matching.
642 next_si &= !STATE_MATCH;
643 result = Result::Match(at - 1);
644 if self.quit_after_match {
645 return result;
646 }
647 self.last_match_si = next_si;
648 prev_si = next_si;
649
650 // This permits short-circuiting when matching a regex set.
651 // In particular, if this DFA state contains only match states,
652 // then it's impossible to extend the set of matches since
653 // match states are final. Therefore, we can quit.
654 if self.prog.matches.len() > 1 {
655 let state = self.state(next_si);
656 let just_matches =
657 state.inst_ptrs().all(|ip| self.prog[ip].is_match());
658 if just_matches {
659 return result;
660 }
661 }
662
663 // Another inner loop! If the DFA stays in this particular
664 // match state, then we can rip through all of the input
665 // very quickly, and only recording the match location once
666 // we've left this particular state.
667 let cur = at;
668 while (next_si & !STATE_MATCH) == prev_si
669 && at + 2 < text.len()
670 {
671 // Argument for safety is in the definition of next_si.
672 next_si = unsafe {
673 self.next_si(next_si & !STATE_MATCH, text, at)
674 };
675 at += 1;
676 }
677 if at > cur {
678 result = Result::Match(at - 2);
679 }
680 } else if next_si & STATE_START > 0 {
681 // A start state isn't in the common case because we may
682 // want to do quick prefix scanning. If the program doesn't
683 // have a detected prefix, then start states are actually
684 // considered common and this case is never reached.
685 debug_assert!(self.has_prefix());
686 next_si &= !STATE_START;
687 prev_si = next_si;
688 at = match self.prefix_at(text, at) {
689 None => return Result::NoMatch(text.len()),
690 Some(i) => i,
691 };
692 } else if next_si >= STATE_UNKNOWN {
693 if next_si == STATE_QUIT {
694 return Result::Quit;
695 }
696 // Finally, this corresponds to the case where the transition
697 // entered a state that can never lead to a match or a state
698 // that hasn't been computed yet. The latter being the "slow"
699 // path.
700 let byte = Byte::byte(text[at - 1]);
701 // We no longer care about the special bits in the state
702 // pointer.
703 prev_si &= STATE_MAX;
704 // Record where we are. This is used to track progress for
705 // determining whether we should quit if we've flushed the
706 // cache too much.
707 self.at = at;
708 next_si = match self.next_state(qcur, qnext, prev_si, byte) {
709 None => return Result::Quit,
710 Some(STATE_DEAD) => return result.set_non_match(at),
711 Some(si) => si,
712 };
713 debug_assert!(next_si != STATE_UNKNOWN);
714 if next_si & STATE_MATCH > 0 {
715 next_si &= !STATE_MATCH;
716 result = Result::Match(at - 1);
717 if self.quit_after_match {
718 return result;
719 }
720 self.last_match_si = next_si;
721 }
722 prev_si = next_si;
723 } else {
724 prev_si = next_si;
725 }
726 }
727
728 // Run the DFA once more on the special EOF sentinel value.
729 // We don't care about the special bits in the state pointer any more,
730 // so get rid of them.
731 prev_si &= STATE_MAX;
732 prev_si = match self.next_state(qcur, qnext, prev_si, Byte::eof()) {
733 None => return Result::Quit,
734 Some(STATE_DEAD) => return result.set_non_match(text.len()),
735 Some(si) => si & !STATE_START,
736 };
737 debug_assert!(prev_si != STATE_UNKNOWN);
738 if prev_si & STATE_MATCH > 0 {
739 prev_si &= !STATE_MATCH;
740 self.last_match_si = prev_si;
741 result = Result::Match(text.len());
742 }
743 result
744 }
745
746 /// Executes the DFA on a reverse NFA.
747 #[cfg_attr(feature = "perf-inline", inline(always))]
748 fn exec_at_reverse(
749 &mut self,
750 qcur: &mut SparseSet,
751 qnext: &mut SparseSet,
752 text: &[u8],
753 ) -> Result<usize> {
754 // The comments in `exec_at` above mostly apply here too. The main
755 // difference is that we move backwards over the input and we look for
756 // the longest possible match instead of the leftmost-first match.
757 //
758 // N.B. The code duplication here is regrettable. Efforts to improve
759 // it without sacrificing performance are welcome. ---AG
760 debug_assert!(self.prog.is_reverse);
761 let mut result = Result::NoMatch(self.at);
762 let (mut prev_si, mut next_si) = (self.start, self.start);
763 let mut at = self.at;
764 while at > 0 {
765 while next_si <= STATE_MAX && at > 0 {
766 // Argument for safety is in the definition of next_si.
767 at -= 1;
768 prev_si = unsafe { self.next_si(next_si, text, at) };
769 if prev_si > STATE_MAX || at <= 4 {
770 mem::swap(&mut prev_si, &mut next_si);
771 break;
772 }
773 at -= 1;
774 next_si = unsafe { self.next_si(prev_si, text, at) };
775 if next_si > STATE_MAX {
776 break;
777 }
778 at -= 1;
779 prev_si = unsafe { self.next_si(next_si, text, at) };
780 if prev_si > STATE_MAX {
781 mem::swap(&mut prev_si, &mut next_si);
782 break;
783 }
784 at -= 1;
785 next_si = unsafe { self.next_si(prev_si, text, at) };
786 }
787 if next_si & STATE_MATCH > 0 {
788 next_si &= !STATE_MATCH;
789 result = Result::Match(at + 1);
790 if self.quit_after_match {
791 return result;
792 }
793 self.last_match_si = next_si;
794 prev_si = next_si;
795 let cur = at;
796 while (next_si & !STATE_MATCH) == prev_si && at >= 2 {
797 // Argument for safety is in the definition of next_si.
798 at -= 1;
799 next_si = unsafe {
800 self.next_si(next_si & !STATE_MATCH, text, at)
801 };
802 }
803 if at < cur {
804 result = Result::Match(at + 2);
805 }
806 } else if next_si >= STATE_UNKNOWN {
807 if next_si == STATE_QUIT {
808 return Result::Quit;
809 }
810 let byte = Byte::byte(text[at]);
811 prev_si &= STATE_MAX;
812 self.at = at;
813 next_si = match self.next_state(qcur, qnext, prev_si, byte) {
814 None => return Result::Quit,
815 Some(STATE_DEAD) => return result.set_non_match(at),
816 Some(si) => si,
817 };
818 debug_assert!(next_si != STATE_UNKNOWN);
819 if next_si & STATE_MATCH > 0 {
820 next_si &= !STATE_MATCH;
821 result = Result::Match(at + 1);
822 if self.quit_after_match {
823 return result;
824 }
825 self.last_match_si = next_si;
826 }
827 prev_si = next_si;
828 } else {
829 prev_si = next_si;
830 }
831 }
832
833 // Run the DFA once more on the special EOF sentinel value.
834 prev_si = match self.next_state(qcur, qnext, prev_si, Byte::eof()) {
835 None => return Result::Quit,
836 Some(STATE_DEAD) => return result.set_non_match(0),
837 Some(si) => si,
838 };
839 debug_assert!(prev_si != STATE_UNKNOWN);
840 if prev_si & STATE_MATCH > 0 {
841 prev_si &= !STATE_MATCH;
842 self.last_match_si = prev_si;
843 result = Result::Match(0);
844 }
845 result
846 }
847
848 /// next_si transitions to the next state, where the transition input
849 /// corresponds to text[i].
850 ///
851 /// This elides bounds checks, and is therefore not safe.
852 #[cfg_attr(feature = "perf-inline", inline(always))]
853 unsafe fn next_si(&self, si: StatePtr, text: &[u8], i: usize) -> StatePtr {
854 // What is the argument for safety here?
855 // We have three unchecked accesses that could possibly violate safety:
856 //
857 // 1. The given byte of input (`text[i]`).
858 // 2. The class of the byte of input (`classes[text[i]]`).
859 // 3. The transition for the class (`trans[si + cls]`).
860 //
861 // (1) is only safe when calling next_si is guarded by
862 // `i < text.len()`.
863 //
864 // (2) is the easiest case to guarantee since `text[i]` is always a
865 // `u8` and `self.prog.byte_classes` always has length `u8::MAX`.
866 // (See `ByteClassSet.byte_classes` in `compile.rs`.)
867 //
868 // (3) is only safe if (1)+(2) are safe. Namely, the transitions
869 // of every state are defined to have length equal to the number of
870 // byte classes in the program. Therefore, a valid class leads to a
871 // valid transition. (All possible transitions are valid lookups, even
872 // if it points to a state that hasn't been computed yet.) (3) also
873 // relies on `si` being correct, but StatePtrs should only ever be
874 // retrieved from the transition table, which ensures they are correct.
875 debug_assert!(i < text.len());
876 let b = *text.get_unchecked(i);
877 debug_assert!((b as usize) < self.prog.byte_classes.len());
878 let cls = *self.prog.byte_classes.get_unchecked(b as usize);
879 self.cache.trans.next_unchecked(si, cls as usize)
880 }
881
882 /// Computes the next state given the current state and the current input
883 /// byte (which may be EOF).
884 ///
885 /// If STATE_DEAD is returned, then there is no valid state transition.
886 /// This implies that no permutation of future input can lead to a match
887 /// state.
888 ///
889 /// STATE_UNKNOWN can never be returned.
890 fn exec_byte(
891 &mut self,
892 qcur: &mut SparseSet,
893 qnext: &mut SparseSet,
894 mut si: StatePtr,
895 b: Byte,
896 ) -> Option<StatePtr> {
897 use crate::prog::Inst::*;
898
899 // Initialize a queue with the current DFA state's NFA states.
900 qcur.clear();
901 for ip in self.state(si).inst_ptrs() {
902 qcur.insert(ip);
903 }
904
905 // Before inspecting the current byte, we may need to also inspect
906 // whether the position immediately preceding the current byte
907 // satisfies the empty assertions found in the current state.
908 //
909 // We only need to do this step if there are any empty assertions in
910 // the current state.
911 let is_word_last = self.state(si).flags().is_word();
912 let is_word = b.is_ascii_word();
913 if self.state(si).flags().has_empty() {
914 // Compute the flags immediately preceding the current byte.
915 // This means we only care about the "end" or "end line" flags.
916 // (The "start" flags are computed immediately following the
917 // current byte and are handled below.)
918 let mut flags = EmptyFlags::default();
919 if b.is_eof() {
920 flags.end = true;
921 flags.end_line = true;
922 } else if b.as_byte().map_or(false, |b| b == b'\n') {
923 flags.end_line = true;
924 }
925 if is_word_last == is_word {
926 flags.not_word_boundary = true;
927 } else {
928 flags.word_boundary = true;
929 }
930 // Now follow epsilon transitions from every NFA state, but make
931 // sure we only follow transitions that satisfy our flags.
932 qnext.clear();
933 for &ip in &*qcur {
934 self.follow_epsilons(usize_to_u32(ip), qnext, flags);
935 }
936 mem::swap(qcur, qnext);
937 }
938
939 // Now we set flags for immediately after the current byte. Since start
940 // states are processed separately, and are the only states that can
941 // have the StartText flag set, we therefore only need to worry about
942 // the StartLine flag here.
943 //
944 // We do also keep track of whether this DFA state contains a NFA state
945 // that is a matching state. This is precisely how we delay the DFA
946 // matching by one byte in order to process the special EOF sentinel
947 // byte. Namely, if this DFA state containing a matching NFA state,
948 // then it is the *next* DFA state that is marked as a match.
949 let mut empty_flags = EmptyFlags::default();
950 let mut state_flags = StateFlags::default();
951 empty_flags.start_line = b.as_byte().map_or(false, |b| b == b'\n');
952 if b.is_ascii_word() {
953 state_flags.set_word();
954 }
955 // Now follow all epsilon transitions again, but only after consuming
956 // the current byte.
957 qnext.clear();
958 for &ip in &*qcur {
959 match self.prog[ip as usize] {
960 // These states never happen in a byte-based program.
961 Char(_) | Ranges(_) => unreachable!(),
962 // These states are handled when following epsilon transitions.
963 Save(_) | Split(_) | EmptyLook(_) => {}
964 Match(_) => {
965 state_flags.set_match();
966 if !self.continue_past_first_match() {
967 break;
968 } else if self.prog.matches.len() > 1
969 && !qnext.contains(ip as usize)
970 {
971 // If we are continuing on to find other matches,
972 // then keep a record of the match states we've seen.
973 qnext.insert(ip);
974 }
975 }
976 Bytes(ref inst) => {
977 if b.as_byte().map_or(false, |b| inst.matches(b)) {
978 self.follow_epsilons(
979 inst.goto as InstPtr,
980 qnext,
981 empty_flags,
982 );
983 }
984 }
985 }
986 }
987
988 let cache = if b.is_eof() && self.prog.matches.len() > 1 {
989 // If we're processing the last byte of the input and we're
990 // matching a regex set, then make the next state contain the
991 // previous states transitions. We do this so that the main
992 // matching loop can extract all of the match instructions.
993 mem::swap(qcur, qnext);
994 // And don't cache this state because it's totally bunk.
995 false
996 } else {
997 true
998 };
999
1000 // We've now built up the set of NFA states that ought to comprise the
1001 // next DFA state, so try to find it in the cache, and if it doesn't
1002 // exist, cache it.
1003 //
1004 // N.B. We pass `&mut si` here because the cache may clear itself if
1005 // it has gotten too full. When that happens, the location of the
1006 // current state may change.
1007 let mut next =
1008 match self.cached_state(qnext, state_flags, Some(&mut si)) {
1009 None => return None,
1010 Some(next) => next,
1011 };
1012 if (self.start & !STATE_START) == next {
1013 // Start states can never be match states since all matches are
1014 // delayed by one byte.
1015 debug_assert!(!self.state(next).flags().is_match());
1016 next = self.start_ptr(next);
1017 }
1018 if next <= STATE_MAX && self.state(next).flags().is_match() {
1019 next |= STATE_MATCH;
1020 }
1021 debug_assert!(next != STATE_UNKNOWN);
1022 // And now store our state in the current state's next list.
1023 if cache {
1024 let cls = self.byte_class(b);
1025 self.cache.trans.set_next(si, cls, next);
1026 }
1027 Some(next)
1028 }
1029
1030 /// Follows the epsilon transitions starting at (and including) `ip`. The
1031 /// resulting states are inserted into the ordered set `q`.
1032 ///
1033 /// Conditional epsilon transitions (i.e., empty width assertions) are only
1034 /// followed if they are satisfied by the given flags, which should
1035 /// represent the flags set at the current location in the input.
1036 ///
1037 /// If the current location corresponds to the empty string, then only the
1038 /// end line and/or end text flags may be set. If the current location
1039 /// corresponds to a real byte in the input, then only the start line
1040 /// and/or start text flags may be set.
1041 ///
1042 /// As an exception to the above, when finding the initial state, any of
1043 /// the above flags may be set:
1044 ///
1045 /// If matching starts at the beginning of the input, then start text and
1046 /// start line should be set. If the input is empty, then end text and end
1047 /// line should also be set.
1048 ///
1049 /// If matching starts after the beginning of the input, then only start
1050 /// line should be set if the preceding byte is `\n`. End line should never
1051 /// be set in this case. (Even if the following byte is a `\n`, it will
1052 /// be handled in a subsequent DFA state.)
1053 fn follow_epsilons(
1054 &mut self,
1055 ip: InstPtr,
1056 q: &mut SparseSet,
1057 flags: EmptyFlags,
1058 ) {
1059 use crate::prog::EmptyLook::*;
1060 use crate::prog::Inst::*;
1061
1062 // We need to traverse the NFA to follow epsilon transitions, so avoid
1063 // recursion with an explicit stack.
1064 self.cache.stack.push(ip);
1065 while let Some(mut ip) = self.cache.stack.pop() {
1066 // Try to munch through as many states as possible without
1067 // pushes/pops to the stack.
1068 loop {
1069 // Don't visit states we've already added.
1070 if q.contains(ip as usize) {
1071 break;
1072 }
1073 q.insert(ip as usize);
1074 match self.prog[ip as usize] {
1075 Char(_) | Ranges(_) => unreachable!(),
1076 Match(_) | Bytes(_) => {
1077 break;
1078 }
1079 EmptyLook(ref inst) => {
1080 // Only follow empty assertion states if our flags
1081 // satisfy the assertion.
1082 match inst.look {
1083 StartLine if flags.start_line => {
1084 ip = inst.goto as InstPtr;
1085 }
1086 EndLine if flags.end_line => {
1087 ip = inst.goto as InstPtr;
1088 }
1089 StartText if flags.start => {
1090 ip = inst.goto as InstPtr;
1091 }
1092 EndText if flags.end => {
1093 ip = inst.goto as InstPtr;
1094 }
1095 WordBoundaryAscii if flags.word_boundary => {
1096 ip = inst.goto as InstPtr;
1097 }
1098 NotWordBoundaryAscii
1099 if flags.not_word_boundary =>
1100 {
1101 ip = inst.goto as InstPtr;
1102 }
1103 WordBoundary if flags.word_boundary => {
1104 ip = inst.goto as InstPtr;
1105 }
1106 NotWordBoundary if flags.not_word_boundary => {
1107 ip = inst.goto as InstPtr;
1108 }
1109 StartLine | EndLine | StartText | EndText
1110 | WordBoundaryAscii | NotWordBoundaryAscii
1111 | WordBoundary | NotWordBoundary => {
1112 break;
1113 }
1114 }
1115 }
1116 Save(ref inst) => {
1117 ip = inst.goto as InstPtr;
1118 }
1119 Split(ref inst) => {
1120 self.cache.stack.push(inst.goto2 as InstPtr);
1121 ip = inst.goto1 as InstPtr;
1122 }
1123 }
1124 }
1125 }
1126 }
1127
1128 /// Find a previously computed state matching the given set of instructions
1129 /// and is_match bool.
1130 ///
1131 /// The given set of instructions should represent a single state in the
1132 /// NFA along with all states reachable without consuming any input.
1133 ///
1134 /// The is_match bool should be true if and only if the preceding DFA state
1135 /// contains an NFA matching state. The cached state produced here will
1136 /// then signify a match. (This enables us to delay a match by one byte,
1137 /// in order to account for the EOF sentinel byte.)
1138 ///
1139 /// If the cache is full, then it is wiped before caching a new state.
1140 ///
1141 /// The current state should be specified if it exists, since it will need
1142 /// to be preserved if the cache clears itself. (Start states are
1143 /// always saved, so they should not be passed here.) It takes a mutable
1144 /// pointer to the index because if the cache is cleared, the state's
1145 /// location may change.
1146 fn cached_state(
1147 &mut self,
1148 q: &SparseSet,
1149 mut state_flags: StateFlags,
1150 current_state: Option<&mut StatePtr>,
1151 ) -> Option<StatePtr> {
1152 // If we couldn't come up with a non-empty key to represent this state,
1153 // then it is dead and can never lead to a match.
1154 //
1155 // Note that inst_flags represent the set of empty width assertions
1156 // in q. We use this as an optimization in exec_byte to determine when
1157 // we should follow epsilon transitions at the empty string preceding
1158 // the current byte.
1159 let key = match self.cached_state_key(q, &mut state_flags) {
1160 None => return Some(STATE_DEAD),
1161 Some(v) => v,
1162 };
1163 // In the cache? Cool. Done.
1164 if let Some(si) = self.cache.compiled.get_ptr(&key) {
1165 return Some(si);
1166 }
1167 // If the cache has gotten too big, wipe it.
1168 if self.approximate_size() > self.prog.dfa_size_limit
1169 && !self.clear_cache_and_save(current_state)
1170 {
1171 // Ooops. DFA is giving up.
1172 return None;
1173 }
1174 // Allocate room for our state and add it.
1175 self.add_state(key)
1176 }
1177
1178 /// Produces a key suitable for describing a state in the DFA cache.
1179 ///
1180 /// The key invariant here is that equivalent keys are produced for any two
1181 /// sets of ordered NFA states (and toggling of whether the previous NFA
1182 /// states contain a match state) that do not discriminate a match for any
1183 /// input.
1184 ///
1185 /// Specifically, q should be an ordered set of NFA states and is_match
1186 /// should be true if and only if the previous NFA states contained a match
1187 /// state.
1188 fn cached_state_key(
1189 &mut self,
1190 q: &SparseSet,
1191 state_flags: &mut StateFlags,
1192 ) -> Option<State> {
1193 use crate::prog::Inst::*;
1194
1195 // We need to build up enough information to recognize pre-built states
1196 // in the DFA. Generally speaking, this includes every instruction
1197 // except for those which are purely epsilon transitions, e.g., the
1198 // Save and Split instructions.
1199 //
1200 // Empty width assertions are also epsilon transitions, but since they
1201 // are conditional, we need to make them part of a state's key in the
1202 // cache.
1203
1204 let mut insts =
1205 mem::replace(&mut self.cache.insts_scratch_space, vec![]);
1206 insts.clear();
1207 // Reserve 1 byte for flags.
1208 insts.push(0);
1209
1210 let mut prev = 0;
1211 for &ip in q {
1212 let ip = usize_to_u32(ip);
1213 match self.prog[ip as usize] {
1214 Char(_) | Ranges(_) => unreachable!(),
1215 Save(_) | Split(_) => {}
1216 Bytes(_) => push_inst_ptr(&mut insts, &mut prev, ip),
1217 EmptyLook(_) => {
1218 state_flags.set_empty();
1219 push_inst_ptr(&mut insts, &mut prev, ip)
1220 }
1221 Match(_) => {
1222 push_inst_ptr(&mut insts, &mut prev, ip);
1223 if !self.continue_past_first_match() {
1224 break;
1225 }
1226 }
1227 }
1228 }
1229 // If we couldn't transition to any other instructions and we didn't
1230 // see a match when expanding NFA states previously, then this is a
1231 // dead state and no amount of additional input can transition out
1232 // of this state.
1233 let opt_state = if insts.len() == 1 && !state_flags.is_match() {
1234 None
1235 } else {
1236 let StateFlags(f) = *state_flags;
1237 insts[0] = f;
1238 Some(State { data: Arc::from(&*insts) })
1239 };
1240 self.cache.insts_scratch_space = insts;
1241 opt_state
1242 }
1243
1244 /// Clears the cache, but saves and restores current_state if it is not
1245 /// none.
1246 ///
1247 /// The current state must be provided here in case its location in the
1248 /// cache changes.
1249 ///
1250 /// This returns false if the cache is not cleared and the DFA should
1251 /// give up.
1252 fn clear_cache_and_save(
1253 &mut self,
1254 current_state: Option<&mut StatePtr>,
1255 ) -> bool {
1256 if self.cache.compiled.is_empty() {
1257 // Nothing to clear...
1258 return true;
1259 }
1260 match current_state {
1261 None => self.clear_cache(),
1262 Some(si) => {
1263 let cur = self.state(*si).clone();
1264 if !self.clear_cache() {
1265 return false;
1266 }
1267 // The unwrap is OK because we just cleared the cache and
1268 // therefore know that the next state pointer won't exceed
1269 // STATE_MAX.
1270 *si = self.restore_state(cur).unwrap();
1271 true
1272 }
1273 }
1274 }
1275
1276 /// Wipes the state cache, but saves and restores the current start state.
1277 ///
1278 /// This returns false if the cache is not cleared and the DFA should
1279 /// give up.
1280 fn clear_cache(&mut self) -> bool {
1281 // Bail out of the DFA if we're moving too "slowly."
1282 // A heuristic from RE2: assume the DFA is too slow if it is processing
1283 // 10 or fewer bytes per state.
1284 // Additionally, we permit the cache to be flushed a few times before
1285 // caling it quits.
1286 let nstates = self.cache.compiled.len();
1287 if self.cache.flush_count >= 3
1288 && self.at >= self.last_cache_flush
1289 && (self.at - self.last_cache_flush) <= 10 * nstates
1290 {
1291 return false;
1292 }
1293 // Update statistics tracking cache flushes.
1294 self.last_cache_flush = self.at;
1295 self.cache.flush_count += 1;
1296
1297 // OK, actually flush the cache.
1298 let start = self.state(self.start & !STATE_START).clone();
1299 let last_match = if self.last_match_si <= STATE_MAX {
1300 Some(self.state(self.last_match_si).clone())
1301 } else {
1302 None
1303 };
1304 self.cache.reset_size();
1305 self.cache.trans.clear();
1306 self.cache.compiled.clear();
1307 for s in &mut self.cache.start_states {
1308 *s = STATE_UNKNOWN;
1309 }
1310 // The unwraps are OK because we just cleared the cache and therefore
1311 // know that the next state pointer won't exceed STATE_MAX.
1312 let start_ptr = self.restore_state(start).unwrap();
1313 self.start = self.start_ptr(start_ptr);
1314 if let Some(last_match) = last_match {
1315 self.last_match_si = self.restore_state(last_match).unwrap();
1316 }
1317 true
1318 }
1319
1320 /// Restores the given state back into the cache, and returns a pointer
1321 /// to it.
1322 fn restore_state(&mut self, state: State) -> Option<StatePtr> {
1323 // If we've already stored this state, just return a pointer to it.
1324 // None will be the wiser.
1325 if let Some(si) = self.cache.compiled.get_ptr(&state) {
1326 return Some(si);
1327 }
1328 self.add_state(state)
1329 }
1330
1331 /// Returns the next state given the current state si and current byte
1332 /// b. {qcur,qnext} are used as scratch space for storing ordered NFA
1333 /// states.
1334 ///
1335 /// This tries to fetch the next state from the cache, but if that fails,
1336 /// it computes the next state, caches it and returns a pointer to it.
1337 ///
1338 /// The pointer can be to a real state, or it can be STATE_DEAD.
1339 /// STATE_UNKNOWN cannot be returned.
1340 ///
1341 /// None is returned if a new state could not be allocated (i.e., the DFA
1342 /// ran out of space and thinks it's running too slowly).
1343 fn next_state(
1344 &mut self,
1345 qcur: &mut SparseSet,
1346 qnext: &mut SparseSet,
1347 si: StatePtr,
1348 b: Byte,
1349 ) -> Option<StatePtr> {
1350 if si == STATE_DEAD {
1351 return Some(STATE_DEAD);
1352 }
1353 match self.cache.trans.next(si, self.byte_class(b)) {
1354 STATE_UNKNOWN => self.exec_byte(qcur, qnext, si, b),
1355 STATE_QUIT => None,
1356 nsi => Some(nsi),
1357 }
1358 }
1359
1360 /// Computes and returns the start state, where searching begins at
1361 /// position `at` in `text`. If the state has already been computed,
1362 /// then it is pulled from the cache. If the state hasn't been cached,
1363 /// then it is computed, cached and a pointer to it is returned.
1364 ///
1365 /// This may return STATE_DEAD but never STATE_UNKNOWN.
1366 #[cfg_attr(feature = "perf-inline", inline(always))]
1367 fn start_state(
1368 &mut self,
1369 q: &mut SparseSet,
1370 empty_flags: EmptyFlags,
1371 state_flags: StateFlags,
1372 ) -> Option<StatePtr> {
1373 // Compute an index into our cache of start states based on the set
1374 // of empty/state flags set at the current position in the input. We
1375 // don't use every flag since not all flags matter. For example, since
1376 // matches are delayed by one byte, start states can never be match
1377 // states.
1378 let flagi = {
1379 (((empty_flags.start as u8) << 0)
1380 | ((empty_flags.end as u8) << 1)
1381 | ((empty_flags.start_line as u8) << 2)
1382 | ((empty_flags.end_line as u8) << 3)
1383 | ((empty_flags.word_boundary as u8) << 4)
1384 | ((empty_flags.not_word_boundary as u8) << 5)
1385 | ((state_flags.is_word() as u8) << 6)) as usize
1386 };
1387 match self.cache.start_states[flagi] {
1388 STATE_UNKNOWN => {}
1389 si => return Some(si),
1390 }
1391 q.clear();
1392 let start = usize_to_u32(self.prog.start);
1393 self.follow_epsilons(start, q, empty_flags);
1394 // Start states can never be match states because we delay every match
1395 // by one byte. Given an empty string and an empty match, the match
1396 // won't actually occur until the DFA processes the special EOF
1397 // sentinel byte.
1398 let sp = match self.cached_state(q, state_flags, None) {
1399 None => return None,
1400 Some(sp) => self.start_ptr(sp),
1401 };
1402 self.cache.start_states[flagi] = sp;
1403 Some(sp)
1404 }
1405
1406 /// Computes the set of starting flags for the given position in text.
1407 ///
1408 /// This should only be used when executing the DFA forwards over the
1409 /// input.
1410 fn start_flags(&self, text: &[u8], at: usize) -> (EmptyFlags, StateFlags) {
1411 let mut empty_flags = EmptyFlags::default();
1412 let mut state_flags = StateFlags::default();
1413 empty_flags.start = at == 0;
1414 empty_flags.end = text.is_empty();
1415 empty_flags.start_line = at == 0 || text[at - 1] == b'\n';
1416 empty_flags.end_line = text.is_empty();
1417
1418 let is_word_last = at > 0 && Byte::byte(text[at - 1]).is_ascii_word();
1419 let is_word = at < text.len() && Byte::byte(text[at]).is_ascii_word();
1420 if is_word_last {
1421 state_flags.set_word();
1422 }
1423 if is_word == is_word_last {
1424 empty_flags.not_word_boundary = true;
1425 } else {
1426 empty_flags.word_boundary = true;
1427 }
1428 (empty_flags, state_flags)
1429 }
1430
1431 /// Computes the set of starting flags for the given position in text.
1432 ///
1433 /// This should only be used when executing the DFA in reverse over the
1434 /// input.
1435 fn start_flags_reverse(
1436 &self,
1437 text: &[u8],
1438 at: usize,
1439 ) -> (EmptyFlags, StateFlags) {
1440 let mut empty_flags = EmptyFlags::default();
1441 let mut state_flags = StateFlags::default();
1442 empty_flags.start = at == text.len();
1443 empty_flags.end = text.is_empty();
1444 empty_flags.start_line = at == text.len() || text[at] == b'\n';
1445 empty_flags.end_line = text.is_empty();
1446
1447 let is_word_last =
1448 at < text.len() && Byte::byte(text[at]).is_ascii_word();
1449 let is_word = at > 0 && Byte::byte(text[at - 1]).is_ascii_word();
1450 if is_word_last {
1451 state_flags.set_word();
1452 }
1453 if is_word == is_word_last {
1454 empty_flags.not_word_boundary = true;
1455 } else {
1456 empty_flags.word_boundary = true;
1457 }
1458 (empty_flags, state_flags)
1459 }
1460
1461 /// Returns a reference to a State given a pointer to it.
1462 fn state(&self, si: StatePtr) -> &State {
1463 self.cache.compiled.get_state(si).unwrap()
1464 }
1465
1466 /// Adds the given state to the DFA.
1467 ///
1468 /// This allocates room for transitions out of this state in
1469 /// self.cache.trans. The transitions can be set with the returned
1470 /// StatePtr.
1471 ///
1472 /// If None is returned, then the state limit was reached and the DFA
1473 /// should quit.
1474 fn add_state(&mut self, state: State) -> Option<StatePtr> {
1475 // This will fail if the next state pointer exceeds STATE_PTR. In
1476 // practice, the cache limit will prevent us from ever getting here,
1477 // but maybe callers will set the cache size to something ridiculous...
1478 let si = match self.cache.trans.add() {
1479 None => return None,
1480 Some(si) => si,
1481 };
1482 // If the program has a Unicode word boundary, then set any transitions
1483 // for non-ASCII bytes to STATE_QUIT. If the DFA stumbles over such a
1484 // transition, then it will quit and an alternative matching engine
1485 // will take over.
1486 if self.prog.has_unicode_word_boundary {
1487 for b in 128..256 {
1488 let cls = self.byte_class(Byte::byte(b as u8));
1489 self.cache.trans.set_next(si, cls, STATE_QUIT);
1490 }
1491 }
1492 // Finally, put our actual state on to our heap of states and index it
1493 // so we can find it later.
1494 self.cache.size += self.cache.trans.state_heap_size()
1495 + state.data.len()
1496 + (2 * mem::size_of::<State>())
1497 + mem::size_of::<StatePtr>();
1498 self.cache.compiled.insert(state, si);
1499 // Transition table and set of states and map should all be in sync.
1500 debug_assert!(
1501 self.cache.compiled.len() == self.cache.trans.num_states()
1502 );
1503 Some(si)
1504 }
1505
1506 /// Quickly finds the next occurrence of any literal prefixes in the regex.
1507 /// If there are no literal prefixes, then the current position is
1508 /// returned. If there are literal prefixes and one could not be found,
1509 /// then None is returned.
1510 ///
1511 /// This should only be called when the DFA is in a start state.
1512 fn prefix_at(&self, text: &[u8], at: usize) -> Option<usize> {
1513 self.prog.prefixes.find(&text[at..]).map(|(s, _)| at + s)
1514 }
1515
1516 /// Returns the number of byte classes required to discriminate transitions
1517 /// in each state.
1518 ///
1519 /// invariant: num_byte_classes() == len(State.next)
1520 fn num_byte_classes(&self) -> usize {
1521 // We add 1 to account for the special EOF byte.
1522 (self.prog.byte_classes[255] as usize + 1) + 1
1523 }
1524
1525 /// Given an input byte or the special EOF sentinel, return its
1526 /// corresponding byte class.
1527 #[cfg_attr(feature = "perf-inline", inline(always))]
1528 fn byte_class(&self, b: Byte) -> usize {
1529 match b.as_byte() {
1530 None => self.num_byte_classes() - 1,
1531 Some(b) => self.u8_class(b),
1532 }
1533 }
1534
1535 /// Like byte_class, but explicitly for u8s.
1536 #[cfg_attr(feature = "perf-inline", inline(always))]
1537 fn u8_class(&self, b: u8) -> usize {
1538 self.prog.byte_classes[b as usize] as usize
1539 }
1540
1541 /// Returns true if the DFA should continue searching past the first match.
1542 ///
1543 /// Leftmost first semantics in the DFA are preserved by not following NFA
1544 /// transitions after the first match is seen.
1545 ///
1546 /// On occasion, we want to avoid leftmost first semantics to find either
1547 /// the longest match (for reverse search) or all possible matches (for
1548 /// regex sets).
1549 fn continue_past_first_match(&self) -> bool {
1550 self.prog.is_reverse || self.prog.matches.len() > 1
1551 }
1552
1553 /// Returns true if there is a prefix we can quickly search for.
1554 fn has_prefix(&self) -> bool {
1555 !self.prog.is_reverse
1556 && !self.prog.prefixes.is_empty()
1557 && !self.prog.is_anchored_start
1558 }
1559
1560 /// Sets the STATE_START bit in the given state pointer if and only if
1561 /// we have a prefix to scan for.
1562 ///
1563 /// If there's no prefix, then it's a waste to treat the start state
1564 /// specially.
1565 fn start_ptr(&self, si: StatePtr) -> StatePtr {
1566 if self.has_prefix() {
1567 si | STATE_START
1568 } else {
1569 si
1570 }
1571 }
1572
1573 /// Approximate size returns the approximate heap space currently used by
1574 /// the DFA. It is used to determine whether the DFA's state cache needs to
1575 /// be wiped. Namely, it is possible that for certain regexes on certain
1576 /// inputs, a new state could be created for every byte of input. (This is
1577 /// bad for memory use, so we bound it with a cache.)
1578 fn approximate_size(&self) -> usize {
1579 self.cache.size
1580 }
1581}
1582
1583/// An abstraction for representing a map of states. The map supports two
1584/// different ways of state lookup. One is fast constant time access via a
1585/// state pointer. The other is a hashmap lookup based on the DFA's
1586/// constituent NFA states.
1587///
1588/// A DFA state internally uses an Arc such that we only need to store the
1589/// set of NFA states on the heap once, even though we support looking up
1590/// states by two different means. A more natural way to express this might
1591/// use raw pointers, but an Arc is safe and effectively achieves the same
1592/// thing.
1593#[derive(Debug)]
1594struct StateMap {
1595 /// The keys are not actually static but rely on always pointing to a
1596 /// buffer in `states` which will never be moved except when clearing
1597 /// the map or on drop, in which case the keys of this map will be
1598 /// removed before
1599 map: HashMap<State, StatePtr>,
1600 /// Our set of states. Note that `StatePtr / num_byte_classes` indexes
1601 /// this Vec rather than just a `StatePtr`.
1602 states: Vec<State>,
1603 /// The number of byte classes in the DFA. Used to index `states`.
1604 num_byte_classes: usize,
1605}
1606
1607impl StateMap {
1608 fn new(num_byte_classes: usize) -> StateMap {
1609 StateMap { map: HashMap::new(), states: vec![], num_byte_classes }
1610 }
1611
1612 fn len(&self) -> usize {
1613 self.states.len()
1614 }
1615
1616 fn is_empty(&self) -> bool {
1617 self.states.is_empty()
1618 }
1619
1620 fn get_ptr(&self, state: &State) -> Option<StatePtr> {
1621 self.map.get(state).cloned()
1622 }
1623
1624 fn get_state(&self, si: StatePtr) -> Option<&State> {
1625 self.states.get(si as usize / self.num_byte_classes)
1626 }
1627
1628 fn insert(&mut self, state: State, si: StatePtr) {
1629 self.map.insert(state.clone(), si);
1630 self.states.push(state);
1631 }
1632
1633 fn clear(&mut self) {
1634 self.map.clear();
1635 self.states.clear();
1636 }
1637}
1638
1639impl Transitions {
1640 /// Create a new transition table.
1641 ///
1642 /// The number of byte classes corresponds to the stride. Every state will
1643 /// have `num_byte_classes` slots for transitions.
1644 fn new(num_byte_classes: usize) -> Transitions {
1645 Transitions { table: vec![], num_byte_classes }
1646 }
1647
1648 /// Returns the total number of states currently in this table.
1649 fn num_states(&self) -> usize {
1650 self.table.len() / self.num_byte_classes
1651 }
1652
1653 /// Allocates room for one additional state and returns a pointer to it.
1654 ///
1655 /// If there's no more room, None is returned.
1656 fn add(&mut self) -> Option<StatePtr> {
1657 let si = self.table.len();
1658 if si > STATE_MAX as usize {
1659 return None;
1660 }
1661 self.table.extend(repeat(STATE_UNKNOWN).take(self.num_byte_classes));
1662 Some(usize_to_u32(si))
1663 }
1664
1665 /// Clears the table of all states.
1666 fn clear(&mut self) {
1667 self.table.clear();
1668 }
1669
1670 /// Sets the transition from (si, cls) to next.
1671 fn set_next(&mut self, si: StatePtr, cls: usize, next: StatePtr) {
1672 self.table[si as usize + cls] = next;
1673 }
1674
1675 /// Returns the transition corresponding to (si, cls).
1676 fn next(&self, si: StatePtr, cls: usize) -> StatePtr {
1677 self.table[si as usize + cls]
1678 }
1679
1680 /// The heap size, in bytes, of a single state in the transition table.
1681 fn state_heap_size(&self) -> usize {
1682 self.num_byte_classes * mem::size_of::<StatePtr>()
1683 }
1684
1685 /// Like `next`, but uses unchecked access and is therefore not safe.
1686 unsafe fn next_unchecked(&self, si: StatePtr, cls: usize) -> StatePtr {
1687 debug_assert!((si as usize) < self.table.len());
1688 debug_assert!(cls < self.num_byte_classes);
1689 *self.table.get_unchecked(si as usize + cls)
1690 }
1691}
1692
1693impl StateFlags {
1694 fn is_match(&self) -> bool {
1695 self.0 & 0b0000_0001 > 0
1696 }
1697
1698 fn set_match(&mut self) {
1699 self.0 |= 0b0000_0001;
1700 }
1701
1702 fn is_word(&self) -> bool {
1703 self.0 & 0b0000_0010 > 0
1704 }
1705
1706 fn set_word(&mut self) {
1707 self.0 |= 0b0000_0010;
1708 }
1709
1710 fn has_empty(&self) -> bool {
1711 self.0 & 0b0000_0100 > 0
1712 }
1713
1714 fn set_empty(&mut self) {
1715 self.0 |= 0b0000_0100;
1716 }
1717}
1718
1719impl Byte {
1720 fn byte(b: u8) -> Self {
1721 Byte(b as u16)
1722 }
1723 fn eof() -> Self {
1724 Byte(256)
1725 }
1726 fn is_eof(&self) -> bool {
1727 self.0 == 256
1728 }
1729
1730 fn is_ascii_word(&self) -> bool {
1731 let b = match self.as_byte() {
1732 None => return false,
1733 Some(b) => b,
1734 };
1735 match b {
1736 b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'_' => true,
1737 _ => false,
1738 }
1739 }
1740
1741 fn as_byte(&self) -> Option<u8> {
1742 if self.is_eof() {
1743 None
1744 } else {
1745 Some(self.0 as u8)
1746 }
1747 }
1748}
1749
1750impl fmt::Debug for State {
1751 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1752 let ips: Vec<usize> = self.inst_ptrs().collect();
1753 f&mut DebugStruct<'_, '_>.debug_struct("State")
1754 .field("flags", &self.flags())
1755 .field(name:"insts", &ips)
1756 .finish()
1757 }
1758}
1759
1760impl fmt::Debug for Transitions {
1761 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1762 let mut fmtd: DebugMap<'_, '_> = f.debug_map();
1763 for si: usize in 0..self.num_states() {
1764 let s: usize = si * self.num_byte_classes;
1765 let e: usize = s + self.num_byte_classes;
1766 fmtd.entry(&si.to_string(), &TransitionsRow(&self.table[s..e]));
1767 }
1768 fmtd.finish()
1769 }
1770}
1771
1772struct TransitionsRow<'a>(&'a [StatePtr]);
1773
1774impl<'a> fmt::Debug for TransitionsRow<'a> {
1775 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1776 let mut fmtd: DebugMap<'_, '_> = f.debug_map();
1777 for (b: usize, si: &u32) in self.0.iter().enumerate() {
1778 match *si {
1779 STATE_UNKNOWN => {}
1780 STATE_DEAD => {
1781 fmtd.entry(&vb(b as usize), &"DEAD");
1782 }
1783 si: u32 => {
1784 fmtd.entry(&vb(b as usize), &si.to_string());
1785 }
1786 }
1787 }
1788 fmtd.finish()
1789 }
1790}
1791
1792impl fmt::Debug for StateFlags {
1793 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1794 f&mut DebugStruct<'_, '_>.debug_struct("StateFlags")
1795 .field("is_match", &self.is_match())
1796 .field("is_word", &self.is_word())
1797 .field(name:"has_empty", &self.has_empty())
1798 .finish()
1799 }
1800}
1801
1802/// Helper function for formatting a byte as a nice-to-read escaped string.
1803fn vb(b: usize) -> String {
1804 use std::ascii::escape_default;
1805
1806 if b > ::std::u8::MAX as usize {
1807 "EOF".to_owned()
1808 } else {
1809 let escaped: Vec = escape_default(b as u8).collect::<Vec<u8>>();
1810 String::from_utf8_lossy(&escaped).into_owned()
1811 }
1812}
1813
1814fn usize_to_u32(n: usize) -> u32 {
1815 if (n as u64) > (::std::u32::MAX as u64) {
1816 panic!("BUG: {} is too big to fit into u32", n)
1817 }
1818 n as u32
1819}
1820
1821#[allow(dead_code)] // useful for debugging
1822fn show_state_ptr(si: StatePtr) -> String {
1823 let mut s: String = format!("{:?}", si & STATE_MAX);
1824 if si == STATE_UNKNOWN {
1825 s = format!("{} (unknown)", s);
1826 }
1827 if si == STATE_DEAD {
1828 s = format!("{} (dead)", s);
1829 }
1830 if si == STATE_QUIT {
1831 s = format!("{} (quit)", s);
1832 }
1833 if si & STATE_START > 0 {
1834 s = format!("{} (start)", s);
1835 }
1836 if si & STATE_MATCH > 0 {
1837 s = format!("{} (match)", s);
1838 }
1839 s
1840}
1841
1842/// https://developers.google.com/protocol-buffers/docs/encoding#varints
1843fn write_vari32(data: &mut Vec<u8>, n: i32) {
1844 let mut un: u32 = (n as u32) << 1;
1845 if n < 0 {
1846 un = !un;
1847 }
1848 write_varu32(data, n:un)
1849}
1850
1851/// https://developers.google.com/protocol-buffers/docs/encoding#varints
1852fn read_vari32(data: &[u8]) -> (i32, usize) {
1853 let (un: u32, i: usize) = read_varu32(data);
1854 let mut n: i32 = (un >> 1) as i32;
1855 if un & 1 != 0 {
1856 n = !n;
1857 }
1858 (n, i)
1859}
1860
1861/// https://developers.google.com/protocol-buffers/docs/encoding#varints
1862fn write_varu32(data: &mut Vec<u8>, mut n: u32) {
1863 while n >= 0b1000_0000 {
1864 data.push((n as u8) | 0b1000_0000);
1865 n >>= 7;
1866 }
1867 data.push(n as u8);
1868}
1869
1870/// https://developers.google.com/protocol-buffers/docs/encoding#varints
1871fn read_varu32(data: &[u8]) -> (u32, usize) {
1872 let mut n: u32 = 0;
1873 let mut shift: u32 = 0;
1874 for (i: usize, &b: u8) in data.iter().enumerate() {
1875 if b < 0b1000_0000 {
1876 return (n | ((b as u32) << shift), i + 1);
1877 }
1878 n |= ((b as u32) & 0b0111_1111) << shift;
1879 shift += 7;
1880 }
1881 (0, 0)
1882}
1883
1884#[cfg(test)]
1885mod tests {
1886
1887 use super::{
1888 push_inst_ptr, read_vari32, read_varu32, write_vari32, write_varu32,
1889 State, StateFlags,
1890 };
1891 use quickcheck::{quickcheck, Gen, QuickCheck};
1892 use std::sync::Arc;
1893
1894 #[test]
1895 fn prop_state_encode_decode() {
1896 fn p(mut ips: Vec<u32>, flags: u8) -> bool {
1897 // It looks like our encoding scheme can't handle instruction
1898 // pointers at or above 2**31. We should fix that, but it seems
1899 // unlikely to occur in real code due to the amount of memory
1900 // required for such a state machine. So for now, we just clamp
1901 // our test data.
1902 for ip in &mut ips {
1903 if *ip >= 1 << 31 {
1904 *ip = (1 << 31) - 1;
1905 }
1906 }
1907 let mut data = vec![flags];
1908 let mut prev = 0;
1909 for &ip in ips.iter() {
1910 push_inst_ptr(&mut data, &mut prev, ip);
1911 }
1912 let state = State { data: Arc::from(&data[..]) };
1913
1914 let expected: Vec<usize> =
1915 ips.into_iter().map(|ip| ip as usize).collect();
1916 let got: Vec<usize> = state.inst_ptrs().collect();
1917 expected == got && state.flags() == StateFlags(flags)
1918 }
1919 QuickCheck::new()
1920 .gen(Gen::new(10_000))
1921 .quickcheck(p as fn(Vec<u32>, u8) -> bool);
1922 }
1923
1924 #[test]
1925 fn prop_read_write_u32() {
1926 fn p(n: u32) -> bool {
1927 let mut buf = vec![];
1928 write_varu32(&mut buf, n);
1929 let (got, nread) = read_varu32(&buf);
1930 nread == buf.len() && got == n
1931 }
1932 quickcheck(p as fn(u32) -> bool);
1933 }
1934
1935 #[test]
1936 fn prop_read_write_i32() {
1937 fn p(n: i32) -> bool {
1938 let mut buf = vec![];
1939 write_vari32(&mut buf, n);
1940 let (got, nread) = read_vari32(&buf);
1941 nread == buf.len() && got == n
1942 }
1943 quickcheck(p as fn(i32) -> bool);
1944 }
1945}
1946