1// This module implements the Pike VM. That is, it guarantees linear time
2// search of a regex on any text with memory use proportional to the size of
3// the regex.
4//
5// It is equal in power to the backtracking engine in this crate, except the
6// backtracking engine is typically faster on small regexes/texts at the
7// expense of a bigger memory footprint.
8//
9// It can do more than the DFA can (specifically, record capture locations
10// and execute Unicode word boundary assertions), but at a slower speed.
11// Specifically, the Pike VM executes a DFA implicitly by repeatedly expanding
12// epsilon transitions. That is, the Pike VM engine can be in multiple states
13// at once where as the DFA is only ever in one state at a time.
14//
15// Therefore, the Pike VM is generally treated as the fallback when the other
16// matching engines either aren't feasible to run or are insufficient.
17
18use std::mem;
19
20use crate::exec::ProgramCache;
21use crate::input::{Input, InputAt};
22use crate::prog::{InstPtr, Program};
23use crate::re_trait::Slot;
24use crate::sparse::SparseSet;
25
26/// An NFA simulation matching engine.
27#[derive(Debug)]
28pub struct Fsm<'r, I> {
29 /// The sequence of opcodes (among other things) that is actually executed.
30 ///
31 /// The program may be byte oriented or Unicode codepoint oriented.
32 prog: &'r Program,
33 /// An explicit stack used for following epsilon transitions. (This is
34 /// borrowed from the cache.)
35 stack: &'r mut Vec<FollowEpsilon>,
36 /// The input to search.
37 input: I,
38}
39
40/// A cached allocation that can be reused on each execution.
41#[derive(Clone, Debug)]
42pub struct Cache {
43 /// A pair of ordered sets for tracking NFA states.
44 clist: Threads,
45 nlist: Threads,
46 /// An explicit stack used for following epsilon transitions.
47 stack: Vec<FollowEpsilon>,
48}
49
50/// An ordered set of NFA states and their captures.
51#[derive(Clone, Debug)]
52struct Threads {
53 /// An ordered set of opcodes (each opcode is an NFA state).
54 set: SparseSet,
55 /// Captures for every NFA state.
56 ///
57 /// It is stored in row-major order, where the columns are the capture
58 /// slots and the rows are the states.
59 caps: Vec<Slot>,
60 /// The number of capture slots stored per thread. (Every capture has
61 /// two slots.)
62 slots_per_thread: usize,
63}
64
65/// A representation of an explicit stack frame when following epsilon
66/// transitions. This is used to avoid recursion.
67#[derive(Clone, Debug)]
68enum FollowEpsilon {
69 /// Follow transitions at the given instruction pointer.
70 IP(InstPtr),
71 /// Restore the capture slot with the given position in the input.
72 Capture { slot: usize, pos: Slot },
73}
74
75impl Cache {
76 /// Create a new allocation used by the NFA machine to record execution
77 /// and captures.
78 pub fn new(_prog: &Program) -> Self {
79 Cache { clist: Threads::new(), nlist: Threads::new(), stack: vec![] }
80 }
81}
82
83impl<'r, I: Input> Fsm<'r, I> {
84 /// Execute the NFA matching engine.
85 ///
86 /// If there's a match, `exec` returns `true` and populates the given
87 /// captures accordingly.
88 pub fn exec(
89 prog: &'r Program,
90 cache: &ProgramCache,
91 matches: &mut [bool],
92 slots: &mut [Slot],
93 quit_after_match: bool,
94 input: I,
95 start: usize,
96 end: usize,
97 ) -> bool {
98 let mut cache = cache.borrow_mut();
99 let cache = &mut cache.pikevm;
100 cache.clist.resize(prog.len(), prog.captures.len());
101 cache.nlist.resize(prog.len(), prog.captures.len());
102 let at = input.at(start);
103 Fsm { prog, stack: &mut cache.stack, input }.exec_(
104 &mut cache.clist,
105 &mut cache.nlist,
106 matches,
107 slots,
108 quit_after_match,
109 at,
110 end,
111 )
112 }
113
114 fn exec_(
115 &mut self,
116 mut clist: &mut Threads,
117 mut nlist: &mut Threads,
118 matches: &mut [bool],
119 slots: &mut [Slot],
120 quit_after_match: bool,
121 mut at: InputAt,
122 end: usize,
123 ) -> bool {
124 let mut matched = false;
125 let mut all_matched = false;
126 clist.set.clear();
127 nlist.set.clear();
128 'LOOP: loop {
129 if clist.set.is_empty() {
130 // Three ways to bail out when our current set of threads is
131 // empty.
132 //
133 // 1. We have a match---so we're done exploring any possible
134 // alternatives. Time to quit. (We can't do this if we're
135 // looking for matches for multiple regexes, unless we know
136 // they all matched.)
137 //
138 // 2. If the expression starts with a '^' we can terminate as
139 // soon as the last thread dies.
140 if (matched && matches.len() <= 1)
141 || all_matched
142 || (!at.is_start() && self.prog.is_anchored_start)
143 {
144 break;
145 }
146
147 // 3. If there's a literal prefix for the program, try to
148 // jump ahead quickly. If it can't be found, then we can
149 // bail out early.
150 if !self.prog.prefixes.is_empty() {
151 at = match self.input.prefix_at(&self.prog.prefixes, at) {
152 None => break,
153 Some(at) => at,
154 };
155 }
156 }
157
158 // This simulates a preceding '.*?' for every regex by adding
159 // a state starting at the current position in the input for the
160 // beginning of the program only if we don't already have a match.
161 if clist.set.is_empty()
162 || (!self.prog.is_anchored_start && !all_matched)
163 {
164 self.add(&mut clist, slots, 0, at);
165 }
166 // The previous call to "add" actually inspects the position just
167 // before the current character. For stepping through the machine,
168 // we can to look at the current character, so we advance the
169 // input.
170 let at_next = self.input.at(at.next_pos());
171 for i in 0..clist.set.len() {
172 let ip = clist.set[i];
173 if self.step(
174 &mut nlist,
175 matches,
176 slots,
177 clist.caps(ip),
178 ip,
179 at,
180 at_next,
181 ) {
182 matched = true;
183 all_matched = all_matched || matches.iter().all(|&b| b);
184 if quit_after_match {
185 // If we only care if a match occurs (not its
186 // position), then we can quit right now.
187 break 'LOOP;
188 }
189 if self.prog.matches.len() == 1 {
190 // We don't need to check the rest of the threads
191 // in this set because we've matched something
192 // ("leftmost-first"). However, we still need to check
193 // threads in the next set to support things like
194 // greedy matching.
195 //
196 // This is only true on normal regexes. For regex sets,
197 // we need to mush on to observe other matches.
198 break;
199 }
200 }
201 }
202 if at.pos() >= end {
203 break;
204 }
205 at = at_next;
206 mem::swap(clist, nlist);
207 nlist.set.clear();
208 }
209 matched
210 }
211
212 /// Step through the input, one token (byte or codepoint) at a time.
213 ///
214 /// nlist is the set of states that will be processed on the next token
215 /// in the input.
216 ///
217 /// caps is the set of captures passed by the caller of the NFA. They are
218 /// written to only when a match state is visited.
219 ///
220 /// thread_caps is the set of captures set for the current NFA state, ip.
221 ///
222 /// at and at_next are the current and next positions in the input. at or
223 /// at_next may be EOF.
224 fn step(
225 &mut self,
226 nlist: &mut Threads,
227 matches: &mut [bool],
228 slots: &mut [Slot],
229 thread_caps: &mut [Option<usize>],
230 ip: usize,
231 at: InputAt,
232 at_next: InputAt,
233 ) -> bool {
234 use crate::prog::Inst::*;
235 match self.prog[ip] {
236 Match(match_slot) => {
237 if match_slot < matches.len() {
238 matches[match_slot] = true;
239 }
240 for (slot, val) in slots.iter_mut().zip(thread_caps.iter()) {
241 *slot = *val;
242 }
243 true
244 }
245 Char(ref inst) => {
246 if inst.c == at.char() {
247 self.add(nlist, thread_caps, inst.goto, at_next);
248 }
249 false
250 }
251 Ranges(ref inst) => {
252 if inst.matches(at.char()) {
253 self.add(nlist, thread_caps, inst.goto, at_next);
254 }
255 false
256 }
257 Bytes(ref inst) => {
258 if let Some(b) = at.byte() {
259 if inst.matches(b) {
260 self.add(nlist, thread_caps, inst.goto, at_next);
261 }
262 }
263 false
264 }
265 EmptyLook(_) | Save(_) | Split(_) => false,
266 }
267 }
268
269 /// Follows epsilon transitions and adds them for processing to nlist,
270 /// starting at and including ip.
271 fn add(
272 &mut self,
273 nlist: &mut Threads,
274 thread_caps: &mut [Option<usize>],
275 ip: usize,
276 at: InputAt,
277 ) {
278 self.stack.push(FollowEpsilon::IP(ip));
279 while let Some(frame) = self.stack.pop() {
280 match frame {
281 FollowEpsilon::IP(ip) => {
282 self.add_step(nlist, thread_caps, ip, at);
283 }
284 FollowEpsilon::Capture { slot, pos } => {
285 thread_caps[slot] = pos;
286 }
287 }
288 }
289 }
290
291 /// A helper function for add that avoids excessive pushing to the stack.
292 fn add_step(
293 &mut self,
294 nlist: &mut Threads,
295 thread_caps: &mut [Option<usize>],
296 mut ip: usize,
297 at: InputAt,
298 ) {
299 // Instead of pushing and popping to the stack, we mutate ip as we
300 // traverse the set of states. We only push to the stack when we
301 // absolutely need recursion (restoring captures or following a
302 // branch).
303 use crate::prog::Inst::*;
304 loop {
305 // Don't visit states we've already added.
306 if nlist.set.contains(ip) {
307 return;
308 }
309 nlist.set.insert(ip);
310 match self.prog[ip] {
311 EmptyLook(ref inst) => {
312 if self.input.is_empty_match(at, inst) {
313 ip = inst.goto;
314 }
315 }
316 Save(ref inst) => {
317 if inst.slot < thread_caps.len() {
318 self.stack.push(FollowEpsilon::Capture {
319 slot: inst.slot,
320 pos: thread_caps[inst.slot],
321 });
322 thread_caps[inst.slot] = Some(at.pos());
323 }
324 ip = inst.goto;
325 }
326 Split(ref inst) => {
327 self.stack.push(FollowEpsilon::IP(inst.goto2));
328 ip = inst.goto1;
329 }
330 Match(_) | Char(_) | Ranges(_) | Bytes(_) => {
331 let t = &mut nlist.caps(ip);
332 for (slot, val) in t.iter_mut().zip(thread_caps.iter()) {
333 *slot = *val;
334 }
335 return;
336 }
337 }
338 }
339 }
340}
341
342impl Threads {
343 fn new() -> Self {
344 Threads { set: SparseSet::new(size:0), caps: vec![], slots_per_thread: 0 }
345 }
346
347 fn resize(&mut self, num_insts: usize, ncaps: usize) {
348 if num_insts == self.set.capacity() {
349 return;
350 }
351 self.slots_per_thread = ncaps * 2;
352 self.set = SparseSet::new(size:num_insts);
353 self.caps = vec![None; self.slots_per_thread * num_insts];
354 }
355
356 fn caps(&mut self, pc: usize) -> &mut [Option<usize>] {
357 let i: usize = pc * self.slots_per_thread;
358 &mut self.caps[i..i + self.slots_per_thread]
359 }
360}
361