1 | // This module implements the Pike VM. That is, it guarantees linear time |
2 | // search of a regex on any text with memory use proportional to the size of |
3 | // the regex. |
4 | // |
5 | // It is equal in power to the backtracking engine in this crate, except the |
6 | // backtracking engine is typically faster on small regexes/texts at the |
7 | // expense of a bigger memory footprint. |
8 | // |
9 | // It can do more than the DFA can (specifically, record capture locations |
10 | // and execute Unicode word boundary assertions), but at a slower speed. |
11 | // Specifically, the Pike VM executes a DFA implicitly by repeatedly expanding |
12 | // epsilon transitions. That is, the Pike VM engine can be in multiple states |
13 | // at once where as the DFA is only ever in one state at a time. |
14 | // |
15 | // Therefore, the Pike VM is generally treated as the fallback when the other |
16 | // matching engines either aren't feasible to run or are insufficient. |
17 | |
18 | use std::mem; |
19 | |
20 | use crate::exec::ProgramCache; |
21 | use crate::input::{Input, InputAt}; |
22 | use crate::prog::{InstPtr, Program}; |
23 | use crate::re_trait::Slot; |
24 | use crate::sparse::SparseSet; |
25 | |
26 | /// An NFA simulation matching engine. |
27 | #[derive (Debug)] |
28 | pub struct Fsm<'r, I> { |
29 | /// The sequence of opcodes (among other things) that is actually executed. |
30 | /// |
31 | /// The program may be byte oriented or Unicode codepoint oriented. |
32 | prog: &'r Program, |
33 | /// An explicit stack used for following epsilon transitions. (This is |
34 | /// borrowed from the cache.) |
35 | stack: &'r mut Vec<FollowEpsilon>, |
36 | /// The input to search. |
37 | input: I, |
38 | } |
39 | |
40 | /// A cached allocation that can be reused on each execution. |
41 | #[derive (Clone, Debug)] |
42 | pub struct Cache { |
43 | /// A pair of ordered sets for tracking NFA states. |
44 | clist: Threads, |
45 | nlist: Threads, |
46 | /// An explicit stack used for following epsilon transitions. |
47 | stack: Vec<FollowEpsilon>, |
48 | } |
49 | |
50 | /// An ordered set of NFA states and their captures. |
51 | #[derive (Clone, Debug)] |
52 | struct Threads { |
53 | /// An ordered set of opcodes (each opcode is an NFA state). |
54 | set: SparseSet, |
55 | /// Captures for every NFA state. |
56 | /// |
57 | /// It is stored in row-major order, where the columns are the capture |
58 | /// slots and the rows are the states. |
59 | caps: Vec<Slot>, |
60 | /// The number of capture slots stored per thread. (Every capture has |
61 | /// two slots.) |
62 | slots_per_thread: usize, |
63 | } |
64 | |
65 | /// A representation of an explicit stack frame when following epsilon |
66 | /// transitions. This is used to avoid recursion. |
67 | #[derive (Clone, Debug)] |
68 | enum FollowEpsilon { |
69 | /// Follow transitions at the given instruction pointer. |
70 | IP(InstPtr), |
71 | /// Restore the capture slot with the given position in the input. |
72 | Capture { slot: usize, pos: Slot }, |
73 | } |
74 | |
75 | impl Cache { |
76 | /// Create a new allocation used by the NFA machine to record execution |
77 | /// and captures. |
78 | pub fn new(_prog: &Program) -> Self { |
79 | Cache { clist: Threads::new(), nlist: Threads::new(), stack: vec![] } |
80 | } |
81 | } |
82 | |
83 | impl<'r, I: Input> Fsm<'r, I> { |
84 | /// Execute the NFA matching engine. |
85 | /// |
86 | /// If there's a match, `exec` returns `true` and populates the given |
87 | /// captures accordingly. |
88 | pub fn exec( |
89 | prog: &'r Program, |
90 | cache: &ProgramCache, |
91 | matches: &mut [bool], |
92 | slots: &mut [Slot], |
93 | quit_after_match: bool, |
94 | input: I, |
95 | start: usize, |
96 | end: usize, |
97 | ) -> bool { |
98 | let mut cache = cache.borrow_mut(); |
99 | let cache = &mut cache.pikevm; |
100 | cache.clist.resize(prog.len(), prog.captures.len()); |
101 | cache.nlist.resize(prog.len(), prog.captures.len()); |
102 | let at = input.at(start); |
103 | Fsm { prog, stack: &mut cache.stack, input }.exec_( |
104 | &mut cache.clist, |
105 | &mut cache.nlist, |
106 | matches, |
107 | slots, |
108 | quit_after_match, |
109 | at, |
110 | end, |
111 | ) |
112 | } |
113 | |
114 | fn exec_( |
115 | &mut self, |
116 | mut clist: &mut Threads, |
117 | mut nlist: &mut Threads, |
118 | matches: &mut [bool], |
119 | slots: &mut [Slot], |
120 | quit_after_match: bool, |
121 | mut at: InputAt, |
122 | end: usize, |
123 | ) -> bool { |
124 | let mut matched = false; |
125 | let mut all_matched = false; |
126 | clist.set.clear(); |
127 | nlist.set.clear(); |
128 | 'LOOP: loop { |
129 | if clist.set.is_empty() { |
130 | // Three ways to bail out when our current set of threads is |
131 | // empty. |
132 | // |
133 | // 1. We have a match---so we're done exploring any possible |
134 | // alternatives. Time to quit. (We can't do this if we're |
135 | // looking for matches for multiple regexes, unless we know |
136 | // they all matched.) |
137 | // |
138 | // 2. If the expression starts with a '^' we can terminate as |
139 | // soon as the last thread dies. |
140 | if (matched && matches.len() <= 1) |
141 | || all_matched |
142 | || (!at.is_start() && self.prog.is_anchored_start) |
143 | { |
144 | break; |
145 | } |
146 | |
147 | // 3. If there's a literal prefix for the program, try to |
148 | // jump ahead quickly. If it can't be found, then we can |
149 | // bail out early. |
150 | if !self.prog.prefixes.is_empty() { |
151 | at = match self.input.prefix_at(&self.prog.prefixes, at) { |
152 | None => break, |
153 | Some(at) => at, |
154 | }; |
155 | } |
156 | } |
157 | |
158 | // This simulates a preceding '.*?' for every regex by adding |
159 | // a state starting at the current position in the input for the |
160 | // beginning of the program only if we don't already have a match. |
161 | if clist.set.is_empty() |
162 | || (!self.prog.is_anchored_start && !all_matched) |
163 | { |
164 | self.add(&mut clist, slots, 0, at); |
165 | } |
166 | // The previous call to "add" actually inspects the position just |
167 | // before the current character. For stepping through the machine, |
168 | // we can to look at the current character, so we advance the |
169 | // input. |
170 | let at_next = self.input.at(at.next_pos()); |
171 | for i in 0..clist.set.len() { |
172 | let ip = clist.set[i]; |
173 | if self.step( |
174 | &mut nlist, |
175 | matches, |
176 | slots, |
177 | clist.caps(ip), |
178 | ip, |
179 | at, |
180 | at_next, |
181 | ) { |
182 | matched = true; |
183 | all_matched = all_matched || matches.iter().all(|&b| b); |
184 | if quit_after_match { |
185 | // If we only care if a match occurs (not its |
186 | // position), then we can quit right now. |
187 | break 'LOOP; |
188 | } |
189 | if self.prog.matches.len() == 1 { |
190 | // We don't need to check the rest of the threads |
191 | // in this set because we've matched something |
192 | // ("leftmost-first"). However, we still need to check |
193 | // threads in the next set to support things like |
194 | // greedy matching. |
195 | // |
196 | // This is only true on normal regexes. For regex sets, |
197 | // we need to mush on to observe other matches. |
198 | break; |
199 | } |
200 | } |
201 | } |
202 | if at.pos() >= end { |
203 | break; |
204 | } |
205 | at = at_next; |
206 | mem::swap(clist, nlist); |
207 | nlist.set.clear(); |
208 | } |
209 | matched |
210 | } |
211 | |
212 | /// Step through the input, one token (byte or codepoint) at a time. |
213 | /// |
214 | /// nlist is the set of states that will be processed on the next token |
215 | /// in the input. |
216 | /// |
217 | /// caps is the set of captures passed by the caller of the NFA. They are |
218 | /// written to only when a match state is visited. |
219 | /// |
220 | /// thread_caps is the set of captures set for the current NFA state, ip. |
221 | /// |
222 | /// at and at_next are the current and next positions in the input. at or |
223 | /// at_next may be EOF. |
224 | fn step( |
225 | &mut self, |
226 | nlist: &mut Threads, |
227 | matches: &mut [bool], |
228 | slots: &mut [Slot], |
229 | thread_caps: &mut [Option<usize>], |
230 | ip: usize, |
231 | at: InputAt, |
232 | at_next: InputAt, |
233 | ) -> bool { |
234 | use crate::prog::Inst::*; |
235 | match self.prog[ip] { |
236 | Match(match_slot) => { |
237 | if match_slot < matches.len() { |
238 | matches[match_slot] = true; |
239 | } |
240 | for (slot, val) in slots.iter_mut().zip(thread_caps.iter()) { |
241 | *slot = *val; |
242 | } |
243 | true |
244 | } |
245 | Char(ref inst) => { |
246 | if inst.c == at.char() { |
247 | self.add(nlist, thread_caps, inst.goto, at_next); |
248 | } |
249 | false |
250 | } |
251 | Ranges(ref inst) => { |
252 | if inst.matches(at.char()) { |
253 | self.add(nlist, thread_caps, inst.goto, at_next); |
254 | } |
255 | false |
256 | } |
257 | Bytes(ref inst) => { |
258 | if let Some(b) = at.byte() { |
259 | if inst.matches(b) { |
260 | self.add(nlist, thread_caps, inst.goto, at_next); |
261 | } |
262 | } |
263 | false |
264 | } |
265 | EmptyLook(_) | Save(_) | Split(_) => false, |
266 | } |
267 | } |
268 | |
269 | /// Follows epsilon transitions and adds them for processing to nlist, |
270 | /// starting at and including ip. |
271 | fn add( |
272 | &mut self, |
273 | nlist: &mut Threads, |
274 | thread_caps: &mut [Option<usize>], |
275 | ip: usize, |
276 | at: InputAt, |
277 | ) { |
278 | self.stack.push(FollowEpsilon::IP(ip)); |
279 | while let Some(frame) = self.stack.pop() { |
280 | match frame { |
281 | FollowEpsilon::IP(ip) => { |
282 | self.add_step(nlist, thread_caps, ip, at); |
283 | } |
284 | FollowEpsilon::Capture { slot, pos } => { |
285 | thread_caps[slot] = pos; |
286 | } |
287 | } |
288 | } |
289 | } |
290 | |
291 | /// A helper function for add that avoids excessive pushing to the stack. |
292 | fn add_step( |
293 | &mut self, |
294 | nlist: &mut Threads, |
295 | thread_caps: &mut [Option<usize>], |
296 | mut ip: usize, |
297 | at: InputAt, |
298 | ) { |
299 | // Instead of pushing and popping to the stack, we mutate ip as we |
300 | // traverse the set of states. We only push to the stack when we |
301 | // absolutely need recursion (restoring captures or following a |
302 | // branch). |
303 | use crate::prog::Inst::*; |
304 | loop { |
305 | // Don't visit states we've already added. |
306 | if nlist.set.contains(ip) { |
307 | return; |
308 | } |
309 | nlist.set.insert(ip); |
310 | match self.prog[ip] { |
311 | EmptyLook(ref inst) => { |
312 | if self.input.is_empty_match(at, inst) { |
313 | ip = inst.goto; |
314 | } |
315 | } |
316 | Save(ref inst) => { |
317 | if inst.slot < thread_caps.len() { |
318 | self.stack.push(FollowEpsilon::Capture { |
319 | slot: inst.slot, |
320 | pos: thread_caps[inst.slot], |
321 | }); |
322 | thread_caps[inst.slot] = Some(at.pos()); |
323 | } |
324 | ip = inst.goto; |
325 | } |
326 | Split(ref inst) => { |
327 | self.stack.push(FollowEpsilon::IP(inst.goto2)); |
328 | ip = inst.goto1; |
329 | } |
330 | Match(_) | Char(_) | Ranges(_) | Bytes(_) => { |
331 | let t = &mut nlist.caps(ip); |
332 | for (slot, val) in t.iter_mut().zip(thread_caps.iter()) { |
333 | *slot = *val; |
334 | } |
335 | return; |
336 | } |
337 | } |
338 | } |
339 | } |
340 | } |
341 | |
342 | impl Threads { |
343 | fn new() -> Self { |
344 | Threads { set: SparseSet::new(size:0), caps: vec![], slots_per_thread: 0 } |
345 | } |
346 | |
347 | fn resize(&mut self, num_insts: usize, ncaps: usize) { |
348 | if num_insts == self.set.capacity() { |
349 | return; |
350 | } |
351 | self.slots_per_thread = ncaps * 2; |
352 | self.set = SparseSet::new(size:num_insts); |
353 | self.caps = vec![None; self.slots_per_thread * num_insts]; |
354 | } |
355 | |
356 | fn caps(&mut self, pc: usize) -> &mut [Option<usize>] { |
357 | let i: usize = pc * self.slots_per_thread; |
358 | &mut self.caps[i..i + self.slots_per_thread] |
359 | } |
360 | } |
361 | |