pikevm.rs source code [crates/regex-1.8.4/src/pikevm.rs]

1	// This module implements the Pike VM. That is, it guarantees linear time
2	// search of a regex on any text with memory use proportional to the size of
3	// the regex.
4	//
5	// It is equal in power to the backtracking engine in this crate, except the
6	// backtracking engine is typically faster on small regexes/texts at the
7	// expense of a bigger memory footprint.
8	//
9	// It can do more than the DFA can (specifically, record capture locations
10	// and execute Unicode word boundary assertions), but at a slower speed.
11	// Specifically, the Pike VM executes a DFA implicitly by repeatedly expanding
12	// epsilon transitions. That is, the Pike VM engine can be in multiple states
13	// at once where as the DFA is only ever in one state at a time.
14	//
15	// Therefore, the Pike VM is generally treated as the fallback when the other
16	// matching engines either aren't feasible to run or are insufficient.
17
18	use std::mem;
19
20	use crate::exec::ProgramCache;
21	use crate::input::{Input, InputAt};
22	use crate::prog::{InstPtr, Program};
23	use crate::re_trait::Slot;
24	use crate::sparse::SparseSet;
25
26	/// An NFA simulation matching engine.
27	#[derive(Debug)]
28	pub struct Fsm<'r, I> {
29	/// The sequence of opcodes (among other things) that is actually executed.
30	///
31	/// The program may be byte oriented or Unicode codepoint oriented.
32	prog: &'r Program,
33	/// An explicit stack used for following epsilon transitions. (This is
34	/// borrowed from the cache.)
35	stack: &'r mut Vec<FollowEpsilon>,
36	/// The input to search.
37	input: I,
38	}
39
40	/// A cached allocation that can be reused on each execution.
41	#[derive(Clone, Debug)]
42	pub struct Cache {
43	/// A pair of ordered sets for tracking NFA states.
44	clist: Threads,
45	nlist: Threads,
46	/// An explicit stack used for following epsilon transitions.
47	stack: Vec<FollowEpsilon>,
48	}
49
50	/// An ordered set of NFA states and their captures.
51	#[derive(Clone, Debug)]
52	struct Threads {
53	/// An ordered set of opcodes (each opcode is an NFA state).
54	set: SparseSet,
55	/// Captures for every NFA state.
56	///
57	/// It is stored in row-major order, where the columns are the capture
58	/// slots and the rows are the states.
59	caps: Vec<Slot>,
60	/// The number of capture slots stored per thread. (Every capture has
61	/// two slots.)
62	slots_per_thread: usize,
63	}
64
65	/// A representation of an explicit stack frame when following epsilon
66	/// transitions. This is used to avoid recursion.
67	#[derive(Clone, Debug)]
68	enum FollowEpsilon {
69	/// Follow transitions at the given instruction pointer.
70	IP(InstPtr),
71	/// Restore the capture slot with the given position in the input.
72	Capture { slot: usize, pos: Slot },
73	}
74
75	impl Cache {
76	/// Create a new allocation used by the NFA machine to record execution
77	/// and captures.
78	pub fn new(_prog: &Program) -> Self {
79	Cache { clist: Threads::new(), nlist: Threads::new(), stack: vec![] }
80	}
81	}
82
83	impl<'r, I: Input> Fsm<'r, I> {
84	/// Execute the NFA matching engine.
85	///
86	/// If there's a match, `exec` returns `true` and populates the given
87	/// captures accordingly.
88	pub fn exec(
89	prog: &'r Program,
90	cache: &ProgramCache,
91	matches: &mut [bool],
92	slots: &mut [Slot],
93	quit_after_match: bool,
94	input: I,
95	start: usize,
96	end: usize,
97	) -> bool {
98	let mut cache = cache.borrow_mut();
99	let cache = &mut cache.pikevm;
100	cache.clist.resize(prog.len(), prog.captures.len());
101	cache.nlist.resize(prog.len(), prog.captures.len());
102	let at = input.at(start);
103	Fsm { prog, stack: &mut cache.stack, input }.exec_(
104	&mut cache.clist,
105	&mut cache.nlist,
106	matches,
107	slots,
108	quit_after_match,
109	at,
110	end,
111	)
112	}
113
114	fn exec_(
115	&mut self,
116	mut clist: &mut Threads,
117	mut nlist: &mut Threads,
118	matches: &mut [bool],
119	slots: &mut [Slot],
120	quit_after_match: bool,
121	mut at: InputAt,
122	end: usize,
123	) -> bool {
124	let mut matched = `false`;
125	let mut all_matched = `false`;
126	clist.set.clear();
127	nlist.set.clear();
128	'LOOP: loop {
129	if clist.set.is_empty() {
130	// Three ways to bail out when our current set of threads is
131	// empty.
132	//
133	// 1. We have a match---so we're done exploring any possible
134	// alternatives. Time to quit. (We can't do this if we're
135	// looking for matches for multiple regexes, unless we know
136	// they all matched.)
137	//
138	// 2. If the expression starts with a '^' we can terminate as
139	// soon as the last thread dies.
140	if (matched && matches.len() <= `1`)
141	\|\| all_matched
142	\|\| (!at.is_start() && self.prog.is_anchored_start)
143	{
144	break;
145	}
146
147	// 3. If there's a literal prefix for the program, try to
148	// jump ahead quickly. If it can't be found, then we can
149	// bail out early.
150	if !self.prog.prefixes.is_empty() {
151	at = match self.input.prefix_at(&self.prog.prefixes, at) {
152	None => break,
153	Some(at) => at,
154	};
155	}
156	}
157
158	// This simulates a preceding '.?' for every regex by adding*
159	// a state starting at the current position in the input for the
160	// beginning of the program only if we don't already have a match.
161	if clist.set.is_empty()
162	\|\| (!self.prog.is_anchored_start && !all_matched)
163	{
164	self.add(&mut clist, slots, `0`, at);
165	}
166	// The previous call to "add" actually inspects the position just
167	// before the current character. For stepping through the machine,
168	// we can to look at the current character, so we advance the
169	// input.
170	let at_next = self.input.at(at.next_pos());
171	for i in `0`..clist.set.len() {
172	let ip = clist.set[i];
173	if self.step(
174	&mut nlist,
175	matches,
176	slots,
177	clist.caps(ip),
178	ip,
179	at,
180	at_next,
181	) {
182	matched = `true`;
183	all_matched = all_matched \|\| matches.iter().all(\|&b\| b);
184	if quit_after_match {
185	// If we only care if a match occurs (not its
186	// position), then we can quit right now.
187	break 'LOOP;
188	}
189	if self.prog.matches.len() == `1` {
190	// We don't need to check the rest of the threads
191	// in this set because we've matched something
192	// ("leftmost-first"). However, we still need to check
193	// threads in the next set to support things like
194	// greedy matching.
195	//
196	// This is only true on normal regexes. For regex sets,
197	// we need to mush on to observe other matches.
198	break;
199	}
200	}
201	}
202	if at.pos() >= end {
203	break;
204	}
205	at = at_next;
206	mem::swap(clist, nlist);
207	nlist.set.clear();
208	}
209	matched
210	}
211
212	/// Step through the input, one token (byte or codepoint) at a time.
213	///
214	/// nlist is the set of states that will be processed on the next token
215	/// in the input.
216	///
217	/// caps is the set of captures passed by the caller of the NFA. They are
218	/// written to only when a match state is visited.
219	///
220	/// thread_caps is the set of captures set for the current NFA state, ip.
221	///
222	/// at and at_next are the current and next positions in the input. at or
223	/// at_next may be EOF.
224	fn step(
225	&mut self,
226	nlist: &mut Threads,
227	matches: &mut [bool],
228	slots: &mut [Slot],
229	thread_caps: &mut [Option<usize>],
230	ip: usize,
231	at: InputAt,
232	at_next: InputAt,
233	) -> bool {
234	use crate::prog::Inst::*;
235	match self.prog[ip] {
236	Match(match_slot) => {
237	if match_slot < matches.len() {
238	matches[match_slot] = `true`;
239	}
240	for (slot, val) in slots.iter_mut().zip(thread_caps.iter()) {
241	slot = val;
242	}
243	`true`
244	}
245	Char(ref inst) => {
246	if inst.c == at.char() {
247	self.add(nlist, thread_caps, inst.goto, at_next);
248	}
249	`false`
250	}
251	Ranges(ref inst) => {
252	if inst.matches(at.char()) {
253	self.add(nlist, thread_caps, inst.goto, at_next);
254	}
255	`false`
256	}
257	Bytes(ref inst) => {
258	if let Some(b) = at.byte() {
259	if inst.matches(b) {
260	self.add(nlist, thread_caps, inst.goto, at_next);
261	}
262	}
263	`false`
264	}
265	EmptyLook(_) \| Save(_) \| Split(_) => `false`,
266	}
267	}
268
269	/// Follows epsilon transitions and adds them for processing to nlist,
270	/// starting at and including ip.
271	fn add(
272	&mut self,
273	nlist: &mut Threads,
274	thread_caps: &mut [Option<usize>],
275	ip: usize,
276	at: InputAt,
277	) {
278	self.stack.push(FollowEpsilon::IP(ip));
279	while let Some(frame) = self.stack.pop() {
280	match frame {
281	FollowEpsilon::IP(ip) => {
282	self.add_step(nlist, thread_caps, ip, at);
283	}
284	FollowEpsilon::Capture { slot, pos } => {
285	thread_caps[slot] = pos;
286	}
287	}
288	}
289	}
290
291	/// A helper function for add that avoids excessive pushing to the stack.
292	fn add_step(
293	&mut self,
294	nlist: &mut Threads,
295	thread_caps: &mut [Option<usize>],
296	mut ip: usize,
297	at: InputAt,
298	) {
299	// Instead of pushing and popping to the stack, we mutate ip as we
300	// traverse the set of states. We only push to the stack when we
301	// absolutely need recursion (restoring captures or following a
302	// branch).
303	use crate::prog::Inst::*;
304	loop {
305	// Don't visit states we've already added.
306	if nlist.set.contains(ip) {
307	return;
308	}
309	nlist.set.insert(ip);
310	match self.prog[ip] {
311	EmptyLook(ref inst) => {
312	if self.input.is_empty_match(at, inst) {
313	ip = inst.goto;
314	}
315	}
316	Save(ref inst) => {
317	if inst.slot < thread_caps.len() {
318	self.stack.push(FollowEpsilon::Capture {
319	slot: inst.slot,
320	pos: thread_caps[inst.slot],
321	});
322	thread_caps[inst.slot] = Some(at.pos());
323	}
324	ip = inst.goto;
325	}
326	Split(ref inst) => {
327	self.stack.push(FollowEpsilon::IP(inst.goto2));
328	ip = inst.goto1;
329	}
330	Match(_) \| Char(_) \| Ranges(_) \| Bytes(_) => {
331	let t = &mut nlist.caps(ip);
332	for (slot, val) in t.iter_mut().zip(thread_caps.iter()) {
333	slot = val;
334	}
335	return;
336	}
337	}
338	}
339	}
340	}
341
342	impl Threads {
343	fn new() -> Self {
344	Threads { set: SparseSet::new(size:`0`), caps: vec![], slots_per_thread: `0` }
345	}
346
347	fn resize(&mut self, num_insts: usize, ncaps: usize) {
348	if num_insts == self.set.capacity() {
349	return;
350	}
351	self.slots_per_thread = ncaps * `2`;
352	self.set = SparseSet::new(size:num_insts);
353	self.caps = vec![None; self.slots_per_thread * num_insts];
354	}
355
356	fn caps(&mut self, pc: usize) -> &mut [Option<usize>] {
357	let i: usize = pc * self.slots_per_thread;
358	&mut self.caps[i..i + self.slots_per_thread]
359	}
360	}
361