backtrack.rs source code [crates/regex-1.8.4/src/backtrack.rs]

1	// This is the backtracking matching engine. It has the same exact capability
2	// as the full NFA simulation, except it is artificially restricted to small
3	// regexes on small inputs because of its memory requirements.
4	//
5	// In particular, this is a bounded* backtracking engine. It retains worst*
6	// case linear time by keeping track of the states that it has visited (using a
7	// bitmap). Namely, once a state is visited, it is never visited again. Since a
8	// state is keyed by `(instruction index, input index)`, we have that its time
9	// complexity is `O(mn)` (i.e., linear in the size of the search text).
10	//
11	// The backtracking engine can beat out the NFA simulation on small
12	// regexes/inputs because it doesn't have to keep track of multiple copies of
13	// the capture groups. In benchmarks, the backtracking engine is roughly twice
14	// as fast as the full NFA simulation. Note though that its performance doesn't
15	// scale, even if you're willing to live with the memory requirements. Namely,
16	// the bitset has to be zeroed on each execution, which becomes quite expensive
17	// on large bitsets.
18
19	use crate::exec::ProgramCache;
20	use crate::input::{Input, InputAt};
21	use crate::prog::{InstPtr, Program};
22	use crate::re_trait::Slot;
23
24	type Bits = u32;
25
26	const BIT_SIZE: usize = `32`;
27	const MAX_SIZE_BYTES: usize = `256` * (`1` << `10`); // 256 KB
28
29	/// Returns true iff the given regex and input should be executed by this
30	/// engine with reasonable memory usage.
31	pub fn should_exec(num_insts: usize, text_len: usize) -> bool {
32	// Total memory usage in bytes is determined by:
33	//
34	// ((len(insts) (len(input) + 1) + bits - 1) / bits) * (size_of(u32))*
35	//
36	// The actual limit picked is pretty much a heuristic.
37	// See: https://github.com/rust-lang/regex/issues/215
38	let size: usize = ((num_insts * (text_len + `1`) + BIT_SIZE - `1`) / BIT_SIZE) * `4`;
39	size <= MAX_SIZE_BYTES
40	}
41
42	/// A backtracking matching engine.
43	#[derive(Debug)]
44	pub struct Bounded<'a, 'm, 'r, 's, I> {
45	prog: &'r Program,
46	input: I,
47	matches: &'m mut [bool],
48	slots: &'s mut [Slot],
49	m: &'a mut Cache,
50	}
51
52	/// Shared cached state between multiple invocations of a backtracking engine
53	/// in the same thread.
54	#[derive(Clone, Debug)]
55	pub struct Cache {
56	jobs: Vec<Job>,
57	visited: Vec<Bits>,
58	}
59
60	impl Cache {
61	/// Create new empty cache for the backtracking engine.
62	pub fn new(_prog: &Program) -> Self {
63	Cache { jobs: vec![], visited: vec![] }
64	}
65	}
66
67	/// A job is an explicit unit of stack space in the backtracking engine.
68	///
69	/// The "normal" representation is a single state transition, which corresponds
70	/// to an NFA state and a character in the input. However, the backtracking
71	/// engine must keep track of old capture group values. We use the explicit
72	/// stack to do it.
73	#[derive(Clone, Copy, Debug)]
74	enum Job {
75	Inst { ip: InstPtr, at: InputAt },
76	SaveRestore { slot: usize, old_pos: Option<usize> },
77	}
78
79	impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
80	/// Execute the backtracking matching engine.
81	///
82	/// If there's a match, `exec` returns `true` and populates the given
83	/// captures accordingly.
84	pub fn exec(
85	prog: &'r Program,
86	cache: &ProgramCache,
87	matches: &'m mut [bool],
88	slots: &'s mut [Slot],
89	input: I,
90	start: usize,
91	end: usize,
92	) -> bool {
93	let mut cache = cache.borrow_mut();
94	let cache = &mut cache.backtrack;
95	let start = input.at(start);
96	let mut b = Bounded { prog, input, matches, slots, m: cache };
97	b.exec_(start, end)
98	}
99
100	/// Clears the cache such that the backtracking engine can be executed
101	/// on some input of fixed length.
102	fn clear(&mut self) {
103	// Reset the job memory so that we start fresh.
104	self.m.jobs.clear();
105
106	// Now we need to clear the bit state set.
107	// We do this by figuring out how much space we need to keep track
108	// of the states we've visited.
109	// Then we reset all existing allocated space to 0.
110	// Finally, we request more space if we need it.
111	//
112	// This is all a little circuitous, but doing this using unchecked
113	// operations doesn't seem to have a measurable impact on performance.
114	// (Probably because backtracking is limited to such small
115	// inputs/regexes in the first place.)
116	let visited_len =
117	(self.prog.len() * (self.input.len() + `1`) + BIT_SIZE - `1`)
118	/ BIT_SIZE;
119	self.m.visited.truncate(visited_len);
120	for v in &mut self.m.visited {
121	*v = `0`;
122	}
123	if visited_len > self.m.visited.len() {
124	let len = self.m.visited.len();
125	self.m.visited.reserve_exact(visited_len - len);
126	for _ in `0`..(visited_len - len) {
127	self.m.visited.push(`0`);
128	}
129	}
130	}
131
132	/// Start backtracking at the given position in the input, but also look
133	/// for literal prefixes.
134	fn exec_(&mut self, mut at: InputAt, end: usize) -> bool {
135	self.clear();
136	// If this is an anchored regex at the beginning of the input, then
137	// we're either already done or we only need to try backtracking once.
138	if self.prog.is_anchored_start {
139	return if !at.is_start() { `false` } else { self.backtrack(at) };
140	}
141	let mut matched = `false`;
142	loop {
143	if !self.prog.prefixes.is_empty() {
144	at = match self.input.prefix_at(&self.prog.prefixes, at) {
145	None => break,
146	Some(at) => at,
147	};
148	}
149	matched = self.backtrack(at) \|\| matched;
150	if matched && self.prog.matches.len() == `1` {
151	return `true`;
152	}
153	if at.pos() >= end {
154	break;
155	}
156	at = self.input.at(at.next_pos());
157	}
158	matched
159	}
160
161	/// The main backtracking loop starting at the given input position.
162	fn backtrack(&mut self, start: InputAt) -> bool {
163	// N.B. We use an explicit stack to avoid recursion.
164	// To avoid excessive pushing and popping, most transitions are handled
165	// in the `step` helper function, which only pushes to the stack when
166	// there's a capture or a branch.
167	let mut matched = `false`;
168	self.m.jobs.push(Job::Inst { ip: `0`, at: start });
169	while let Some(job) = self.m.jobs.pop() {
170	match job {
171	Job::Inst { ip, at } => {
172	if self.step(ip, at) {
173	// Only quit if we're matching one regex.
174	// If we're matching a regex set, then mush on and
175	// try to find other matches (if we want them).
176	if self.prog.matches.len() == `1` {
177	return `true`;
178	}
179	matched = `true`;
180	}
181	}
182	Job::SaveRestore { slot, old_pos } => {
183	if slot < self.slots.len() {
184	self.slots[slot] = old_pos;
185	}
186	}
187	}
188	}
189	matched
190	}
191
192	fn step(&mut self, mut ip: InstPtr, mut at: InputAt) -> bool {
193	use crate::prog::Inst::*;
194	loop {
195	// This loop is an optimization to avoid constantly pushing/popping
196	// from the stack. Namely, if we're pushing a job only to run it
197	// next, avoid the push and just mutate `ip` (and possibly `at`)
198	// in place.
199	if self.has_visited(ip, at) {
200	return `false`;
201	}
202	match self.prog[ip] {
203	Match(slot) => {
204	if slot < self.matches.len() {
205	self.matches[slot] = `true`;
206	}
207	return `true`;
208	}
209	Save(ref inst) => {
210	if let Some(&old_pos) = self.slots.get(inst.slot) {
211	// If this path doesn't work out, then we save the old
212	// capture index (if one exists) in an alternate
213	// job. If the next path fails, then the alternate
214	// job is popped and the old capture index is restored.
215	self.m.jobs.push(Job::SaveRestore {
216	slot: inst.slot,
217	old_pos,
218	});
219	self.slots[inst.slot] = Some(at.pos());
220	}
221	ip = inst.goto;
222	}
223	Split(ref inst) => {
224	self.m.jobs.push(Job::Inst { ip: inst.goto2, at });
225	ip = inst.goto1;
226	}
227	EmptyLook(ref inst) => {
228	if self.input.is_empty_match(at, inst) {
229	ip = inst.goto;
230	} else {
231	return `false`;
232	}
233	}
234	Char(ref inst) => {
235	if inst.c == at.char() {
236	ip = inst.goto;
237	at = self.input.at(at.next_pos());
238	} else {
239	return `false`;
240	}
241	}
242	Ranges(ref inst) => {
243	if inst.matches(at.char()) {
244	ip = inst.goto;
245	at = self.input.at(at.next_pos());
246	} else {
247	return `false`;
248	}
249	}
250	Bytes(ref inst) => {
251	if let Some(b) = at.byte() {
252	if inst.matches(b) {
253	ip = inst.goto;
254	at = self.input.at(at.next_pos());
255	continue;
256	}
257	}
258	return `false`;
259	}
260	}
261	}
262	}
263
264	fn has_visited(&mut self, ip: InstPtr, at: InputAt) -> bool {
265	let k = ip * (self.input.len() + `1`) + at.pos();
266	let k1 = k / BIT_SIZE;
267	let k2 = usize_to_u32(`1` << (k & (BIT_SIZE - `1`)));
268	if self.m.visited[k1] & k2 == `0` {
269	self.m.visited[k1] \|= k2;
270	`false`
271	} else {
272	`true`
273	}
274	}
275	}
276
277	fn usize_to_u32(n: usize) -> u32 {
278	if (n as u64) > (::std::u32::MAX as u64) {
279	panic!("BUG: {} is too big to fit into u32", n)
280	}
281	n as u32
282	}
283