1 | // This is the backtracking matching engine. It has the same exact capability |
2 | // as the full NFA simulation, except it is artificially restricted to small |
3 | // regexes on small inputs because of its memory requirements. |
4 | // |
5 | // In particular, this is a *bounded* backtracking engine. It retains worst |
6 | // case linear time by keeping track of the states that it has visited (using a |
7 | // bitmap). Namely, once a state is visited, it is never visited again. Since a |
8 | // state is keyed by `(instruction index, input index)`, we have that its time |
9 | // complexity is `O(mn)` (i.e., linear in the size of the search text). |
10 | // |
11 | // The backtracking engine can beat out the NFA simulation on small |
12 | // regexes/inputs because it doesn't have to keep track of multiple copies of |
13 | // the capture groups. In benchmarks, the backtracking engine is roughly twice |
14 | // as fast as the full NFA simulation. Note though that its performance doesn't |
15 | // scale, even if you're willing to live with the memory requirements. Namely, |
16 | // the bitset has to be zeroed on each execution, which becomes quite expensive |
17 | // on large bitsets. |
18 | |
19 | use crate::exec::ProgramCache; |
20 | use crate::input::{Input, InputAt}; |
21 | use crate::prog::{InstPtr, Program}; |
22 | use crate::re_trait::Slot; |
23 | |
24 | type Bits = u32; |
25 | |
26 | const BIT_SIZE: usize = 32; |
27 | const MAX_SIZE_BYTES: usize = 256 * (1 << 10); // 256 KB |
28 | |
29 | /// Returns true iff the given regex and input should be executed by this |
30 | /// engine with reasonable memory usage. |
31 | pub fn should_exec(num_insts: usize, text_len: usize) -> bool { |
32 | // Total memory usage in bytes is determined by: |
33 | // |
34 | // ((len(insts) * (len(input) + 1) + bits - 1) / bits) * (size_of(u32)) |
35 | // |
36 | // The actual limit picked is pretty much a heuristic. |
37 | // See: https://github.com/rust-lang/regex/issues/215 |
38 | let size: usize = ((num_insts * (text_len + 1) + BIT_SIZE - 1) / BIT_SIZE) * 4; |
39 | size <= MAX_SIZE_BYTES |
40 | } |
41 | |
42 | /// A backtracking matching engine. |
43 | #[derive (Debug)] |
44 | pub struct Bounded<'a, 'm, 'r, 's, I> { |
45 | prog: &'r Program, |
46 | input: I, |
47 | matches: &'m mut [bool], |
48 | slots: &'s mut [Slot], |
49 | m: &'a mut Cache, |
50 | } |
51 | |
52 | /// Shared cached state between multiple invocations of a backtracking engine |
53 | /// in the same thread. |
54 | #[derive (Clone, Debug)] |
55 | pub struct Cache { |
56 | jobs: Vec<Job>, |
57 | visited: Vec<Bits>, |
58 | } |
59 | |
60 | impl Cache { |
61 | /// Create new empty cache for the backtracking engine. |
62 | pub fn new(_prog: &Program) -> Self { |
63 | Cache { jobs: vec![], visited: vec![] } |
64 | } |
65 | } |
66 | |
67 | /// A job is an explicit unit of stack space in the backtracking engine. |
68 | /// |
69 | /// The "normal" representation is a single state transition, which corresponds |
70 | /// to an NFA state and a character in the input. However, the backtracking |
71 | /// engine must keep track of old capture group values. We use the explicit |
72 | /// stack to do it. |
73 | #[derive (Clone, Copy, Debug)] |
74 | enum Job { |
75 | Inst { ip: InstPtr, at: InputAt }, |
76 | SaveRestore { slot: usize, old_pos: Option<usize> }, |
77 | } |
78 | |
79 | impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> { |
80 | /// Execute the backtracking matching engine. |
81 | /// |
82 | /// If there's a match, `exec` returns `true` and populates the given |
83 | /// captures accordingly. |
84 | pub fn exec( |
85 | prog: &'r Program, |
86 | cache: &ProgramCache, |
87 | matches: &'m mut [bool], |
88 | slots: &'s mut [Slot], |
89 | input: I, |
90 | start: usize, |
91 | end: usize, |
92 | ) -> bool { |
93 | let mut cache = cache.borrow_mut(); |
94 | let cache = &mut cache.backtrack; |
95 | let start = input.at(start); |
96 | let mut b = Bounded { prog, input, matches, slots, m: cache }; |
97 | b.exec_(start, end) |
98 | } |
99 | |
100 | /// Clears the cache such that the backtracking engine can be executed |
101 | /// on some input of fixed length. |
102 | fn clear(&mut self) { |
103 | // Reset the job memory so that we start fresh. |
104 | self.m.jobs.clear(); |
105 | |
106 | // Now we need to clear the bit state set. |
107 | // We do this by figuring out how much space we need to keep track |
108 | // of the states we've visited. |
109 | // Then we reset all existing allocated space to 0. |
110 | // Finally, we request more space if we need it. |
111 | // |
112 | // This is all a little circuitous, but doing this using unchecked |
113 | // operations doesn't seem to have a measurable impact on performance. |
114 | // (Probably because backtracking is limited to such small |
115 | // inputs/regexes in the first place.) |
116 | let visited_len = |
117 | (self.prog.len() * (self.input.len() + 1) + BIT_SIZE - 1) |
118 | / BIT_SIZE; |
119 | self.m.visited.truncate(visited_len); |
120 | for v in &mut self.m.visited { |
121 | *v = 0; |
122 | } |
123 | if visited_len > self.m.visited.len() { |
124 | let len = self.m.visited.len(); |
125 | self.m.visited.reserve_exact(visited_len - len); |
126 | for _ in 0..(visited_len - len) { |
127 | self.m.visited.push(0); |
128 | } |
129 | } |
130 | } |
131 | |
132 | /// Start backtracking at the given position in the input, but also look |
133 | /// for literal prefixes. |
134 | fn exec_(&mut self, mut at: InputAt, end: usize) -> bool { |
135 | self.clear(); |
136 | // If this is an anchored regex at the beginning of the input, then |
137 | // we're either already done or we only need to try backtracking once. |
138 | if self.prog.is_anchored_start { |
139 | return if !at.is_start() { false } else { self.backtrack(at) }; |
140 | } |
141 | let mut matched = false; |
142 | loop { |
143 | if !self.prog.prefixes.is_empty() { |
144 | at = match self.input.prefix_at(&self.prog.prefixes, at) { |
145 | None => break, |
146 | Some(at) => at, |
147 | }; |
148 | } |
149 | matched = self.backtrack(at) || matched; |
150 | if matched && self.prog.matches.len() == 1 { |
151 | return true; |
152 | } |
153 | if at.pos() >= end { |
154 | break; |
155 | } |
156 | at = self.input.at(at.next_pos()); |
157 | } |
158 | matched |
159 | } |
160 | |
161 | /// The main backtracking loop starting at the given input position. |
162 | fn backtrack(&mut self, start: InputAt) -> bool { |
163 | // N.B. We use an explicit stack to avoid recursion. |
164 | // To avoid excessive pushing and popping, most transitions are handled |
165 | // in the `step` helper function, which only pushes to the stack when |
166 | // there's a capture or a branch. |
167 | let mut matched = false; |
168 | self.m.jobs.push(Job::Inst { ip: 0, at: start }); |
169 | while let Some(job) = self.m.jobs.pop() { |
170 | match job { |
171 | Job::Inst { ip, at } => { |
172 | if self.step(ip, at) { |
173 | // Only quit if we're matching one regex. |
174 | // If we're matching a regex set, then mush on and |
175 | // try to find other matches (if we want them). |
176 | if self.prog.matches.len() == 1 { |
177 | return true; |
178 | } |
179 | matched = true; |
180 | } |
181 | } |
182 | Job::SaveRestore { slot, old_pos } => { |
183 | if slot < self.slots.len() { |
184 | self.slots[slot] = old_pos; |
185 | } |
186 | } |
187 | } |
188 | } |
189 | matched |
190 | } |
191 | |
192 | fn step(&mut self, mut ip: InstPtr, mut at: InputAt) -> bool { |
193 | use crate::prog::Inst::*; |
194 | loop { |
195 | // This loop is an optimization to avoid constantly pushing/popping |
196 | // from the stack. Namely, if we're pushing a job only to run it |
197 | // next, avoid the push and just mutate `ip` (and possibly `at`) |
198 | // in place. |
199 | if self.has_visited(ip, at) { |
200 | return false; |
201 | } |
202 | match self.prog[ip] { |
203 | Match(slot) => { |
204 | if slot < self.matches.len() { |
205 | self.matches[slot] = true; |
206 | } |
207 | return true; |
208 | } |
209 | Save(ref inst) => { |
210 | if let Some(&old_pos) = self.slots.get(inst.slot) { |
211 | // If this path doesn't work out, then we save the old |
212 | // capture index (if one exists) in an alternate |
213 | // job. If the next path fails, then the alternate |
214 | // job is popped and the old capture index is restored. |
215 | self.m.jobs.push(Job::SaveRestore { |
216 | slot: inst.slot, |
217 | old_pos, |
218 | }); |
219 | self.slots[inst.slot] = Some(at.pos()); |
220 | } |
221 | ip = inst.goto; |
222 | } |
223 | Split(ref inst) => { |
224 | self.m.jobs.push(Job::Inst { ip: inst.goto2, at }); |
225 | ip = inst.goto1; |
226 | } |
227 | EmptyLook(ref inst) => { |
228 | if self.input.is_empty_match(at, inst) { |
229 | ip = inst.goto; |
230 | } else { |
231 | return false; |
232 | } |
233 | } |
234 | Char(ref inst) => { |
235 | if inst.c == at.char() { |
236 | ip = inst.goto; |
237 | at = self.input.at(at.next_pos()); |
238 | } else { |
239 | return false; |
240 | } |
241 | } |
242 | Ranges(ref inst) => { |
243 | if inst.matches(at.char()) { |
244 | ip = inst.goto; |
245 | at = self.input.at(at.next_pos()); |
246 | } else { |
247 | return false; |
248 | } |
249 | } |
250 | Bytes(ref inst) => { |
251 | if let Some(b) = at.byte() { |
252 | if inst.matches(b) { |
253 | ip = inst.goto; |
254 | at = self.input.at(at.next_pos()); |
255 | continue; |
256 | } |
257 | } |
258 | return false; |
259 | } |
260 | } |
261 | } |
262 | } |
263 | |
264 | fn has_visited(&mut self, ip: InstPtr, at: InputAt) -> bool { |
265 | let k = ip * (self.input.len() + 1) + at.pos(); |
266 | let k1 = k / BIT_SIZE; |
267 | let k2 = usize_to_u32(1 << (k & (BIT_SIZE - 1))); |
268 | if self.m.visited[k1] & k2 == 0 { |
269 | self.m.visited[k1] |= k2; |
270 | false |
271 | } else { |
272 | true |
273 | } |
274 | } |
275 | } |
276 | |
277 | fn usize_to_u32(n: usize) -> u32 { |
278 | if (n as u64) > (::std::u32::MAX as u64) { |
279 | panic!("BUG: {} is too big to fit into u32" , n) |
280 | } |
281 | n as u32 |
282 | } |
283 | |