1 | use std::cmp::Ordering; |
2 | use std::collections::HashMap; |
3 | use std::fmt; |
4 | use std::mem; |
5 | use std::ops::Deref; |
6 | use std::slice; |
7 | use std::sync::Arc; |
8 | |
9 | use crate::input::Char; |
10 | use crate::literal::LiteralSearcher; |
11 | |
12 | /// `InstPtr` represents the index of an instruction in a regex program. |
13 | pub type InstPtr = usize; |
14 | |
15 | /// Program is a sequence of instructions and various facts about thos |
16 | /// instructions. |
17 | #[derive (Clone)] |
18 | pub struct Program { |
19 | /// A sequence of instructions that represents an NFA. |
20 | pub insts: Vec<Inst>, |
21 | /// Pointers to each Match instruction in the sequence. |
22 | /// |
23 | /// This is always length 1 unless this program represents a regex set. |
24 | pub matches: Vec<InstPtr>, |
25 | /// The ordered sequence of all capture groups extracted from the AST. |
26 | /// Unnamed groups are `None`. |
27 | pub captures: Vec<Option<String>>, |
28 | /// Pointers to all named capture groups into `captures`. |
29 | pub capture_name_idx: Arc<HashMap<String, usize>>, |
30 | /// If the number of capture groups is the same for all possible matches, |
31 | /// then this is that number. |
32 | pub static_captures_len: Option<usize>, |
33 | /// A pointer to the start instruction. This can vary depending on how |
34 | /// the program was compiled. For example, programs for use with the DFA |
35 | /// engine have a `.*?` inserted at the beginning of unanchored regular |
36 | /// expressions. The actual starting point of the program is after the |
37 | /// `.*?`. |
38 | pub start: InstPtr, |
39 | /// A set of equivalence classes for discriminating bytes in the compiled |
40 | /// program. |
41 | pub byte_classes: Vec<u8>, |
42 | /// When true, this program can only match valid UTF-8. |
43 | pub only_utf8: bool, |
44 | /// When true, this program uses byte range instructions instead of Unicode |
45 | /// range instructions. |
46 | pub is_bytes: bool, |
47 | /// When true, the program is compiled for DFA matching. For example, this |
48 | /// implies `is_bytes` and also inserts a preceding `.*?` for unanchored |
49 | /// regexes. |
50 | pub is_dfa: bool, |
51 | /// When true, the program matches text in reverse (for use only in the |
52 | /// DFA). |
53 | pub is_reverse: bool, |
54 | /// Whether the regex must match from the start of the input. |
55 | pub is_anchored_start: bool, |
56 | /// Whether the regex must match at the end of the input. |
57 | pub is_anchored_end: bool, |
58 | /// Whether this program contains a Unicode word boundary instruction. |
59 | pub has_unicode_word_boundary: bool, |
60 | /// A possibly empty machine for very quickly matching prefix literals. |
61 | pub prefixes: LiteralSearcher, |
62 | /// A limit on the size of the cache that the DFA is allowed to use while |
63 | /// matching. |
64 | /// |
65 | /// The cache limit specifies approximately how much space we're willing to |
66 | /// give to the state cache. Once the state cache exceeds the size, it is |
67 | /// wiped and all states must be re-computed. |
68 | /// |
69 | /// Note that this value does not impact correctness. It can be set to 0 |
70 | /// and the DFA will run just fine. (It will only ever store exactly one |
71 | /// state in the cache, and will likely run very slowly, but it will work.) |
72 | /// |
73 | /// Also note that this limit is *per thread of execution*. That is, |
74 | /// if the same regex is used to search text across multiple threads |
75 | /// simultaneously, then the DFA cache is not shared. Instead, copies are |
76 | /// made. |
77 | pub dfa_size_limit: usize, |
78 | } |
79 | |
80 | impl Program { |
81 | /// Creates an empty instruction sequence. Fields are given default |
82 | /// values. |
83 | pub fn new() -> Self { |
84 | Program { |
85 | insts: vec![], |
86 | matches: vec![], |
87 | captures: vec![], |
88 | capture_name_idx: Arc::new(HashMap::new()), |
89 | static_captures_len: None, |
90 | start: 0, |
91 | byte_classes: vec![0; 256], |
92 | only_utf8: true, |
93 | is_bytes: false, |
94 | is_dfa: false, |
95 | is_reverse: false, |
96 | is_anchored_start: false, |
97 | is_anchored_end: false, |
98 | has_unicode_word_boundary: false, |
99 | prefixes: LiteralSearcher::empty(), |
100 | dfa_size_limit: 2 * (1 << 20), |
101 | } |
102 | } |
103 | |
104 | /// If pc is an index to a no-op instruction (like Save), then return the |
105 | /// next pc that is not a no-op instruction. |
106 | pub fn skip(&self, mut pc: usize) -> usize { |
107 | loop { |
108 | match self[pc] { |
109 | Inst::Save(ref i) => pc = i.goto, |
110 | _ => return pc, |
111 | } |
112 | } |
113 | } |
114 | |
115 | /// Return true if and only if an execution engine at instruction `pc` will |
116 | /// always lead to a match. |
117 | pub fn leads_to_match(&self, pc: usize) -> bool { |
118 | if self.matches.len() > 1 { |
119 | // If we have a regex set, then we have more than one ending |
120 | // state, so leading to one of those states is generally |
121 | // meaningless. |
122 | return false; |
123 | } |
124 | match self[self.skip(pc)] { |
125 | Inst::Match(_) => true, |
126 | _ => false, |
127 | } |
128 | } |
129 | |
130 | /// Returns true if the current configuration demands that an implicit |
131 | /// `.*?` be prepended to the instruction sequence. |
132 | pub fn needs_dotstar(&self) -> bool { |
133 | self.is_dfa && !self.is_reverse && !self.is_anchored_start |
134 | } |
135 | |
136 | /// Returns true if this program uses Byte instructions instead of |
137 | /// Char/Range instructions. |
138 | pub fn uses_bytes(&self) -> bool { |
139 | self.is_bytes || self.is_dfa |
140 | } |
141 | |
142 | /// Returns true if this program exclusively matches valid UTF-8 bytes. |
143 | /// |
144 | /// That is, if an invalid UTF-8 byte is seen, then no match is possible. |
145 | pub fn only_utf8(&self) -> bool { |
146 | self.only_utf8 |
147 | } |
148 | |
149 | /// Return the approximate heap usage of this instruction sequence in |
150 | /// bytes. |
151 | pub fn approximate_size(&self) -> usize { |
152 | // The only instruction that uses heap space is Ranges (for |
153 | // Unicode codepoint programs) to store non-overlapping codepoint |
154 | // ranges. To keep this operation constant time, we ignore them. |
155 | (self.len() * mem::size_of::<Inst>()) |
156 | + (self.matches.len() * mem::size_of::<InstPtr>()) |
157 | + (self.captures.len() * mem::size_of::<Option<String>>()) |
158 | + (self.capture_name_idx.len() |
159 | * (mem::size_of::<String>() + mem::size_of::<usize>())) |
160 | + (self.byte_classes.len() * mem::size_of::<u8>()) |
161 | + self.prefixes.approximate_size() |
162 | } |
163 | } |
164 | |
165 | impl Deref for Program { |
166 | type Target = [Inst]; |
167 | |
168 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
169 | fn deref(&self) -> &Self::Target { |
170 | &*self.insts |
171 | } |
172 | } |
173 | |
174 | impl fmt::Debug for Program { |
175 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
176 | use self::Inst::*; |
177 | |
178 | fn with_goto(cur: usize, goto: usize, fmtd: String) -> String { |
179 | if goto == cur + 1 { |
180 | fmtd |
181 | } else { |
182 | format!(" {} (goto: {})" , fmtd, goto) |
183 | } |
184 | } |
185 | |
186 | fn visible_byte(b: u8) -> String { |
187 | use std::ascii::escape_default; |
188 | let escaped = escape_default(b).collect::<Vec<u8>>(); |
189 | String::from_utf8_lossy(&escaped).into_owned() |
190 | } |
191 | |
192 | for (pc, inst) in self.iter().enumerate() { |
193 | match *inst { |
194 | Match(slot) => write!(f, " {:04} Match( {:?})" , pc, slot)?, |
195 | Save(ref inst) => { |
196 | let s = format!(" {:04} Save( {})" , pc, inst.slot); |
197 | write!(f, " {}" , with_goto(pc, inst.goto, s))?; |
198 | } |
199 | Split(ref inst) => { |
200 | write!( |
201 | f, |
202 | " {:04} Split( {}, {})" , |
203 | pc, inst.goto1, inst.goto2 |
204 | )?; |
205 | } |
206 | EmptyLook(ref inst) => { |
207 | let s = format!(" {:?}" , inst.look); |
208 | write!(f, " {:04} {}" , pc, with_goto(pc, inst.goto, s))?; |
209 | } |
210 | Char(ref inst) => { |
211 | let s = format!(" {:?}" , inst.c); |
212 | write!(f, " {:04} {}" , pc, with_goto(pc, inst.goto, s))?; |
213 | } |
214 | Ranges(ref inst) => { |
215 | let ranges = inst |
216 | .ranges |
217 | .iter() |
218 | .map(|r| format!(" {:?}- {:?}" , r.0, r.1)) |
219 | .collect::<Vec<String>>() |
220 | .join(", " ); |
221 | write!( |
222 | f, |
223 | " {:04} {}" , |
224 | pc, |
225 | with_goto(pc, inst.goto, ranges) |
226 | )?; |
227 | } |
228 | Bytes(ref inst) => { |
229 | let s = format!( |
230 | "Bytes( {}, {})" , |
231 | visible_byte(inst.start), |
232 | visible_byte(inst.end) |
233 | ); |
234 | write!(f, " {:04} {}" , pc, with_goto(pc, inst.goto, s))?; |
235 | } |
236 | } |
237 | if pc == self.start { |
238 | write!(f, " (start)" )?; |
239 | } |
240 | writeln!(f)?; |
241 | } |
242 | Ok(()) |
243 | } |
244 | } |
245 | |
246 | impl<'a> IntoIterator for &'a Program { |
247 | type Item = &'a Inst; |
248 | type IntoIter = slice::Iter<'a, Inst>; |
249 | fn into_iter(self) -> Self::IntoIter { |
250 | self.iter() |
251 | } |
252 | } |
253 | |
254 | /// Inst is an instruction code in a Regex program. |
255 | /// |
256 | /// Regrettably, a regex program either contains Unicode codepoint |
257 | /// instructions (Char and Ranges) or it contains byte instructions (Bytes). |
258 | /// A regex program can never contain both. |
259 | /// |
260 | /// It would be worth investigating splitting this into two distinct types and |
261 | /// then figuring out how to make the matching engines polymorphic over those |
262 | /// types without sacrificing performance. |
263 | /// |
264 | /// Other than the benefit of moving invariants into the type system, another |
265 | /// benefit is the decreased size. If we remove the `Char` and `Ranges` |
266 | /// instructions from the `Inst` enum, then its size shrinks from 32 bytes to |
267 | /// 24 bytes. (This is because of the removal of a `Box<[]>` in the `Ranges` |
268 | /// variant.) Given that byte based machines are typically much bigger than |
269 | /// their Unicode analogues (because they can decode UTF-8 directly), this ends |
270 | /// up being a pretty significant savings. |
271 | #[derive (Clone, Debug)] |
272 | pub enum Inst { |
273 | /// Match indicates that the program has reached a match state. |
274 | /// |
275 | /// The number in the match corresponds to the Nth logical regular |
276 | /// expression in this program. This index is always 0 for normal regex |
277 | /// programs. Values greater than 0 appear when compiling regex sets, and |
278 | /// each match instruction gets its own unique value. The value corresponds |
279 | /// to the Nth regex in the set. |
280 | Match(usize), |
281 | /// Save causes the program to save the current location of the input in |
282 | /// the slot indicated by InstSave. |
283 | Save(InstSave), |
284 | /// Split causes the program to diverge to one of two paths in the |
285 | /// program, preferring goto1 in InstSplit. |
286 | Split(InstSplit), |
287 | /// EmptyLook represents a zero-width assertion in a regex program. A |
288 | /// zero-width assertion does not consume any of the input text. |
289 | EmptyLook(InstEmptyLook), |
290 | /// Char requires the regex program to match the character in InstChar at |
291 | /// the current position in the input. |
292 | Char(InstChar), |
293 | /// Ranges requires the regex program to match the character at the current |
294 | /// position in the input with one of the ranges specified in InstRanges. |
295 | Ranges(InstRanges), |
296 | /// Bytes is like Ranges, except it expresses a single byte range. It is |
297 | /// used in conjunction with Split instructions to implement multi-byte |
298 | /// character classes. |
299 | Bytes(InstBytes), |
300 | } |
301 | |
302 | impl Inst { |
303 | /// Returns true if and only if this is a match instruction. |
304 | pub fn is_match(&self) -> bool { |
305 | match *self { |
306 | Inst::Match(_) => true, |
307 | _ => false, |
308 | } |
309 | } |
310 | } |
311 | |
312 | /// Representation of the Save instruction. |
313 | #[derive (Clone, Debug)] |
314 | pub struct InstSave { |
315 | /// The next location to execute in the program. |
316 | pub goto: InstPtr, |
317 | /// The capture slot (there are two slots for every capture in a regex, |
318 | /// including the zeroth capture for the entire match). |
319 | pub slot: usize, |
320 | } |
321 | |
322 | /// Representation of the Split instruction. |
323 | #[derive (Clone, Debug)] |
324 | pub struct InstSplit { |
325 | /// The first instruction to try. A match resulting from following goto1 |
326 | /// has precedence over a match resulting from following goto2. |
327 | pub goto1: InstPtr, |
328 | /// The second instruction to try. A match resulting from following goto1 |
329 | /// has precedence over a match resulting from following goto2. |
330 | pub goto2: InstPtr, |
331 | } |
332 | |
333 | /// Representation of the `EmptyLook` instruction. |
334 | #[derive (Clone, Debug)] |
335 | pub struct InstEmptyLook { |
336 | /// The next location to execute in the program if this instruction |
337 | /// succeeds. |
338 | pub goto: InstPtr, |
339 | /// The type of zero-width assertion to check. |
340 | pub look: EmptyLook, |
341 | } |
342 | |
343 | /// The set of zero-width match instructions. |
344 | #[derive (Clone, Copy, Debug, PartialEq, Eq)] |
345 | pub enum EmptyLook { |
346 | /// Start of line or input. |
347 | StartLine, |
348 | /// End of line or input. |
349 | EndLine, |
350 | /// Start of input. |
351 | StartText, |
352 | /// End of input. |
353 | EndText, |
354 | /// Word character on one side and non-word character on other. |
355 | WordBoundary, |
356 | /// Word character on both sides or non-word character on both sides. |
357 | NotWordBoundary, |
358 | /// ASCII word boundary. |
359 | WordBoundaryAscii, |
360 | /// Not ASCII word boundary. |
361 | NotWordBoundaryAscii, |
362 | } |
363 | |
364 | /// Representation of the Char instruction. |
365 | #[derive (Clone, Debug)] |
366 | pub struct InstChar { |
367 | /// The next location to execute in the program if this instruction |
368 | /// succeeds. |
369 | pub goto: InstPtr, |
370 | /// The character to test. |
371 | pub c: char, |
372 | } |
373 | |
374 | /// Representation of the Ranges instruction. |
375 | #[derive (Clone, Debug)] |
376 | pub struct InstRanges { |
377 | /// The next location to execute in the program if this instruction |
378 | /// succeeds. |
379 | pub goto: InstPtr, |
380 | /// The set of Unicode scalar value ranges to test. |
381 | pub ranges: Box<[(char, char)]>, |
382 | } |
383 | |
384 | impl InstRanges { |
385 | /// Tests whether the given input character matches this instruction. |
386 | pub fn matches(&self, c: Char) -> bool { |
387 | // This speeds up the `match_class_unicode` benchmark by checking |
388 | // some common cases quickly without binary search. e.g., Matching |
389 | // a Unicode class on predominantly ASCII text. |
390 | for r in self.ranges.iter().take(4) { |
391 | if c < r.0 { |
392 | return false; |
393 | } |
394 | if c <= r.1 { |
395 | return true; |
396 | } |
397 | } |
398 | self.ranges |
399 | .binary_search_by(|r| { |
400 | if r.1 < c { |
401 | Ordering::Less |
402 | } else if r.0 > c { |
403 | Ordering::Greater |
404 | } else { |
405 | Ordering::Equal |
406 | } |
407 | }) |
408 | .is_ok() |
409 | } |
410 | |
411 | /// Return the number of distinct characters represented by all of the |
412 | /// ranges. |
413 | pub fn num_chars(&self) -> usize { |
414 | self.ranges |
415 | .iter() |
416 | .map(|&(s, e)| 1 + (e as u32) - (s as u32)) |
417 | .sum::<u32>() as usize |
418 | } |
419 | } |
420 | |
421 | /// Representation of the Bytes instruction. |
422 | #[derive (Clone, Debug)] |
423 | pub struct InstBytes { |
424 | /// The next location to execute in the program if this instruction |
425 | /// succeeds. |
426 | pub goto: InstPtr, |
427 | /// The start (inclusive) of this byte range. |
428 | pub start: u8, |
429 | /// The end (inclusive) of this byte range. |
430 | pub end: u8, |
431 | } |
432 | |
433 | impl InstBytes { |
434 | /// Returns true if and only if the given byte is in this range. |
435 | pub fn matches(&self, byte: u8) -> bool { |
436 | self.start <= byte && byte <= self.end |
437 | } |
438 | } |
439 | |
440 | #[cfg (test)] |
441 | mod test { |
442 | #[test ] |
443 | #[cfg (target_pointer_width = "64" )] |
444 | fn test_size_of_inst() { |
445 | use std::mem::size_of; |
446 | |
447 | use super::Inst; |
448 | |
449 | assert_eq!(32, size_of::<Inst>()); |
450 | } |
451 | } |
452 | |