1use std::cmp::Ordering;
2use std::collections::HashMap;
3use std::fmt;
4use std::mem;
5use std::ops::Deref;
6use std::slice;
7use std::sync::Arc;
8
9use crate::input::Char;
10use crate::literal::LiteralSearcher;
11
12/// `InstPtr` represents the index of an instruction in a regex program.
13pub type InstPtr = usize;
14
15/// Program is a sequence of instructions and various facts about thos
16/// instructions.
17#[derive(Clone)]
18pub struct Program {
19 /// A sequence of instructions that represents an NFA.
20 pub insts: Vec<Inst>,
21 /// Pointers to each Match instruction in the sequence.
22 ///
23 /// This is always length 1 unless this program represents a regex set.
24 pub matches: Vec<InstPtr>,
25 /// The ordered sequence of all capture groups extracted from the AST.
26 /// Unnamed groups are `None`.
27 pub captures: Vec<Option<String>>,
28 /// Pointers to all named capture groups into `captures`.
29 pub capture_name_idx: Arc<HashMap<String, usize>>,
30 /// If the number of capture groups is the same for all possible matches,
31 /// then this is that number.
32 pub static_captures_len: Option<usize>,
33 /// A pointer to the start instruction. This can vary depending on how
34 /// the program was compiled. For example, programs for use with the DFA
35 /// engine have a `.*?` inserted at the beginning of unanchored regular
36 /// expressions. The actual starting point of the program is after the
37 /// `.*?`.
38 pub start: InstPtr,
39 /// A set of equivalence classes for discriminating bytes in the compiled
40 /// program.
41 pub byte_classes: Vec<u8>,
42 /// When true, this program can only match valid UTF-8.
43 pub only_utf8: bool,
44 /// When true, this program uses byte range instructions instead of Unicode
45 /// range instructions.
46 pub is_bytes: bool,
47 /// When true, the program is compiled for DFA matching. For example, this
48 /// implies `is_bytes` and also inserts a preceding `.*?` for unanchored
49 /// regexes.
50 pub is_dfa: bool,
51 /// When true, the program matches text in reverse (for use only in the
52 /// DFA).
53 pub is_reverse: bool,
54 /// Whether the regex must match from the start of the input.
55 pub is_anchored_start: bool,
56 /// Whether the regex must match at the end of the input.
57 pub is_anchored_end: bool,
58 /// Whether this program contains a Unicode word boundary instruction.
59 pub has_unicode_word_boundary: bool,
60 /// A possibly empty machine for very quickly matching prefix literals.
61 pub prefixes: LiteralSearcher,
62 /// A limit on the size of the cache that the DFA is allowed to use while
63 /// matching.
64 ///
65 /// The cache limit specifies approximately how much space we're willing to
66 /// give to the state cache. Once the state cache exceeds the size, it is
67 /// wiped and all states must be re-computed.
68 ///
69 /// Note that this value does not impact correctness. It can be set to 0
70 /// and the DFA will run just fine. (It will only ever store exactly one
71 /// state in the cache, and will likely run very slowly, but it will work.)
72 ///
73 /// Also note that this limit is *per thread of execution*. That is,
74 /// if the same regex is used to search text across multiple threads
75 /// simultaneously, then the DFA cache is not shared. Instead, copies are
76 /// made.
77 pub dfa_size_limit: usize,
78}
79
80impl Program {
81 /// Creates an empty instruction sequence. Fields are given default
82 /// values.
83 pub fn new() -> Self {
84 Program {
85 insts: vec![],
86 matches: vec![],
87 captures: vec![],
88 capture_name_idx: Arc::new(HashMap::new()),
89 static_captures_len: None,
90 start: 0,
91 byte_classes: vec![0; 256],
92 only_utf8: true,
93 is_bytes: false,
94 is_dfa: false,
95 is_reverse: false,
96 is_anchored_start: false,
97 is_anchored_end: false,
98 has_unicode_word_boundary: false,
99 prefixes: LiteralSearcher::empty(),
100 dfa_size_limit: 2 * (1 << 20),
101 }
102 }
103
104 /// If pc is an index to a no-op instruction (like Save), then return the
105 /// next pc that is not a no-op instruction.
106 pub fn skip(&self, mut pc: usize) -> usize {
107 loop {
108 match self[pc] {
109 Inst::Save(ref i) => pc = i.goto,
110 _ => return pc,
111 }
112 }
113 }
114
115 /// Return true if and only if an execution engine at instruction `pc` will
116 /// always lead to a match.
117 pub fn leads_to_match(&self, pc: usize) -> bool {
118 if self.matches.len() > 1 {
119 // If we have a regex set, then we have more than one ending
120 // state, so leading to one of those states is generally
121 // meaningless.
122 return false;
123 }
124 match self[self.skip(pc)] {
125 Inst::Match(_) => true,
126 _ => false,
127 }
128 }
129
130 /// Returns true if the current configuration demands that an implicit
131 /// `.*?` be prepended to the instruction sequence.
132 pub fn needs_dotstar(&self) -> bool {
133 self.is_dfa && !self.is_reverse && !self.is_anchored_start
134 }
135
136 /// Returns true if this program uses Byte instructions instead of
137 /// Char/Range instructions.
138 pub fn uses_bytes(&self) -> bool {
139 self.is_bytes || self.is_dfa
140 }
141
142 /// Returns true if this program exclusively matches valid UTF-8 bytes.
143 ///
144 /// That is, if an invalid UTF-8 byte is seen, then no match is possible.
145 pub fn only_utf8(&self) -> bool {
146 self.only_utf8
147 }
148
149 /// Return the approximate heap usage of this instruction sequence in
150 /// bytes.
151 pub fn approximate_size(&self) -> usize {
152 // The only instruction that uses heap space is Ranges (for
153 // Unicode codepoint programs) to store non-overlapping codepoint
154 // ranges. To keep this operation constant time, we ignore them.
155 (self.len() * mem::size_of::<Inst>())
156 + (self.matches.len() * mem::size_of::<InstPtr>())
157 + (self.captures.len() * mem::size_of::<Option<String>>())
158 + (self.capture_name_idx.len()
159 * (mem::size_of::<String>() + mem::size_of::<usize>()))
160 + (self.byte_classes.len() * mem::size_of::<u8>())
161 + self.prefixes.approximate_size()
162 }
163}
164
165impl Deref for Program {
166 type Target = [Inst];
167
168 #[cfg_attr(feature = "perf-inline", inline(always))]
169 fn deref(&self) -> &Self::Target {
170 &*self.insts
171 }
172}
173
174impl fmt::Debug for Program {
175 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
176 use self::Inst::*;
177
178 fn with_goto(cur: usize, goto: usize, fmtd: String) -> String {
179 if goto == cur + 1 {
180 fmtd
181 } else {
182 format!("{} (goto: {})", fmtd, goto)
183 }
184 }
185
186 fn visible_byte(b: u8) -> String {
187 use std::ascii::escape_default;
188 let escaped = escape_default(b).collect::<Vec<u8>>();
189 String::from_utf8_lossy(&escaped).into_owned()
190 }
191
192 for (pc, inst) in self.iter().enumerate() {
193 match *inst {
194 Match(slot) => write!(f, "{:04} Match({:?})", pc, slot)?,
195 Save(ref inst) => {
196 let s = format!("{:04} Save({})", pc, inst.slot);
197 write!(f, "{}", with_goto(pc, inst.goto, s))?;
198 }
199 Split(ref inst) => {
200 write!(
201 f,
202 "{:04} Split({}, {})",
203 pc, inst.goto1, inst.goto2
204 )?;
205 }
206 EmptyLook(ref inst) => {
207 let s = format!("{:?}", inst.look);
208 write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?;
209 }
210 Char(ref inst) => {
211 let s = format!("{:?}", inst.c);
212 write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?;
213 }
214 Ranges(ref inst) => {
215 let ranges = inst
216 .ranges
217 .iter()
218 .map(|r| format!("{:?}-{:?}", r.0, r.1))
219 .collect::<Vec<String>>()
220 .join(", ");
221 write!(
222 f,
223 "{:04} {}",
224 pc,
225 with_goto(pc, inst.goto, ranges)
226 )?;
227 }
228 Bytes(ref inst) => {
229 let s = format!(
230 "Bytes({}, {})",
231 visible_byte(inst.start),
232 visible_byte(inst.end)
233 );
234 write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?;
235 }
236 }
237 if pc == self.start {
238 write!(f, " (start)")?;
239 }
240 writeln!(f)?;
241 }
242 Ok(())
243 }
244}
245
246impl<'a> IntoIterator for &'a Program {
247 type Item = &'a Inst;
248 type IntoIter = slice::Iter<'a, Inst>;
249 fn into_iter(self) -> Self::IntoIter {
250 self.iter()
251 }
252}
253
254/// Inst is an instruction code in a Regex program.
255///
256/// Regrettably, a regex program either contains Unicode codepoint
257/// instructions (Char and Ranges) or it contains byte instructions (Bytes).
258/// A regex program can never contain both.
259///
260/// It would be worth investigating splitting this into two distinct types and
261/// then figuring out how to make the matching engines polymorphic over those
262/// types without sacrificing performance.
263///
264/// Other than the benefit of moving invariants into the type system, another
265/// benefit is the decreased size. If we remove the `Char` and `Ranges`
266/// instructions from the `Inst` enum, then its size shrinks from 32 bytes to
267/// 24 bytes. (This is because of the removal of a `Box<[]>` in the `Ranges`
268/// variant.) Given that byte based machines are typically much bigger than
269/// their Unicode analogues (because they can decode UTF-8 directly), this ends
270/// up being a pretty significant savings.
271#[derive(Clone, Debug)]
272pub enum Inst {
273 /// Match indicates that the program has reached a match state.
274 ///
275 /// The number in the match corresponds to the Nth logical regular
276 /// expression in this program. This index is always 0 for normal regex
277 /// programs. Values greater than 0 appear when compiling regex sets, and
278 /// each match instruction gets its own unique value. The value corresponds
279 /// to the Nth regex in the set.
280 Match(usize),
281 /// Save causes the program to save the current location of the input in
282 /// the slot indicated by InstSave.
283 Save(InstSave),
284 /// Split causes the program to diverge to one of two paths in the
285 /// program, preferring goto1 in InstSplit.
286 Split(InstSplit),
287 /// EmptyLook represents a zero-width assertion in a regex program. A
288 /// zero-width assertion does not consume any of the input text.
289 EmptyLook(InstEmptyLook),
290 /// Char requires the regex program to match the character in InstChar at
291 /// the current position in the input.
292 Char(InstChar),
293 /// Ranges requires the regex program to match the character at the current
294 /// position in the input with one of the ranges specified in InstRanges.
295 Ranges(InstRanges),
296 /// Bytes is like Ranges, except it expresses a single byte range. It is
297 /// used in conjunction with Split instructions to implement multi-byte
298 /// character classes.
299 Bytes(InstBytes),
300}
301
302impl Inst {
303 /// Returns true if and only if this is a match instruction.
304 pub fn is_match(&self) -> bool {
305 match *self {
306 Inst::Match(_) => true,
307 _ => false,
308 }
309 }
310}
311
312/// Representation of the Save instruction.
313#[derive(Clone, Debug)]
314pub struct InstSave {
315 /// The next location to execute in the program.
316 pub goto: InstPtr,
317 /// The capture slot (there are two slots for every capture in a regex,
318 /// including the zeroth capture for the entire match).
319 pub slot: usize,
320}
321
322/// Representation of the Split instruction.
323#[derive(Clone, Debug)]
324pub struct InstSplit {
325 /// The first instruction to try. A match resulting from following goto1
326 /// has precedence over a match resulting from following goto2.
327 pub goto1: InstPtr,
328 /// The second instruction to try. A match resulting from following goto1
329 /// has precedence over a match resulting from following goto2.
330 pub goto2: InstPtr,
331}
332
333/// Representation of the `EmptyLook` instruction.
334#[derive(Clone, Debug)]
335pub struct InstEmptyLook {
336 /// The next location to execute in the program if this instruction
337 /// succeeds.
338 pub goto: InstPtr,
339 /// The type of zero-width assertion to check.
340 pub look: EmptyLook,
341}
342
343/// The set of zero-width match instructions.
344#[derive(Clone, Copy, Debug, PartialEq, Eq)]
345pub enum EmptyLook {
346 /// Start of line or input.
347 StartLine,
348 /// End of line or input.
349 EndLine,
350 /// Start of input.
351 StartText,
352 /// End of input.
353 EndText,
354 /// Word character on one side and non-word character on other.
355 WordBoundary,
356 /// Word character on both sides or non-word character on both sides.
357 NotWordBoundary,
358 /// ASCII word boundary.
359 WordBoundaryAscii,
360 /// Not ASCII word boundary.
361 NotWordBoundaryAscii,
362}
363
364/// Representation of the Char instruction.
365#[derive(Clone, Debug)]
366pub struct InstChar {
367 /// The next location to execute in the program if this instruction
368 /// succeeds.
369 pub goto: InstPtr,
370 /// The character to test.
371 pub c: char,
372}
373
374/// Representation of the Ranges instruction.
375#[derive(Clone, Debug)]
376pub struct InstRanges {
377 /// The next location to execute in the program if this instruction
378 /// succeeds.
379 pub goto: InstPtr,
380 /// The set of Unicode scalar value ranges to test.
381 pub ranges: Box<[(char, char)]>,
382}
383
384impl InstRanges {
385 /// Tests whether the given input character matches this instruction.
386 pub fn matches(&self, c: Char) -> bool {
387 // This speeds up the `match_class_unicode` benchmark by checking
388 // some common cases quickly without binary search. e.g., Matching
389 // a Unicode class on predominantly ASCII text.
390 for r in self.ranges.iter().take(4) {
391 if c < r.0 {
392 return false;
393 }
394 if c <= r.1 {
395 return true;
396 }
397 }
398 self.ranges
399 .binary_search_by(|r| {
400 if r.1 < c {
401 Ordering::Less
402 } else if r.0 > c {
403 Ordering::Greater
404 } else {
405 Ordering::Equal
406 }
407 })
408 .is_ok()
409 }
410
411 /// Return the number of distinct characters represented by all of the
412 /// ranges.
413 pub fn num_chars(&self) -> usize {
414 self.ranges
415 .iter()
416 .map(|&(s, e)| 1 + (e as u32) - (s as u32))
417 .sum::<u32>() as usize
418 }
419}
420
421/// Representation of the Bytes instruction.
422#[derive(Clone, Debug)]
423pub struct InstBytes {
424 /// The next location to execute in the program if this instruction
425 /// succeeds.
426 pub goto: InstPtr,
427 /// The start (inclusive) of this byte range.
428 pub start: u8,
429 /// The end (inclusive) of this byte range.
430 pub end: u8,
431}
432
433impl InstBytes {
434 /// Returns true if and only if the given byte is in this range.
435 pub fn matches(&self, byte: u8) -> bool {
436 self.start <= byte && byte <= self.end
437 }
438}
439
440#[cfg(test)]
441mod test {
442 #[test]
443 #[cfg(target_pointer_width = "64")]
444 fn test_size_of_inst() {
445 use std::mem::size_of;
446
447 use super::Inst;
448
449 assert_eq!(32, size_of::<Inst>());
450 }
451}
452