1use std::collections::HashMap;
2use std::fmt;
3use std::iter;
4use std::result;
5use std::sync::Arc;
6
7use regex_syntax::hir::{self, Hir, Look};
8use regex_syntax::is_word_byte;
9use regex_syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences};
10
11use crate::prog::{
12 EmptyLook, Inst, InstBytes, InstChar, InstEmptyLook, InstPtr, InstRanges,
13 InstSave, InstSplit, Program,
14};
15
16use crate::Error;
17
18type Result = result::Result<Patch, Error>;
19type ResultOrEmpty = result::Result<Option<Patch>, Error>;
20
21#[derive(Debug)]
22struct Patch {
23 hole: Hole,
24 entry: InstPtr,
25}
26
27/// A compiler translates a regular expression AST to a sequence of
28/// instructions. The sequence of instructions represents an NFA.
29// `Compiler` is only public via the `internal` module, so avoid deriving
30// `Debug`.
31#[allow(missing_debug_implementations)]
32pub struct Compiler {
33 insts: Vec<MaybeInst>,
34 compiled: Program,
35 capture_name_idx: HashMap<String, usize>,
36 num_exprs: usize,
37 size_limit: usize,
38 suffix_cache: SuffixCache,
39 utf8_seqs: Option<Utf8Sequences>,
40 byte_classes: ByteClassSet,
41 // This keeps track of extra bytes allocated while compiling the regex
42 // program. Currently, this corresponds to two things. First is the heap
43 // memory allocated by Unicode character classes ('InstRanges'). Second is
44 // a "fake" amount of memory used by empty sub-expressions, so that enough
45 // empty sub-expressions will ultimately trigger the compiler to bail
46 // because of a size limit restriction. (That empty sub-expressions don't
47 // add to heap memory usage is more-or-less an implementation detail.) In
48 // the second case, if we don't bail, then an excessively large repetition
49 // on an empty sub-expression can result in the compiler using a very large
50 // amount of CPU time.
51 extra_inst_bytes: usize,
52}
53
54impl Compiler {
55 /// Create a new regular expression compiler.
56 ///
57 /// Various options can be set before calling `compile` on an expression.
58 pub fn new() -> Self {
59 Compiler {
60 insts: vec![],
61 compiled: Program::new(),
62 capture_name_idx: HashMap::new(),
63 num_exprs: 0,
64 size_limit: 10 * (1 << 20),
65 suffix_cache: SuffixCache::new(1000),
66 utf8_seqs: Some(Utf8Sequences::new('\x00', '\x00')),
67 byte_classes: ByteClassSet::new(),
68 extra_inst_bytes: 0,
69 }
70 }
71
72 /// The size of the resulting program is limited by size_limit. If
73 /// the program approximately exceeds the given size (in bytes), then
74 /// compilation will stop and return an error.
75 pub fn size_limit(mut self, size_limit: usize) -> Self {
76 self.size_limit = size_limit;
77 self
78 }
79
80 /// If bytes is true, then the program is compiled as a byte based
81 /// automaton, which incorporates UTF-8 decoding into the machine. If it's
82 /// false, then the automaton is Unicode scalar value based, e.g., an
83 /// engine utilizing such an automaton is responsible for UTF-8 decoding.
84 ///
85 /// The specific invariant is that when returning a byte based machine,
86 /// the neither the `Char` nor `Ranges` instructions are produced.
87 /// Conversely, when producing a Unicode scalar value machine, the `Bytes`
88 /// instruction is never produced.
89 ///
90 /// Note that `dfa(true)` implies `bytes(true)`.
91 pub fn bytes(mut self, yes: bool) -> Self {
92 self.compiled.is_bytes = yes;
93 self
94 }
95
96 /// When disabled, the program compiled may match arbitrary bytes.
97 ///
98 /// When enabled (the default), all compiled programs exclusively match
99 /// valid UTF-8 bytes.
100 pub fn only_utf8(mut self, yes: bool) -> Self {
101 self.compiled.only_utf8 = yes;
102 self
103 }
104
105 /// When set, the machine returned is suitable for use in the DFA matching
106 /// engine.
107 ///
108 /// In particular, this ensures that if the regex is not anchored in the
109 /// beginning, then a preceding `.*?` is included in the program. (The NFA
110 /// based engines handle the preceding `.*?` explicitly, which is difficult
111 /// or impossible in the DFA engine.)
112 pub fn dfa(mut self, yes: bool) -> Self {
113 self.compiled.is_dfa = yes;
114 self
115 }
116
117 /// When set, the machine returned is suitable for matching text in
118 /// reverse. In particular, all concatenations are flipped.
119 pub fn reverse(mut self, yes: bool) -> Self {
120 self.compiled.is_reverse = yes;
121 self
122 }
123
124 /// Compile a regular expression given its AST.
125 ///
126 /// The compiler is guaranteed to succeed unless the program exceeds the
127 /// specified size limit. If the size limit is exceeded, then compilation
128 /// stops and returns an error.
129 pub fn compile(mut self, exprs: &[Hir]) -> result::Result<Program, Error> {
130 debug_assert!(!exprs.is_empty());
131 self.num_exprs = exprs.len();
132 if exprs.len() == 1 {
133 self.compile_one(&exprs[0])
134 } else {
135 self.compile_many(exprs)
136 }
137 }
138
139 fn compile_one(mut self, expr: &Hir) -> result::Result<Program, Error> {
140 if self.compiled.only_utf8
141 && expr.properties().look_set().contains(Look::WordAsciiNegate)
142 {
143 return Err(Error::Syntax(
144 "ASCII-only \\B is not allowed in Unicode regexes \
145 because it may result in invalid UTF-8 matches"
146 .to_string(),
147 ));
148 }
149 // If we're compiling a forward DFA and we aren't anchored, then
150 // add a `.*?` before the first capture group.
151 // Other matching engines handle this by baking the logic into the
152 // matching engine itself.
153 let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
154 self.compiled.is_anchored_start =
155 expr.properties().look_set_prefix().contains(Look::Start);
156 self.compiled.is_anchored_end =
157 expr.properties().look_set_suffix().contains(Look::End);
158 if self.compiled.needs_dotstar() {
159 dotstar_patch = self.c_dotstar()?;
160 self.compiled.start = dotstar_patch.entry;
161 }
162 self.compiled.captures = vec![None];
163 let patch =
164 self.c_capture(0, expr)?.unwrap_or_else(|| self.next_inst());
165 if self.compiled.needs_dotstar() {
166 self.fill(dotstar_patch.hole, patch.entry);
167 } else {
168 self.compiled.start = patch.entry;
169 }
170 self.fill_to_next(patch.hole);
171 self.compiled.matches = vec![self.insts.len()];
172 self.push_compiled(Inst::Match(0));
173 self.compiled.static_captures_len =
174 expr.properties().static_explicit_captures_len();
175 self.compile_finish()
176 }
177
178 fn compile_many(
179 mut self,
180 exprs: &[Hir],
181 ) -> result::Result<Program, Error> {
182 debug_assert!(exprs.len() > 1);
183
184 self.compiled.is_anchored_start = exprs
185 .iter()
186 .all(|e| e.properties().look_set_prefix().contains(Look::Start));
187 self.compiled.is_anchored_end = exprs
188 .iter()
189 .all(|e| e.properties().look_set_suffix().contains(Look::End));
190 let mut dotstar_patch = Patch { hole: Hole::None, entry: 0 };
191 if self.compiled.needs_dotstar() {
192 dotstar_patch = self.c_dotstar()?;
193 self.compiled.start = dotstar_patch.entry;
194 } else {
195 self.compiled.start = 0; // first instruction is always split
196 }
197 self.fill_to_next(dotstar_patch.hole);
198
199 let mut prev_hole = Hole::None;
200 for (i, expr) in exprs[0..exprs.len() - 1].iter().enumerate() {
201 self.fill_to_next(prev_hole);
202 let split = self.push_split_hole();
203 let Patch { hole, entry } =
204 self.c_capture(0, expr)?.unwrap_or_else(|| self.next_inst());
205 self.fill_to_next(hole);
206 self.compiled.matches.push(self.insts.len());
207 self.push_compiled(Inst::Match(i));
208 prev_hole = self.fill_split(split, Some(entry), None);
209 }
210 let i = exprs.len() - 1;
211 let Patch { hole, entry } =
212 self.c_capture(0, &exprs[i])?.unwrap_or_else(|| self.next_inst());
213 self.fill(prev_hole, entry);
214 self.fill_to_next(hole);
215 self.compiled.matches.push(self.insts.len());
216 self.push_compiled(Inst::Match(i));
217 self.compile_finish()
218 }
219
220 fn compile_finish(mut self) -> result::Result<Program, Error> {
221 self.compiled.insts =
222 self.insts.into_iter().map(|inst| inst.unwrap()).collect();
223 self.compiled.byte_classes = self.byte_classes.byte_classes();
224 self.compiled.capture_name_idx = Arc::new(self.capture_name_idx);
225 Ok(self.compiled)
226 }
227
228 /// Compile expr into self.insts, returning a patch on success,
229 /// or an error if we run out of memory.
230 ///
231 /// All of the c_* methods of the compiler share the contract outlined
232 /// here.
233 ///
234 /// The main thing that a c_* method does is mutate `self.insts`
235 /// to add a list of mostly compiled instructions required to execute
236 /// the given expression. `self.insts` contains MaybeInsts rather than
237 /// Insts because there is some backpatching required.
238 ///
239 /// The `Patch` value returned by each c_* method provides metadata
240 /// about the compiled instructions emitted to `self.insts`. The
241 /// `entry` member of the patch refers to the first instruction
242 /// (the entry point), while the `hole` member contains zero or
243 /// more offsets to partial instructions that need to be backpatched.
244 /// The c_* routine can't know where its list of instructions are going to
245 /// jump to after execution, so it is up to the caller to patch
246 /// these jumps to point to the right place. So compiling some
247 /// expression, e, we would end up with a situation that looked like:
248 ///
249 /// ```text
250 /// self.insts = [ ..., i1, i2, ..., iexit1, ..., iexitn, ...]
251 /// ^ ^ ^
252 /// | \ /
253 /// entry \ /
254 /// hole
255 /// ```
256 ///
257 /// To compile two expressions, e1 and e2, concatenated together we
258 /// would do:
259 ///
260 /// ```ignore
261 /// let patch1 = self.c(e1);
262 /// let patch2 = self.c(e2);
263 /// ```
264 ///
265 /// while leaves us with a situation that looks like
266 ///
267 /// ```text
268 /// self.insts = [ ..., i1, ..., iexit1, ..., i2, ..., iexit2 ]
269 /// ^ ^ ^ ^
270 /// | | | |
271 /// entry1 hole1 entry2 hole2
272 /// ```
273 ///
274 /// Then to merge the two patches together into one we would backpatch
275 /// hole1 with entry2 and return a new patch that enters at entry1
276 /// and has hole2 for a hole. In fact, if you look at the c_concat
277 /// method you will see that it does exactly this, though it handles
278 /// a list of expressions rather than just the two that we use for
279 /// an example.
280 ///
281 /// Ok(None) is returned when an expression is compiled to no
282 /// instruction, and so no patch.entry value makes sense.
283 fn c(&mut self, expr: &Hir) -> ResultOrEmpty {
284 use crate::prog;
285 use regex_syntax::hir::HirKind::*;
286
287 self.check_size()?;
288 match *expr.kind() {
289 Empty => self.c_empty(),
290 Literal(hir::Literal(ref bytes)) => {
291 if self.compiled.is_reverse {
292 let mut bytes = bytes.to_vec();
293 bytes.reverse();
294 self.c_literal(&bytes)
295 } else {
296 self.c_literal(bytes)
297 }
298 }
299 Class(hir::Class::Unicode(ref cls)) => self.c_class(cls.ranges()),
300 Class(hir::Class::Bytes(ref cls)) => {
301 if self.compiled.uses_bytes() {
302 self.c_class_bytes(cls.ranges())
303 } else {
304 assert!(cls.is_ascii());
305 let mut char_ranges = vec![];
306 for r in cls.iter() {
307 let (s, e) = (r.start() as char, r.end() as char);
308 char_ranges.push(hir::ClassUnicodeRange::new(s, e));
309 }
310 self.c_class(&char_ranges)
311 }
312 }
313 Look(ref look) => match *look {
314 hir::Look::Start if self.compiled.is_reverse => {
315 self.c_empty_look(prog::EmptyLook::EndText)
316 }
317 hir::Look::Start => {
318 self.c_empty_look(prog::EmptyLook::StartText)
319 }
320 hir::Look::End if self.compiled.is_reverse => {
321 self.c_empty_look(prog::EmptyLook::StartText)
322 }
323 hir::Look::End => self.c_empty_look(prog::EmptyLook::EndText),
324 hir::Look::StartLF if self.compiled.is_reverse => {
325 self.byte_classes.set_range(b'\n', b'\n');
326 self.c_empty_look(prog::EmptyLook::EndLine)
327 }
328 hir::Look::StartLF => {
329 self.byte_classes.set_range(b'\n', b'\n');
330 self.c_empty_look(prog::EmptyLook::StartLine)
331 }
332 hir::Look::EndLF if self.compiled.is_reverse => {
333 self.byte_classes.set_range(b'\n', b'\n');
334 self.c_empty_look(prog::EmptyLook::StartLine)
335 }
336 hir::Look::EndLF => {
337 self.byte_classes.set_range(b'\n', b'\n');
338 self.c_empty_look(prog::EmptyLook::EndLine)
339 }
340 hir::Look::StartCRLF | hir::Look::EndCRLF => {
341 return Err(Error::Syntax(
342 "CRLF-aware line anchors are not supported yet"
343 .to_string(),
344 ));
345 }
346 hir::Look::WordAscii => {
347 self.byte_classes.set_word_boundary();
348 self.c_empty_look(prog::EmptyLook::WordBoundaryAscii)
349 }
350 hir::Look::WordAsciiNegate => {
351 self.byte_classes.set_word_boundary();
352 self.c_empty_look(prog::EmptyLook::NotWordBoundaryAscii)
353 }
354 hir::Look::WordUnicode => {
355 if !cfg!(feature = "unicode-perl") {
356 return Err(Error::Syntax(
357 "Unicode word boundaries are unavailable when \
358 the unicode-perl feature is disabled"
359 .to_string(),
360 ));
361 }
362 self.compiled.has_unicode_word_boundary = true;
363 self.byte_classes.set_word_boundary();
364 // We also make sure that all ASCII bytes are in a different
365 // class from non-ASCII bytes. Otherwise, it's possible for
366 // ASCII bytes to get lumped into the same class as non-ASCII
367 // bytes. This in turn may cause the lazy DFA to falsely start
368 // when it sees an ASCII byte that maps to a byte class with
369 // non-ASCII bytes. This ensures that never happens.
370 self.byte_classes.set_range(0, 0x7F);
371 self.c_empty_look(prog::EmptyLook::WordBoundary)
372 }
373 hir::Look::WordUnicodeNegate => {
374 if !cfg!(feature = "unicode-perl") {
375 return Err(Error::Syntax(
376 "Unicode word boundaries are unavailable when \
377 the unicode-perl feature is disabled"
378 .to_string(),
379 ));
380 }
381 self.compiled.has_unicode_word_boundary = true;
382 self.byte_classes.set_word_boundary();
383 // See comments above for why we set the ASCII range here.
384 self.byte_classes.set_range(0, 0x7F);
385 self.c_empty_look(prog::EmptyLook::NotWordBoundary)
386 }
387 },
388 Capture(hir::Capture { index, ref name, ref sub }) => {
389 if index as usize >= self.compiled.captures.len() {
390 let name = match *name {
391 None => None,
392 Some(ref boxed_str) => Some(boxed_str.to_string()),
393 };
394 self.compiled.captures.push(name.clone());
395 if let Some(name) = name {
396 self.capture_name_idx.insert(name, index as usize);
397 }
398 }
399 self.c_capture(2 * index as usize, sub)
400 }
401 Concat(ref es) => {
402 if self.compiled.is_reverse {
403 self.c_concat(es.iter().rev())
404 } else {
405 self.c_concat(es)
406 }
407 }
408 Alternation(ref es) => self.c_alternate(&**es),
409 Repetition(ref rep) => self.c_repeat(rep),
410 }
411 }
412
413 fn c_empty(&mut self) -> ResultOrEmpty {
414 // See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8
415 // See: CVE-2022-24713
416 //
417 // Since 'empty' sub-expressions don't increase the size of
418 // the actual compiled object, we "fake" an increase in its
419 // size so that our 'check_size_limit' routine will eventually
420 // stop compilation if there are too many empty sub-expressions
421 // (e.g., via a large repetition).
422 self.extra_inst_bytes += std::mem::size_of::<Inst>();
423 Ok(None)
424 }
425
426 fn c_capture(&mut self, first_slot: usize, expr: &Hir) -> ResultOrEmpty {
427 if self.num_exprs > 1 || self.compiled.is_dfa {
428 // Don't ever compile Save instructions for regex sets because
429 // they are never used. They are also never used in DFA programs
430 // because DFAs can't handle captures.
431 self.c(expr)
432 } else {
433 let entry = self.insts.len();
434 let hole = self.push_hole(InstHole::Save { slot: first_slot });
435 let patch = self.c(expr)?.unwrap_or_else(|| self.next_inst());
436 self.fill(hole, patch.entry);
437 self.fill_to_next(patch.hole);
438 let hole = self.push_hole(InstHole::Save { slot: first_slot + 1 });
439 Ok(Some(Patch { hole, entry }))
440 }
441 }
442
443 fn c_dotstar(&mut self) -> Result {
444 let hir = if self.compiled.only_utf8() {
445 Hir::dot(hir::Dot::AnyChar)
446 } else {
447 Hir::dot(hir::Dot::AnyByte)
448 };
449 Ok(self
450 .c(&Hir::repetition(hir::Repetition {
451 min: 0,
452 max: None,
453 greedy: false,
454 sub: Box::new(hir),
455 }))?
456 .unwrap())
457 }
458
459 fn c_char(&mut self, c: char) -> ResultOrEmpty {
460 if self.compiled.uses_bytes() {
461 if c.is_ascii() {
462 let b = c as u8;
463 let hole =
464 self.push_hole(InstHole::Bytes { start: b, end: b });
465 self.byte_classes.set_range(b, b);
466 Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
467 } else {
468 self.c_class(&[hir::ClassUnicodeRange::new(c, c)])
469 }
470 } else {
471 let hole = self.push_hole(InstHole::Char { c });
472 Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
473 }
474 }
475
476 fn c_class(&mut self, ranges: &[hir::ClassUnicodeRange]) -> ResultOrEmpty {
477 use std::mem::size_of;
478
479 if ranges.is_empty() {
480 return Err(Error::Syntax(
481 "empty character classes are not allowed".to_string(),
482 ));
483 }
484 if self.compiled.uses_bytes() {
485 Ok(Some(CompileClass { c: self, ranges }.compile()?))
486 } else {
487 let ranges: Vec<(char, char)> =
488 ranges.iter().map(|r| (r.start(), r.end())).collect();
489 let hole = if ranges.len() == 1 && ranges[0].0 == ranges[0].1 {
490 self.push_hole(InstHole::Char { c: ranges[0].0 })
491 } else {
492 self.extra_inst_bytes +=
493 ranges.len() * (size_of::<char>() * 2);
494 self.push_hole(InstHole::Ranges { ranges })
495 };
496 Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
497 }
498 }
499
500 fn c_byte(&mut self, b: u8) -> ResultOrEmpty {
501 self.c_class_bytes(&[hir::ClassBytesRange::new(b, b)])
502 }
503
504 fn c_class_bytes(
505 &mut self,
506 ranges: &[hir::ClassBytesRange],
507 ) -> ResultOrEmpty {
508 if ranges.is_empty() {
509 return Err(Error::Syntax(
510 "empty character classes are not allowed".to_string(),
511 ));
512 }
513
514 let first_split_entry = self.insts.len();
515 let mut holes = vec![];
516 let mut prev_hole = Hole::None;
517 for r in &ranges[0..ranges.len() - 1] {
518 self.fill_to_next(prev_hole);
519 let split = self.push_split_hole();
520 let next = self.insts.len();
521 self.byte_classes.set_range(r.start(), r.end());
522 holes.push(self.push_hole(InstHole::Bytes {
523 start: r.start(),
524 end: r.end(),
525 }));
526 prev_hole = self.fill_split(split, Some(next), None);
527 }
528 let next = self.insts.len();
529 let r = &ranges[ranges.len() - 1];
530 self.byte_classes.set_range(r.start(), r.end());
531 holes.push(
532 self.push_hole(InstHole::Bytes { start: r.start(), end: r.end() }),
533 );
534 self.fill(prev_hole, next);
535 Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry }))
536 }
537
538 fn c_empty_look(&mut self, look: EmptyLook) -> ResultOrEmpty {
539 let hole = self.push_hole(InstHole::EmptyLook { look });
540 Ok(Some(Patch { hole, entry: self.insts.len() - 1 }))
541 }
542
543 fn c_literal(&mut self, bytes: &[u8]) -> ResultOrEmpty {
544 match core::str::from_utf8(bytes) {
545 Ok(string) => {
546 let mut it = string.chars();
547 let Patch { mut hole, entry } = loop {
548 match it.next() {
549 None => return self.c_empty(),
550 Some(ch) => {
551 if let Some(p) = self.c_char(ch)? {
552 break p;
553 }
554 }
555 }
556 };
557 for ch in it {
558 if let Some(p) = self.c_char(ch)? {
559 self.fill(hole, p.entry);
560 hole = p.hole;
561 }
562 }
563 Ok(Some(Patch { hole, entry }))
564 }
565 Err(_) => {
566 assert!(self.compiled.uses_bytes());
567 let mut it = bytes.iter().copied();
568 let Patch { mut hole, entry } = loop {
569 match it.next() {
570 None => return self.c_empty(),
571 Some(byte) => {
572 if let Some(p) = self.c_byte(byte)? {
573 break p;
574 }
575 }
576 }
577 };
578 for byte in it {
579 if let Some(p) = self.c_byte(byte)? {
580 self.fill(hole, p.entry);
581 hole = p.hole;
582 }
583 }
584 Ok(Some(Patch { hole, entry }))
585 }
586 }
587 }
588
589 fn c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty
590 where
591 I: IntoIterator<Item = &'a Hir>,
592 {
593 let mut exprs = exprs.into_iter();
594 let Patch { mut hole, entry } = loop {
595 match exprs.next() {
596 None => return self.c_empty(),
597 Some(e) => {
598 if let Some(p) = self.c(e)? {
599 break p;
600 }
601 }
602 }
603 };
604 for e in exprs {
605 if let Some(p) = self.c(e)? {
606 self.fill(hole, p.entry);
607 hole = p.hole;
608 }
609 }
610 Ok(Some(Patch { hole, entry }))
611 }
612
613 fn c_alternate(&mut self, exprs: &[Hir]) -> ResultOrEmpty {
614 debug_assert!(
615 exprs.len() >= 2,
616 "alternates must have at least 2 exprs"
617 );
618
619 // Initial entry point is always the first split.
620 let first_split_entry = self.insts.len();
621
622 // Save up all of the holes from each alternate. They will all get
623 // patched to point to the same location.
624 let mut holes = vec![];
625
626 // true indicates that the hole is a split where we want to fill
627 // the second branch.
628 let mut prev_hole = (Hole::None, false);
629 for e in &exprs[0..exprs.len() - 1] {
630 if prev_hole.1 {
631 let next = self.insts.len();
632 self.fill_split(prev_hole.0, None, Some(next));
633 } else {
634 self.fill_to_next(prev_hole.0);
635 }
636 let split = self.push_split_hole();
637 if let Some(Patch { hole, entry }) = self.c(e)? {
638 holes.push(hole);
639 prev_hole = (self.fill_split(split, Some(entry), None), false);
640 } else {
641 let (split1, split2) = split.dup_one();
642 holes.push(split1);
643 prev_hole = (split2, true);
644 }
645 }
646 if let Some(Patch { hole, entry }) = self.c(&exprs[exprs.len() - 1])? {
647 holes.push(hole);
648 if prev_hole.1 {
649 self.fill_split(prev_hole.0, None, Some(entry));
650 } else {
651 self.fill(prev_hole.0, entry);
652 }
653 } else {
654 // We ignore prev_hole.1. When it's true, it means we have two
655 // empty branches both pushing prev_hole.0 into holes, so both
656 // branches will go to the same place anyway.
657 holes.push(prev_hole.0);
658 }
659 Ok(Some(Patch { hole: Hole::Many(holes), entry: first_split_entry }))
660 }
661
662 fn c_repeat(&mut self, rep: &hir::Repetition) -> ResultOrEmpty {
663 match (rep.min, rep.max) {
664 (0, Some(1)) => self.c_repeat_zero_or_one(&rep.sub, rep.greedy),
665 (0, None) => self.c_repeat_zero_or_more(&rep.sub, rep.greedy),
666 (1, None) => self.c_repeat_one_or_more(&rep.sub, rep.greedy),
667 (min, None) => {
668 self.c_repeat_range_min_or_more(&rep.sub, rep.greedy, min)
669 }
670 (min, Some(max)) => {
671 self.c_repeat_range(&rep.sub, rep.greedy, min, max)
672 }
673 }
674 }
675
676 fn c_repeat_zero_or_one(
677 &mut self,
678 expr: &Hir,
679 greedy: bool,
680 ) -> ResultOrEmpty {
681 let split_entry = self.insts.len();
682 let split = self.push_split_hole();
683 let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
684 Some(p) => p,
685 None => return self.pop_split_hole(),
686 };
687 let split_hole = if greedy {
688 self.fill_split(split, Some(entry_rep), None)
689 } else {
690 self.fill_split(split, None, Some(entry_rep))
691 };
692 let holes = vec![hole_rep, split_hole];
693 Ok(Some(Patch { hole: Hole::Many(holes), entry: split_entry }))
694 }
695
696 fn c_repeat_zero_or_more(
697 &mut self,
698 expr: &Hir,
699 greedy: bool,
700 ) -> ResultOrEmpty {
701 let split_entry = self.insts.len();
702 let split = self.push_split_hole();
703 let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
704 Some(p) => p,
705 None => return self.pop_split_hole(),
706 };
707
708 self.fill(hole_rep, split_entry);
709 let split_hole = if greedy {
710 self.fill_split(split, Some(entry_rep), None)
711 } else {
712 self.fill_split(split, None, Some(entry_rep))
713 };
714 Ok(Some(Patch { hole: split_hole, entry: split_entry }))
715 }
716
717 fn c_repeat_one_or_more(
718 &mut self,
719 expr: &Hir,
720 greedy: bool,
721 ) -> ResultOrEmpty {
722 let Patch { hole: hole_rep, entry: entry_rep } = match self.c(expr)? {
723 Some(p) => p,
724 None => return Ok(None),
725 };
726 self.fill_to_next(hole_rep);
727 let split = self.push_split_hole();
728
729 let split_hole = if greedy {
730 self.fill_split(split, Some(entry_rep), None)
731 } else {
732 self.fill_split(split, None, Some(entry_rep))
733 };
734 Ok(Some(Patch { hole: split_hole, entry: entry_rep }))
735 }
736
737 fn c_repeat_range_min_or_more(
738 &mut self,
739 expr: &Hir,
740 greedy: bool,
741 min: u32,
742 ) -> ResultOrEmpty {
743 let min = u32_to_usize(min);
744 // Using next_inst() is ok, because we can't return it (concat would
745 // have to return Some(_) while c_repeat_range_min_or_more returns
746 // None).
747 let patch_concat = self
748 .c_concat(iter::repeat(expr).take(min))?
749 .unwrap_or_else(|| self.next_inst());
750 if let Some(patch_rep) = self.c_repeat_zero_or_more(expr, greedy)? {
751 self.fill(patch_concat.hole, patch_rep.entry);
752 Ok(Some(Patch { hole: patch_rep.hole, entry: patch_concat.entry }))
753 } else {
754 Ok(None)
755 }
756 }
757
758 fn c_repeat_range(
759 &mut self,
760 expr: &Hir,
761 greedy: bool,
762 min: u32,
763 max: u32,
764 ) -> ResultOrEmpty {
765 let (min, max) = (u32_to_usize(min), u32_to_usize(max));
766 debug_assert!(min <= max);
767 let patch_concat = self.c_concat(iter::repeat(expr).take(min))?;
768 if min == max {
769 return Ok(patch_concat);
770 }
771 // Same reasoning as in c_repeat_range_min_or_more (we know that min <
772 // max at this point).
773 let patch_concat = patch_concat.unwrap_or_else(|| self.next_inst());
774 let initial_entry = patch_concat.entry;
775 // It is much simpler to compile, e.g., `a{2,5}` as:
776 //
777 // aaa?a?a?
778 //
779 // But you end up with a sequence of instructions like this:
780 //
781 // 0: 'a'
782 // 1: 'a',
783 // 2: split(3, 4)
784 // 3: 'a'
785 // 4: split(5, 6)
786 // 5: 'a'
787 // 6: split(7, 8)
788 // 7: 'a'
789 // 8: MATCH
790 //
791 // This is *incredibly* inefficient because the splits end
792 // up forming a chain, which has to be resolved everything a
793 // transition is followed.
794 let mut holes = vec![];
795 let mut prev_hole = patch_concat.hole;
796 for _ in min..max {
797 self.fill_to_next(prev_hole);
798 let split = self.push_split_hole();
799 let Patch { hole, entry } = match self.c(expr)? {
800 Some(p) => p,
801 None => return self.pop_split_hole(),
802 };
803 prev_hole = hole;
804 if greedy {
805 holes.push(self.fill_split(split, Some(entry), None));
806 } else {
807 holes.push(self.fill_split(split, None, Some(entry)));
808 }
809 }
810 holes.push(prev_hole);
811 Ok(Some(Patch { hole: Hole::Many(holes), entry: initial_entry }))
812 }
813
814 /// Can be used as a default value for the c_* functions when the call to
815 /// c_function is followed by inserting at least one instruction that is
816 /// always executed after the ones written by the c* function.
817 fn next_inst(&self) -> Patch {
818 Patch { hole: Hole::None, entry: self.insts.len() }
819 }
820
821 fn fill(&mut self, hole: Hole, goto: InstPtr) {
822 match hole {
823 Hole::None => {}
824 Hole::One(pc) => {
825 self.insts[pc].fill(goto);
826 }
827 Hole::Many(holes) => {
828 for hole in holes {
829 self.fill(hole, goto);
830 }
831 }
832 }
833 }
834
835 fn fill_to_next(&mut self, hole: Hole) {
836 let next = self.insts.len();
837 self.fill(hole, next);
838 }
839
840 fn fill_split(
841 &mut self,
842 hole: Hole,
843 goto1: Option<InstPtr>,
844 goto2: Option<InstPtr>,
845 ) -> Hole {
846 match hole {
847 Hole::None => Hole::None,
848 Hole::One(pc) => match (goto1, goto2) {
849 (Some(goto1), Some(goto2)) => {
850 self.insts[pc].fill_split(goto1, goto2);
851 Hole::None
852 }
853 (Some(goto1), None) => {
854 self.insts[pc].half_fill_split_goto1(goto1);
855 Hole::One(pc)
856 }
857 (None, Some(goto2)) => {
858 self.insts[pc].half_fill_split_goto2(goto2);
859 Hole::One(pc)
860 }
861 (None, None) => unreachable!(
862 "at least one of the split \
863 holes must be filled"
864 ),
865 },
866 Hole::Many(holes) => {
867 let mut new_holes = vec![];
868 for hole in holes {
869 new_holes.push(self.fill_split(hole, goto1, goto2));
870 }
871 if new_holes.is_empty() {
872 Hole::None
873 } else if new_holes.len() == 1 {
874 new_holes.pop().unwrap()
875 } else {
876 Hole::Many(new_holes)
877 }
878 }
879 }
880 }
881
882 fn push_compiled(&mut self, inst: Inst) {
883 self.insts.push(MaybeInst::Compiled(inst));
884 }
885
886 fn push_hole(&mut self, inst: InstHole) -> Hole {
887 let hole = self.insts.len();
888 self.insts.push(MaybeInst::Uncompiled(inst));
889 Hole::One(hole)
890 }
891
892 fn push_split_hole(&mut self) -> Hole {
893 let hole = self.insts.len();
894 self.insts.push(MaybeInst::Split);
895 Hole::One(hole)
896 }
897
898 fn pop_split_hole(&mut self) -> ResultOrEmpty {
899 self.insts.pop();
900 Ok(None)
901 }
902
903 fn check_size(&self) -> result::Result<(), Error> {
904 use std::mem::size_of;
905
906 let size =
907 self.extra_inst_bytes + (self.insts.len() * size_of::<Inst>());
908 if size > self.size_limit {
909 Err(Error::CompiledTooBig(self.size_limit))
910 } else {
911 Ok(())
912 }
913 }
914}
915
916#[derive(Debug)]
917enum Hole {
918 None,
919 One(InstPtr),
920 Many(Vec<Hole>),
921}
922
923impl Hole {
924 fn dup_one(self) -> (Self, Self) {
925 match self {
926 Hole::One(pc: usize) => (Hole::One(pc), Hole::One(pc)),
927 Hole::None | Hole::Many(_) => {
928 unreachable!("must be called on single hole")
929 }
930 }
931 }
932}
933
934#[derive(Clone, Debug)]
935enum MaybeInst {
936 Compiled(Inst),
937 Uncompiled(InstHole),
938 Split,
939 Split1(InstPtr),
940 Split2(InstPtr),
941}
942
943impl MaybeInst {
944 fn fill(&mut self, goto: InstPtr) {
945 let maybeinst = match *self {
946 MaybeInst::Split => MaybeInst::Split1(goto),
947 MaybeInst::Uncompiled(ref inst) => {
948 MaybeInst::Compiled(inst.fill(goto))
949 }
950 MaybeInst::Split1(goto1) => {
951 MaybeInst::Compiled(Inst::Split(InstSplit {
952 goto1,
953 goto2: goto,
954 }))
955 }
956 MaybeInst::Split2(goto2) => {
957 MaybeInst::Compiled(Inst::Split(InstSplit {
958 goto1: goto,
959 goto2,
960 }))
961 }
962 _ => unreachable!(
963 "not all instructions were compiled! \
964 found uncompiled instruction: {:?}",
965 self
966 ),
967 };
968 *self = maybeinst;
969 }
970
971 fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) {
972 let filled = match *self {
973 MaybeInst::Split => Inst::Split(InstSplit { goto1, goto2 }),
974 _ => unreachable!(
975 "must be called on Split instruction, \
976 instead it was called on: {:?}",
977 self
978 ),
979 };
980 *self = MaybeInst::Compiled(filled);
981 }
982
983 fn half_fill_split_goto1(&mut self, goto1: InstPtr) {
984 let half_filled = match *self {
985 MaybeInst::Split => goto1,
986 _ => unreachable!(
987 "must be called on Split instruction, \
988 instead it was called on: {:?}",
989 self
990 ),
991 };
992 *self = MaybeInst::Split1(half_filled);
993 }
994
995 fn half_fill_split_goto2(&mut self, goto2: InstPtr) {
996 let half_filled = match *self {
997 MaybeInst::Split => goto2,
998 _ => unreachable!(
999 "must be called on Split instruction, \
1000 instead it was called on: {:?}",
1001 self
1002 ),
1003 };
1004 *self = MaybeInst::Split2(half_filled);
1005 }
1006
1007 fn unwrap(self) -> Inst {
1008 match self {
1009 MaybeInst::Compiled(inst) => inst,
1010 _ => unreachable!(
1011 "must be called on a compiled instruction, \
1012 instead it was called on: {:?}",
1013 self
1014 ),
1015 }
1016 }
1017}
1018
1019#[derive(Clone, Debug)]
1020enum InstHole {
1021 Save { slot: usize },
1022 EmptyLook { look: EmptyLook },
1023 Char { c: char },
1024 Ranges { ranges: Vec<(char, char)> },
1025 Bytes { start: u8, end: u8 },
1026}
1027
1028impl InstHole {
1029 fn fill(&self, goto: InstPtr) -> Inst {
1030 match *self {
1031 InstHole::Save { slot: usize } => Inst::Save(InstSave { goto, slot }),
1032 InstHole::EmptyLook { look: EmptyLook } => {
1033 Inst::EmptyLook(InstEmptyLook { goto, look })
1034 }
1035 InstHole::Char { c: char } => Inst::Char(InstChar { goto, c }),
1036 InstHole::Ranges { ref ranges: &Vec<(char, char)> } => Inst::Ranges(InstRanges {
1037 goto,
1038 ranges: ranges.clone().into_boxed_slice(),
1039 }),
1040 InstHole::Bytes { start: u8, end: u8 } => {
1041 Inst::Bytes(InstBytes { goto, start, end })
1042 }
1043 }
1044 }
1045}
1046
1047struct CompileClass<'a, 'b> {
1048 c: &'a mut Compiler,
1049 ranges: &'b [hir::ClassUnicodeRange],
1050}
1051
1052impl<'a, 'b> CompileClass<'a, 'b> {
1053 fn compile(mut self) -> Result {
1054 let mut holes = vec![];
1055 let mut initial_entry = None;
1056 let mut last_split = Hole::None;
1057 let mut utf8_seqs = self.c.utf8_seqs.take().unwrap();
1058 self.c.suffix_cache.clear();
1059
1060 for (i, range) in self.ranges.iter().enumerate() {
1061 let is_last_range = i + 1 == self.ranges.len();
1062 utf8_seqs.reset(range.start(), range.end());
1063 let mut it = (&mut utf8_seqs).peekable();
1064 loop {
1065 let utf8_seq = match it.next() {
1066 None => break,
1067 Some(utf8_seq) => utf8_seq,
1068 };
1069 if is_last_range && it.peek().is_none() {
1070 let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
1071 holes.push(hole);
1072 self.c.fill(last_split, entry);
1073 last_split = Hole::None;
1074 if initial_entry.is_none() {
1075 initial_entry = Some(entry);
1076 }
1077 } else {
1078 if initial_entry.is_none() {
1079 initial_entry = Some(self.c.insts.len());
1080 }
1081 self.c.fill_to_next(last_split);
1082 last_split = self.c.push_split_hole();
1083 let Patch { hole, entry } = self.c_utf8_seq(&utf8_seq)?;
1084 holes.push(hole);
1085 last_split =
1086 self.c.fill_split(last_split, Some(entry), None);
1087 }
1088 }
1089 }
1090 self.c.utf8_seqs = Some(utf8_seqs);
1091 Ok(Patch { hole: Hole::Many(holes), entry: initial_entry.unwrap() })
1092 }
1093
1094 fn c_utf8_seq(&mut self, seq: &Utf8Sequence) -> Result {
1095 if self.c.compiled.is_reverse {
1096 self.c_utf8_seq_(seq)
1097 } else {
1098 self.c_utf8_seq_(seq.into_iter().rev())
1099 }
1100 }
1101
1102 fn c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result
1103 where
1104 I: IntoIterator<Item = &'r Utf8Range>,
1105 {
1106 // The initial instruction for each UTF-8 sequence should be the same.
1107 let mut from_inst = ::std::usize::MAX;
1108 let mut last_hole = Hole::None;
1109 for byte_range in seq {
1110 let key = SuffixCacheKey {
1111 from_inst,
1112 start: byte_range.start,
1113 end: byte_range.end,
1114 };
1115 {
1116 let pc = self.c.insts.len();
1117 if let Some(cached_pc) = self.c.suffix_cache.get(key, pc) {
1118 from_inst = cached_pc;
1119 continue;
1120 }
1121 }
1122 self.c.byte_classes.set_range(byte_range.start, byte_range.end);
1123 if from_inst == ::std::usize::MAX {
1124 last_hole = self.c.push_hole(InstHole::Bytes {
1125 start: byte_range.start,
1126 end: byte_range.end,
1127 });
1128 } else {
1129 self.c.push_compiled(Inst::Bytes(InstBytes {
1130 goto: from_inst,
1131 start: byte_range.start,
1132 end: byte_range.end,
1133 }));
1134 }
1135 from_inst = self.c.insts.len().checked_sub(1).unwrap();
1136 debug_assert!(from_inst < ::std::usize::MAX);
1137 }
1138 debug_assert!(from_inst < ::std::usize::MAX);
1139 Ok(Patch { hole: last_hole, entry: from_inst })
1140 }
1141}
1142
1143/// `SuffixCache` is a simple bounded hash map for caching suffix entries in
1144/// UTF-8 automata. For example, consider the Unicode range \u{0}-\u{FFFF}.
1145/// The set of byte ranges looks like this:
1146///
1147/// [0-7F]
1148/// [C2-DF][80-BF]
1149/// [E0][A0-BF][80-BF]
1150/// [E1-EC][80-BF][80-BF]
1151/// [ED][80-9F][80-BF]
1152/// [EE-EF][80-BF][80-BF]
1153///
1154/// Each line above translates to one alternate in the compiled regex program.
1155/// However, all but one of the alternates end in the same suffix, which is
1156/// a waste of an instruction. The suffix cache facilitates reusing them across
1157/// alternates.
1158///
1159/// Note that a HashMap could be trivially used for this, but we don't need its
1160/// overhead. Some small bounded space (LRU style) is more than enough.
1161///
1162/// This uses similar idea to [`SparseSet`](../sparse/struct.SparseSet.html),
1163/// except it uses hashes as original indices and then compares full keys for
1164/// validation against `dense` array.
1165#[derive(Debug)]
1166struct SuffixCache {
1167 sparse: Box<[usize]>,
1168 dense: Vec<SuffixCacheEntry>,
1169}
1170
1171#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
1172struct SuffixCacheEntry {
1173 key: SuffixCacheKey,
1174 pc: InstPtr,
1175}
1176
1177#[derive(Clone, Copy, Debug, Default, Eq, Hash, PartialEq)]
1178struct SuffixCacheKey {
1179 from_inst: InstPtr,
1180 start: u8,
1181 end: u8,
1182}
1183
1184impl SuffixCache {
1185 fn new(size: usize) -> Self {
1186 SuffixCache {
1187 sparse: vec![0usize; size].into(),
1188 dense: Vec::with_capacity(size),
1189 }
1190 }
1191
1192 fn get(&mut self, key: SuffixCacheKey, pc: InstPtr) -> Option<InstPtr> {
1193 let hash = self.hash(&key);
1194 let pos = &mut self.sparse[hash];
1195 if let Some(entry) = self.dense.get(*pos) {
1196 if entry.key == key {
1197 return Some(entry.pc);
1198 }
1199 }
1200 *pos = self.dense.len();
1201 self.dense.push(SuffixCacheEntry { key, pc });
1202 None
1203 }
1204
1205 fn clear(&mut self) {
1206 self.dense.clear();
1207 }
1208
1209 fn hash(&self, suffix: &SuffixCacheKey) -> usize {
1210 // Basic FNV-1a hash as described:
1211 // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
1212 const FNV_PRIME: u64 = 1_099_511_628_211;
1213 let mut h = 14_695_981_039_346_656_037;
1214 h = (h ^ (suffix.from_inst as u64)).wrapping_mul(FNV_PRIME);
1215 h = (h ^ (suffix.start as u64)).wrapping_mul(FNV_PRIME);
1216 h = (h ^ (suffix.end as u64)).wrapping_mul(FNV_PRIME);
1217 (h as usize) % self.sparse.len()
1218 }
1219}
1220
1221struct ByteClassSet([bool; 256]);
1222
1223impl ByteClassSet {
1224 fn new() -> Self {
1225 ByteClassSet([false; 256])
1226 }
1227
1228 fn set_range(&mut self, start: u8, end: u8) {
1229 debug_assert!(start <= end);
1230 if start > 0 {
1231 self.0[start as usize - 1] = true;
1232 }
1233 self.0[end as usize] = true;
1234 }
1235
1236 fn set_word_boundary(&mut self) {
1237 // We need to mark all ranges of bytes whose pairs result in
1238 // evaluating \b differently.
1239 let iswb = is_word_byte;
1240 let mut b1: u16 = 0;
1241 let mut b2: u16;
1242 while b1 <= 255 {
1243 b2 = b1 + 1;
1244 while b2 <= 255 && iswb(b1 as u8) == iswb(b2 as u8) {
1245 b2 += 1;
1246 }
1247 self.set_range(b1 as u8, (b2 - 1) as u8);
1248 b1 = b2;
1249 }
1250 }
1251
1252 fn byte_classes(&self) -> Vec<u8> {
1253 // N.B. If you're debugging the DFA, it's useful to simply return
1254 // `(0..256).collect()`, which effectively removes the byte classes
1255 // and makes the transitions easier to read.
1256 // (0usize..256).map(|x| x as u8).collect()
1257 let mut byte_classes = vec![0; 256];
1258 let mut class = 0u8;
1259 let mut i = 0;
1260 loop {
1261 byte_classes[i] = class as u8;
1262 if i >= 255 {
1263 break;
1264 }
1265 if self.0[i] {
1266 class = class.checked_add(1).unwrap();
1267 }
1268 i += 1;
1269 }
1270 byte_classes
1271 }
1272}
1273
1274impl fmt::Debug for ByteClassSet {
1275 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1276 f.debug_tuple(name:"ByteClassSet").field(&&self.0[..]).finish()
1277 }
1278}
1279
1280fn u32_to_usize(n: u32) -> usize {
1281 // In case usize is less than 32 bits, we need to guard against overflow.
1282 // On most platforms this compiles to nothing.
1283 // TODO Use `std::convert::TryFrom` once it's stable.
1284 if (n as u64) > (::std::usize::MAX as u64) {
1285 panic!("BUG: {} is too big to be pointer sized", n)
1286 }
1287 n as usize
1288}
1289
1290#[cfg(test)]
1291mod tests {
1292 use super::ByteClassSet;
1293
1294 #[test]
1295 fn byte_classes() {
1296 let mut set = ByteClassSet::new();
1297 set.set_range(b'a', b'z');
1298 let classes = set.byte_classes();
1299 assert_eq!(classes[0], 0);
1300 assert_eq!(classes[1], 0);
1301 assert_eq!(classes[2], 0);
1302 assert_eq!(classes[b'a' as usize - 1], 0);
1303 assert_eq!(classes[b'a' as usize], 1);
1304 assert_eq!(classes[b'm' as usize], 1);
1305 assert_eq!(classes[b'z' as usize], 1);
1306 assert_eq!(classes[b'z' as usize + 1], 2);
1307 assert_eq!(classes[254], 2);
1308 assert_eq!(classes[255], 2);
1309
1310 let mut set = ByteClassSet::new();
1311 set.set_range(0, 2);
1312 set.set_range(4, 6);
1313 let classes = set.byte_classes();
1314 assert_eq!(classes[0], 0);
1315 assert_eq!(classes[1], 0);
1316 assert_eq!(classes[2], 0);
1317 assert_eq!(classes[3], 1);
1318 assert_eq!(classes[4], 2);
1319 assert_eq!(classes[5], 2);
1320 assert_eq!(classes[6], 2);
1321 assert_eq!(classes[7], 3);
1322 assert_eq!(classes[255], 3);
1323 }
1324
1325 #[test]
1326 fn full_byte_classes() {
1327 let mut set = ByteClassSet::new();
1328 for i in 0..256u16 {
1329 set.set_range(i as u8, i as u8);
1330 }
1331 assert_eq!(set.byte_classes().len(), 256);
1332 }
1333}
1334