1// This module provides an NFA compiler using Thompson's construction
2// algorithm. The compiler takes a regex-syntax::Hir as input and emits an NFA
3// graph as output. The NFA graph is structured in a way that permits it to be
4// executed by a virtual machine and also used to efficiently build a DFA.
5//
6// The compiler deals with a slightly expanded set of NFA states that notably
7// includes an empty node that has exactly one epsilon transition to the next
8// state. In other words, it's a "goto" instruction if one views Thompson's NFA
9// as a set of bytecode instructions. These goto instructions are removed in
10// a subsequent phase before returning the NFA to the caller. The purpose of
11// these empty nodes is that they make the construction algorithm substantially
12// simpler to implement. We remove them before returning to the caller because
13// they can represent substantial overhead when traversing the NFA graph
14// (either while searching using the NFA directly or while building a DFA).
15//
16// In the future, it would be nice to provide a Glushkov compiler as well,
17// as it would work well as a bit-parallel NFA for smaller regexes. But
18// the Thompson construction is one I'm more familiar with and seems more
19// straight-forward to deal with when it comes to large Unicode character
20// classes.
21//
22// Internally, the compiler uses interior mutability to improve composition
23// in the face of the borrow checker. In particular, we'd really like to be
24// able to write things like this:
25//
26// self.c_concat(exprs.iter().map(|e| self.c(e)))
27//
28// Which elegantly uses iterators to build up a sequence of compiled regex
29// sub-expressions and then hands it off to the concatenating compiler
30// routine. Without interior mutability, the borrow checker won't let us
31// borrow `self` mutably both inside and outside the closure at the same
32// time.
33
34use std::cell::RefCell;
35use std::mem;
36
37use regex_syntax::hir::{self, Hir, HirKind};
38use regex_syntax::utf8::{Utf8Range, Utf8Sequences};
39
40use classes::ByteClassSet;
41use error::{Error, Result};
42use nfa::map::{Utf8BoundedMap, Utf8SuffixKey, Utf8SuffixMap};
43use nfa::range_trie::RangeTrie;
44use nfa::{State, StateID, Transition, NFA};
45
46/// Config knobs for the NFA compiler. See the builder's methods for more
47/// docs on each one.
48#[derive(Clone, Copy, Debug)]
49struct Config {
50 anchored: bool,
51 allow_invalid_utf8: bool,
52 reverse: bool,
53 shrink: bool,
54}
55
56impl Default for Config {
57 fn default() -> Config {
58 Config {
59 anchored: false,
60 allow_invalid_utf8: false,
61 reverse: false,
62 shrink: true,
63 }
64 }
65}
66
67/// A builder for compiling an NFA.
68#[derive(Clone, Debug)]
69pub struct Builder {
70 config: Config,
71}
72
73impl Builder {
74 /// Create a new NFA builder with its default configuration.
75 pub fn new() -> Builder {
76 Builder { config: Config::default() }
77 }
78
79 /// Compile the given high level intermediate representation of a regular
80 /// expression into an NFA.
81 ///
82 /// If there was a problem building the NFA, then an error is returned.
83 /// For example, if the regex uses unsupported features (such as zero-width
84 /// assertions), then an error is returned.
85 pub fn build(&self, expr: &Hir) -> Result<NFA> {
86 let mut nfa = NFA::always_match();
87 self.build_with(&mut Compiler::new(), &mut nfa, expr)?;
88 Ok(nfa)
89 }
90
91 /// Compile the given high level intermediate representation of a regular
92 /// expression into the NFA given using the given compiler. Callers may
93 /// prefer this over `build` if they would like to reuse allocations while
94 /// compiling many regular expressions.
95 ///
96 /// On success, the given NFA is completely overwritten with the NFA
97 /// produced by the compiler.
98 ///
99 /// If there was a problem building the NFA, then an error is returned. For
100 /// example, if the regex uses unsupported features (such as zero-width
101 /// assertions), then an error is returned. When an error is returned,
102 /// the contents of `nfa` are unspecified and should not be relied upon.
103 /// However, it can still be reused in subsequent calls to this method.
104 pub fn build_with(
105 &self,
106 compiler: &mut Compiler,
107 nfa: &mut NFA,
108 expr: &Hir,
109 ) -> Result<()> {
110 compiler.clear();
111 compiler.configure(self.config);
112 compiler.compile(nfa, expr)
113 }
114
115 /// Set whether matching must be anchored at the beginning of the input.
116 ///
117 /// When enabled, a match must begin at the start of the input. When
118 /// disabled, the NFA will act as if the pattern started with a `.*?`,
119 /// which enables a match to appear anywhere.
120 ///
121 /// By default this is disabled.
122 pub fn anchored(&mut self, yes: bool) -> &mut Builder {
123 self.config.anchored = yes;
124 self
125 }
126
127 /// When enabled, the builder will permit the construction of an NFA that
128 /// may match invalid UTF-8.
129 ///
130 /// When disabled (the default), the builder is guaranteed to produce a
131 /// regex that will only ever match valid UTF-8 (otherwise, the builder
132 /// will return an error).
133 pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut Builder {
134 self.config.allow_invalid_utf8 = yes;
135 self
136 }
137
138 /// Reverse the NFA.
139 ///
140 /// A NFA reversal is performed by reversing all of the concatenated
141 /// sub-expressions in the original pattern, recursively. The resulting
142 /// NFA can be used to match the pattern starting from the end of a string
143 /// instead of the beginning of a string.
144 ///
145 /// Reversing the NFA is useful for building a reverse DFA, which is most
146 /// useful for finding the start of a match.
147 pub fn reverse(&mut self, yes: bool) -> &mut Builder {
148 self.config.reverse = yes;
149 self
150 }
151
152 /// Apply best effort heuristics to shrink the NFA at the expense of more
153 /// time/memory.
154 ///
155 /// This is enabled by default. Generally speaking, if one is using an NFA
156 /// to compile DFA, then the extra time used to shrink the NFA will be
157 /// more than made up for during DFA construction (potentially by a lot).
158 /// In other words, enabling this can substantially decrease the overall
159 /// amount of time it takes to build a DFA.
160 ///
161 /// The only reason to disable this if you want to compile an NFA and start
162 /// using it as quickly as possible without needing to build a DFA.
163 pub fn shrink(&mut self, yes: bool) -> &mut Builder {
164 self.config.shrink = yes;
165 self
166 }
167}
168
169/// A compiler that converts a regex abstract syntax to an NFA via Thompson's
170/// construction. Namely, this compiler permits epsilon transitions between
171/// states.
172///
173/// Users of this crate cannot use a compiler directly. Instead, all one can
174/// do is create one and use it via the
175/// [`Builder::build_with`](struct.Builder.html#method.build_with)
176/// method. This permits callers to reuse compilers in order to amortize
177/// allocations.
178#[derive(Clone, Debug)]
179pub struct Compiler {
180 /// The set of compiled NFA states. Once a state is compiled, it is
181 /// assigned a state ID equivalent to its index in this list. Subsequent
182 /// compilation can modify previous states by adding new transitions.
183 states: RefCell<Vec<CState>>,
184 /// The configuration from the builder.
185 config: Config,
186 /// State used for compiling character classes to UTF-8 byte automata.
187 /// State is not retained between character class compilations. This just
188 /// serves to amortize allocation to the extent possible.
189 utf8_state: RefCell<Utf8State>,
190 /// State used for arranging character classes in reverse into a trie.
191 trie_state: RefCell<RangeTrie>,
192 /// State used for caching common suffixes when compiling reverse UTF-8
193 /// automata (for Unicode character classes).
194 utf8_suffix: RefCell<Utf8SuffixMap>,
195 /// A map used to re-map state IDs when translating the compiler's internal
196 /// NFA state representation to the external NFA representation.
197 remap: RefCell<Vec<StateID>>,
198 /// A set of compiler internal state IDs that correspond to states that are
199 /// exclusively epsilon transitions, i.e., goto instructions, combined with
200 /// the state that they point to. This is used to record said states while
201 /// transforming the compiler's internal NFA representation to the external
202 /// form.
203 empties: RefCell<Vec<(StateID, StateID)>>,
204}
205
206/// A compiler intermediate state representation for an NFA that is only used
207/// during compilation. Once compilation is done, `CState`s are converted to
208/// `State`s, which have a much simpler representation.
209#[derive(Clone, Debug, Eq, PartialEq)]
210enum CState {
211 /// An empty state whose only purpose is to forward the automaton to
212 /// another state via en epsilon transition. These are useful during
213 /// compilation but are otherwise removed at the end.
214 Empty { next: StateID },
215 /// A state that only transitions to `next` if the current input byte is
216 /// in the range `[start, end]` (inclusive on both ends).
217 Range { range: Transition },
218 /// A state with possibly many transitions, represented in a sparse
219 /// fashion. Transitions are ordered lexicographically by input range.
220 /// As such, this may only be used when every transition has equal
221 /// priority. (In practice, this is only used for encoding large UTF-8
222 /// automata.)
223 Sparse { ranges: Vec<Transition> },
224 /// An alternation such that there exists an epsilon transition to all
225 /// states in `alternates`, where matches found via earlier transitions
226 /// are preferred over later transitions.
227 Union { alternates: Vec<StateID> },
228 /// An alternation such that there exists an epsilon transition to all
229 /// states in `alternates`, where matches found via later transitions
230 /// are preferred over earlier transitions.
231 ///
232 /// This "reverse" state exists for convenience during compilation that
233 /// permits easy construction of non-greedy combinations of NFA states.
234 /// At the end of compilation, Union and UnionReverse states are merged
235 /// into one Union type of state, where the latter has its epsilon
236 /// transitions reversed to reflect the priority inversion.
237 UnionReverse { alternates: Vec<StateID> },
238 /// A match state. There is exactly one such occurrence of this state in
239 /// an NFA.
240 Match,
241}
242
243/// A value that represents the result of compiling a sub-expression of a
244/// regex's HIR. Specifically, this represents a sub-graph of the NFA that
245/// has an initial state at `start` and a final state at `end`.
246#[derive(Clone, Copy, Debug)]
247pub struct ThompsonRef {
248 start: StateID,
249 end: StateID,
250}
251
252impl Compiler {
253 /// Create a new compiler.
254 pub fn new() -> Compiler {
255 Compiler {
256 states: RefCell::new(vec![]),
257 config: Config::default(),
258 utf8_state: RefCell::new(Utf8State::new()),
259 trie_state: RefCell::new(RangeTrie::new()),
260 utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)),
261 remap: RefCell::new(vec![]),
262 empties: RefCell::new(vec![]),
263 }
264 }
265
266 /// Clear any memory used by this compiler such that it is ready to compile
267 /// a new regex.
268 ///
269 /// It is preferrable to reuse a compiler if possible in order to reuse
270 /// allocations.
271 fn clear(&self) {
272 self.states.borrow_mut().clear();
273 // We don't need to clear anything else since they are cleared on
274 // their own and only when they are used.
275 }
276
277 /// Configure this compiler from the builder's knobs.
278 ///
279 /// The compiler is always reconfigured by the builder before using it to
280 /// build an NFA.
281 fn configure(&mut self, config: Config) {
282 self.config = config;
283 }
284
285 /// Convert the current intermediate NFA to its final compiled form.
286 fn compile(&self, nfa: &mut NFA, expr: &Hir) -> Result<()> {
287 nfa.anchored = self.config.anchored;
288
289 let mut start = self.add_empty();
290 if !nfa.anchored {
291 let compiled = if self.config.allow_invalid_utf8 {
292 self.c_unanchored_prefix_invalid_utf8()?
293 } else {
294 self.c_unanchored_prefix_valid_utf8()?
295 };
296 self.patch(start, compiled.start);
297 start = compiled.end;
298 }
299 let compiled = self.c(&expr)?;
300 let match_id = self.add_match();
301 self.patch(start, compiled.start);
302 self.patch(compiled.end, match_id);
303 self.finish(nfa);
304 Ok(())
305 }
306
307 /// Finishes the compilation process and populates the provide NFA with
308 /// the final graph.
309 fn finish(&self, nfa: &mut NFA) {
310 let mut bstates = self.states.borrow_mut();
311 let mut remap = self.remap.borrow_mut();
312 remap.resize(bstates.len(), 0);
313 let mut empties = self.empties.borrow_mut();
314 empties.clear();
315
316 // We don't reuse allocations here becuase this is what we're
317 // returning.
318 nfa.states.clear();
319 let mut byteset = ByteClassSet::new();
320
321 // The idea here is to convert our intermediate states to their final
322 // form. The only real complexity here is the process of converting
323 // transitions, which are expressed in terms of state IDs. The new
324 // set of states will be smaller because of partial epsilon removal,
325 // so the state IDs will not be the same.
326 for (id, bstate) in bstates.iter_mut().enumerate() {
327 match *bstate {
328 CState::Empty { next } => {
329 // Since we're removing empty states, we need to handle
330 // them later since we don't yet know which new state this
331 // empty state will be mapped to.
332 empties.push((id, next));
333 }
334 CState::Range { ref range } => {
335 remap[id] = nfa.states.len();
336 byteset.set_range(range.start, range.end);
337 nfa.states.push(State::Range { range: range.clone() });
338 }
339 CState::Sparse { ref mut ranges } => {
340 remap[id] = nfa.states.len();
341
342 let ranges = mem::replace(ranges, vec![]);
343 for r in &ranges {
344 byteset.set_range(r.start, r.end);
345 }
346 nfa.states.push(State::Sparse {
347 ranges: ranges.into_boxed_slice(),
348 });
349 }
350 CState::Union { ref mut alternates } => {
351 remap[id] = nfa.states.len();
352
353 let alternates = mem::replace(alternates, vec![]);
354 nfa.states.push(State::Union {
355 alternates: alternates.into_boxed_slice(),
356 });
357 }
358 CState::UnionReverse { ref mut alternates } => {
359 remap[id] = nfa.states.len();
360
361 let mut alternates = mem::replace(alternates, vec![]);
362 alternates.reverse();
363 nfa.states.push(State::Union {
364 alternates: alternates.into_boxed_slice(),
365 });
366 }
367 CState::Match => {
368 remap[id] = nfa.states.len();
369 nfa.states.push(State::Match);
370 }
371 }
372 }
373 for &(empty_id, mut empty_next) in empties.iter() {
374 // empty states can point to other empty states, forming a chain.
375 // So we must follow the chain until the end, which must end at
376 // a non-empty state, and therefore, a state that is correctly
377 // remapped. We are guaranteed to terminate because our compiler
378 // never builds a loop among empty states.
379 while let CState::Empty { next } = bstates[empty_next] {
380 empty_next = next;
381 }
382 remap[empty_id] = remap[empty_next];
383 }
384 for state in &mut nfa.states {
385 state.remap(&remap);
386 }
387 // The compiler always begins the NFA at the first state.
388 nfa.start = remap[0];
389 nfa.byte_classes = byteset.byte_classes();
390 }
391
392 fn c(&self, expr: &Hir) -> Result<ThompsonRef> {
393 match *expr.kind() {
394 HirKind::Empty => {
395 let id = self.add_empty();
396 Ok(ThompsonRef { start: id, end: id })
397 }
398 HirKind::Literal(hir::Literal::Unicode(ch)) => {
399 let mut buf = [0; 4];
400 let it = ch
401 .encode_utf8(&mut buf)
402 .as_bytes()
403 .iter()
404 .map(|&b| Ok(self.c_range(b, b)));
405 self.c_concat(it)
406 }
407 HirKind::Literal(hir::Literal::Byte(b)) => Ok(self.c_range(b, b)),
408 HirKind::Class(hir::Class::Bytes(ref cls)) => {
409 self.c_byte_class(cls)
410 }
411 HirKind::Class(hir::Class::Unicode(ref cls)) => {
412 self.c_unicode_class(cls)
413 }
414 HirKind::Repetition(ref rep) => self.c_repetition(rep),
415 HirKind::Group(ref group) => self.c(&*group.hir),
416 HirKind::Concat(ref exprs) => {
417 self.c_concat(exprs.iter().map(|e| self.c(e)))
418 }
419 HirKind::Alternation(ref exprs) => {
420 self.c_alternation(exprs.iter().map(|e| self.c(e)))
421 }
422 HirKind::Anchor(_) => Err(Error::unsupported_anchor()),
423 HirKind::WordBoundary(_) => Err(Error::unsupported_word()),
424 }
425 }
426
427 fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef>
428 where
429 I: DoubleEndedIterator<Item = Result<ThompsonRef>>,
430 {
431 let first =
432 if self.config.reverse { it.next_back() } else { it.next() };
433 let ThompsonRef { start, mut end } = match first {
434 Some(result) => result?,
435 None => return Ok(self.c_empty()),
436 };
437 loop {
438 let next =
439 if self.config.reverse { it.next_back() } else { it.next() };
440 let compiled = match next {
441 Some(result) => result?,
442 None => break,
443 };
444 self.patch(end, compiled.start);
445 end = compiled.end;
446 }
447 Ok(ThompsonRef { start, end })
448 }
449
450 fn c_alternation<I>(&self, mut it: I) -> Result<ThompsonRef>
451 where
452 I: Iterator<Item = Result<ThompsonRef>>,
453 {
454 let first = it.next().expect("alternations must be non-empty")?;
455 let second = match it.next() {
456 None => return Ok(first),
457 Some(result) => result?,
458 };
459
460 let union = self.add_union();
461 let end = self.add_empty();
462 self.patch(union, first.start);
463 self.patch(first.end, end);
464 self.patch(union, second.start);
465 self.patch(second.end, end);
466 for result in it {
467 let compiled = result?;
468 self.patch(union, compiled.start);
469 self.patch(compiled.end, end);
470 }
471 Ok(ThompsonRef { start: union, end })
472 }
473
474 fn c_repetition(&self, rep: &hir::Repetition) -> Result<ThompsonRef> {
475 match rep.kind {
476 hir::RepetitionKind::ZeroOrOne => {
477 self.c_zero_or_one(&rep.hir, rep.greedy)
478 }
479 hir::RepetitionKind::ZeroOrMore => {
480 self.c_at_least(&rep.hir, rep.greedy, 0)
481 }
482 hir::RepetitionKind::OneOrMore => {
483 self.c_at_least(&rep.hir, rep.greedy, 1)
484 }
485 hir::RepetitionKind::Range(ref rng) => match *rng {
486 hir::RepetitionRange::Exactly(count) => {
487 self.c_exactly(&rep.hir, count)
488 }
489 hir::RepetitionRange::AtLeast(m) => {
490 self.c_at_least(&rep.hir, rep.greedy, m)
491 }
492 hir::RepetitionRange::Bounded(min, max) => {
493 self.c_bounded(&rep.hir, rep.greedy, min, max)
494 }
495 },
496 }
497 }
498
499 fn c_bounded(
500 &self,
501 expr: &Hir,
502 greedy: bool,
503 min: u32,
504 max: u32,
505 ) -> Result<ThompsonRef> {
506 let prefix = self.c_exactly(expr, min)?;
507 if min == max {
508 return Ok(prefix);
509 }
510
511 // It is tempting here to compile the rest here as a concatenation
512 // of zero-or-one matches. i.e., for `a{2,5}`, compile it as if it
513 // were `aaa?a?a?`. The problem here is that it leads to this program:
514 //
515 // >000000: 61 => 01
516 // 000001: 61 => 02
517 // 000002: alt(03, 04)
518 // 000003: 61 => 04
519 // 000004: alt(05, 06)
520 // 000005: 61 => 06
521 // 000006: alt(07, 08)
522 // 000007: 61 => 08
523 // 000008: MATCH
524 //
525 // And effectively, once you hit state 2, the epsilon closure will
526 // include states 3, 5, 5, 6, 7 and 8, which is quite a bit. It is
527 // better to instead compile it like so:
528 //
529 // >000000: 61 => 01
530 // 000001: 61 => 02
531 // 000002: alt(03, 08)
532 // 000003: 61 => 04
533 // 000004: alt(05, 08)
534 // 000005: 61 => 06
535 // 000006: alt(07, 08)
536 // 000007: 61 => 08
537 // 000008: MATCH
538 //
539 // So that the epsilon closure of state 2 is now just 3 and 8.
540 let empty = self.add_empty();
541 let mut prev_end = prefix.end;
542 for _ in min..max {
543 let union = if greedy {
544 self.add_union()
545 } else {
546 self.add_reverse_union()
547 };
548 let compiled = self.c(expr)?;
549 self.patch(prev_end, union);
550 self.patch(union, compiled.start);
551 self.patch(union, empty);
552 prev_end = compiled.end;
553 }
554 self.patch(prev_end, empty);
555 Ok(ThompsonRef { start: prefix.start, end: empty })
556 }
557
558 fn c_at_least(
559 &self,
560 expr: &Hir,
561 greedy: bool,
562 n: u32,
563 ) -> Result<ThompsonRef> {
564 if n == 0 {
565 let union = if greedy {
566 self.add_union()
567 } else {
568 self.add_reverse_union()
569 };
570 let compiled = self.c(expr)?;
571 self.patch(union, compiled.start);
572 self.patch(compiled.end, union);
573 Ok(ThompsonRef { start: union, end: union })
574 } else if n == 1 {
575 let compiled = self.c(expr)?;
576 let union = if greedy {
577 self.add_union()
578 } else {
579 self.add_reverse_union()
580 };
581 self.patch(compiled.end, union);
582 self.patch(union, compiled.start);
583 Ok(ThompsonRef { start: compiled.start, end: union })
584 } else {
585 let prefix = self.c_exactly(expr, n - 1)?;
586 let last = self.c(expr)?;
587 let union = if greedy {
588 self.add_union()
589 } else {
590 self.add_reverse_union()
591 };
592 self.patch(prefix.end, last.start);
593 self.patch(last.end, union);
594 self.patch(union, last.start);
595 Ok(ThompsonRef { start: prefix.start, end: union })
596 }
597 }
598
599 fn c_zero_or_one(&self, expr: &Hir, greedy: bool) -> Result<ThompsonRef> {
600 let union =
601 if greedy { self.add_union() } else { self.add_reverse_union() };
602 let compiled = self.c(expr)?;
603 let empty = self.add_empty();
604 self.patch(union, compiled.start);
605 self.patch(union, empty);
606 self.patch(compiled.end, empty);
607 Ok(ThompsonRef { start: union, end: empty })
608 }
609
610 fn c_exactly(&self, expr: &Hir, n: u32) -> Result<ThompsonRef> {
611 let it = (0..n).map(|_| self.c(expr));
612 self.c_concat(it)
613 }
614
615 fn c_byte_class(&self, cls: &hir::ClassBytes) -> Result<ThompsonRef> {
616 let end = self.add_empty();
617 let mut trans = Vec::with_capacity(cls.ranges().len());
618 for r in cls.iter() {
619 trans.push(Transition {
620 start: r.start(),
621 end: r.end(),
622 next: end,
623 });
624 }
625 Ok(ThompsonRef { start: self.add_sparse(trans), end })
626 }
627
628 fn c_unicode_class(&self, cls: &hir::ClassUnicode) -> Result<ThompsonRef> {
629 // If all we have are ASCII ranges wrapped in a Unicode package, then
630 // there is zero reason to bring out the big guns. We can fit all ASCII
631 // ranges within a single sparse transition.
632 if cls.is_all_ascii() {
633 let end = self.add_empty();
634 let mut trans = Vec::with_capacity(cls.ranges().len());
635 for r in cls.iter() {
636 assert!(r.start() <= '\x7F');
637 assert!(r.end() <= '\x7F');
638 trans.push(Transition {
639 start: r.start() as u8,
640 end: r.end() as u8,
641 next: end,
642 });
643 }
644 Ok(ThompsonRef { start: self.add_sparse(trans), end })
645 } else if self.config.reverse {
646 if !self.config.shrink {
647 // When we don't want to spend the extra time shrinking, we
648 // compile the UTF-8 automaton in reverse using something like
649 // the "naive" approach, but will attempt to re-use common
650 // suffixes.
651 self.c_unicode_class_reverse_with_suffix(cls)
652 } else {
653 // When we want to shrink our NFA for reverse UTF-8 automata,
654 // we cannot feed UTF-8 sequences directly to the UTF-8
655 // compiler, since the UTF-8 compiler requires all sequences
656 // to be lexicographically sorted. Instead, we organize our
657 // sequences into a range trie, which can then output our
658 // sequences in the correct order. Unfortunately, building the
659 // range trie is fairly expensive (but not nearly as expensive
660 // as building a DFA). Hence the reason why the 'shrink' option
661 // exists, so that this path can be toggled off.
662 let mut trie = self.trie_state.borrow_mut();
663 trie.clear();
664
665 for rng in cls.iter() {
666 for mut seq in Utf8Sequences::new(rng.start(), rng.end()) {
667 seq.reverse();
668 trie.insert(seq.as_slice());
669 }
670 }
671 let mut utf8_state = self.utf8_state.borrow_mut();
672 let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state);
673 trie.iter(|seq| {
674 utf8c.add(&seq);
675 });
676 Ok(utf8c.finish())
677 }
678 } else {
679 // In the forward direction, we always shrink our UTF-8 automata
680 // because we can stream it right into the UTF-8 compiler. There
681 // is almost no downside (in either memory or time) to using this
682 // approach.
683 let mut utf8_state = self.utf8_state.borrow_mut();
684 let mut utf8c = Utf8Compiler::new(self, &mut *utf8_state);
685 for rng in cls.iter() {
686 for seq in Utf8Sequences::new(rng.start(), rng.end()) {
687 utf8c.add(seq.as_slice());
688 }
689 }
690 Ok(utf8c.finish())
691 }
692
693 // For reference, the code below is the "naive" version of compiling a
694 // UTF-8 automaton. It is deliciously simple (and works for both the
695 // forward and reverse cases), but will unfortunately produce very
696 // large NFAs. When compiling a forward automaton, the size difference
697 // can sometimes be an order of magnitude. For example, the '\w' regex
698 // will generate about ~3000 NFA states using the naive approach below,
699 // but only 283 states when using the approach above. This is because
700 // the approach above actually compiles a *minimal* (or near minimal,
701 // because of the bounded hashmap) UTF-8 automaton.
702 //
703 // The code below is kept as a reference point in order to make it
704 // easier to understand the higher level goal here.
705 /*
706 let it = cls
707 .iter()
708 .flat_map(|rng| Utf8Sequences::new(rng.start(), rng.end()))
709 .map(|seq| {
710 let it = seq
711 .as_slice()
712 .iter()
713 .map(|rng| Ok(self.c_range(rng.start, rng.end)));
714 self.c_concat(it)
715 });
716 self.c_alternation(it);
717 */
718 }
719
720 fn c_unicode_class_reverse_with_suffix(
721 &self,
722 cls: &hir::ClassUnicode,
723 ) -> Result<ThompsonRef> {
724 // N.B. It would likely be better to cache common *prefixes* in the
725 // reverse direction, but it's not quite clear how to do that. The
726 // advantage of caching suffixes is that it does give us a win, and
727 // has a very small additional overhead.
728 let mut cache = self.utf8_suffix.borrow_mut();
729 cache.clear();
730
731 let union = self.add_union();
732 let alt_end = self.add_empty();
733 for urng in cls.iter() {
734 for seq in Utf8Sequences::new(urng.start(), urng.end()) {
735 let mut end = alt_end;
736 for brng in seq.as_slice() {
737 let key = Utf8SuffixKey {
738 from: end,
739 start: brng.start,
740 end: brng.end,
741 };
742 let hash = cache.hash(&key);
743 if let Some(id) = cache.get(&key, hash) {
744 end = id;
745 continue;
746 }
747
748 let compiled = self.c_range(brng.start, brng.end);
749 self.patch(compiled.end, end);
750 end = compiled.start;
751 cache.set(key, hash, end);
752 }
753 self.patch(union, end);
754 }
755 }
756 Ok(ThompsonRef { start: union, end: alt_end })
757 }
758
759 fn c_range(&self, start: u8, end: u8) -> ThompsonRef {
760 let id = self.add_range(start, end);
761 ThompsonRef { start: id, end: id }
762 }
763
764 fn c_empty(&self) -> ThompsonRef {
765 let id = self.add_empty();
766 ThompsonRef { start: id, end: id }
767 }
768
769 fn c_unanchored_prefix_valid_utf8(&self) -> Result<ThompsonRef> {
770 self.c(&Hir::repetition(hir::Repetition {
771 kind: hir::RepetitionKind::ZeroOrMore,
772 greedy: false,
773 hir: Box::new(Hir::any(false)),
774 }))
775 }
776
777 fn c_unanchored_prefix_invalid_utf8(&self) -> Result<ThompsonRef> {
778 self.c(&Hir::repetition(hir::Repetition {
779 kind: hir::RepetitionKind::ZeroOrMore,
780 greedy: false,
781 hir: Box::new(Hir::any(true)),
782 }))
783 }
784
785 fn patch(&self, from: StateID, to: StateID) {
786 match self.states.borrow_mut()[from] {
787 CState::Empty { ref mut next } => {
788 *next = to;
789 }
790 CState::Range { ref mut range } => {
791 range.next = to;
792 }
793 CState::Sparse { .. } => {
794 panic!("cannot patch from a sparse NFA state")
795 }
796 CState::Union { ref mut alternates } => {
797 alternates.push(to);
798 }
799 CState::UnionReverse { ref mut alternates } => {
800 alternates.push(to);
801 }
802 CState::Match => {}
803 }
804 }
805
806 fn add_empty(&self) -> StateID {
807 let id = self.states.borrow().len();
808 self.states.borrow_mut().push(CState::Empty { next: 0 });
809 id
810 }
811
812 fn add_range(&self, start: u8, end: u8) -> StateID {
813 let id = self.states.borrow().len();
814 let trans = Transition { start, end, next: 0 };
815 let state = CState::Range { range: trans };
816 self.states.borrow_mut().push(state);
817 id
818 }
819
820 fn add_sparse(&self, ranges: Vec<Transition>) -> StateID {
821 if ranges.len() == 1 {
822 let id = self.states.borrow().len();
823 let state = CState::Range { range: ranges[0] };
824 self.states.borrow_mut().push(state);
825 return id;
826 }
827 let id = self.states.borrow().len();
828 let state = CState::Sparse { ranges };
829 self.states.borrow_mut().push(state);
830 id
831 }
832
833 fn add_union(&self) -> StateID {
834 let id = self.states.borrow().len();
835 let state = CState::Union { alternates: vec![] };
836 self.states.borrow_mut().push(state);
837 id
838 }
839
840 fn add_reverse_union(&self) -> StateID {
841 let id = self.states.borrow().len();
842 let state = CState::UnionReverse { alternates: vec![] };
843 self.states.borrow_mut().push(state);
844 id
845 }
846
847 fn add_match(&self) -> StateID {
848 let id = self.states.borrow().len();
849 self.states.borrow_mut().push(CState::Match);
850 id
851 }
852}
853
854#[derive(Debug)]
855struct Utf8Compiler<'a> {
856 nfac: &'a Compiler,
857 state: &'a mut Utf8State,
858 target: StateID,
859}
860
861#[derive(Clone, Debug)]
862struct Utf8State {
863 compiled: Utf8BoundedMap,
864 uncompiled: Vec<Utf8Node>,
865}
866
867#[derive(Clone, Debug)]
868struct Utf8Node {
869 trans: Vec<Transition>,
870 last: Option<Utf8LastTransition>,
871}
872
873#[derive(Clone, Debug)]
874struct Utf8LastTransition {
875 start: u8,
876 end: u8,
877}
878
879impl Utf8State {
880 fn new() -> Utf8State {
881 Utf8State { compiled: Utf8BoundedMap::new(capacity:5000), uncompiled: vec![] }
882 }
883
884 fn clear(&mut self) {
885 self.compiled.clear();
886 self.uncompiled.clear();
887 }
888}
889
890impl<'a> Utf8Compiler<'a> {
891 fn new(nfac: &'a Compiler, state: &'a mut Utf8State) -> Utf8Compiler<'a> {
892 let target = nfac.add_empty();
893 state.clear();
894 let mut utf8c = Utf8Compiler { nfac, state, target };
895 utf8c.add_empty();
896 utf8c
897 }
898
899 fn finish(&mut self) -> ThompsonRef {
900 self.compile_from(0);
901 let node = self.pop_root();
902 let start = self.compile(node);
903 ThompsonRef { start, end: self.target }
904 }
905
906 fn add(&mut self, ranges: &[Utf8Range]) {
907 let prefix_len = ranges
908 .iter()
909 .zip(&self.state.uncompiled)
910 .take_while(|&(range, node)| {
911 node.last.as_ref().map_or(false, |t| {
912 (t.start, t.end) == (range.start, range.end)
913 })
914 })
915 .count();
916 assert!(prefix_len < ranges.len());
917 self.compile_from(prefix_len);
918 self.add_suffix(&ranges[prefix_len..]);
919 }
920
921 fn compile_from(&mut self, from: usize) {
922 let mut next = self.target;
923 while from + 1 < self.state.uncompiled.len() {
924 let node = self.pop_freeze(next);
925 next = self.compile(node);
926 }
927 self.top_last_freeze(next);
928 }
929
930 fn compile(&mut self, node: Vec<Transition>) -> StateID {
931 let hash = self.state.compiled.hash(&node);
932 if let Some(id) = self.state.compiled.get(&node, hash) {
933 return id;
934 }
935 let id = self.nfac.add_sparse(node.clone());
936 self.state.compiled.set(node, hash, id);
937 id
938 }
939
940 fn add_suffix(&mut self, ranges: &[Utf8Range]) {
941 assert!(!ranges.is_empty());
942 let last = self
943 .state
944 .uncompiled
945 .len()
946 .checked_sub(1)
947 .expect("non-empty nodes");
948 assert!(self.state.uncompiled[last].last.is_none());
949 self.state.uncompiled[last].last = Some(Utf8LastTransition {
950 start: ranges[0].start,
951 end: ranges[0].end,
952 });
953 for r in &ranges[1..] {
954 self.state.uncompiled.push(Utf8Node {
955 trans: vec![],
956 last: Some(Utf8LastTransition { start: r.start, end: r.end }),
957 });
958 }
959 }
960
961 fn add_empty(&mut self) {
962 self.state.uncompiled.push(Utf8Node { trans: vec![], last: None });
963 }
964
965 fn pop_freeze(&mut self, next: StateID) -> Vec<Transition> {
966 let mut uncompiled = self.state.uncompiled.pop().unwrap();
967 uncompiled.set_last_transition(next);
968 uncompiled.trans
969 }
970
971 fn pop_root(&mut self) -> Vec<Transition> {
972 assert_eq!(self.state.uncompiled.len(), 1);
973 assert!(self.state.uncompiled[0].last.is_none());
974 self.state.uncompiled.pop().expect("non-empty nodes").trans
975 }
976
977 fn top_last_freeze(&mut self, next: StateID) {
978 let last = self
979 .state
980 .uncompiled
981 .len()
982 .checked_sub(1)
983 .expect("non-empty nodes");
984 self.state.uncompiled[last].set_last_transition(next);
985 }
986}
987
988impl Utf8Node {
989 fn set_last_transition(&mut self, next: StateID) {
990 if let Some(last: Utf8LastTransition) = self.last.take() {
991 self.trans.push(Transition {
992 start: last.start,
993 end: last.end,
994 next,
995 });
996 }
997 }
998}
999
1000#[cfg(test)]
1001mod tests {
1002 use regex_syntax::hir::Hir;
1003 use regex_syntax::ParserBuilder;
1004
1005 use super::{Builder, State, StateID, Transition, NFA};
1006
1007 fn parse(pattern: &str) -> Hir {
1008 ParserBuilder::new().build().parse(pattern).unwrap()
1009 }
1010
1011 fn build(pattern: &str) -> NFA {
1012 Builder::new().anchored(true).build(&parse(pattern)).unwrap()
1013 }
1014
1015 fn s_byte(byte: u8, next: StateID) -> State {
1016 let trans = Transition { start: byte, end: byte, next };
1017 State::Range { range: trans }
1018 }
1019
1020 fn s_range(start: u8, end: u8, next: StateID) -> State {
1021 let trans = Transition { start, end, next };
1022 State::Range { range: trans }
1023 }
1024
1025 fn s_sparse(ranges: &[(u8, u8, StateID)]) -> State {
1026 let ranges = ranges
1027 .iter()
1028 .map(|&(start, end, next)| Transition { start, end, next })
1029 .collect();
1030 State::Sparse { ranges }
1031 }
1032
1033 fn s_union(alts: &[StateID]) -> State {
1034 State::Union { alternates: alts.to_vec().into_boxed_slice() }
1035 }
1036
1037 fn s_match() -> State {
1038 State::Match
1039 }
1040
1041 #[test]
1042 fn errors() {
1043 // unsupported anchors
1044 assert!(Builder::new().build(&parse(r"^")).is_err());
1045 assert!(Builder::new().build(&parse(r"$")).is_err());
1046 assert!(Builder::new().build(&parse(r"\A")).is_err());
1047 assert!(Builder::new().build(&parse(r"\z")).is_err());
1048
1049 // unsupported word boundaries
1050 assert!(Builder::new().build(&parse(r"\b")).is_err());
1051 assert!(Builder::new().build(&parse(r"\B")).is_err());
1052 assert!(Builder::new().build(&parse(r"(?-u)\b")).is_err());
1053 }
1054
1055 // Test that building an unanchored NFA has an appropriate `.*?` prefix.
1056 #[test]
1057 fn compile_unanchored_prefix() {
1058 // When the machine can only match valid UTF-8.
1059 let nfa = Builder::new().anchored(false).build(&parse(r"a")).unwrap();
1060 // There should be many states since the `.` in `.*?` matches any
1061 // Unicode scalar value.
1062 assert_eq!(11, nfa.len());
1063 assert_eq!(nfa.states[10], s_match());
1064 assert_eq!(nfa.states[9], s_byte(b'a', 10));
1065
1066 // When the machine can match invalid UTF-8.
1067 let nfa = Builder::new()
1068 .anchored(false)
1069 .allow_invalid_utf8(true)
1070 .build(&parse(r"a"))
1071 .unwrap();
1072 assert_eq!(
1073 nfa.states,
1074 &[
1075 s_union(&[2, 1]),
1076 s_range(0, 255, 0),
1077 s_byte(b'a', 3),
1078 s_match(),
1079 ]
1080 );
1081 }
1082
1083 #[test]
1084 fn compile_empty() {
1085 assert_eq!(build("").states, &[s_match(),]);
1086 }
1087
1088 #[test]
1089 fn compile_literal() {
1090 assert_eq!(build("a").states, &[s_byte(b'a', 1), s_match(),]);
1091 assert_eq!(
1092 build("ab").states,
1093 &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(),]
1094 );
1095 assert_eq!(
1096 build("ā˜ƒ").states,
1097 &[s_byte(0xE2, 1), s_byte(0x98, 2), s_byte(0x83, 3), s_match(),]
1098 );
1099
1100 // Check that non-UTF-8 literals work.
1101 let hir = ParserBuilder::new()
1102 .allow_invalid_utf8(true)
1103 .build()
1104 .parse(r"(?-u)\xFF")
1105 .unwrap();
1106 let nfa = Builder::new()
1107 .anchored(true)
1108 .allow_invalid_utf8(true)
1109 .build(&hir)
1110 .unwrap();
1111 assert_eq!(nfa.states, &[s_byte(b'\xFF', 1), s_match(),]);
1112 }
1113
1114 #[test]
1115 fn compile_class() {
1116 assert_eq!(
1117 build(r"[a-z]").states,
1118 &[s_range(b'a', b'z', 1), s_match(),]
1119 );
1120 assert_eq!(
1121 build(r"[x-za-c]").states,
1122 &[s_sparse(&[(b'a', b'c', 1), (b'x', b'z', 1)]), s_match()]
1123 );
1124 assert_eq!(
1125 build(r"[\u03B1-\u03B4]").states,
1126 &[s_range(0xB1, 0xB4, 2), s_byte(0xCE, 0), s_match()]
1127 );
1128 assert_eq!(
1129 build(r"[\u03B1-\u03B4\u{1F919}-\u{1F91E}]").states,
1130 &[
1131 s_range(0xB1, 0xB4, 5),
1132 s_range(0x99, 0x9E, 5),
1133 s_byte(0xA4, 1),
1134 s_byte(0x9F, 2),
1135 s_sparse(&[(0xCE, 0xCE, 0), (0xF0, 0xF0, 3)]),
1136 s_match(),
1137 ]
1138 );
1139 assert_eq!(
1140 build(r"[a-zā˜ƒ]").states,
1141 &[
1142 s_byte(0x83, 3),
1143 s_byte(0x98, 0),
1144 s_sparse(&[(b'a', b'z', 3), (0xE2, 0xE2, 1)]),
1145 s_match(),
1146 ]
1147 );
1148 }
1149
1150 #[test]
1151 fn compile_repetition() {
1152 assert_eq!(
1153 build(r"a?").states,
1154 &[s_union(&[1, 2]), s_byte(b'a', 2), s_match(),]
1155 );
1156 assert_eq!(
1157 build(r"a??").states,
1158 &[s_union(&[2, 1]), s_byte(b'a', 2), s_match(),]
1159 );
1160 }
1161
1162 #[test]
1163 fn compile_group() {
1164 assert_eq!(
1165 build(r"ab+").states,
1166 &[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[1, 3]), s_match(),]
1167 );
1168 assert_eq!(
1169 build(r"(ab)").states,
1170 &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(),]
1171 );
1172 assert_eq!(
1173 build(r"(ab)+").states,
1174 &[s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[0, 3]), s_match(),]
1175 );
1176 }
1177
1178 #[test]
1179 fn compile_alternation() {
1180 assert_eq!(
1181 build(r"a|b").states,
1182 &[s_byte(b'a', 3), s_byte(b'b', 3), s_union(&[0, 1]), s_match(),]
1183 );
1184 assert_eq!(
1185 build(r"|b").states,
1186 &[s_byte(b'b', 2), s_union(&[2, 0]), s_match(),]
1187 );
1188 assert_eq!(
1189 build(r"a|").states,
1190 &[s_byte(b'a', 2), s_union(&[0, 2]), s_match(),]
1191 );
1192 }
1193}
1194