1use core::{borrow::Borrow, cell::RefCell};
2
3use alloc::{sync::Arc, vec, vec::Vec};
4
5use regex_syntax::{
6 hir::{self, Hir},
7 utf8::{Utf8Range, Utf8Sequences},
8 ParserBuilder,
9};
10
11use crate::{
12 nfa::thompson::{
13 builder::Builder,
14 error::BuildError,
15 literal_trie::LiteralTrie,
16 map::{Utf8BoundedMap, Utf8SuffixKey, Utf8SuffixMap},
17 nfa::{Transition, NFA},
18 range_trie::RangeTrie,
19 },
20 util::{
21 look::{Look, LookMatcher},
22 primitives::{PatternID, StateID},
23 },
24};
25
26/// The configuration used for a Thompson NFA compiler.
27#[derive(Clone, Debug, Default)]
28pub struct Config {
29 utf8: Option<bool>,
30 reverse: Option<bool>,
31 nfa_size_limit: Option<Option<usize>>,
32 shrink: Option<bool>,
33 which_captures: Option<WhichCaptures>,
34 look_matcher: Option<LookMatcher>,
35 #[cfg(test)]
36 unanchored_prefix: Option<bool>,
37}
38
39impl Config {
40 /// Return a new default Thompson NFA compiler configuration.
41 pub fn new() -> Config {
42 Config::default()
43 }
44
45 /// Whether to enable UTF-8 mode during search or not.
46 ///
47 /// A regex engine is said to be in UTF-8 mode when it guarantees that
48 /// all matches returned by it have spans consisting of only valid UTF-8.
49 /// That is, it is impossible for a match span to be returned that
50 /// contains any invalid UTF-8.
51 ///
52 /// UTF-8 mode generally consists of two things:
53 ///
54 /// 1. Whether the NFA's states are constructed such that all paths to a
55 /// match state that consume at least one byte always correspond to valid
56 /// UTF-8.
57 /// 2. Whether all paths to a match state that do _not_ consume any bytes
58 /// should always correspond to valid UTF-8 boundaries.
59 ///
60 /// (1) is a guarantee made by whoever constructs the NFA.
61 /// If you're parsing a regex from its concrete syntax, then
62 /// [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) can make
63 /// this guarantee for you. It does it by returning an error if the regex
64 /// pattern could every report a non-empty match span that contains invalid
65 /// UTF-8. So long as `syntax::Config::utf8` mode is enabled and your regex
66 /// successfully parses, then you're guaranteed that the corresponding NFA
67 /// will only ever report non-empty match spans containing valid UTF-8.
68 ///
69 /// (2) is a trickier guarantee because it cannot be enforced by the NFA
70 /// state graph itself. Consider, for example, the regex `a*`. It matches
71 /// the empty strings in `ā˜ƒ` at positions `0`, `1`, `2` and `3`, where
72 /// positions `1` and `2` occur within the UTF-8 encoding of a codepoint,
73 /// and thus correspond to invalid UTF-8 boundaries. Therefore, this
74 /// guarantee must be made at a higher level than the NFA state graph
75 /// itself. This crate deals with this case in each regex engine. Namely,
76 /// when a zero-width match that splits a codepoint is found and UTF-8
77 /// mode enabled, then it is ignored and the engine moves on looking for
78 /// the next match.
79 ///
80 /// Thus, UTF-8 mode is both a promise that the NFA built only reports
81 /// non-empty matches that are valid UTF-8, and an *instruction* to regex
82 /// engines that empty matches that split codepoints should be banned.
83 ///
84 /// Because UTF-8 mode is fundamentally about avoiding invalid UTF-8 spans,
85 /// it only makes sense to enable this option when you *know* your haystack
86 /// is valid UTF-8. (For example, a `&str`.) Enabling UTF-8 mode and
87 /// searching a haystack that contains invalid UTF-8 leads to **unspecified
88 /// behavior**.
89 ///
90 /// Therefore, it may make sense to enable `syntax::Config::utf8` while
91 /// simultaneously *disabling* this option. That would ensure all non-empty
92 /// match spans are valid UTF-8, but that empty match spans may still split
93 /// a codepoint or match at other places that aren't valid UTF-8.
94 ///
95 /// In general, this mode is only relevant if your regex can match the
96 /// empty string. Most regexes don't.
97 ///
98 /// This is enabled by default.
99 ///
100 /// # Example
101 ///
102 /// This example shows how UTF-8 mode can impact the match spans that may
103 /// be reported in certain cases.
104 ///
105 /// ```
106 /// use regex_automata::{
107 /// nfa::thompson::{self, pikevm::PikeVM},
108 /// Match, Input,
109 /// };
110 ///
111 /// let re = PikeVM::new("")?;
112 /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
113 ///
114 /// // UTF-8 mode is enabled by default.
115 /// let mut input = Input::new("ā˜ƒ");
116 /// re.search(&mut cache, &input, &mut caps);
117 /// assert_eq!(Some(Match::must(0, 0..0)), caps.get_match());
118 ///
119 /// // Even though an empty regex matches at 1..1, our next match is
120 /// // 3..3 because 1..1 and 2..2 split the snowman codepoint (which is
121 /// // three bytes long).
122 /// input.set_start(1);
123 /// re.search(&mut cache, &input, &mut caps);
124 /// assert_eq!(Some(Match::must(0, 3..3)), caps.get_match());
125 ///
126 /// // But if we disable UTF-8, then we'll get matches at 1..1 and 2..2:
127 /// let re = PikeVM::builder()
128 /// .thompson(thompson::Config::new().utf8(false))
129 /// .build("")?;
130 /// re.search(&mut cache, &input, &mut caps);
131 /// assert_eq!(Some(Match::must(0, 1..1)), caps.get_match());
132 ///
133 /// input.set_start(2);
134 /// re.search(&mut cache, &input, &mut caps);
135 /// assert_eq!(Some(Match::must(0, 2..2)), caps.get_match());
136 ///
137 /// input.set_start(3);
138 /// re.search(&mut cache, &input, &mut caps);
139 /// assert_eq!(Some(Match::must(0, 3..3)), caps.get_match());
140 ///
141 /// input.set_start(4);
142 /// re.search(&mut cache, &input, &mut caps);
143 /// assert_eq!(None, caps.get_match());
144 ///
145 /// # Ok::<(), Box<dyn std::error::Error>>(())
146 /// ```
147 pub fn utf8(mut self, yes: bool) -> Config {
148 self.utf8 = Some(yes);
149 self
150 }
151
152 /// Reverse the NFA.
153 ///
154 /// A NFA reversal is performed by reversing all of the concatenated
155 /// sub-expressions in the original pattern, recursively. (Look around
156 /// operators are also inverted.) The resulting NFA can be used to match
157 /// the pattern starting from the end of a string instead of the beginning
158 /// of a string.
159 ///
160 /// Reversing the NFA is useful for building a reverse DFA, which is most
161 /// useful for finding the start of a match after its ending position has
162 /// been found. NFA execution engines typically do not work on reverse
163 /// NFAs. For example, currently, the Pike VM reports the starting location
164 /// of matches without a reverse NFA.
165 ///
166 /// Currently, enabling this setting requires disabling the
167 /// [`captures`](Config::captures) setting. If both are enabled, then the
168 /// compiler will return an error. It is expected that this limitation will
169 /// be lifted in the future.
170 ///
171 /// This is disabled by default.
172 ///
173 /// # Example
174 ///
175 /// This example shows how to build a DFA from a reverse NFA, and then use
176 /// the DFA to search backwards.
177 ///
178 /// ```
179 /// use regex_automata::{
180 /// dfa::{self, Automaton},
181 /// nfa::thompson::{NFA, WhichCaptures},
182 /// HalfMatch, Input,
183 /// };
184 ///
185 /// let dfa = dfa::dense::Builder::new()
186 /// .thompson(NFA::config()
187 /// .which_captures(WhichCaptures::None)
188 /// .reverse(true)
189 /// )
190 /// .build("baz[0-9]+")?;
191 /// let expected = Some(HalfMatch::must(0, 3));
192 /// assert_eq!(
193 /// expected,
194 /// dfa.try_search_rev(&Input::new("foobaz12345bar"))?,
195 /// );
196 ///
197 /// # Ok::<(), Box<dyn std::error::Error>>(())
198 /// ```
199 pub fn reverse(mut self, yes: bool) -> Config {
200 self.reverse = Some(yes);
201 self
202 }
203
204 /// Sets an approximate size limit on the total heap used by the NFA being
205 /// compiled.
206 ///
207 /// This permits imposing constraints on the size of a compiled NFA. This
208 /// may be useful in contexts where the regex pattern is untrusted and one
209 /// wants to avoid using too much memory.
210 ///
211 /// This size limit does not apply to auxiliary heap used during
212 /// compilation that is not part of the built NFA.
213 ///
214 /// Note that this size limit is applied during compilation in order for
215 /// the limit to prevent too much heap from being used. However, the
216 /// implementation may use an intermediate NFA representation that is
217 /// otherwise slightly bigger than the final public form. Since the size
218 /// limit may be applied to an intermediate representation, there is not
219 /// necessarily a precise correspondence between the configured size limit
220 /// and the heap usage of the final NFA.
221 ///
222 /// There is no size limit by default.
223 ///
224 /// # Example
225 ///
226 /// This example demonstrates how Unicode mode can greatly increase the
227 /// size of the NFA.
228 ///
229 /// ```
230 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
231 /// use regex_automata::nfa::thompson::NFA;
232 ///
233 /// // 300KB isn't enough!
234 /// NFA::compiler()
235 /// .configure(NFA::config().nfa_size_limit(Some(300_000)))
236 /// .build(r"\w{20}")
237 /// .unwrap_err();
238 ///
239 /// // ... but 400KB probably is.
240 /// let nfa = NFA::compiler()
241 /// .configure(NFA::config().nfa_size_limit(Some(400_000)))
242 /// .build(r"\w{20}")?;
243 ///
244 /// assert_eq!(nfa.pattern_len(), 1);
245 ///
246 /// # Ok::<(), Box<dyn std::error::Error>>(())
247 /// ```
248 pub fn nfa_size_limit(mut self, bytes: Option<usize>) -> Config {
249 self.nfa_size_limit = Some(bytes);
250 self
251 }
252
253 /// Apply best effort heuristics to shrink the NFA at the expense of more
254 /// time/memory.
255 ///
256 /// Generally speaking, if one is using an NFA to compile a DFA, then the
257 /// extra time used to shrink the NFA will be more than made up for during
258 /// DFA construction (potentially by a lot). In other words, enabling this
259 /// can substantially decrease the overall amount of time it takes to build
260 /// a DFA.
261 ///
262 /// A reason to keep this disabled is if you want to compile an NFA and
263 /// start using it as quickly as possible without needing to build a DFA,
264 /// and you don't mind using a bit of extra memory for the NFA. e.g., for
265 /// an NFA simulation or for a lazy DFA.
266 ///
267 /// NFA shrinking is currently most useful when compiling a reverse
268 /// NFA with large Unicode character classes. In particular, it trades
269 /// additional CPU time during NFA compilation in favor of generating fewer
270 /// NFA states.
271 ///
272 /// This is disabled by default because it can increase compile times
273 /// quite a bit if you aren't building a full DFA.
274 ///
275 /// # Example
276 ///
277 /// This example shows that NFA shrinking can lead to substantial space
278 /// savings in some cases. Notice that, as noted above, we build a reverse
279 /// DFA and use a pattern with a large Unicode character class.
280 ///
281 /// ```
282 /// # if cfg!(miri) { return Ok(()); } // miri takes too long
283 /// use regex_automata::nfa::thompson::{NFA, WhichCaptures};
284 ///
285 /// // Currently we have to disable captures when enabling reverse NFA.
286 /// let config = NFA::config()
287 /// .which_captures(WhichCaptures::None)
288 /// .reverse(true);
289 /// let not_shrunk = NFA::compiler()
290 /// .configure(config.clone().shrink(false))
291 /// .build(r"\w")?;
292 /// let shrunk = NFA::compiler()
293 /// .configure(config.clone().shrink(true))
294 /// .build(r"\w")?;
295 ///
296 /// // While a specific shrink factor is not guaranteed, the savings can be
297 /// // considerable in some cases.
298 /// assert!(shrunk.states().len() * 2 < not_shrunk.states().len());
299 ///
300 /// # Ok::<(), Box<dyn std::error::Error>>(())
301 /// ```
302 pub fn shrink(mut self, yes: bool) -> Config {
303 self.shrink = Some(yes);
304 self
305 }
306
307 /// Whether to include 'Capture' states in the NFA.
308 ///
309 /// Currently, enabling this setting requires disabling the
310 /// [`reverse`](Config::reverse) setting. If both are enabled, then the
311 /// compiler will return an error. It is expected that this limitation will
312 /// be lifted in the future.
313 ///
314 /// This is enabled by default.
315 ///
316 /// # Example
317 ///
318 /// This example demonstrates that some regex engines, like the Pike VM,
319 /// require capturing states to be present in the NFA to report match
320 /// offsets.
321 ///
322 /// (Note that since this method is deprecated, the example below uses
323 /// [`Config::which_captures`] to disable capture states.)
324 ///
325 /// ```
326 /// use regex_automata::nfa::thompson::{
327 /// pikevm::PikeVM,
328 /// NFA,
329 /// WhichCaptures,
330 /// };
331 ///
332 /// let re = PikeVM::builder()
333 /// .thompson(NFA::config().which_captures(WhichCaptures::None))
334 /// .build(r"[a-z]+")?;
335 /// let mut cache = re.create_cache();
336 ///
337 /// assert!(re.is_match(&mut cache, "abc"));
338 /// assert_eq!(None, re.find(&mut cache, "abc"));
339 ///
340 /// # Ok::<(), Box<dyn std::error::Error>>(())
341 /// ```
342 #[deprecated(since = "0.3.5", note = "use which_captures instead")]
343 pub fn captures(self, yes: bool) -> Config {
344 self.which_captures(if yes {
345 WhichCaptures::All
346 } else {
347 WhichCaptures::None
348 })
349 }
350
351 /// Configures what kinds of capture groups are compiled into
352 /// [`State::Capture`](crate::nfa::thompson::State::Capture) states in a
353 /// Thompson NFA.
354 ///
355 /// Currently, using any option except for [`WhichCaptures::None`] requires
356 /// disabling the [`reverse`](Config::reverse) setting. If both are
357 /// enabled, then the compiler will return an error. It is expected that
358 /// this limitation will be lifted in the future.
359 ///
360 /// This is set to [`WhichCaptures::All`] by default. Callers may wish to
361 /// use [`WhichCaptures::Implicit`] in cases where one wants avoid the
362 /// overhead of capture states for explicit groups. Usually this occurs
363 /// when one wants to use the `PikeVM` only for determining the overall
364 /// match. Otherwise, the `PikeVM` could use much more memory than is
365 /// necessary.
366 ///
367 /// # Example
368 ///
369 /// This example demonstrates that some regex engines, like the Pike VM,
370 /// require capturing states to be present in the NFA to report match
371 /// offsets.
372 ///
373 /// ```
374 /// use regex_automata::nfa::thompson::{
375 /// pikevm::PikeVM,
376 /// NFA,
377 /// WhichCaptures,
378 /// };
379 ///
380 /// let re = PikeVM::builder()
381 /// .thompson(NFA::config().which_captures(WhichCaptures::None))
382 /// .build(r"[a-z]+")?;
383 /// let mut cache = re.create_cache();
384 ///
385 /// assert!(re.is_match(&mut cache, "abc"));
386 /// assert_eq!(None, re.find(&mut cache, "abc"));
387 ///
388 /// # Ok::<(), Box<dyn std::error::Error>>(())
389 /// ```
390 ///
391 /// The same applies to the bounded backtracker:
392 ///
393 /// ```
394 /// use regex_automata::nfa::thompson::{
395 /// backtrack::BoundedBacktracker,
396 /// NFA,
397 /// WhichCaptures,
398 /// };
399 ///
400 /// let re = BoundedBacktracker::builder()
401 /// .thompson(NFA::config().which_captures(WhichCaptures::None))
402 /// .build(r"[a-z]+")?;
403 /// let mut cache = re.create_cache();
404 ///
405 /// assert!(re.try_is_match(&mut cache, "abc")?);
406 /// assert_eq!(None, re.try_find(&mut cache, "abc")?);
407 ///
408 /// # Ok::<(), Box<dyn std::error::Error>>(())
409 /// ```
410 pub fn which_captures(mut self, which_captures: WhichCaptures) -> Config {
411 self.which_captures = Some(which_captures);
412 self
413 }
414
415 /// Sets the look-around matcher that should be used with this NFA.
416 ///
417 /// A look-around matcher determines how to match look-around assertions.
418 /// In particular, some assertions are configurable. For example, the
419 /// `(?m:^)` and `(?m:$)` assertions can have their line terminator changed
420 /// from the default of `\n` to any other byte.
421 ///
422 /// # Example
423 ///
424 /// This shows how to change the line terminator for multi-line assertions.
425 ///
426 /// ```
427 /// use regex_automata::{
428 /// nfa::thompson::{self, pikevm::PikeVM},
429 /// util::look::LookMatcher,
430 /// Match, Input,
431 /// };
432 ///
433 /// let mut lookm = LookMatcher::new();
434 /// lookm.set_line_terminator(b'\x00');
435 ///
436 /// let re = PikeVM::builder()
437 /// .thompson(thompson::Config::new().look_matcher(lookm))
438 /// .build(r"(?m)^[a-z]+$")?;
439 /// let mut cache = re.create_cache();
440 ///
441 /// // Multi-line assertions now use NUL as a terminator.
442 /// assert_eq!(
443 /// Some(Match::must(0, 1..4)),
444 /// re.find(&mut cache, b"\x00abc\x00"),
445 /// );
446 /// // ... and \n is no longer recognized as a terminator.
447 /// assert_eq!(
448 /// None,
449 /// re.find(&mut cache, b"\nabc\n"),
450 /// );
451 ///
452 /// # Ok::<(), Box<dyn std::error::Error>>(())
453 /// ```
454 pub fn look_matcher(mut self, m: LookMatcher) -> Config {
455 self.look_matcher = Some(m);
456 self
457 }
458
459 /// Whether to compile an unanchored prefix into this NFA.
460 ///
461 /// This is enabled by default. It is made available for tests only to make
462 /// it easier to unit test the output of the compiler.
463 #[cfg(test)]
464 fn unanchored_prefix(mut self, yes: bool) -> Config {
465 self.unanchored_prefix = Some(yes);
466 self
467 }
468
469 /// Returns whether this configuration has enabled UTF-8 mode.
470 pub fn get_utf8(&self) -> bool {
471 self.utf8.unwrap_or(true)
472 }
473
474 /// Returns whether this configuration has enabled reverse NFA compilation.
475 pub fn get_reverse(&self) -> bool {
476 self.reverse.unwrap_or(false)
477 }
478
479 /// Return the configured NFA size limit, if it exists, in the number of
480 /// bytes of heap used.
481 pub fn get_nfa_size_limit(&self) -> Option<usize> {
482 self.nfa_size_limit.unwrap_or(None)
483 }
484
485 /// Return whether NFA shrinking is enabled.
486 pub fn get_shrink(&self) -> bool {
487 self.shrink.unwrap_or(false)
488 }
489
490 /// Return whether NFA compilation is configured to produce capture states.
491 #[deprecated(since = "0.3.5", note = "use get_which_captures instead")]
492 pub fn get_captures(&self) -> bool {
493 self.get_which_captures().is_any()
494 }
495
496 /// Return what kinds of capture states will be compiled into an NFA.
497 pub fn get_which_captures(&self) -> WhichCaptures {
498 self.which_captures.unwrap_or(WhichCaptures::All)
499 }
500
501 /// Return the look-around matcher for this NFA.
502 pub fn get_look_matcher(&self) -> LookMatcher {
503 self.look_matcher.clone().unwrap_or(LookMatcher::default())
504 }
505
506 /// Return whether NFA compilation is configured to include an unanchored
507 /// prefix.
508 ///
509 /// This is always false when not in test mode.
510 fn get_unanchored_prefix(&self) -> bool {
511 #[cfg(test)]
512 {
513 self.unanchored_prefix.unwrap_or(true)
514 }
515 #[cfg(not(test))]
516 {
517 true
518 }
519 }
520
521 /// Overwrite the default configuration such that the options in `o` are
522 /// always used. If an option in `o` is not set, then the corresponding
523 /// option in `self` is used. If it's not set in `self` either, then it
524 /// remains not set.
525 pub(crate) fn overwrite(&self, o: Config) -> Config {
526 Config {
527 utf8: o.utf8.or(self.utf8),
528 reverse: o.reverse.or(self.reverse),
529 nfa_size_limit: o.nfa_size_limit.or(self.nfa_size_limit),
530 shrink: o.shrink.or(self.shrink),
531 which_captures: o.which_captures.or(self.which_captures),
532 look_matcher: o.look_matcher.or_else(|| self.look_matcher.clone()),
533 #[cfg(test)]
534 unanchored_prefix: o.unanchored_prefix.or(self.unanchored_prefix),
535 }
536 }
537}
538
539/// A configuration indicating which kinds of
540/// [`State::Capture`](crate::nfa::thompson::State::Capture) states to include.
541///
542/// This configuration can be used with [`Config::which_captures`] to control
543/// which capture states are compiled into a Thompson NFA.
544///
545/// The default configuration is [`WhichCaptures::All`].
546#[derive(Clone, Copy, Debug)]
547pub enum WhichCaptures {
548 /// All capture states, including those corresponding to both implicit and
549 /// explicit capture groups, are included in the Thompson NFA.
550 All,
551 /// Only capture states corresponding to implicit capture groups are
552 /// included. Implicit capture groups appear in every pattern implicitly
553 /// and correspond to the overall match of a pattern.
554 ///
555 /// This is useful when one only cares about the overall match of a
556 /// pattern. By excluding capture states from explicit capture groups,
557 /// one might be able to reduce the memory usage of a multi-pattern regex
558 /// substantially if it was otherwise written to have many explicit capture
559 /// groups.
560 Implicit,
561 /// No capture states are compiled into the Thompson NFA.
562 ///
563 /// This is useful when capture states are either not needed (for example,
564 /// if one is only trying to build a DFA) or if they aren't supported (for
565 /// example, a reverse NFA).
566 None,
567}
568
569impl Default for WhichCaptures {
570 fn default() -> WhichCaptures {
571 WhichCaptures::All
572 }
573}
574
575impl WhichCaptures {
576 /// Returns true if this configuration indicates that no capture states
577 /// should be produced in an NFA.
578 pub fn is_none(&self) -> bool {
579 matches!(*self, WhichCaptures::None)
580 }
581
582 /// Returns true if this configuration indicates that some capture states
583 /// should be added to an NFA. Note that this might only include capture
584 /// states for implicit capture groups.
585 pub fn is_any(&self) -> bool {
586 !self.is_none()
587 }
588}
589
590/*
591This compiler below uses Thompson's construction algorithm. The compiler takes
592a regex-syntax::Hir as input and emits an NFA graph as output. The NFA graph
593is structured in a way that permits it to be executed by a virtual machine and
594also used to efficiently build a DFA.
595
596The compiler deals with a slightly expanded set of NFA states than what is
597in a final NFA (as exhibited by builder::State and nfa::State). Notably a
598compiler state includes an empty node that has exactly one unconditional
599epsilon transition to the next state. In other words, it's a "goto" instruction
600if one views Thompson's NFA as a set of bytecode instructions. These goto
601instructions are removed in a subsequent phase before returning the NFA to the
602caller. The purpose of these empty nodes is that they make the construction
603algorithm substantially simpler to implement. We remove them before returning
604to the caller because they can represent substantial overhead when traversing
605the NFA graph (either while searching using the NFA directly or while building
606a DFA).
607
608In the future, it would be nice to provide a Glushkov compiler as well, as it
609would work well as a bit-parallel NFA for smaller regexes. But the Thompson
610construction is one I'm more familiar with and seems more straight-forward to
611deal with when it comes to large Unicode character classes.
612
613Internally, the compiler uses interior mutability to improve composition in the
614face of the borrow checker. In particular, we'd really like to be able to write
615things like this:
616
617 self.c_concat(exprs.iter().map(|e| self.c(e)))
618
619Which elegantly uses iterators to build up a sequence of compiled regex
620sub-expressions and then hands it off to the concatenating compiler routine.
621Without interior mutability, the borrow checker won't let us borrow `self`
622mutably both inside and outside the closure at the same time.
623*/
624
625/// A builder for compiling an NFA from a regex's high-level intermediate
626/// representation (HIR).
627///
628/// This compiler provides a way to translate a parsed regex pattern into an
629/// NFA state graph. The NFA state graph can either be used directly to execute
630/// a search (e.g., with a Pike VM), or it can be further used to build a DFA.
631///
632/// This compiler provides APIs both for compiling regex patterns directly from
633/// their concrete syntax, or via a [`regex_syntax::hir::Hir`].
634///
635/// This compiler has various options that may be configured via
636/// [`thompson::Config`](Config).
637///
638/// Note that a compiler is not the same as a [`thompson::Builder`](Builder).
639/// A `Builder` provides a lower level API that is uncoupled from a regex
640/// pattern's concrete syntax or even its HIR. Instead, it permits stitching
641/// together an NFA by hand. See its docs for examples.
642///
643/// # Example: compilation from concrete syntax
644///
645/// This shows how to compile an NFA from a pattern string while setting a size
646/// limit on how big the NFA is allowed to be (in terms of bytes of heap used).
647///
648/// ```
649/// use regex_automata::{
650/// nfa::thompson::{NFA, pikevm::PikeVM},
651/// Match,
652/// };
653///
654/// let config = NFA::config().nfa_size_limit(Some(1_000));
655/// let nfa = NFA::compiler().configure(config).build(r"(?-u)\w")?;
656///
657/// let re = PikeVM::new_from_nfa(nfa)?;
658/// let mut cache = re.create_cache();
659/// let mut caps = re.create_captures();
660/// let expected = Some(Match::must(0, 3..4));
661/// re.captures(&mut cache, "!@#A#@!", &mut caps);
662/// assert_eq!(expected, caps.get_match());
663///
664/// # Ok::<(), Box<dyn std::error::Error>>(())
665/// ```
666///
667/// # Example: compilation from HIR
668///
669/// This shows how to hand assemble a regular expression via its HIR, and then
670/// compile an NFA directly from it.
671///
672/// ```
673/// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
674/// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange};
675///
676/// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![
677/// ClassBytesRange::new(b'0', b'9'),
678/// ClassBytesRange::new(b'A', b'Z'),
679/// ClassBytesRange::new(b'_', b'_'),
680/// ClassBytesRange::new(b'a', b'z'),
681/// ])));
682///
683/// let config = NFA::config().nfa_size_limit(Some(1_000));
684/// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?;
685///
686/// let re = PikeVM::new_from_nfa(nfa)?;
687/// let mut cache = re.create_cache();
688/// let mut caps = re.create_captures();
689/// let expected = Some(Match::must(0, 3..4));
690/// re.captures(&mut cache, "!@#A#@!", &mut caps);
691/// assert_eq!(expected, caps.get_match());
692///
693/// # Ok::<(), Box<dyn std::error::Error>>(())
694/// ```
695#[derive(Clone, Debug)]
696pub struct Compiler {
697 /// A regex parser, used when compiling an NFA directly from a pattern
698 /// string.
699 parser: ParserBuilder,
700 /// The compiler configuration.
701 config: Config,
702 /// The builder for actually constructing an NFA. This provides a
703 /// convenient abstraction for writing a compiler.
704 builder: RefCell<Builder>,
705 /// State used for compiling character classes to UTF-8 byte automata.
706 /// State is not retained between character class compilations. This just
707 /// serves to amortize allocation to the extent possible.
708 utf8_state: RefCell<Utf8State>,
709 /// State used for arranging character classes in reverse into a trie.
710 trie_state: RefCell<RangeTrie>,
711 /// State used for caching common suffixes when compiling reverse UTF-8
712 /// automata (for Unicode character classes).
713 utf8_suffix: RefCell<Utf8SuffixMap>,
714}
715
716impl Compiler {
717 /// Create a new NFA builder with its default configuration.
718 pub fn new() -> Compiler {
719 Compiler {
720 parser: ParserBuilder::new(),
721 config: Config::default(),
722 builder: RefCell::new(Builder::new()),
723 utf8_state: RefCell::new(Utf8State::new()),
724 trie_state: RefCell::new(RangeTrie::new()),
725 utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)),
726 }
727 }
728
729 /// Compile the given regular expression pattern into an NFA.
730 ///
731 /// If there was a problem parsing the regex, then that error is returned.
732 ///
733 /// Otherwise, if there was a problem building the NFA, then an error is
734 /// returned. The only error that can occur is if the compiled regex would
735 /// exceed the size limits configured on this builder, or if any part of
736 /// the NFA would exceed the integer representations used. (For example,
737 /// too many states might plausibly occur on a 16-bit target.)
738 ///
739 /// # Example
740 ///
741 /// ```
742 /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
743 ///
744 /// let config = NFA::config().nfa_size_limit(Some(1_000));
745 /// let nfa = NFA::compiler().configure(config).build(r"(?-u)\w")?;
746 ///
747 /// let re = PikeVM::new_from_nfa(nfa)?;
748 /// let mut cache = re.create_cache();
749 /// let mut caps = re.create_captures();
750 /// let expected = Some(Match::must(0, 3..4));
751 /// re.captures(&mut cache, "!@#A#@!", &mut caps);
752 /// assert_eq!(expected, caps.get_match());
753 ///
754 /// # Ok::<(), Box<dyn std::error::Error>>(())
755 /// ```
756 pub fn build(&self, pattern: &str) -> Result<NFA, BuildError> {
757 self.build_many(&[pattern])
758 }
759
760 /// Compile the given regular expression patterns into a single NFA.
761 ///
762 /// When matches are returned, the pattern ID corresponds to the index of
763 /// the pattern in the slice given.
764 ///
765 /// # Example
766 ///
767 /// ```
768 /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
769 ///
770 /// let config = NFA::config().nfa_size_limit(Some(1_000));
771 /// let nfa = NFA::compiler().configure(config).build_many(&[
772 /// r"(?-u)\s",
773 /// r"(?-u)\w",
774 /// ])?;
775 ///
776 /// let re = PikeVM::new_from_nfa(nfa)?;
777 /// let mut cache = re.create_cache();
778 /// let mut caps = re.create_captures();
779 /// let expected = Some(Match::must(1, 1..2));
780 /// re.captures(&mut cache, "!A! !A!", &mut caps);
781 /// assert_eq!(expected, caps.get_match());
782 ///
783 /// # Ok::<(), Box<dyn std::error::Error>>(())
784 /// ```
785 pub fn build_many<P: AsRef<str>>(
786 &self,
787 patterns: &[P],
788 ) -> Result<NFA, BuildError> {
789 let mut hirs = vec![];
790 for p in patterns {
791 hirs.push(
792 self.parser
793 .build()
794 .parse(p.as_ref())
795 .map_err(BuildError::syntax)?,
796 );
797 debug!("parsed: {:?}", p.as_ref());
798 }
799 self.build_many_from_hir(&hirs)
800 }
801
802 /// Compile the given high level intermediate representation of a regular
803 /// expression into an NFA.
804 ///
805 /// If there was a problem building the NFA, then an error is returned. The
806 /// only error that can occur is if the compiled regex would exceed the
807 /// size limits configured on this builder, or if any part of the NFA would
808 /// exceed the integer representations used. (For example, too many states
809 /// might plausibly occur on a 16-bit target.)
810 ///
811 /// # Example
812 ///
813 /// ```
814 /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
815 /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange};
816 ///
817 /// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![
818 /// ClassBytesRange::new(b'0', b'9'),
819 /// ClassBytesRange::new(b'A', b'Z'),
820 /// ClassBytesRange::new(b'_', b'_'),
821 /// ClassBytesRange::new(b'a', b'z'),
822 /// ])));
823 ///
824 /// let config = NFA::config().nfa_size_limit(Some(1_000));
825 /// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?;
826 ///
827 /// let re = PikeVM::new_from_nfa(nfa)?;
828 /// let mut cache = re.create_cache();
829 /// let mut caps = re.create_captures();
830 /// let expected = Some(Match::must(0, 3..4));
831 /// re.captures(&mut cache, "!@#A#@!", &mut caps);
832 /// assert_eq!(expected, caps.get_match());
833 ///
834 /// # Ok::<(), Box<dyn std::error::Error>>(())
835 /// ```
836 pub fn build_from_hir(&self, expr: &Hir) -> Result<NFA, BuildError> {
837 self.build_many_from_hir(&[expr])
838 }
839
840 /// Compile the given high level intermediate representations of regular
841 /// expressions into a single NFA.
842 ///
843 /// When matches are returned, the pattern ID corresponds to the index of
844 /// the pattern in the slice given.
845 ///
846 /// # Example
847 ///
848 /// ```
849 /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
850 /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange};
851 ///
852 /// let hirs = &[
853 /// Hir::class(Class::Bytes(ClassBytes::new(vec![
854 /// ClassBytesRange::new(b'\t', b'\r'),
855 /// ClassBytesRange::new(b' ', b' '),
856 /// ]))),
857 /// Hir::class(Class::Bytes(ClassBytes::new(vec![
858 /// ClassBytesRange::new(b'0', b'9'),
859 /// ClassBytesRange::new(b'A', b'Z'),
860 /// ClassBytesRange::new(b'_', b'_'),
861 /// ClassBytesRange::new(b'a', b'z'),
862 /// ]))),
863 /// ];
864 ///
865 /// let config = NFA::config().nfa_size_limit(Some(1_000));
866 /// let nfa = NFA::compiler().configure(config).build_many_from_hir(hirs)?;
867 ///
868 /// let re = PikeVM::new_from_nfa(nfa)?;
869 /// let mut cache = re.create_cache();
870 /// let mut caps = re.create_captures();
871 /// let expected = Some(Match::must(1, 1..2));
872 /// re.captures(&mut cache, "!A! !A!", &mut caps);
873 /// assert_eq!(expected, caps.get_match());
874 ///
875 /// # Ok::<(), Box<dyn std::error::Error>>(())
876 /// ```
877 pub fn build_many_from_hir<H: Borrow<Hir>>(
878 &self,
879 exprs: &[H],
880 ) -> Result<NFA, BuildError> {
881 self.compile(exprs)
882 }
883
884 /// Apply the given NFA configuration options to this builder.
885 ///
886 /// # Example
887 ///
888 /// ```
889 /// use regex_automata::nfa::thompson::NFA;
890 ///
891 /// let config = NFA::config().nfa_size_limit(Some(1_000));
892 /// let nfa = NFA::compiler().configure(config).build(r"(?-u)\w")?;
893 /// assert_eq!(nfa.pattern_len(), 1);
894 ///
895 /// # Ok::<(), Box<dyn std::error::Error>>(())
896 /// ```
897 pub fn configure(&mut self, config: Config) -> &mut Compiler {
898 self.config = self.config.overwrite(config);
899 self
900 }
901
902 /// Set the syntax configuration for this builder using
903 /// [`syntax::Config`](crate::util::syntax::Config).
904 ///
905 /// This permits setting things like case insensitivity, Unicode and multi
906 /// line mode.
907 ///
908 /// This syntax configuration only applies when an NFA is built directly
909 /// from a pattern string. If an NFA is built from an HIR, then all syntax
910 /// settings are ignored.
911 ///
912 /// # Example
913 ///
914 /// ```
915 /// use regex_automata::{nfa::thompson::NFA, util::syntax};
916 ///
917 /// let syntax_config = syntax::Config::new().unicode(false);
918 /// let nfa = NFA::compiler().syntax(syntax_config).build(r"\w")?;
919 /// // If Unicode were enabled, the number of states would be much bigger.
920 /// assert!(nfa.states().len() < 15);
921 ///
922 /// # Ok::<(), Box<dyn std::error::Error>>(())
923 /// ```
924 pub fn syntax(
925 &mut self,
926 config: crate::util::syntax::Config,
927 ) -> &mut Compiler {
928 config.apply(&mut self.parser);
929 self
930 }
931}
932
933impl Compiler {
934 /// Compile the sequence of HIR expressions given. Pattern IDs are
935 /// allocated starting from 0, in correspondence with the slice given.
936 ///
937 /// It is legal to provide an empty slice. In that case, the NFA returned
938 /// has no patterns and will never match anything.
939 fn compile<H: Borrow<Hir>>(&self, exprs: &[H]) -> Result<NFA, BuildError> {
940 if exprs.len() > PatternID::LIMIT {
941 return Err(BuildError::too_many_patterns(exprs.len()));
942 }
943 if self.config.get_reverse()
944 && self.config.get_which_captures().is_any()
945 {
946 return Err(BuildError::unsupported_captures());
947 }
948
949 self.builder.borrow_mut().clear();
950 self.builder.borrow_mut().set_utf8(self.config.get_utf8());
951 self.builder.borrow_mut().set_reverse(self.config.get_reverse());
952 self.builder
953 .borrow_mut()
954 .set_look_matcher(self.config.get_look_matcher());
955 self.builder
956 .borrow_mut()
957 .set_size_limit(self.config.get_nfa_size_limit())?;
958
959 // We always add an unanchored prefix unless we were specifically told
960 // not to (for tests only), or if we know that the regex is anchored
961 // for all matches. When an unanchored prefix is not added, then the
962 // NFA's anchored and unanchored start states are equivalent.
963 let all_anchored = exprs.iter().all(|e| {
964 e.borrow()
965 .properties()
966 .look_set_prefix()
967 .contains(hir::Look::Start)
968 });
969 let anchored = !self.config.get_unanchored_prefix() || all_anchored;
970 let unanchored_prefix = if anchored {
971 self.c_empty()?
972 } else {
973 self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)?
974 };
975
976 let compiled = self.c_alt_iter(exprs.iter().map(|e| {
977 let _ = self.start_pattern()?;
978 let one = self.c_cap(0, None, e.borrow())?;
979 let match_state_id = self.add_match()?;
980 self.patch(one.end, match_state_id)?;
981 let _ = self.finish_pattern(one.start)?;
982 Ok(ThompsonRef { start: one.start, end: match_state_id })
983 }))?;
984 self.patch(unanchored_prefix.end, compiled.start)?;
985 let nfa = self
986 .builder
987 .borrow_mut()
988 .build(compiled.start, unanchored_prefix.start)?;
989
990 debug!("HIR-to-NFA compilation complete, config: {:?}", self.config);
991 Ok(nfa)
992 }
993
994 /// Compile an arbitrary HIR expression.
995 fn c(&self, expr: &Hir) -> Result<ThompsonRef, BuildError> {
996 use regex_syntax::hir::{Class, HirKind::*};
997
998 match *expr.kind() {
999 Empty => self.c_empty(),
1000 Literal(hir::Literal(ref bytes)) => self.c_literal(bytes),
1001 Class(Class::Bytes(ref c)) => self.c_byte_class(c),
1002 Class(Class::Unicode(ref c)) => self.c_unicode_class(c),
1003 Look(ref look) => self.c_look(look),
1004 Repetition(ref rep) => self.c_repetition(rep),
1005 Capture(ref c) => self.c_cap(c.index, c.name.as_deref(), &c.sub),
1006 Concat(ref es) => self.c_concat(es.iter().map(|e| self.c(e))),
1007 Alternation(ref es) => self.c_alt_slice(es),
1008 }
1009 }
1010
1011 /// Compile a concatenation of the sub-expressions yielded by the given
1012 /// iterator. If the iterator yields no elements, then this compiles down
1013 /// to an "empty" state that always matches.
1014 ///
1015 /// If the compiler is in reverse mode, then the expressions given are
1016 /// automatically compiled in reverse.
1017 fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef, BuildError>
1018 where
1019 I: DoubleEndedIterator<Item = Result<ThompsonRef, BuildError>>,
1020 {
1021 let first = if self.is_reverse() { it.next_back() } else { it.next() };
1022 let ThompsonRef { start, mut end } = match first {
1023 Some(result) => result?,
1024 None => return self.c_empty(),
1025 };
1026 loop {
1027 let next =
1028 if self.is_reverse() { it.next_back() } else { it.next() };
1029 let compiled = match next {
1030 Some(result) => result?,
1031 None => break,
1032 };
1033 self.patch(end, compiled.start)?;
1034 end = compiled.end;
1035 }
1036 Ok(ThompsonRef { start, end })
1037 }
1038
1039 /// Compile an alternation of the given HIR values.
1040 ///
1041 /// This is like 'c_alt_iter', but it accepts a slice of HIR values instead
1042 /// of an iterator of compiled NFA subgraphs. The point of accepting a
1043 /// slice here is that it opens up some optimization opportunities. For
1044 /// example, if all of the HIR values are literals, then this routine might
1045 /// re-shuffle them to make NFA epsilon closures substantially faster.
1046 fn c_alt_slice(&self, exprs: &[Hir]) -> Result<ThompsonRef, BuildError> {
1047 // self.c_alt_iter(exprs.iter().map(|e| self.c(e)))
1048 let literal_count = exprs
1049 .iter()
1050 .filter(|e| {
1051 matches!(*e.kind(), hir::HirKind::Literal(hir::Literal(_)))
1052 })
1053 .count();
1054 if literal_count <= 1 || literal_count < exprs.len() {
1055 return self.c_alt_iter(exprs.iter().map(|e| self.c(e)));
1056 }
1057
1058 let mut trie = if self.is_reverse() {
1059 LiteralTrie::reverse()
1060 } else {
1061 LiteralTrie::forward()
1062 };
1063 for expr in exprs.iter() {
1064 let literal = match *expr.kind() {
1065 hir::HirKind::Literal(hir::Literal(ref bytes)) => bytes,
1066 _ => unreachable!(),
1067 };
1068 trie.add(literal)?;
1069 }
1070 trie.compile(&mut self.builder.borrow_mut())
1071 }
1072
1073 /// Compile an alternation, where each element yielded by the given
1074 /// iterator represents an item in the alternation. If the iterator yields
1075 /// no elements, then this compiles down to a "fail" state.
1076 ///
1077 /// In an alternation, expressions appearing earlier are "preferred" at
1078 /// match time over expressions appearing later. At least, this is true
1079 /// when using "leftmost first" match semantics. (If "leftmost longest" are
1080 /// ever added in the future, then this preference order of priority would
1081 /// not apply in that mode.)
1082 fn c_alt_iter<I>(&self, mut it: I) -> Result<ThompsonRef, BuildError>
1083 where
1084 I: Iterator<Item = Result<ThompsonRef, BuildError>>,
1085 {
1086 let first = match it.next() {
1087 None => return self.c_fail(),
1088 Some(result) => result?,
1089 };
1090 let second = match it.next() {
1091 None => return Ok(first),
1092 Some(result) => result?,
1093 };
1094
1095 let union = self.add_union()?;
1096 let end = self.add_empty()?;
1097 self.patch(union, first.start)?;
1098 self.patch(first.end, end)?;
1099 self.patch(union, second.start)?;
1100 self.patch(second.end, end)?;
1101 for result in it {
1102 let compiled = result?;
1103 self.patch(union, compiled.start)?;
1104 self.patch(compiled.end, end)?;
1105 }
1106 Ok(ThompsonRef { start: union, end })
1107 }
1108
1109 /// Compile the given capture sub-expression. `expr` should be the
1110 /// sub-expression contained inside the capture. If "capture" states are
1111 /// enabled, then they are added as appropriate.
1112 ///
1113 /// This accepts the pieces of a capture instead of a `hir::Capture` so
1114 /// that it's easy to manufacture a "fake" group when necessary, e.g., for
1115 /// adding the entire pattern as if it were a group in order to create
1116 /// appropriate "capture" states in the NFA.
1117 fn c_cap(
1118 &self,
1119 index: u32,
1120 name: Option<&str>,
1121 expr: &Hir,
1122 ) -> Result<ThompsonRef, BuildError> {
1123 match self.config.get_which_captures() {
1124 // No capture states means we always skip them.
1125 WhichCaptures::None => return self.c(expr),
1126 // Implicit captures states means we only add when index==0 since
1127 // index==0 implies the group is implicit.
1128 WhichCaptures::Implicit if index > 0 => return self.c(expr),
1129 _ => {}
1130 }
1131
1132 let start = self.add_capture_start(index, name)?;
1133 let inner = self.c(expr)?;
1134 let end = self.add_capture_end(index)?;
1135 self.patch(start, inner.start)?;
1136 self.patch(inner.end, end)?;
1137 Ok(ThompsonRef { start, end })
1138 }
1139
1140 /// Compile the given repetition expression. This handles all types of
1141 /// repetitions and greediness.
1142 fn c_repetition(
1143 &self,
1144 rep: &hir::Repetition,
1145 ) -> Result<ThompsonRef, BuildError> {
1146 match (rep.min, rep.max) {
1147 (0, Some(1)) => self.c_zero_or_one(&rep.sub, rep.greedy),
1148 (min, None) => self.c_at_least(&rep.sub, rep.greedy, min),
1149 (min, Some(max)) if min == max => self.c_exactly(&rep.sub, min),
1150 (min, Some(max)) => self.c_bounded(&rep.sub, rep.greedy, min, max),
1151 }
1152 }
1153
1154 /// Compile the given expression such that it matches at least `min` times,
1155 /// but no more than `max` times.
1156 ///
1157 /// When `greedy` is true, then the preference is for the expression to
1158 /// match as much as possible. Otherwise, it will match as little as
1159 /// possible.
1160 fn c_bounded(
1161 &self,
1162 expr: &Hir,
1163 greedy: bool,
1164 min: u32,
1165 max: u32,
1166 ) -> Result<ThompsonRef, BuildError> {
1167 let prefix = self.c_exactly(expr, min)?;
1168 if min == max {
1169 return Ok(prefix);
1170 }
1171
1172 // It is tempting here to compile the rest here as a concatenation
1173 // of zero-or-one matches. i.e., for `a{2,5}`, compile it as if it
1174 // were `aaa?a?a?`. The problem here is that it leads to this program:
1175 //
1176 // >000000: 61 => 01
1177 // 000001: 61 => 02
1178 // 000002: union(03, 04)
1179 // 000003: 61 => 04
1180 // 000004: union(05, 06)
1181 // 000005: 61 => 06
1182 // 000006: union(07, 08)
1183 // 000007: 61 => 08
1184 // 000008: MATCH
1185 //
1186 // And effectively, once you hit state 2, the epsilon closure will
1187 // include states 3, 5, 6, 7 and 8, which is quite a bit. It is better
1188 // to instead compile it like so:
1189 //
1190 // >000000: 61 => 01
1191 // 000001: 61 => 02
1192 // 000002: union(03, 08)
1193 // 000003: 61 => 04
1194 // 000004: union(05, 08)
1195 // 000005: 61 => 06
1196 // 000006: union(07, 08)
1197 // 000007: 61 => 08
1198 // 000008: MATCH
1199 //
1200 // So that the epsilon closure of state 2 is now just 3 and 8.
1201 let empty = self.add_empty()?;
1202 let mut prev_end = prefix.end;
1203 for _ in min..max {
1204 let union = if greedy {
1205 self.add_union()
1206 } else {
1207 self.add_union_reverse()
1208 }?;
1209 let compiled = self.c(expr)?;
1210 self.patch(prev_end, union)?;
1211 self.patch(union, compiled.start)?;
1212 self.patch(union, empty)?;
1213 prev_end = compiled.end;
1214 }
1215 self.patch(prev_end, empty)?;
1216 Ok(ThompsonRef { start: prefix.start, end: empty })
1217 }
1218
1219 /// Compile the given expression such that it may be matched `n` or more
1220 /// times, where `n` can be any integer. (Although a particularly large
1221 /// integer is likely to run afoul of any configured size limits.)
1222 ///
1223 /// When `greedy` is true, then the preference is for the expression to
1224 /// match as much as possible. Otherwise, it will match as little as
1225 /// possible.
1226 fn c_at_least(
1227 &self,
1228 expr: &Hir,
1229 greedy: bool,
1230 n: u32,
1231 ) -> Result<ThompsonRef, BuildError> {
1232 if n == 0 {
1233 // When the expression cannot match the empty string, then we
1234 // can get away with something much simpler: just one 'alt'
1235 // instruction that optionally repeats itself. But if the expr
1236 // can match the empty string... see below.
1237 if expr.properties().minimum_len().map_or(false, |len| len > 0) {
1238 let union = if greedy {
1239 self.add_union()
1240 } else {
1241 self.add_union_reverse()
1242 }?;
1243 let compiled = self.c(expr)?;
1244 self.patch(union, compiled.start)?;
1245 self.patch(compiled.end, union)?;
1246 return Ok(ThompsonRef { start: union, end: union });
1247 }
1248
1249 // What's going on here? Shouldn't x* be simpler than this? It
1250 // turns out that when implementing leftmost-first (Perl-like)
1251 // match semantics, x* results in an incorrect preference order
1252 // when computing the transitive closure of states if and only if
1253 // 'x' can match the empty string. So instead, we compile x* as
1254 // (x+)?, which preserves the correct preference order.
1255 //
1256 // See: https://github.com/rust-lang/regex/issues/779
1257 let compiled = self.c(expr)?;
1258 let plus = if greedy {
1259 self.add_union()
1260 } else {
1261 self.add_union_reverse()
1262 }?;
1263 self.patch(compiled.end, plus)?;
1264 self.patch(plus, compiled.start)?;
1265
1266 let question = if greedy {
1267 self.add_union()
1268 } else {
1269 self.add_union_reverse()
1270 }?;
1271 let empty = self.add_empty()?;
1272 self.patch(question, compiled.start)?;
1273 self.patch(question, empty)?;
1274 self.patch(plus, empty)?;
1275 Ok(ThompsonRef { start: question, end: empty })
1276 } else if n == 1 {
1277 let compiled = self.c(expr)?;
1278 let union = if greedy {
1279 self.add_union()
1280 } else {
1281 self.add_union_reverse()
1282 }?;
1283 self.patch(compiled.end, union)?;
1284 self.patch(union, compiled.start)?;
1285 Ok(ThompsonRef { start: compiled.start, end: union })
1286 } else {
1287 let prefix = self.c_exactly(expr, n - 1)?;
1288 let last = self.c(expr)?;
1289 let union = if greedy {
1290 self.add_union()
1291 } else {
1292 self.add_union_reverse()
1293 }?;
1294 self.patch(prefix.end, last.start)?;
1295 self.patch(last.end, union)?;
1296 self.patch(union, last.start)?;
1297 Ok(ThompsonRef { start: prefix.start, end: union })
1298 }
1299 }
1300
1301 /// Compile the given expression such that it may be matched zero or one
1302 /// times.
1303 ///
1304 /// When `greedy` is true, then the preference is for the expression to
1305 /// match as much as possible. Otherwise, it will match as little as
1306 /// possible.
1307 fn c_zero_or_one(
1308 &self,
1309 expr: &Hir,
1310 greedy: bool,
1311 ) -> Result<ThompsonRef, BuildError> {
1312 let union =
1313 if greedy { self.add_union() } else { self.add_union_reverse() }?;
1314 let compiled = self.c(expr)?;
1315 let empty = self.add_empty()?;
1316 self.patch(union, compiled.start)?;
1317 self.patch(union, empty)?;
1318 self.patch(compiled.end, empty)?;
1319 Ok(ThompsonRef { start: union, end: empty })
1320 }
1321
1322 /// Compile the given HIR expression exactly `n` times.
1323 fn c_exactly(
1324 &self,
1325 expr: &Hir,
1326 n: u32,
1327 ) -> Result<ThompsonRef, BuildError> {
1328 let it = (0..n).map(|_| self.c(expr));
1329 self.c_concat(it)
1330 }
1331
1332 /// Compile the given byte oriented character class.
1333 ///
1334 /// This uses "sparse" states to represent an alternation between ranges in
1335 /// this character class. We can use "sparse" states instead of stitching
1336 /// together a "union" state because all ranges in a character class have
1337 /// equal priority *and* are non-overlapping (thus, only one can match, so
1338 /// there's never a question of priority in the first place). This saves a
1339 /// fair bit of overhead when traversing an NFA.
1340 ///
1341 /// This routine compiles an empty character class into a "fail" state.
1342 fn c_byte_class(
1343 &self,
1344 cls: &hir::ClassBytes,
1345 ) -> Result<ThompsonRef, BuildError> {
1346 let end = self.add_empty()?;
1347 let mut trans = Vec::with_capacity(cls.ranges().len());
1348 for r in cls.iter() {
1349 trans.push(Transition {
1350 start: r.start(),
1351 end: r.end(),
1352 next: end,
1353 });
1354 }
1355 Ok(ThompsonRef { start: self.add_sparse(trans)?, end })
1356 }
1357
1358 /// Compile the given Unicode character class.
1359 ///
1360 /// This routine specifically tries to use various types of compression,
1361 /// since UTF-8 automata of large classes can get quite large. The specific
1362 /// type of compression used depends on forward vs reverse compilation, and
1363 /// whether NFA shrinking is enabled or not.
1364 ///
1365 /// Aside from repetitions causing lots of repeat group, this is like the
1366 /// single most expensive part of regex compilation. Therefore, a large part
1367 /// of the expense of compilation may be reduce by disabling Unicode in the
1368 /// pattern.
1369 ///
1370 /// This routine compiles an empty character class into a "fail" state.
1371 fn c_unicode_class(
1372 &self,
1373 cls: &hir::ClassUnicode,
1374 ) -> Result<ThompsonRef, BuildError> {
1375 // If all we have are ASCII ranges wrapped in a Unicode package, then
1376 // there is zero reason to bring out the big guns. We can fit all ASCII
1377 // ranges within a single sparse state.
1378 if cls.is_ascii() {
1379 let end = self.add_empty()?;
1380 let mut trans = Vec::with_capacity(cls.ranges().len());
1381 for r in cls.iter() {
1382 // The unwraps below are OK because we've verified that this
1383 // class only contains ASCII codepoints.
1384 trans.push(Transition {
1385 // FIXME(1.59): use the 'TryFrom<char> for u8' impl.
1386 start: u8::try_from(u32::from(r.start())).unwrap(),
1387 end: u8::try_from(u32::from(r.end())).unwrap(),
1388 next: end,
1389 });
1390 }
1391 Ok(ThompsonRef { start: self.add_sparse(trans)?, end })
1392 } else if self.is_reverse() {
1393 if !self.config.get_shrink() {
1394 // When we don't want to spend the extra time shrinking, we
1395 // compile the UTF-8 automaton in reverse using something like
1396 // the "naive" approach, but will attempt to re-use common
1397 // suffixes.
1398 self.c_unicode_class_reverse_with_suffix(cls)
1399 } else {
1400 // When we want to shrink our NFA for reverse UTF-8 automata,
1401 // we cannot feed UTF-8 sequences directly to the UTF-8
1402 // compiler, since the UTF-8 compiler requires all sequences
1403 // to be lexicographically sorted. Instead, we organize our
1404 // sequences into a range trie, which can then output our
1405 // sequences in the correct order. Unfortunately, building the
1406 // range trie is fairly expensive (but not nearly as expensive
1407 // as building a DFA). Hence the reason why the 'shrink' option
1408 // exists, so that this path can be toggled off. For example,
1409 // we might want to turn this off if we know we won't be
1410 // compiling a DFA.
1411 let mut trie = self.trie_state.borrow_mut();
1412 trie.clear();
1413
1414 for rng in cls.iter() {
1415 for mut seq in Utf8Sequences::new(rng.start(), rng.end()) {
1416 seq.reverse();
1417 trie.insert(seq.as_slice());
1418 }
1419 }
1420 let mut builder = self.builder.borrow_mut();
1421 let mut utf8_state = self.utf8_state.borrow_mut();
1422 let mut utf8c =
1423 Utf8Compiler::new(&mut *builder, &mut *utf8_state)?;
1424 trie.iter(|seq| {
1425 utf8c.add(&seq)?;
1426 Ok(())
1427 })?;
1428 utf8c.finish()
1429 }
1430 } else {
1431 // In the forward direction, we always shrink our UTF-8 automata
1432 // because we can stream it right into the UTF-8 compiler. There
1433 // is almost no downside (in either memory or time) to using this
1434 // approach.
1435 let mut builder = self.builder.borrow_mut();
1436 let mut utf8_state = self.utf8_state.borrow_mut();
1437 let mut utf8c =
1438 Utf8Compiler::new(&mut *builder, &mut *utf8_state)?;
1439 for rng in cls.iter() {
1440 for seq in Utf8Sequences::new(rng.start(), rng.end()) {
1441 utf8c.add(seq.as_slice())?;
1442 }
1443 }
1444 utf8c.finish()
1445 }
1446
1447 // For reference, the code below is the "naive" version of compiling a
1448 // UTF-8 automaton. It is deliciously simple (and works for both the
1449 // forward and reverse cases), but will unfortunately produce very
1450 // large NFAs. When compiling a forward automaton, the size difference
1451 // can sometimes be an order of magnitude. For example, the '\w' regex
1452 // will generate about ~3000 NFA states using the naive approach below,
1453 // but only 283 states when using the approach above. This is because
1454 // the approach above actually compiles a *minimal* (or near minimal,
1455 // because of the bounded hashmap for reusing equivalent states) UTF-8
1456 // automaton.
1457 //
1458 // The code below is kept as a reference point in order to make it
1459 // easier to understand the higher level goal here. Although, it will
1460 // almost certainly bit-rot, so keep that in mind. Also, if you try to
1461 // use it, some of the tests in this module will fail because they look
1462 // for terser byte code produce by the more optimized handling above.
1463 // But the integration test suite should still pass.
1464 //
1465 // One good example of the substantial difference this can make is to
1466 // compare and contrast performance of the Pike VM when the code below
1467 // is active vs the code above. Here's an example to try:
1468 //
1469 // regex-cli find match pikevm -b -p '(?m)^\w{20}' non-ascii-file
1470 //
1471 // With Unicode classes generated below, this search takes about 45s on
1472 // my machine. But with the compressed version above, the search takes
1473 // only around 1.4s. The NFA is also 20% smaller. This is in part due
1474 // to the compression, but also because of the utilization of 'sparse'
1475 // NFA states. They lead to much less state shuffling during the NFA
1476 // search.
1477 /*
1478 let it = cls
1479 .iter()
1480 .flat_map(|rng| Utf8Sequences::new(rng.start(), rng.end()))
1481 .map(|seq| {
1482 let it = seq
1483 .as_slice()
1484 .iter()
1485 .map(|rng| self.c_range(rng.start, rng.end));
1486 self.c_concat(it)
1487 });
1488 self.c_alt_iter(it)
1489 */
1490 }
1491
1492 /// Compile the given Unicode character class in reverse with suffix
1493 /// caching.
1494 ///
1495 /// This is a "quick" way to compile large Unicode classes into reverse
1496 /// UTF-8 automata while doing a small amount of compression on that
1497 /// automata by reusing common suffixes.
1498 ///
1499 /// A more comprehensive compression scheme can be accomplished by using
1500 /// a range trie to efficiently sort a reverse sequence of UTF-8 byte
1501 /// rqanges, and then use Daciuk's algorithm via `Utf8Compiler`.
1502 ///
1503 /// This is the technique used when "NFA shrinking" is disabled.
1504 ///
1505 /// (This also tries to use "sparse" states where possible, just like
1506 /// `c_byte_class` does.)
1507 fn c_unicode_class_reverse_with_suffix(
1508 &self,
1509 cls: &hir::ClassUnicode,
1510 ) -> Result<ThompsonRef, BuildError> {
1511 // N.B. It would likely be better to cache common *prefixes* in the
1512 // reverse direction, but it's not quite clear how to do that. The
1513 // advantage of caching suffixes is that it does give us a win, and
1514 // has a very small additional overhead.
1515 let mut cache = self.utf8_suffix.borrow_mut();
1516 cache.clear();
1517
1518 let union = self.add_union()?;
1519 let alt_end = self.add_empty()?;
1520 for urng in cls.iter() {
1521 for seq in Utf8Sequences::new(urng.start(), urng.end()) {
1522 let mut end = alt_end;
1523 for brng in seq.as_slice() {
1524 let key = Utf8SuffixKey {
1525 from: end,
1526 start: brng.start,
1527 end: brng.end,
1528 };
1529 let hash = cache.hash(&key);
1530 if let Some(id) = cache.get(&key, hash) {
1531 end = id;
1532 continue;
1533 }
1534
1535 let compiled = self.c_range(brng.start, brng.end)?;
1536 self.patch(compiled.end, end)?;
1537 end = compiled.start;
1538 cache.set(key, hash, end);
1539 }
1540 self.patch(union, end)?;
1541 }
1542 }
1543 Ok(ThompsonRef { start: union, end: alt_end })
1544 }
1545
1546 /// Compile the given HIR look-around assertion to an NFA look-around
1547 /// assertion.
1548 fn c_look(&self, anchor: &hir::Look) -> Result<ThompsonRef, BuildError> {
1549 let look = match *anchor {
1550 hir::Look::Start => Look::Start,
1551 hir::Look::End => Look::End,
1552 hir::Look::StartLF => Look::StartLF,
1553 hir::Look::EndLF => Look::EndLF,
1554 hir::Look::StartCRLF => Look::StartCRLF,
1555 hir::Look::EndCRLF => Look::EndCRLF,
1556 hir::Look::WordAscii => Look::WordAscii,
1557 hir::Look::WordAsciiNegate => Look::WordAsciiNegate,
1558 hir::Look::WordUnicode => Look::WordUnicode,
1559 hir::Look::WordUnicodeNegate => Look::WordUnicodeNegate,
1560 hir::Look::WordStartAscii => Look::WordStartAscii,
1561 hir::Look::WordEndAscii => Look::WordEndAscii,
1562 hir::Look::WordStartUnicode => Look::WordStartUnicode,
1563 hir::Look::WordEndUnicode => Look::WordEndUnicode,
1564 hir::Look::WordStartHalfAscii => Look::WordStartHalfAscii,
1565 hir::Look::WordEndHalfAscii => Look::WordEndHalfAscii,
1566 hir::Look::WordStartHalfUnicode => Look::WordStartHalfUnicode,
1567 hir::Look::WordEndHalfUnicode => Look::WordEndHalfUnicode,
1568 };
1569 let id = self.add_look(look)?;
1570 Ok(ThompsonRef { start: id, end: id })
1571 }
1572
1573 /// Compile the given byte string to a concatenation of bytes.
1574 fn c_literal(&self, bytes: &[u8]) -> Result<ThompsonRef, BuildError> {
1575 self.c_concat(bytes.iter().copied().map(|b| self.c_range(b, b)))
1576 }
1577
1578 /// Compile a "range" state with one transition that may only be followed
1579 /// if the input byte is in the (inclusive) range given.
1580 ///
1581 /// Both the `start` and `end` locations point to the state created.
1582 /// Callers will likely want to keep the `start`, but patch the `end` to
1583 /// point to some other state.
1584 fn c_range(&self, start: u8, end: u8) -> Result<ThompsonRef, BuildError> {
1585 let id = self.add_range(start, end)?;
1586 Ok(ThompsonRef { start: id, end: id })
1587 }
1588
1589 /// Compile an "empty" state with one unconditional epsilon transition.
1590 ///
1591 /// Both the `start` and `end` locations point to the state created.
1592 /// Callers will likely want to keep the `start`, but patch the `end` to
1593 /// point to some other state.
1594 fn c_empty(&self) -> Result<ThompsonRef, BuildError> {
1595 let id = self.add_empty()?;
1596 Ok(ThompsonRef { start: id, end: id })
1597 }
1598
1599 /// Compile a "fail" state that can never have any outgoing transitions.
1600 fn c_fail(&self) -> Result<ThompsonRef, BuildError> {
1601 let id = self.add_fail()?;
1602 Ok(ThompsonRef { start: id, end: id })
1603 }
1604
1605 // The below helpers are meant to be simple wrappers around the
1606 // corresponding Builder methods. For the most part, they let us write
1607 // 'self.add_foo()' instead of 'self.builder.borrow_mut().add_foo()', where
1608 // the latter is a mouthful. Some of the methods do inject a little bit
1609 // of extra logic. e.g., Flipping look-around operators when compiling in
1610 // reverse mode.
1611
1612 fn patch(&self, from: StateID, to: StateID) -> Result<(), BuildError> {
1613 self.builder.borrow_mut().patch(from, to)
1614 }
1615
1616 fn start_pattern(&self) -> Result<PatternID, BuildError> {
1617 self.builder.borrow_mut().start_pattern()
1618 }
1619
1620 fn finish_pattern(
1621 &self,
1622 start_id: StateID,
1623 ) -> Result<PatternID, BuildError> {
1624 self.builder.borrow_mut().finish_pattern(start_id)
1625 }
1626
1627 fn add_empty(&self) -> Result<StateID, BuildError> {
1628 self.builder.borrow_mut().add_empty()
1629 }
1630
1631 fn add_range(&self, start: u8, end: u8) -> Result<StateID, BuildError> {
1632 self.builder.borrow_mut().add_range(Transition {
1633 start,
1634 end,
1635 next: StateID::ZERO,
1636 })
1637 }
1638
1639 fn add_sparse(
1640 &self,
1641 ranges: Vec<Transition>,
1642 ) -> Result<StateID, BuildError> {
1643 self.builder.borrow_mut().add_sparse(ranges)
1644 }
1645
1646 fn add_look(&self, mut look: Look) -> Result<StateID, BuildError> {
1647 if self.is_reverse() {
1648 look = look.reversed();
1649 }
1650 self.builder.borrow_mut().add_look(StateID::ZERO, look)
1651 }
1652
1653 fn add_union(&self) -> Result<StateID, BuildError> {
1654 self.builder.borrow_mut().add_union(vec![])
1655 }
1656
1657 fn add_union_reverse(&self) -> Result<StateID, BuildError> {
1658 self.builder.borrow_mut().add_union_reverse(vec![])
1659 }
1660
1661 fn add_capture_start(
1662 &self,
1663 capture_index: u32,
1664 name: Option<&str>,
1665 ) -> Result<StateID, BuildError> {
1666 let name = name.map(|n| Arc::from(n));
1667 self.builder.borrow_mut().add_capture_start(
1668 StateID::ZERO,
1669 capture_index,
1670 name,
1671 )
1672 }
1673
1674 fn add_capture_end(
1675 &self,
1676 capture_index: u32,
1677 ) -> Result<StateID, BuildError> {
1678 self.builder.borrow_mut().add_capture_end(StateID::ZERO, capture_index)
1679 }
1680
1681 fn add_fail(&self) -> Result<StateID, BuildError> {
1682 self.builder.borrow_mut().add_fail()
1683 }
1684
1685 fn add_match(&self) -> Result<StateID, BuildError> {
1686 self.builder.borrow_mut().add_match()
1687 }
1688
1689 fn is_reverse(&self) -> bool {
1690 self.config.get_reverse()
1691 }
1692}
1693
1694/// A value that represents the result of compiling a sub-expression of a
1695/// regex's HIR. Specifically, this represents a sub-graph of the NFA that
1696/// has an initial state at `start` and a final state at `end`.
1697#[derive(Clone, Copy, Debug)]
1698pub(crate) struct ThompsonRef {
1699 pub(crate) start: StateID,
1700 pub(crate) end: StateID,
1701}
1702
1703/// A UTF-8 compiler based on Daciuk's algorithm for compilining minimal DFAs
1704/// from a lexicographically sorted sequence of strings in linear time.
1705///
1706/// The trick here is that any Unicode codepoint range can be converted to
1707/// a sequence of byte ranges that form a UTF-8 automaton. Connecting them
1708/// together via an alternation is trivial, and indeed, it works. However,
1709/// there is a lot of redundant structure in many UTF-8 automatons. Since our
1710/// UTF-8 ranges are in lexicographic order, we can use Daciuk's algorithm
1711/// to build nearly minimal DFAs in linear time. (They are guaranteed to be
1712/// minimal because we use a bounded cache of previously build DFA states.)
1713///
1714/// The drawback is that this sadly doesn't work for reverse automata, since
1715/// the ranges are no longer in lexicographic order. For that, we invented the
1716/// range trie (which gets its own module). Once a range trie is built, we then
1717/// use this same Utf8Compiler to build a reverse UTF-8 automaton.
1718///
1719/// The high level idea is described here:
1720/// https://blog.burntsushi.net/transducers/#finite-state-machines-as-data-structures
1721///
1722/// There is also another implementation of this in the `fst` crate.
1723#[derive(Debug)]
1724struct Utf8Compiler<'a> {
1725 builder: &'a mut Builder,
1726 state: &'a mut Utf8State,
1727 target: StateID,
1728}
1729
1730#[derive(Clone, Debug)]
1731struct Utf8State {
1732 compiled: Utf8BoundedMap,
1733 uncompiled: Vec<Utf8Node>,
1734}
1735
1736#[derive(Clone, Debug)]
1737struct Utf8Node {
1738 trans: Vec<Transition>,
1739 last: Option<Utf8LastTransition>,
1740}
1741
1742#[derive(Clone, Debug)]
1743struct Utf8LastTransition {
1744 start: u8,
1745 end: u8,
1746}
1747
1748impl Utf8State {
1749 fn new() -> Utf8State {
1750 Utf8State { compiled: Utf8BoundedMap::new(10_000), uncompiled: vec![] }
1751 }
1752
1753 fn clear(&mut self) {
1754 self.compiled.clear();
1755 self.uncompiled.clear();
1756 }
1757}
1758
1759impl<'a> Utf8Compiler<'a> {
1760 fn new(
1761 builder: &'a mut Builder,
1762 state: &'a mut Utf8State,
1763 ) -> Result<Utf8Compiler<'a>, BuildError> {
1764 let target = builder.add_empty()?;
1765 state.clear();
1766 let mut utf8c = Utf8Compiler { builder, state, target };
1767 utf8c.add_empty();
1768 Ok(utf8c)
1769 }
1770
1771 fn finish(&mut self) -> Result<ThompsonRef, BuildError> {
1772 self.compile_from(0)?;
1773 let node = self.pop_root();
1774 let start = self.compile(node)?;
1775 Ok(ThompsonRef { start, end: self.target })
1776 }
1777
1778 fn add(&mut self, ranges: &[Utf8Range]) -> Result<(), BuildError> {
1779 let prefix_len = ranges
1780 .iter()
1781 .zip(&self.state.uncompiled)
1782 .take_while(|&(range, node)| {
1783 node.last.as_ref().map_or(false, |t| {
1784 (t.start, t.end) == (range.start, range.end)
1785 })
1786 })
1787 .count();
1788 assert!(prefix_len < ranges.len());
1789 self.compile_from(prefix_len)?;
1790 self.add_suffix(&ranges[prefix_len..]);
1791 Ok(())
1792 }
1793
1794 fn compile_from(&mut self, from: usize) -> Result<(), BuildError> {
1795 let mut next = self.target;
1796 while from + 1 < self.state.uncompiled.len() {
1797 let node = self.pop_freeze(next);
1798 next = self.compile(node)?;
1799 }
1800 self.top_last_freeze(next);
1801 Ok(())
1802 }
1803
1804 fn compile(
1805 &mut self,
1806 node: Vec<Transition>,
1807 ) -> Result<StateID, BuildError> {
1808 let hash = self.state.compiled.hash(&node);
1809 if let Some(id) = self.state.compiled.get(&node, hash) {
1810 return Ok(id);
1811 }
1812 let id = self.builder.add_sparse(node.clone())?;
1813 self.state.compiled.set(node, hash, id);
1814 Ok(id)
1815 }
1816
1817 fn add_suffix(&mut self, ranges: &[Utf8Range]) {
1818 assert!(!ranges.is_empty());
1819 let last = self
1820 .state
1821 .uncompiled
1822 .len()
1823 .checked_sub(1)
1824 .expect("non-empty nodes");
1825 assert!(self.state.uncompiled[last].last.is_none());
1826 self.state.uncompiled[last].last = Some(Utf8LastTransition {
1827 start: ranges[0].start,
1828 end: ranges[0].end,
1829 });
1830 for r in &ranges[1..] {
1831 self.state.uncompiled.push(Utf8Node {
1832 trans: vec![],
1833 last: Some(Utf8LastTransition { start: r.start, end: r.end }),
1834 });
1835 }
1836 }
1837
1838 fn add_empty(&mut self) {
1839 self.state.uncompiled.push(Utf8Node { trans: vec![], last: None });
1840 }
1841
1842 fn pop_freeze(&mut self, next: StateID) -> Vec<Transition> {
1843 let mut uncompiled = self.state.uncompiled.pop().unwrap();
1844 uncompiled.set_last_transition(next);
1845 uncompiled.trans
1846 }
1847
1848 fn pop_root(&mut self) -> Vec<Transition> {
1849 assert_eq!(self.state.uncompiled.len(), 1);
1850 assert!(self.state.uncompiled[0].last.is_none());
1851 self.state.uncompiled.pop().expect("non-empty nodes").trans
1852 }
1853
1854 fn top_last_freeze(&mut self, next: StateID) {
1855 let last = self
1856 .state
1857 .uncompiled
1858 .len()
1859 .checked_sub(1)
1860 .expect("non-empty nodes");
1861 self.state.uncompiled[last].set_last_transition(next);
1862 }
1863}
1864
1865impl Utf8Node {
1866 fn set_last_transition(&mut self, next: StateID) {
1867 if let Some(last) = self.last.take() {
1868 self.trans.push(Transition {
1869 start: last.start,
1870 end: last.end,
1871 next,
1872 });
1873 }
1874 }
1875}
1876
1877#[cfg(test)]
1878mod tests {
1879 use alloc::{vec, vec::Vec};
1880
1881 use crate::{
1882 nfa::thompson::{SparseTransitions, State, Transition, NFA},
1883 util::primitives::{PatternID, SmallIndex, StateID},
1884 };
1885
1886 use super::*;
1887
1888 fn build(pattern: &str) -> NFA {
1889 NFA::compiler()
1890 .configure(
1891 NFA::config()
1892 .which_captures(WhichCaptures::None)
1893 .unanchored_prefix(false),
1894 )
1895 .build(pattern)
1896 .unwrap()
1897 }
1898
1899 fn pid(id: usize) -> PatternID {
1900 PatternID::new(id).unwrap()
1901 }
1902
1903 fn sid(id: usize) -> StateID {
1904 StateID::new(id).unwrap()
1905 }
1906
1907 fn s_byte(byte: u8, next: usize) -> State {
1908 let next = sid(next);
1909 let trans = Transition { start: byte, end: byte, next };
1910 State::ByteRange { trans }
1911 }
1912
1913 fn s_range(start: u8, end: u8, next: usize) -> State {
1914 let next = sid(next);
1915 let trans = Transition { start, end, next };
1916 State::ByteRange { trans }
1917 }
1918
1919 fn s_sparse(transitions: &[(u8, u8, usize)]) -> State {
1920 let transitions = transitions
1921 .iter()
1922 .map(|&(start, end, next)| Transition {
1923 start,
1924 end,
1925 next: sid(next),
1926 })
1927 .collect();
1928 State::Sparse(SparseTransitions { transitions })
1929 }
1930
1931 fn s_bin_union(alt1: usize, alt2: usize) -> State {
1932 State::BinaryUnion { alt1: sid(alt1), alt2: sid(alt2) }
1933 }
1934
1935 fn s_union(alts: &[usize]) -> State {
1936 State::Union {
1937 alternates: alts
1938 .iter()
1939 .map(|&id| sid(id))
1940 .collect::<Vec<StateID>>()
1941 .into_boxed_slice(),
1942 }
1943 }
1944
1945 fn s_cap(next: usize, pattern: usize, index: usize, slot: usize) -> State {
1946 State::Capture {
1947 next: sid(next),
1948 pattern_id: pid(pattern),
1949 group_index: SmallIndex::new(index).unwrap(),
1950 slot: SmallIndex::new(slot).unwrap(),
1951 }
1952 }
1953
1954 fn s_fail() -> State {
1955 State::Fail
1956 }
1957
1958 fn s_match(id: usize) -> State {
1959 State::Match { pattern_id: pid(id) }
1960 }
1961
1962 // Test that building an unanchored NFA has an appropriate `(?s:.)*?`
1963 // prefix.
1964 #[test]
1965 fn compile_unanchored_prefix() {
1966 let nfa = NFA::compiler()
1967 .configure(NFA::config().which_captures(WhichCaptures::None))
1968 .build(r"a")
1969 .unwrap();
1970 assert_eq!(
1971 nfa.states(),
1972 &[
1973 s_bin_union(2, 1),
1974 s_range(0, 255, 0),
1975 s_byte(b'a', 3),
1976 s_match(0),
1977 ]
1978 );
1979 }
1980
1981 #[test]
1982 fn compile_empty() {
1983 assert_eq!(build("").states(), &[s_match(0),]);
1984 }
1985
1986 #[test]
1987 fn compile_literal() {
1988 assert_eq!(build("a").states(), &[s_byte(b'a', 1), s_match(0),]);
1989 assert_eq!(
1990 build("ab").states(),
1991 &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(0),]
1992 );
1993 assert_eq!(
1994 build("ā˜ƒ").states(),
1995 &[s_byte(0xE2, 1), s_byte(0x98, 2), s_byte(0x83, 3), s_match(0)]
1996 );
1997
1998 // Check that non-UTF-8 literals work.
1999 let nfa = NFA::compiler()
2000 .configure(
2001 NFA::config()
2002 .which_captures(WhichCaptures::None)
2003 .unanchored_prefix(false),
2004 )
2005 .syntax(crate::util::syntax::Config::new().utf8(false))
2006 .build(r"(?-u)\xFF")
2007 .unwrap();
2008 assert_eq!(nfa.states(), &[s_byte(b'\xFF', 1), s_match(0),]);
2009 }
2010
2011 #[test]
2012 fn compile_class_ascii() {
2013 assert_eq!(
2014 build(r"[a-z]").states(),
2015 &[s_range(b'a', b'z', 1), s_match(0),]
2016 );
2017 assert_eq!(
2018 build(r"[x-za-c]").states(),
2019 &[s_sparse(&[(b'a', b'c', 1), (b'x', b'z', 1)]), s_match(0)]
2020 );
2021 }
2022
2023 #[test]
2024 #[cfg(not(miri))]
2025 fn compile_class_unicode() {
2026 assert_eq!(
2027 build(r"[\u03B1-\u03B4]").states(),
2028 &[s_range(0xB1, 0xB4, 2), s_byte(0xCE, 0), s_match(0)]
2029 );
2030 assert_eq!(
2031 build(r"[\u03B1-\u03B4\u{1F919}-\u{1F91E}]").states(),
2032 &[
2033 s_range(0xB1, 0xB4, 5),
2034 s_range(0x99, 0x9E, 5),
2035 s_byte(0xA4, 1),
2036 s_byte(0x9F, 2),
2037 s_sparse(&[(0xCE, 0xCE, 0), (0xF0, 0xF0, 3)]),
2038 s_match(0),
2039 ]
2040 );
2041 assert_eq!(
2042 build(r"[a-zā˜ƒ]").states(),
2043 &[
2044 s_byte(0x83, 3),
2045 s_byte(0x98, 0),
2046 s_sparse(&[(b'a', b'z', 3), (0xE2, 0xE2, 1)]),
2047 s_match(0),
2048 ]
2049 );
2050 }
2051
2052 #[test]
2053 fn compile_repetition() {
2054 assert_eq!(
2055 build(r"a?").states(),
2056 &[s_bin_union(1, 2), s_byte(b'a', 2), s_match(0),]
2057 );
2058 assert_eq!(
2059 build(r"a??").states(),
2060 &[s_bin_union(2, 1), s_byte(b'a', 2), s_match(0),]
2061 );
2062 }
2063
2064 #[test]
2065 fn compile_group() {
2066 assert_eq!(
2067 build(r"ab+").states(),
2068 &[s_byte(b'a', 1), s_byte(b'b', 2), s_bin_union(1, 3), s_match(0)]
2069 );
2070 assert_eq!(
2071 build(r"(ab)").states(),
2072 &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(0)]
2073 );
2074 assert_eq!(
2075 build(r"(ab)+").states(),
2076 &[s_byte(b'a', 1), s_byte(b'b', 2), s_bin_union(0, 3), s_match(0)]
2077 );
2078 }
2079
2080 #[test]
2081 fn compile_alternation() {
2082 assert_eq!(
2083 build(r"a|b").states(),
2084 &[s_range(b'a', b'b', 1), s_match(0)]
2085 );
2086 assert_eq!(
2087 build(r"ab|cd").states(),
2088 &[
2089 s_byte(b'b', 3),
2090 s_byte(b'd', 3),
2091 s_sparse(&[(b'a', b'a', 0), (b'c', b'c', 1)]),
2092 s_match(0)
2093 ],
2094 );
2095 assert_eq!(
2096 build(r"|b").states(),
2097 &[s_byte(b'b', 2), s_bin_union(2, 0), s_match(0)]
2098 );
2099 assert_eq!(
2100 build(r"a|").states(),
2101 &[s_byte(b'a', 2), s_bin_union(0, 2), s_match(0)]
2102 );
2103 }
2104
2105 // This tests the use of a non-binary union, i.e., a state with more than
2106 // 2 unconditional epsilon transitions. The only place they tend to appear
2107 // is in reverse NFAs when shrinking is disabled. Otherwise, 'binary-union'
2108 // and 'sparse' tend to cover all other cases of alternation.
2109 #[test]
2110 fn compile_non_binary_union() {
2111 let nfa = NFA::compiler()
2112 .configure(
2113 NFA::config()
2114 .which_captures(WhichCaptures::None)
2115 .reverse(true)
2116 .shrink(false)
2117 .unanchored_prefix(false),
2118 )
2119 .build(r"[\u1000\u2000\u3000]")
2120 .unwrap();
2121 assert_eq!(
2122 nfa.states(),
2123 &[
2124 s_union(&[3, 6, 9]),
2125 s_byte(0xE1, 10),
2126 s_byte(0x80, 1),
2127 s_byte(0x80, 2),
2128 s_byte(0xE2, 10),
2129 s_byte(0x80, 4),
2130 s_byte(0x80, 5),
2131 s_byte(0xE3, 10),
2132 s_byte(0x80, 7),
2133 s_byte(0x80, 8),
2134 s_match(0),
2135 ]
2136 );
2137 }
2138
2139 #[test]
2140 fn compile_many_start_pattern() {
2141 let nfa = NFA::compiler()
2142 .configure(
2143 NFA::config()
2144 .which_captures(WhichCaptures::None)
2145 .unanchored_prefix(false),
2146 )
2147 .build_many(&["a", "b"])
2148 .unwrap();
2149 assert_eq!(
2150 nfa.states(),
2151 &[
2152 s_byte(b'a', 1),
2153 s_match(0),
2154 s_byte(b'b', 3),
2155 s_match(1),
2156 s_bin_union(0, 2),
2157 ]
2158 );
2159 assert_eq!(nfa.start_anchored().as_usize(), 4);
2160 assert_eq!(nfa.start_unanchored().as_usize(), 4);
2161 // Test that the start states for each individual pattern are correct.
2162 assert_eq!(nfa.start_pattern(pid(0)).unwrap(), sid(0));
2163 assert_eq!(nfa.start_pattern(pid(1)).unwrap(), sid(2));
2164 }
2165
2166 // This tests that our compiler can handle an empty character class. At the
2167 // time of writing, the regex parser forbids it, so the only way to test it
2168 // is to provide a hand written HIR.
2169 #[test]
2170 fn empty_class_bytes() {
2171 use regex_syntax::hir::{Class, ClassBytes, Hir};
2172
2173 let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![])));
2174 let config = NFA::config()
2175 .which_captures(WhichCaptures::None)
2176 .unanchored_prefix(false);
2177 let nfa =
2178 NFA::compiler().configure(config).build_from_hir(&hir).unwrap();
2179 assert_eq!(nfa.states(), &[s_fail(), s_match(0)]);
2180 }
2181
2182 // Like empty_class_bytes, but for a Unicode class.
2183 #[test]
2184 fn empty_class_unicode() {
2185 use regex_syntax::hir::{Class, ClassUnicode, Hir};
2186
2187 let hir = Hir::class(Class::Unicode(ClassUnicode::new(vec![])));
2188 let config = NFA::config()
2189 .which_captures(WhichCaptures::None)
2190 .unanchored_prefix(false);
2191 let nfa =
2192 NFA::compiler().configure(config).build_from_hir(&hir).unwrap();
2193 assert_eq!(nfa.states(), &[s_fail(), s_match(0)]);
2194 }
2195
2196 #[test]
2197 fn compile_captures_all() {
2198 let nfa = NFA::compiler()
2199 .configure(
2200 NFA::config()
2201 .unanchored_prefix(false)
2202 .which_captures(WhichCaptures::All),
2203 )
2204 .build("a(b)c")
2205 .unwrap();
2206 assert_eq!(
2207 nfa.states(),
2208 &[
2209 s_cap(1, 0, 0, 0),
2210 s_byte(b'a', 2),
2211 s_cap(3, 0, 1, 2),
2212 s_byte(b'b', 4),
2213 s_cap(5, 0, 1, 3),
2214 s_byte(b'c', 6),
2215 s_cap(7, 0, 0, 1),
2216 s_match(0)
2217 ]
2218 );
2219 let ginfo = nfa.group_info();
2220 assert_eq!(2, ginfo.all_group_len());
2221 }
2222
2223 #[test]
2224 fn compile_captures_implicit() {
2225 let nfa = NFA::compiler()
2226 .configure(
2227 NFA::config()
2228 .unanchored_prefix(false)
2229 .which_captures(WhichCaptures::Implicit),
2230 )
2231 .build("a(b)c")
2232 .unwrap();
2233 assert_eq!(
2234 nfa.states(),
2235 &[
2236 s_cap(1, 0, 0, 0),
2237 s_byte(b'a', 2),
2238 s_byte(b'b', 3),
2239 s_byte(b'c', 4),
2240 s_cap(5, 0, 0, 1),
2241 s_match(0)
2242 ]
2243 );
2244 let ginfo = nfa.group_info();
2245 assert_eq!(1, ginfo.all_group_len());
2246 }
2247
2248 #[test]
2249 fn compile_captures_none() {
2250 let nfa = NFA::compiler()
2251 .configure(
2252 NFA::config()
2253 .unanchored_prefix(false)
2254 .which_captures(WhichCaptures::None),
2255 )
2256 .build("a(b)c")
2257 .unwrap();
2258 assert_eq!(
2259 nfa.states(),
2260 &[s_byte(b'a', 1), s_byte(b'b', 2), s_byte(b'c', 3), s_match(0)]
2261 );
2262 let ginfo = nfa.group_info();
2263 assert_eq!(0, ginfo.all_group_len());
2264 }
2265}
2266