compiler.rs source code [crates/regex-automata-0.4.6/src/nfa/thompson/compiler.rs]

1	use core::{borrow::Borrow, cell::RefCell};
2
3	use alloc::{sync::Arc, vec, vec::Vec};
4
5	use regex_syntax::{
6	hir::{self, Hir},
7	utf8::{Utf8Range, Utf8Sequences},
8	ParserBuilder,
9	};
10
11	use crate::{
12	nfa::thompson::{
13	builder::Builder,
14	error::BuildError,
15	literal_trie::LiteralTrie,
16	map::{Utf8BoundedMap, Utf8SuffixKey, Utf8SuffixMap},
17	nfa::{Transition, NFA},
18	range_trie::RangeTrie,
19	},
20	util::{
21	look::{Look, LookMatcher},
22	primitives::{PatternID, StateID},
23	},
24	};
25
26	/// The configuration used for a Thompson NFA compiler.
27	#[derive(Clone, Debug, Default)]
28	pub struct Config {
29	utf8: Option<bool>,
30	reverse: Option<bool>,
31	nfa_size_limit: Option<Option<usize>>,
32	shrink: Option<bool>,
33	which_captures: Option<WhichCaptures>,
34	look_matcher: Option<LookMatcher>,
35	#[cfg(test)]
36	unanchored_prefix: Option<bool>,
37	}
38
39	impl Config {
40	/// Return a new default Thompson NFA compiler configuration.
41	pub fn new() -> Config {
42	Config::default()
43	}
44
45	/// Whether to enable UTF-8 mode during search or not.
46	///
47	/// A regex engine is said to be in UTF-8 mode when it guarantees that
48	/// all matches returned by it have spans consisting of only valid UTF-8.
49	/// That is, it is impossible for a match span to be returned that
50	/// contains any invalid UTF-8.
51	///
52	/// UTF-8 mode generally consists of two things:
53	///
54	/// 1. Whether the NFA's states are constructed such that all paths to a
55	/// match state that consume at least one byte always correspond to valid
56	/// UTF-8.
57	/// 2. Whether all paths to a match state that do _not_ consume any bytes
58	/// should always correspond to valid UTF-8 boundaries.
59	///
60	/// (1) is a guarantee made by whoever constructs the NFA.
61	/// If you're parsing a regex from its concrete syntax, then
62	/// [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) can make
63	/// this guarantee for you. It does it by returning an error if the regex
64	/// pattern could every report a non-empty match span that contains invalid
65	/// UTF-8. So long as `syntax::Config::utf8` mode is enabled and your regex
66	/// successfully parses, then you're guaranteed that the corresponding NFA
67	/// will only ever report non-empty match spans containing valid UTF-8.
68	///
69	/// (2) is a trickier guarantee because it cannot be enforced by the NFA
70	/// state graph itself. Consider, for example, the regex `a`. It matches*
71	/// the empty strings in `☃` at positions `0`, `1`, `2` and `3`, where
72	/// positions `1` and `2` occur within the UTF-8 encoding of a codepoint,
73	/// and thus correspond to invalid UTF-8 boundaries. Therefore, this
74	/// guarantee must be made at a higher level than the NFA state graph
75	/// itself. This crate deals with this case in each regex engine. Namely,
76	/// when a zero-width match that splits a codepoint is found and UTF-8
77	/// mode enabled, then it is ignored and the engine moves on looking for
78	/// the next match.
79	///
80	/// Thus, UTF-8 mode is both a promise that the NFA built only reports
81	/// non-empty matches that are valid UTF-8, and an instruction* to regex*
82	/// engines that empty matches that split codepoints should be banned.
83	///
84	/// Because UTF-8 mode is fundamentally about avoiding invalid UTF-8 spans,
85	/// it only makes sense to enable this option when you know* your haystack*
86	/// is valid UTF-8. (For example, a `&str`.) Enabling UTF-8 mode and
87	/// searching a haystack that contains invalid UTF-8 leads to unspecified
88	/// behavior.
89	///
90	/// Therefore, it may make sense to enable `syntax::Config::utf8` while
91	/// simultaneously disabling* this option. That would ensure all non-empty*
92	/// match spans are valid UTF-8, but that empty match spans may still split
93	/// a codepoint or match at other places that aren't valid UTF-8.
94	///
95	/// In general, this mode is only relevant if your regex can match the
96	/// empty string. Most regexes don't.
97	///
98	/// This is enabled by default.
99	///
100	/// # Example
101	///
102	/// This example shows how UTF-8 mode can impact the match spans that may
103	/// be reported in certain cases.
104	///
105	/// ```
106	/// use regex_automata::{
107	/// nfa::thompson::{self, pikevm::PikeVM},
108	/// Match, Input,
109	/// };
110	///
111	/// let re = PikeVM::new("")?;
112	/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
113	///
114	/// // UTF-8 mode is enabled by default.
115	/// let mut input = Input::new("☃");
116	/// re.search(&mut cache, &input, &mut caps);
117	/// assert_eq!(Some(Match::must(`0`, `0`..`0`)), caps.get_match());
118	///
119	/// // Even though an empty regex matches at 1..1, our next match is
120	/// // 3..3 because 1..1 and 2..2 split the snowman codepoint (which is
121	/// // three bytes long).
122	/// input.set_start(`1`);
123	/// re.search(&mut cache, &input, &mut caps);
124	/// assert_eq!(Some(Match::must(`0`, `3`..`3`)), caps.get_match());
125	///
126	/// // But if we disable UTF-8, then we'll get matches at 1..1 and 2..2:
127	/// let re = PikeVM::builder()
128	/// .thompson(thompson::Config::new().utf8(`false`))
129	/// .build("")?;
130	/// re.search(&mut cache, &input, &mut caps);
131	/// assert_eq!(Some(Match::must(`0`, `1`..`1`)), caps.get_match());
132	///
133	/// input.set_start(`2`);
134	/// re.search(&mut cache, &input, &mut caps);
135	/// assert_eq!(Some(Match::must(`0`, `2`..`2`)), caps.get_match());
136	///
137	/// input.set_start(`3`);
138	/// re.search(&mut cache, &input, &mut caps);
139	/// assert_eq!(Some(Match::must(`0`, `3`..`3`)), caps.get_match());
140	///
141	/// input.set_start(`4`);
142	/// re.search(&mut cache, &input, &mut caps);
143	/// assert_eq!(None, caps.get_match());
144	///
145	/// # Ok::<(), Box<dyn std::error::Error>>(())
146	/// ```
147	pub fn utf8(mut self, yes: bool) -> Config {
148	self.utf8 = Some(yes);
149	self
150	}
151
152	/// Reverse the NFA.
153	///
154	/// A NFA reversal is performed by reversing all of the concatenated
155	/// sub-expressions in the original pattern, recursively. (Look around
156	/// operators are also inverted.) The resulting NFA can be used to match
157	/// the pattern starting from the end of a string instead of the beginning
158	/// of a string.
159	///
160	/// Reversing the NFA is useful for building a reverse DFA, which is most
161	/// useful for finding the start of a match after its ending position has
162	/// been found. NFA execution engines typically do not work on reverse
163	/// NFAs. For example, currently, the Pike VM reports the starting location
164	/// of matches without a reverse NFA.
165	///
166	/// Currently, enabling this setting requires disabling the
167	/// [`captures`](Config::captures) setting. If both are enabled, then the
168	/// compiler will return an error. It is expected that this limitation will
169	/// be lifted in the future.
170	///
171	/// This is disabled by default.
172	///
173	/// # Example
174	///
175	/// This example shows how to build a DFA from a reverse NFA, and then use
176	/// the DFA to search backwards.
177	///
178	/// ```
179	/// use regex_automata::{
180	/// dfa::{self, Automaton},
181	/// nfa::thompson::{NFA, WhichCaptures},
182	/// HalfMatch, Input,
183	/// };
184	///
185	/// let dfa = dfa::dense::Builder::new()
186	/// .thompson(NFA::config()
187	/// .which_captures(WhichCaptures::None)
188	/// .reverse(`true`)
189	/// )
190	/// .build("baz[0-9]+")?;
191	/// let expected = Some(HalfMatch::must(`0`, `3`));
192	/// assert_eq!(
193	/// expected,
194	/// dfa.try_search_rev(&Input::new("foobaz12345bar"))?,
195	/// );
196	///
197	/// # Ok::<(), Box<dyn std::error::Error>>(())
198	/// ```
199	pub fn reverse(mut self, yes: bool) -> Config {
200	self.reverse = Some(yes);
201	self
202	}
203
204	/// Sets an approximate size limit on the total heap used by the NFA being
205	/// compiled.
206	///
207	/// This permits imposing constraints on the size of a compiled NFA. This
208	/// may be useful in contexts where the regex pattern is untrusted and one
209	/// wants to avoid using too much memory.
210	///
211	/// This size limit does not apply to auxiliary heap used during
212	/// compilation that is not part of the built NFA.
213	///
214	/// Note that this size limit is applied during compilation in order for
215	/// the limit to prevent too much heap from being used. However, the
216	/// implementation may use an intermediate NFA representation that is
217	/// otherwise slightly bigger than the final public form. Since the size
218	/// limit may be applied to an intermediate representation, there is not
219	/// necessarily a precise correspondence between the configured size limit
220	/// and the heap usage of the final NFA.
221	///
222	/// There is no size limit by default.
223	///
224	/// # Example
225	///
226	/// This example demonstrates how Unicode mode can greatly increase the
227	/// size of the NFA.
228	///
229	/// ```
230	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
231	/// use regex_automata::nfa::thompson::NFA;
232	///
233	/// // 300KB isn't enough!
234	/// NFA::compiler()
235	/// .configure(NFA::config().nfa_size_limit(Some(`300_000`)))
236	/// .build(r"\w{20}")
237	/// .unwrap_err();
238	///
239	/// // ... but 400KB probably is.
240	/// let nfa = NFA::compiler()
241	/// .configure(NFA::config().nfa_size_limit(Some(`400_000`)))
242	/// .build(r"\w{20}")?;
243	///
244	/// assert_eq!(nfa.pattern_len(), `1`);
245	///
246	/// # Ok::<(), Box<dyn std::error::Error>>(())
247	/// ```
248	pub fn nfa_size_limit(mut self, bytes: Option<usize>) -> Config {
249	self.nfa_size_limit = Some(bytes);
250	self
251	}
252
253	/// Apply best effort heuristics to shrink the NFA at the expense of more
254	/// time/memory.
255	///
256	/// Generally speaking, if one is using an NFA to compile a DFA, then the
257	/// extra time used to shrink the NFA will be more than made up for during
258	/// DFA construction (potentially by a lot). In other words, enabling this
259	/// can substantially decrease the overall amount of time it takes to build
260	/// a DFA.
261	///
262	/// A reason to keep this disabled is if you want to compile an NFA and
263	/// start using it as quickly as possible without needing to build a DFA,
264	/// and you don't mind using a bit of extra memory for the NFA. e.g., for
265	/// an NFA simulation or for a lazy DFA.
266	///
267	/// NFA shrinking is currently most useful when compiling a reverse
268	/// NFA with large Unicode character classes. In particular, it trades
269	/// additional CPU time during NFA compilation in favor of generating fewer
270	/// NFA states.
271	///
272	/// This is disabled by default because it can increase compile times
273	/// quite a bit if you aren't building a full DFA.
274	///
275	/// # Example
276	///
277	/// This example shows that NFA shrinking can lead to substantial space
278	/// savings in some cases. Notice that, as noted above, we build a reverse
279	/// DFA and use a pattern with a large Unicode character class.
280	///
281	/// ```
282	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
283	/// use regex_automata::nfa::thompson::{NFA, WhichCaptures};
284	///
285	/// // Currently we have to disable captures when enabling reverse NFA.
286	/// let config = NFA::config()
287	/// .which_captures(WhichCaptures::None)
288	/// .reverse(`true`);
289	/// let not_shrunk = NFA::compiler()
290	/// .configure(config.clone().shrink(`false`))
291	/// .build(r"\w")?;
292	/// let shrunk = NFA::compiler()
293	/// .configure(config.clone().shrink(`true`))
294	/// .build(r"\w")?;
295	///
296	/// // While a specific shrink factor is not guaranteed, the savings can be
297	/// // considerable in some cases.
298	/// assert!(shrunk.states().len() * `2` < not_shrunk.states().len());
299	///
300	/// # Ok::<(), Box<dyn std::error::Error>>(())
301	/// ```
302	pub fn shrink(mut self, yes: bool) -> Config {
303	self.shrink = Some(yes);
304	self
305	}
306
307	/// Whether to include 'Capture' states in the NFA.
308	///
309	/// Currently, enabling this setting requires disabling the
310	/// [`reverse`](Config::reverse) setting. If both are enabled, then the
311	/// compiler will return an error. It is expected that this limitation will
312	/// be lifted in the future.
313	///
314	/// This is enabled by default.
315	///
316	/// # Example
317	///
318	/// This example demonstrates that some regex engines, like the Pike VM,
319	/// require capturing states to be present in the NFA to report match
320	/// offsets.
321	///
322	/// (Note that since this method is deprecated, the example below uses
323	/// [`Config::which_captures`] to disable capture states.)
324	///
325	/// ```
326	/// use regex_automata::nfa::thompson::{
327	/// pikevm::PikeVM,
328	/// NFA,
329	/// WhichCaptures,
330	/// };
331	///
332	/// let re = PikeVM::builder()
333	/// .thompson(NFA::config().which_captures(WhichCaptures::None))
334	/// .build(r"[a-z]+")?;
335	/// let mut cache = re.create_cache();
336	///
337	/// assert!(re.is_match(&mut cache, "abc"));
338	/// assert_eq!(None, re.find(&mut cache, "abc"));
339	///
340	/// # Ok::<(), Box<dyn std::error::Error>>(())
341	/// ```
342	#[deprecated(since = "0.3.5", note = "use which_captures instead")]
343	pub fn captures(self, yes: bool) -> Config {
344	self.which_captures(if yes {
345	WhichCaptures::All
346	} else {
347	WhichCaptures::None
348	})
349	}
350
351	/// Configures what kinds of capture groups are compiled into
352	/// [`State::Capture`](crate::nfa::thompson::State::Capture) states in a
353	/// Thompson NFA.
354	///
355	/// Currently, using any option except for [`WhichCaptures::None`] requires
356	/// disabling the [`reverse`](Config::reverse) setting. If both are
357	/// enabled, then the compiler will return an error. It is expected that
358	/// this limitation will be lifted in the future.
359	///
360	/// This is set to [`WhichCaptures::All`] by default. Callers may wish to
361	/// use [`WhichCaptures::Implicit`] in cases where one wants avoid the
362	/// overhead of capture states for explicit groups. Usually this occurs
363	/// when one wants to use the `PikeVM` only for determining the overall
364	/// match. Otherwise, the `PikeVM` could use much more memory than is
365	/// necessary.
366	///
367	/// # Example
368	///
369	/// This example demonstrates that some regex engines, like the Pike VM,
370	/// require capturing states to be present in the NFA to report match
371	/// offsets.
372	///
373	/// ```
374	/// use regex_automata::nfa::thompson::{
375	/// pikevm::PikeVM,
376	/// NFA,
377	/// WhichCaptures,
378	/// };
379	///
380	/// let re = PikeVM::builder()
381	/// .thompson(NFA::config().which_captures(WhichCaptures::None))
382	/// .build(r"[a-z]+")?;
383	/// let mut cache = re.create_cache();
384	///
385	/// assert!(re.is_match(&mut cache, "abc"));
386	/// assert_eq!(None, re.find(&mut cache, "abc"));
387	///
388	/// # Ok::<(), Box<dyn std::error::Error>>(())
389	/// ```
390	///
391	/// The same applies to the bounded backtracker:
392	///
393	/// ```
394	/// use regex_automata::nfa::thompson::{
395	/// backtrack::BoundedBacktracker,
396	/// NFA,
397	/// WhichCaptures,
398	/// };
399	///
400	/// let re = BoundedBacktracker::builder()
401	/// .thompson(NFA::config().which_captures(WhichCaptures::None))
402	/// .build(r"[a-z]+")?;
403	/// let mut cache = re.create_cache();
404	///
405	/// assert!(re.try_is_match(&mut cache, "abc")?);
406	/// assert_eq!(None, re.try_find(&mut cache, "abc")?);
407	///
408	/// # Ok::<(), Box<dyn std::error::Error>>(())
409	/// ```
410	pub fn which_captures(mut self, which_captures: WhichCaptures) -> Config {
411	self.which_captures = Some(which_captures);
412	self
413	}
414
415	/// Sets the look-around matcher that should be used with this NFA.
416	///
417	/// A look-around matcher determines how to match look-around assertions.
418	/// In particular, some assertions are configurable. For example, the
419	/// `(?m:^)` and `(?m:$)` assertions can have their line terminator changed
420	/// from the default of `\n` to any other byte.
421	///
422	/// # Example
423	///
424	/// This shows how to change the line terminator for multi-line assertions.
425	///
426	/// ```
427	/// use regex_automata::{
428	/// nfa::thompson::{self, pikevm::PikeVM},
429	/// util::look::LookMatcher,
430	/// Match, Input,
431	/// };
432	///
433	/// let mut lookm = LookMatcher::new();
434	/// lookm.set_line_terminator(b'`\x00`');
435	///
436	/// let re = PikeVM::builder()
437	/// .thompson(thompson::Config::new().look_matcher(lookm))
438	/// .build(r"(?m)^[a-z]+$")?;
439	/// let mut cache = re.create_cache();
440	///
441	/// // Multi-line assertions now use NUL as a terminator.
442	/// assert_eq!(
443	/// Some(Match::must(`0`, `1`..`4`)),
444	/// re.find(&mut cache, b"`\x00`abc`\x00`"),
445	/// );
446	/// // ... and \n is no longer recognized as a terminator.
447	/// assert_eq!(
448	/// None,
449	/// re.find(&mut cache, b"`\n`abc`\n`"),
450	/// );
451	///
452	/// # Ok::<(), Box<dyn std::error::Error>>(())
453	/// ```
454	pub fn look_matcher(mut self, m: LookMatcher) -> Config {
455	self.look_matcher = Some(m);
456	self
457	}
458
459	/// Whether to compile an unanchored prefix into this NFA.
460	///
461	/// This is enabled by default. It is made available for tests only to make
462	/// it easier to unit test the output of the compiler.
463	#[cfg(test)]
464	fn unanchored_prefix(mut self, yes: bool) -> Config {
465	self.unanchored_prefix = Some(yes);
466	self
467	}
468
469	/// Returns whether this configuration has enabled UTF-8 mode.
470	pub fn get_utf8(&self) -> bool {
471	self.utf8.unwrap_or(`true`)
472	}
473
474	/// Returns whether this configuration has enabled reverse NFA compilation.
475	pub fn get_reverse(&self) -> bool {
476	self.reverse.unwrap_or(`false`)
477	}
478
479	/// Return the configured NFA size limit, if it exists, in the number of
480	/// bytes of heap used.
481	pub fn get_nfa_size_limit(&self) -> Option<usize> {
482	self.nfa_size_limit.unwrap_or(None)
483	}
484
485	/// Return whether NFA shrinking is enabled.
486	pub fn get_shrink(&self) -> bool {
487	self.shrink.unwrap_or(`false`)
488	}
489
490	/// Return whether NFA compilation is configured to produce capture states.
491	#[deprecated(since = "0.3.5", note = "use get_which_captures instead")]
492	pub fn get_captures(&self) -> bool {
493	self.get_which_captures().is_any()
494	}
495
496	/// Return what kinds of capture states will be compiled into an NFA.
497	pub fn get_which_captures(&self) -> WhichCaptures {
498	self.which_captures.unwrap_or(WhichCaptures::All)
499	}
500
501	/// Return the look-around matcher for this NFA.
502	pub fn get_look_matcher(&self) -> LookMatcher {
503	self.look_matcher.clone().unwrap_or(LookMatcher::default())
504	}
505
506	/// Return whether NFA compilation is configured to include an unanchored
507	/// prefix.
508	///
509	/// This is always false when not in test mode.
510	fn get_unanchored_prefix(&self) -> bool {
511	#[cfg(test)]
512	{
513	self.unanchored_prefix.unwrap_or(`true`)
514	}
515	#[cfg(not(test))]
516	{
517	`true`
518	}
519	}
520
521	/// Overwrite the default configuration such that the options in `o` are
522	/// always used. If an option in `o` is not set, then the corresponding
523	/// option in `self` is used. If it's not set in `self` either, then it
524	/// remains not set.
525	pub(crate) fn overwrite(&self, o: Config) -> Config {
526	Config {
527	utf8: o.utf8.or(self.utf8),
528	reverse: o.reverse.or(self.reverse),
529	nfa_size_limit: o.nfa_size_limit.or(self.nfa_size_limit),
530	shrink: o.shrink.or(self.shrink),
531	which_captures: o.which_captures.or(self.which_captures),
532	look_matcher: o.look_matcher.or_else(\|\| self.look_matcher.clone()),
533	#[cfg(test)]
534	unanchored_prefix: o.unanchored_prefix.or(self.unanchored_prefix),
535	}
536	}
537	}
538
539	/// A configuration indicating which kinds of
540	/// [`State::Capture`](crate::nfa::thompson::State::Capture) states to include.
541	///
542	/// This configuration can be used with [`Config::which_captures`] to control
543	/// which capture states are compiled into a Thompson NFA.
544	///
545	/// The default configuration is [`WhichCaptures::All`].
546	#[derive(Clone, Copy, Debug)]
547	pub enum WhichCaptures {
548	/// All capture states, including those corresponding to both implicit and
549	/// explicit capture groups, are included in the Thompson NFA.
550	All,
551	/// Only capture states corresponding to implicit capture groups are
552	/// included. Implicit capture groups appear in every pattern implicitly
553	/// and correspond to the overall match of a pattern.
554	///
555	/// This is useful when one only cares about the overall match of a
556	/// pattern. By excluding capture states from explicit capture groups,
557	/// one might be able to reduce the memory usage of a multi-pattern regex
558	/// substantially if it was otherwise written to have many explicit capture
559	/// groups.
560	Implicit,
561	/// No capture states are compiled into the Thompson NFA.
562	///
563	/// This is useful when capture states are either not needed (for example,
564	/// if one is only trying to build a DFA) or if they aren't supported (for
565	/// example, a reverse NFA).
566	None,
567	}
568
569	impl Default for WhichCaptures {
570	fn default() -> WhichCaptures {
571	WhichCaptures::All
572	}
573	}
574
575	impl WhichCaptures {
576	/// Returns true if this configuration indicates that no capture states
577	/// should be produced in an NFA.
578	pub fn is_none(&self) -> bool {
579	matches!(*self, WhichCaptures::None)
580	}
581
582	/// Returns true if this configuration indicates that some capture states
583	/// should be added to an NFA. Note that this might only include capture
584	/// states for implicit capture groups.
585	pub fn is_any(&self) -> bool {
586	!self.is_none()
587	}
588	}
589
590	/*
591	This compiler below uses Thompson's construction algorithm. The compiler takes
592	a regex-syntax::Hir as input and emits an NFA graph as output. The NFA graph
593	is structured in a way that permits it to be executed by a virtual machine and
594	also used to efficiently build a DFA.
595
596	The compiler deals with a slightly expanded set of NFA states than what is
597	in a final NFA (as exhibited by builder::State and nfa::State). Notably a
598	compiler state includes an empty node that has exactly one unconditional
599	epsilon transition to the next state. In other words, it's a "goto" instruction
600	if one views Thompson's NFA as a set of bytecode instructions. These goto
601	instructions are removed in a subsequent phase before returning the NFA to the
602	caller. The purpose of these empty nodes is that they make the construction
603	algorithm substantially simpler to implement. We remove them before returning
604	to the caller because they can represent substantial overhead when traversing
605	the NFA graph (either while searching using the NFA directly or while building
606	a DFA).
607
608	In the future, it would be nice to provide a Glushkov compiler as well, as it
609	would work well as a bit-parallel NFA for smaller regexes. But the Thompson
610	construction is one I'm more familiar with and seems more straight-forward to
611	deal with when it comes to large Unicode character classes.
612
613	Internally, the compiler uses interior mutability to improve composition in the
614	face of the borrow checker. In particular, we'd really like to be able to write
615	things like this:
616
617	self.c_concat(exprs.iter().map(\|e\| self.c(e)))
618
619	Which elegantly uses iterators to build up a sequence of compiled regex
620	sub-expressions and then hands it off to the concatenating compiler routine.
621	Without interior mutability, the borrow checker won't let us borrow `self`
622	mutably both inside and outside the closure at the same time.
623	*/
624
625	/// A builder for compiling an NFA from a regex's high-level intermediate
626	/// representation (HIR).
627	///
628	/// This compiler provides a way to translate a parsed regex pattern into an
629	/// NFA state graph. The NFA state graph can either be used directly to execute
630	/// a search (e.g., with a Pike VM), or it can be further used to build a DFA.
631	///
632	/// This compiler provides APIs both for compiling regex patterns directly from
633	/// their concrete syntax, or via a [`regex_syntax::hir::Hir`].
634	///
635	/// This compiler has various options that may be configured via
636	/// [`thompson::Config`](Config).
637	///
638	/// Note that a compiler is not the same as a [`thompson::Builder`](Builder).
639	/// A `Builder` provides a lower level API that is uncoupled from a regex
640	/// pattern's concrete syntax or even its HIR. Instead, it permits stitching
641	/// together an NFA by hand. See its docs for examples.
642	///
643	/// # Example: compilation from concrete syntax
644	///
645	/// This shows how to compile an NFA from a pattern string while setting a size
646	/// limit on how big the NFA is allowed to be (in terms of bytes of heap used).
647	///
648	/// ```
649	/// use regex_automata::{
650	/// nfa::thompson::{NFA, pikevm::PikeVM},
651	/// Match,
652	/// };
653	///
654	/// let config = NFA::config().nfa_size_limit(Some(`1_000`));
655	/// let nfa = NFA::compiler().configure(config).build(r"(?-u)\w")?;
656	///
657	/// let re = PikeVM::new_from_nfa(nfa)?;
658	/// let mut cache = re.create_cache();
659	/// let mut caps = re.create_captures();
660	/// let expected = Some(Match::must(`0`, `3`..`4`));
661	/// re.captures(&mut cache, "!@#A#@!", &mut caps);
662	/// assert_eq!(expected, caps.get_match());
663	///
664	/// # Ok::<(), Box<dyn std::error::Error>>(())
665	/// ```
666	///
667	/// # Example: compilation from HIR
668	///
669	/// This shows how to hand assemble a regular expression via its HIR, and then
670	/// compile an NFA directly from it.
671	///
672	/// ```
673	/// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
674	/// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange};
675	///
676	/// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![
677	/// ClassBytesRange::new(b'0', b'9'),
678	/// ClassBytesRange::new(b'A', b'Z'),
679	/// ClassBytesRange::new(b'_', b'_'),
680	/// ClassBytesRange::new(b'a', b'z'),
681	/// ])));
682	///
683	/// let config = NFA::config().nfa_size_limit(Some(`1_000`));
684	/// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?;
685	///
686	/// let re = PikeVM::new_from_nfa(nfa)?;
687	/// let mut cache = re.create_cache();
688	/// let mut caps = re.create_captures();
689	/// let expected = Some(Match::must(`0`, `3`..`4`));
690	/// re.captures(&mut cache, "!@#A#@!", &mut caps);
691	/// assert_eq!(expected, caps.get_match());
692	///
693	/// # Ok::<(), Box<dyn std::error::Error>>(())
694	/// ```
695	#[derive(Clone, Debug)]
696	pub struct Compiler {
697	/// A regex parser, used when compiling an NFA directly from a pattern
698	/// string.
699	parser: ParserBuilder,
700	/// The compiler configuration.
701	config: Config,
702	/// The builder for actually constructing an NFA. This provides a
703	/// convenient abstraction for writing a compiler.
704	builder: RefCell<Builder>,
705	/// State used for compiling character classes to UTF-8 byte automata.
706	/// State is not retained between character class compilations. This just
707	/// serves to amortize allocation to the extent possible.
708	utf8_state: RefCell<Utf8State>,
709	/// State used for arranging character classes in reverse into a trie.
710	trie_state: RefCell<RangeTrie>,
711	/// State used for caching common suffixes when compiling reverse UTF-8
712	/// automata (for Unicode character classes).
713	utf8_suffix: RefCell<Utf8SuffixMap>,
714	}
715
716	impl Compiler {
717	/// Create a new NFA builder with its default configuration.
718	pub fn new() -> Compiler {
719	Compiler {
720	parser: ParserBuilder::new(),
721	config: Config::default(),
722	builder: RefCell::new(Builder::new()),
723	utf8_state: RefCell::new(Utf8State::new()),
724	trie_state: RefCell::new(RangeTrie::new()),
725	utf8_suffix: RefCell::new(Utf8SuffixMap::new(`1000`)),
726	}
727	}
728
729	/// Compile the given regular expression pattern into an NFA.
730	///
731	/// If there was a problem parsing the regex, then that error is returned.
732	///
733	/// Otherwise, if there was a problem building the NFA, then an error is
734	/// returned. The only error that can occur is if the compiled regex would
735	/// exceed the size limits configured on this builder, or if any part of
736	/// the NFA would exceed the integer representations used. (For example,
737	/// too many states might plausibly occur on a 16-bit target.)
738	///
739	/// # Example
740	///
741	/// ```
742	/// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
743	///
744	/// let config = NFA::config().nfa_size_limit(Some(`1_000`));
745	/// let nfa = NFA::compiler().configure(config).build(r"(?-u)\w")?;
746	///
747	/// let re = PikeVM::new_from_nfa(nfa)?;
748	/// let mut cache = re.create_cache();
749	/// let mut caps = re.create_captures();
750	/// let expected = Some(Match::must(`0`, `3`..`4`));
751	/// re.captures(&mut cache, "!@#A#@!", &mut caps);
752	/// assert_eq!(expected, caps.get_match());
753	///
754	/// # Ok::<(), Box<dyn std::error::Error>>(())
755	/// ```
756	pub fn build(&self, pattern: &str) -> Result<NFA, BuildError> {
757	self.build_many(&[pattern])
758	}
759
760	/// Compile the given regular expression patterns into a single NFA.
761	///
762	/// When matches are returned, the pattern ID corresponds to the index of
763	/// the pattern in the slice given.
764	///
765	/// # Example
766	///
767	/// ```
768	/// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
769	///
770	/// let config = NFA::config().nfa_size_limit(Some(`1_000`));
771	/// let nfa = NFA::compiler().configure(config).build_many(&[
772	/// r"(?-u)\s",
773	/// r"(?-u)\w",
774	/// ])?;
775	///
776	/// let re = PikeVM::new_from_nfa(nfa)?;
777	/// let mut cache = re.create_cache();
778	/// let mut caps = re.create_captures();
779	/// let expected = Some(Match::must(`1`, `1`..`2`));
780	/// re.captures(&mut cache, "!A! !A!", &mut caps);
781	/// assert_eq!(expected, caps.get_match());
782	///
783	/// # Ok::<(), Box<dyn std::error::Error>>(())
784	/// ```
785	pub fn build_many<P: AsRef<str>>(
786	&self,
787	patterns: &[P],
788	) -> Result<NFA, BuildError> {
789	let mut hirs = vec![];
790	for p in patterns {
791	hirs.push(
792	self.parser
793	.build()
794	.parse(p.as_ref())
795	.map_err(BuildError::syntax)?,
796	);
797	debug!("parsed: {:?}", p.as_ref());
798	}
799	self.build_many_from_hir(&hirs)
800	}
801
802	/// Compile the given high level intermediate representation of a regular
803	/// expression into an NFA.
804	///
805	/// If there was a problem building the NFA, then an error is returned. The
806	/// only error that can occur is if the compiled regex would exceed the
807	/// size limits configured on this builder, or if any part of the NFA would
808	/// exceed the integer representations used. (For example, too many states
809	/// might plausibly occur on a 16-bit target.)
810	///
811	/// # Example
812	///
813	/// ```
814	/// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
815	/// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange};
816	///
817	/// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![
818	/// ClassBytesRange::new(b'0', b'9'),
819	/// ClassBytesRange::new(b'A', b'Z'),
820	/// ClassBytesRange::new(b'_', b'_'),
821	/// ClassBytesRange::new(b'a', b'z'),
822	/// ])));
823	///
824	/// let config = NFA::config().nfa_size_limit(Some(`1_000`));
825	/// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?;
826	///
827	/// let re = PikeVM::new_from_nfa(nfa)?;
828	/// let mut cache = re.create_cache();
829	/// let mut caps = re.create_captures();
830	/// let expected = Some(Match::must(`0`, `3`..`4`));
831	/// re.captures(&mut cache, "!@#A#@!", &mut caps);
832	/// assert_eq!(expected, caps.get_match());
833	///
834	/// # Ok::<(), Box<dyn std::error::Error>>(())
835	/// ```
836	pub fn build_from_hir(&self, expr: &Hir) -> Result<NFA, BuildError> {
837	self.build_many_from_hir(&[expr])
838	}
839
840	/// Compile the given high level intermediate representations of regular
841	/// expressions into a single NFA.
842	///
843	/// When matches are returned, the pattern ID corresponds to the index of
844	/// the pattern in the slice given.
845	///
846	/// # Example
847	///
848	/// ```
849	/// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
850	/// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange};
851	///
852	/// let hirs = &[
853	/// Hir::class(Class::Bytes(ClassBytes::new(vec![
854	/// ClassBytesRange::new(b'`\t`', b'`\r`'),
855	/// ClassBytesRange::new(b' ', b' '),
856	/// ]))),
857	/// Hir::class(Class::Bytes(ClassBytes::new(vec![
858	/// ClassBytesRange::new(b'0', b'9'),
859	/// ClassBytesRange::new(b'A', b'Z'),
860	/// ClassBytesRange::new(b'_', b'_'),
861	/// ClassBytesRange::new(b'a', b'z'),
862	/// ]))),
863	/// ];
864	///
865	/// let config = NFA::config().nfa_size_limit(Some(`1_000`));
866	/// let nfa = NFA::compiler().configure(config).build_many_from_hir(hirs)?;
867	///
868	/// let re = PikeVM::new_from_nfa(nfa)?;
869	/// let mut cache = re.create_cache();
870	/// let mut caps = re.create_captures();
871	/// let expected = Some(Match::must(`1`, `1`..`2`));
872	/// re.captures(&mut cache, "!A! !A!", &mut caps);
873	/// assert_eq!(expected, caps.get_match());
874	///
875	/// # Ok::<(), Box<dyn std::error::Error>>(())
876	/// ```
877	pub fn build_many_from_hir<H: Borrow<Hir>>(
878	&self,
879	exprs: &[H],
880	) -> Result<NFA, BuildError> {
881	self.compile(exprs)
882	}
883
884	/// Apply the given NFA configuration options to this builder.
885	///
886	/// # Example
887	///
888	/// ```
889	/// use regex_automata::nfa::thompson::NFA;
890	///
891	/// let config = NFA::config().nfa_size_limit(Some(`1_000`));
892	/// let nfa = NFA::compiler().configure(config).build(r"(?-u)\w")?;
893	/// assert_eq!(nfa.pattern_len(), `1`);
894	///
895	/// # Ok::<(), Box<dyn std::error::Error>>(())
896	/// ```
897	pub fn configure(&mut self, config: Config) -> &mut Compiler {
898	self.config = self.config.overwrite(config);
899	self
900	}
901
902	/// Set the syntax configuration for this builder using
903	/// [`syntax::Config`](crate::util::syntax::Config).
904	///
905	/// This permits setting things like case insensitivity, Unicode and multi
906	/// line mode.
907	///
908	/// This syntax configuration only applies when an NFA is built directly
909	/// from a pattern string. If an NFA is built from an HIR, then all syntax
910	/// settings are ignored.
911	///
912	/// # Example
913	///
914	/// ```
915	/// use regex_automata::{nfa::thompson::NFA, util::syntax};
916	///
917	/// let syntax_config = syntax::Config::new().unicode(`false`);
918	/// let nfa = NFA::compiler().syntax(syntax_config).build(r"\w")?;
919	/// // If Unicode were enabled, the number of states would be much bigger.
920	/// assert!(nfa.states().len() < `15`);
921	///
922	/// # Ok::<(), Box<dyn std::error::Error>>(())
923	/// ```
924	pub fn syntax(
925	&mut self,
926	config: crate::util::syntax::Config,
927	) -> &mut Compiler {
928	config.apply(&mut self.parser);
929	self
930	}
931	}
932
933	impl Compiler {
934	/// Compile the sequence of HIR expressions given. Pattern IDs are
935	/// allocated starting from 0, in correspondence with the slice given.
936	///
937	/// It is legal to provide an empty slice. In that case, the NFA returned
938	/// has no patterns and will never match anything.
939	fn compile<H: Borrow<Hir>>(&self, exprs: &[H]) -> Result<NFA, BuildError> {
940	if exprs.len() > PatternID::LIMIT {
941	return Err(BuildError::too_many_patterns(exprs.len()));
942	}
943	if self.config.get_reverse()
944	&& self.config.get_which_captures().is_any()
945	{
946	return Err(BuildError::unsupported_captures());
947	}
948
949	self.builder.borrow_mut().clear();
950	self.builder.borrow_mut().set_utf8(self.config.get_utf8());
951	self.builder.borrow_mut().set_reverse(self.config.get_reverse());
952	self.builder
953	.borrow_mut()
954	.set_look_matcher(self.config.get_look_matcher());
955	self.builder
956	.borrow_mut()
957	.set_size_limit(self.config.get_nfa_size_limit())?;
958
959	// We always add an unanchored prefix unless we were specifically told
960	// not to (for tests only), or if we know that the regex is anchored
961	// for all matches. When an unanchored prefix is not added, then the
962	// NFA's anchored and unanchored start states are equivalent.
963	let all_anchored = exprs.iter().all(\|e\| {
964	let props = e.borrow().properties();
965	if self.config.get_reverse() {
966	props.look_set_suffix().contains(hir::Look::End)
967	} else {
968	props.look_set_prefix().contains(hir::Look::Start)
969	}
970	});
971	let anchored = !self.config.get_unanchored_prefix() \|\| all_anchored;
972	let unanchored_prefix = if anchored {
973	self.c_empty()?
974	} else {
975	self.c_at_least(&Hir::dot(hir::Dot::AnyByte), `false`, `0`)?
976	};
977
978	let compiled = self.c_alt_iter(exprs.iter().map(\|e\| {
979	let _ = self.start_pattern()?;
980	let one = self.c_cap(`0`, None, e.borrow())?;
981	let match_state_id = self.add_match()?;
982	self.patch(one.end, match_state_id)?;
983	let _ = self.finish_pattern(one.start)?;
984	Ok(ThompsonRef { start: one.start, end: match_state_id })
985	}))?;
986	self.patch(unanchored_prefix.end, compiled.start)?;
987	let nfa = self
988	.builder
989	.borrow_mut()
990	.build(compiled.start, unanchored_prefix.start)?;
991
992	debug!("HIR-to-NFA compilation complete, config: {:?}", self.config);
993	Ok(nfa)
994	}
995
996	/// Compile an arbitrary HIR expression.
997	fn c(&self, expr: &Hir) -> Result<ThompsonRef, BuildError> {
998	use regex_syntax::hir::{Class, HirKind::*};
999
1000	match *expr.kind() {
1001	Empty => self.c_empty(),
1002	Literal(hir::Literal(ref bytes)) => self.c_literal(bytes),
1003	Class(Class::Bytes(ref c)) => self.c_byte_class(c),
1004	Class(Class::Unicode(ref c)) => self.c_unicode_class(c),
1005	Look(ref look) => self.c_look(look),
1006	Repetition(ref rep) => self.c_repetition(rep),
1007	Capture(ref c) => self.c_cap(c.index, c.name.as_deref(), &c.sub),
1008	Concat(ref es) => self.c_concat(es.iter().map(\|e\| self.c(e))),
1009	Alternation(ref es) => self.c_alt_slice(es),
1010	}
1011	}
1012
1013	/// Compile a concatenation of the sub-expressions yielded by the given
1014	/// iterator. If the iterator yields no elements, then this compiles down
1015	/// to an "empty" state that always matches.
1016	///
1017	/// If the compiler is in reverse mode, then the expressions given are
1018	/// automatically compiled in reverse.
1019	fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef, BuildError>
1020	where
1021	I: DoubleEndedIterator<Item = Result<ThompsonRef, BuildError>>,
1022	{
1023	let first = if self.is_reverse() { it.next_back() } else { it.next() };
1024	let ThompsonRef { start, mut end } = match first {
1025	Some(result) => result?,
1026	None => return self.c_empty(),
1027	};
1028	loop {
1029	let next =
1030	if self.is_reverse() { it.next_back() } else { it.next() };
1031	let compiled = match next {
1032	Some(result) => result?,
1033	None => break,
1034	};
1035	self.patch(end, compiled.start)?;
1036	end = compiled.end;
1037	}
1038	Ok(ThompsonRef { start, end })
1039	}
1040
1041	/// Compile an alternation of the given HIR values.
1042	///
1043	/// This is like 'c_alt_iter', but it accepts a slice of HIR values instead
1044	/// of an iterator of compiled NFA subgraphs. The point of accepting a
1045	/// slice here is that it opens up some optimization opportunities. For
1046	/// example, if all of the HIR values are literals, then this routine might
1047	/// re-shuffle them to make NFA epsilon closures substantially faster.
1048	fn c_alt_slice(&self, exprs: &[Hir]) -> Result<ThompsonRef, BuildError> {
1049	// self.c_alt_iter(exprs.iter().map(\|e\| self.c(e)))
1050	let literal_count = exprs
1051	.iter()
1052	.filter(\|e\| {
1053	matches!(*e.kind(), hir::HirKind::Literal(hir::Literal(_)))
1054	})
1055	.count();
1056	if literal_count <= `1` \|\| literal_count < exprs.len() {
1057	return self.c_alt_iter(exprs.iter().map(\|e\| self.c(e)));
1058	}
1059
1060	let mut trie = if self.is_reverse() {
1061	LiteralTrie::reverse()
1062	} else {
1063	LiteralTrie::forward()
1064	};
1065	for expr in exprs.iter() {
1066	let literal = match *expr.kind() {
1067	hir::HirKind::Literal(hir::Literal(ref bytes)) => bytes,
1068	_ => unreachable!(),
1069	};
1070	trie.add(literal)?;
1071	}
1072	trie.compile(&mut self.builder.borrow_mut())
1073	}
1074
1075	/// Compile an alternation, where each element yielded by the given
1076	/// iterator represents an item in the alternation. If the iterator yields
1077	/// no elements, then this compiles down to a "fail" state.
1078	///
1079	/// In an alternation, expressions appearing earlier are "preferred" at
1080	/// match time over expressions appearing later. At least, this is true
1081	/// when using "leftmost first" match semantics. (If "leftmost longest" are
1082	/// ever added in the future, then this preference order of priority would
1083	/// not apply in that mode.)
1084	fn c_alt_iter<I>(&self, mut it: I) -> Result<ThompsonRef, BuildError>
1085	where
1086	I: Iterator<Item = Result<ThompsonRef, BuildError>>,
1087	{
1088	let first = match it.next() {
1089	None => return self.c_fail(),
1090	Some(result) => result?,
1091	};
1092	let second = match it.next() {
1093	None => return Ok(first),
1094	Some(result) => result?,
1095	};
1096
1097	let union = self.add_union()?;
1098	let end = self.add_empty()?;
1099	self.patch(union, first.start)?;
1100	self.patch(first.end, end)?;
1101	self.patch(union, second.start)?;
1102	self.patch(second.end, end)?;
1103	for result in it {
1104	let compiled = result?;
1105	self.patch(union, compiled.start)?;
1106	self.patch(compiled.end, end)?;
1107	}
1108	Ok(ThompsonRef { start: union, end })
1109	}
1110
1111	/// Compile the given capture sub-expression. `expr` should be the
1112	/// sub-expression contained inside the capture. If "capture" states are
1113	/// enabled, then they are added as appropriate.
1114	///
1115	/// This accepts the pieces of a capture instead of a `hir::Capture` so
1116	/// that it's easy to manufacture a "fake" group when necessary, e.g., for
1117	/// adding the entire pattern as if it were a group in order to create
1118	/// appropriate "capture" states in the NFA.
1119	fn c_cap(
1120	&self,
1121	index: u32,
1122	name: Option<&str>,
1123	expr: &Hir,
1124	) -> Result<ThompsonRef, BuildError> {
1125	match self.config.get_which_captures() {
1126	// No capture states means we always skip them.
1127	WhichCaptures::None => return self.c(expr),
1128	// Implicit captures states means we only add when index==0 since
1129	// index==0 implies the group is implicit.
1130	WhichCaptures::Implicit if index > `0` => return self.c(expr),
1131	_ => {}
1132	}
1133
1134	let start = self.add_capture_start(index, name)?;
1135	let inner = self.c(expr)?;
1136	let end = self.add_capture_end(index)?;
1137	self.patch(start, inner.start)?;
1138	self.patch(inner.end, end)?;
1139	Ok(ThompsonRef { start, end })
1140	}
1141
1142	/// Compile the given repetition expression. This handles all types of
1143	/// repetitions and greediness.
1144	fn c_repetition(
1145	&self,
1146	rep: &hir::Repetition,
1147	) -> Result<ThompsonRef, BuildError> {
1148	match (rep.min, rep.max) {
1149	(`0`, Some(`1`)) => self.c_zero_or_one(&rep.sub, rep.greedy),
1150	(min, None) => self.c_at_least(&rep.sub, rep.greedy, min),
1151	(min, Some(max)) if min == max => self.c_exactly(&rep.sub, min),
1152	(min, Some(max)) => self.c_bounded(&rep.sub, rep.greedy, min, max),
1153	}
1154	}
1155
1156	/// Compile the given expression such that it matches at least `min` times,
1157	/// but no more than `max` times.
1158	///
1159	/// When `greedy` is true, then the preference is for the expression to
1160	/// match as much as possible. Otherwise, it will match as little as
1161	/// possible.
1162	fn c_bounded(
1163	&self,
1164	expr: &Hir,
1165	greedy: bool,
1166	min: u32,
1167	max: u32,
1168	) -> Result<ThompsonRef, BuildError> {
1169	let prefix = self.c_exactly(expr, min)?;
1170	if min == max {
1171	return Ok(prefix);
1172	}
1173
1174	// It is tempting here to compile the rest here as a concatenation
1175	// of zero-or-one matches. i.e., for `a{2,5}`, compile it as if it
1176	// were `aaa?a?a?`. The problem here is that it leads to this program:
1177	//
1178	// >000000: 61 => 01
1179	// 000001: 61 => 02
1180	// 000002: union(03, 04)
1181	// 000003: 61 => 04
1182	// 000004: union(05, 06)
1183	// 000005: 61 => 06
1184	// 000006: union(07, 08)
1185	// 000007: 61 => 08
1186	// 000008: MATCH
1187	//
1188	// And effectively, once you hit state 2, the epsilon closure will
1189	// include states 3, 5, 6, 7 and 8, which is quite a bit. It is better
1190	// to instead compile it like so:
1191	//
1192	// >000000: 61 => 01
1193	// 000001: 61 => 02
1194	// 000002: union(03, 08)
1195	// 000003: 61 => 04
1196	// 000004: union(05, 08)
1197	// 000005: 61 => 06
1198	// 000006: union(07, 08)
1199	// 000007: 61 => 08
1200	// 000008: MATCH
1201	//
1202	// So that the epsilon closure of state 2 is now just 3 and 8.
1203	let empty = self.add_empty()?;
1204	let mut prev_end = prefix.end;
1205	for _ in min..max {
1206	let union = if greedy {
1207	self.add_union()
1208	} else {
1209	self.add_union_reverse()
1210	}?;
1211	let compiled = self.c(expr)?;
1212	self.patch(prev_end, union)?;
1213	self.patch(union, compiled.start)?;
1214	self.patch(union, empty)?;
1215	prev_end = compiled.end;
1216	}
1217	self.patch(prev_end, empty)?;
1218	Ok(ThompsonRef { start: prefix.start, end: empty })
1219	}
1220
1221	/// Compile the given expression such that it may be matched `n` or more
1222	/// times, where `n` can be any integer. (Although a particularly large
1223	/// integer is likely to run afoul of any configured size limits.)
1224	///
1225	/// When `greedy` is true, then the preference is for the expression to
1226	/// match as much as possible. Otherwise, it will match as little as
1227	/// possible.
1228	fn c_at_least(
1229	&self,
1230	expr: &Hir,
1231	greedy: bool,
1232	n: u32,
1233	) -> Result<ThompsonRef, BuildError> {
1234	if n == `0` {
1235	// When the expression cannot match the empty string, then we
1236	// can get away with something much simpler: just one 'alt'
1237	// instruction that optionally repeats itself. But if the expr
1238	// can match the empty string... see below.
1239	if expr.properties().minimum_len().map_or(`false`, \|len\| len > `0`) {
1240	let union = if greedy {
1241	self.add_union()
1242	} else {
1243	self.add_union_reverse()
1244	}?;
1245	let compiled = self.c(expr)?;
1246	self.patch(union, compiled.start)?;
1247	self.patch(compiled.end, union)?;
1248	return Ok(ThompsonRef { start: union, end: union });
1249	}
1250
1251	// What's going on here? Shouldn't x be simpler than this? It*
1252	// turns out that when implementing leftmost-first (Perl-like)
1253	// match semantics, x results in an incorrect preference order*
1254	// when computing the transitive closure of states if and only if
1255	// 'x' can match the empty string. So instead, we compile x as*
1256	// (x+)?, which preserves the correct preference order.
1257	//
1258	// See: https://github.com/rust-lang/regex/issues/779
1259	let compiled = self.c(expr)?;
1260	let plus = if greedy {
1261	self.add_union()
1262	} else {
1263	self.add_union_reverse()
1264	}?;
1265	self.patch(compiled.end, plus)?;
1266	self.patch(plus, compiled.start)?;
1267
1268	let question = if greedy {
1269	self.add_union()
1270	} else {
1271	self.add_union_reverse()
1272	}?;
1273	let empty = self.add_empty()?;
1274	self.patch(question, compiled.start)?;
1275	self.patch(question, empty)?;
1276	self.patch(plus, empty)?;
1277	Ok(ThompsonRef { start: question, end: empty })
1278	} else if n == `1` {
1279	let compiled = self.c(expr)?;
1280	let union = if greedy {
1281	self.add_union()
1282	} else {
1283	self.add_union_reverse()
1284	}?;
1285	self.patch(compiled.end, union)?;
1286	self.patch(union, compiled.start)?;
1287	Ok(ThompsonRef { start: compiled.start, end: union })
1288	} else {
1289	let prefix = self.c_exactly(expr, n - `1`)?;
1290	let last = self.c(expr)?;
1291	let union = if greedy {
1292	self.add_union()
1293	} else {
1294	self.add_union_reverse()
1295	}?;
1296	self.patch(prefix.end, last.start)?;
1297	self.patch(last.end, union)?;
1298	self.patch(union, last.start)?;
1299	Ok(ThompsonRef { start: prefix.start, end: union })
1300	}
1301	}
1302
1303	/// Compile the given expression such that it may be matched zero or one
1304	/// times.
1305	///
1306	/// When `greedy` is true, then the preference is for the expression to
1307	/// match as much as possible. Otherwise, it will match as little as
1308	/// possible.
1309	fn c_zero_or_one(
1310	&self,
1311	expr: &Hir,
1312	greedy: bool,
1313	) -> Result<ThompsonRef, BuildError> {
1314	let union =
1315	if greedy { self.add_union() } else { self.add_union_reverse() }?;
1316	let compiled = self.c(expr)?;
1317	let empty = self.add_empty()?;
1318	self.patch(union, compiled.start)?;
1319	self.patch(union, empty)?;
1320	self.patch(compiled.end, empty)?;
1321	Ok(ThompsonRef { start: union, end: empty })
1322	}
1323
1324	/// Compile the given HIR expression exactly `n` times.
1325	fn c_exactly(
1326	&self,
1327	expr: &Hir,
1328	n: u32,
1329	) -> Result<ThompsonRef, BuildError> {
1330	let it = (`0`..n).map(\|_\| self.c(expr));
1331	self.c_concat(it)
1332	}
1333
1334	/// Compile the given byte oriented character class.
1335	///
1336	/// This uses "sparse" states to represent an alternation between ranges in
1337	/// this character class. We can use "sparse" states instead of stitching
1338	/// together a "union" state because all ranges in a character class have
1339	/// equal priority and* are non-overlapping (thus, only one can match, so*
1340	/// there's never a question of priority in the first place). This saves a
1341	/// fair bit of overhead when traversing an NFA.
1342	///
1343	/// This routine compiles an empty character class into a "fail" state.
1344	fn c_byte_class(
1345	&self,
1346	cls: &hir::ClassBytes,
1347	) -> Result<ThompsonRef, BuildError> {
1348	let end = self.add_empty()?;
1349	let mut trans = Vec::with_capacity(cls.ranges().len());
1350	for r in cls.iter() {
1351	trans.push(Transition {
1352	start: r.start(),
1353	end: r.end(),
1354	next: end,
1355	});
1356	}
1357	Ok(ThompsonRef { start: self.add_sparse(trans)?, end })
1358	}
1359
1360	/// Compile the given Unicode character class.
1361	///
1362	/// This routine specifically tries to use various types of compression,
1363	/// since UTF-8 automata of large classes can get quite large. The specific
1364	/// type of compression used depends on forward vs reverse compilation, and
1365	/// whether NFA shrinking is enabled or not.
1366	///
1367	/// Aside from repetitions causing lots of repeat group, this is like the
1368	/// single most expensive part of regex compilation. Therefore, a large part
1369	/// of the expense of compilation may be reduce by disabling Unicode in the
1370	/// pattern.
1371	///
1372	/// This routine compiles an empty character class into a "fail" state.
1373	fn c_unicode_class(
1374	&self,
1375	cls: &hir::ClassUnicode,
1376	) -> Result<ThompsonRef, BuildError> {
1377	// If all we have are ASCII ranges wrapped in a Unicode package, then
1378	// there is zero reason to bring out the big guns. We can fit all ASCII
1379	// ranges within a single sparse state.
1380	if cls.is_ascii() {
1381	let end = self.add_empty()?;
1382	let mut trans = Vec::with_capacity(cls.ranges().len());
1383	for r in cls.iter() {
1384	// The unwraps below are OK because we've verified that this
1385	// class only contains ASCII codepoints.
1386	trans.push(Transition {
1387	// FIXME(1.59): use the 'TryFrom<char> for u8' impl.
1388	start: u8::try_from(u32::from(r.start())).unwrap(),
1389	end: u8::try_from(u32::from(r.end())).unwrap(),
1390	next: end,
1391	});
1392	}
1393	Ok(ThompsonRef { start: self.add_sparse(trans)?, end })
1394	} else if self.is_reverse() {
1395	if !self.config.get_shrink() {
1396	// When we don't want to spend the extra time shrinking, we
1397	// compile the UTF-8 automaton in reverse using something like
1398	// the "naive" approach, but will attempt to re-use common
1399	// suffixes.
1400	self.c_unicode_class_reverse_with_suffix(cls)
1401	} else {
1402	// When we want to shrink our NFA for reverse UTF-8 automata,
1403	// we cannot feed UTF-8 sequences directly to the UTF-8
1404	// compiler, since the UTF-8 compiler requires all sequences
1405	// to be lexicographically sorted. Instead, we organize our
1406	// sequences into a range trie, which can then output our
1407	// sequences in the correct order. Unfortunately, building the
1408	// range trie is fairly expensive (but not nearly as expensive
1409	// as building a DFA). Hence the reason why the 'shrink' option
1410	// exists, so that this path can be toggled off. For example,
1411	// we might want to turn this off if we know we won't be
1412	// compiling a DFA.
1413	let mut trie = self.trie_state.borrow_mut();
1414	trie.clear();
1415
1416	for rng in cls.iter() {
1417	for mut seq in Utf8Sequences::new(rng.start(), rng.end()) {
1418	seq.reverse();
1419	trie.insert(seq.as_slice());
1420	}
1421	}
1422	let mut builder = self.builder.borrow_mut();
1423	let mut utf8_state = self.utf8_state.borrow_mut();
1424	let mut utf8c =
1425	Utf8Compiler::new(&mut builder, &mut* *utf8_state)?;
1426	trie.iter(\|seq\| {
1427	utf8c.add(&seq)?;
1428	Ok(())
1429	})?;
1430	utf8c.finish()
1431	}
1432	} else {
1433	// In the forward direction, we always shrink our UTF-8 automata
1434	// because we can stream it right into the UTF-8 compiler. There
1435	// is almost no downside (in either memory or time) to using this
1436	// approach.
1437	let mut builder = self.builder.borrow_mut();
1438	let mut utf8_state = self.utf8_state.borrow_mut();
1439	let mut utf8c =
1440	Utf8Compiler::new(&mut builder, &mut* *utf8_state)?;
1441	for rng in cls.iter() {
1442	for seq in Utf8Sequences::new(rng.start(), rng.end()) {
1443	utf8c.add(seq.as_slice())?;
1444	}
1445	}
1446	utf8c.finish()
1447	}
1448
1449	// For reference, the code below is the "naive" version of compiling a
1450	// UTF-8 automaton. It is deliciously simple (and works for both the
1451	// forward and reverse cases), but will unfortunately produce very
1452	// large NFAs. When compiling a forward automaton, the size difference
1453	// can sometimes be an order of magnitude. For example, the '\w' regex
1454	// will generate about ~3000 NFA states using the naive approach below,
1455	// but only 283 states when using the approach above. This is because
1456	// the approach above actually compiles a minimal* (or near minimal,*
1457	// because of the bounded hashmap for reusing equivalent states) UTF-8
1458	// automaton.
1459	//
1460	// The code below is kept as a reference point in order to make it
1461	// easier to understand the higher level goal here. Although, it will
1462	// almost certainly bit-rot, so keep that in mind. Also, if you try to
1463	// use it, some of the tests in this module will fail because they look
1464	// for terser byte code produce by the more optimized handling above.
1465	// But the integration test suite should still pass.
1466	//
1467	// One good example of the substantial difference this can make is to
1468	// compare and contrast performance of the Pike VM when the code below
1469	// is active vs the code above. Here's an example to try:
1470	//
1471	// regex-cli find match pikevm -b -p '(?m)^\w{20}' non-ascii-file
1472	//
1473	// With Unicode classes generated below, this search takes about 45s on
1474	// my machine. But with the compressed version above, the search takes
1475	// only around 1.4s. The NFA is also 20% smaller. This is in part due
1476	// to the compression, but also because of the utilization of 'sparse'
1477	// NFA states. They lead to much less state shuffling during the NFA
1478	// search.
1479	/*
1480	let it = cls
1481	.iter()
1482	.flat_map(\|rng\| Utf8Sequences::new(rng.start(), rng.end()))
1483	.map(\|seq\| {
1484	let it = seq
1485	.as_slice()
1486	.iter()
1487	.map(\|rng\| self.c_range(rng.start, rng.end));
1488	self.c_concat(it)
1489	});
1490	self.c_alt_iter(it)
1491	*/
1492	}
1493
1494	/// Compile the given Unicode character class in reverse with suffix
1495	/// caching.
1496	///
1497	/// This is a "quick" way to compile large Unicode classes into reverse
1498	/// UTF-8 automata while doing a small amount of compression on that
1499	/// automata by reusing common suffixes.
1500	///
1501	/// A more comprehensive compression scheme can be accomplished by using
1502	/// a range trie to efficiently sort a reverse sequence of UTF-8 byte
1503	/// rqanges, and then use Daciuk's algorithm via `Utf8Compiler`.
1504	///
1505	/// This is the technique used when "NFA shrinking" is disabled.
1506	///
1507	/// (This also tries to use "sparse" states where possible, just like
1508	/// `c_byte_class` does.)
1509	fn c_unicode_class_reverse_with_suffix(
1510	&self,
1511	cls: &hir::ClassUnicode,
1512	) -> Result<ThompsonRef, BuildError> {
1513	// N.B. It would likely be better to cache common prefixes* in the*
1514	// reverse direction, but it's not quite clear how to do that. The
1515	// advantage of caching suffixes is that it does give us a win, and
1516	// has a very small additional overhead.
1517	let mut cache = self.utf8_suffix.borrow_mut();
1518	cache.clear();
1519
1520	let union = self.add_union()?;
1521	let alt_end = self.add_empty()?;
1522	for urng in cls.iter() {
1523	for seq in Utf8Sequences::new(urng.start(), urng.end()) {
1524	let mut end = alt_end;
1525	for brng in seq.as_slice() {
1526	let key = Utf8SuffixKey {
1527	from: end,
1528	start: brng.start,
1529	end: brng.end,
1530	};
1531	let hash = cache.hash(&key);
1532	if let Some(id) = cache.get(&key, hash) {
1533	end = id;
1534	continue;
1535	}
1536
1537	let compiled = self.c_range(brng.start, brng.end)?;
1538	self.patch(compiled.end, end)?;
1539	end = compiled.start;
1540	cache.set(key, hash, end);
1541	}
1542	self.patch(union, end)?;
1543	}
1544	}
1545	Ok(ThompsonRef { start: union, end: alt_end })
1546	}
1547
1548	/// Compile the given HIR look-around assertion to an NFA look-around
1549	/// assertion.
1550	fn c_look(&self, anchor: &hir::Look) -> Result<ThompsonRef, BuildError> {
1551	let look = match *anchor {
1552	hir::Look::Start => Look::Start,
1553	hir::Look::End => Look::End,
1554	hir::Look::StartLF => Look::StartLF,
1555	hir::Look::EndLF => Look::EndLF,
1556	hir::Look::StartCRLF => Look::StartCRLF,
1557	hir::Look::EndCRLF => Look::EndCRLF,
1558	hir::Look::WordAscii => Look::WordAscii,
1559	hir::Look::WordAsciiNegate => Look::WordAsciiNegate,
1560	hir::Look::WordUnicode => Look::WordUnicode,
1561	hir::Look::WordUnicodeNegate => Look::WordUnicodeNegate,
1562	hir::Look::WordStartAscii => Look::WordStartAscii,
1563	hir::Look::WordEndAscii => Look::WordEndAscii,
1564	hir::Look::WordStartUnicode => Look::WordStartUnicode,
1565	hir::Look::WordEndUnicode => Look::WordEndUnicode,
1566	hir::Look::WordStartHalfAscii => Look::WordStartHalfAscii,
1567	hir::Look::WordEndHalfAscii => Look::WordEndHalfAscii,
1568	hir::Look::WordStartHalfUnicode => Look::WordStartHalfUnicode,
1569	hir::Look::WordEndHalfUnicode => Look::WordEndHalfUnicode,
1570	};
1571	let id = self.add_look(look)?;
1572	Ok(ThompsonRef { start: id, end: id })
1573	}
1574
1575	/// Compile the given byte string to a concatenation of bytes.
1576	fn c_literal(&self, bytes: &[u8]) -> Result<ThompsonRef, BuildError> {
1577	self.c_concat(bytes.iter().copied().map(\|b\| self.c_range(b, b)))
1578	}
1579
1580	/// Compile a "range" state with one transition that may only be followed
1581	/// if the input byte is in the (inclusive) range given.
1582	///
1583	/// Both the `start` and `end` locations point to the state created.
1584	/// Callers will likely want to keep the `start`, but patch the `end` to
1585	/// point to some other state.
1586	fn c_range(&self, start: u8, end: u8) -> Result<ThompsonRef, BuildError> {
1587	let id = self.add_range(start, end)?;
1588	Ok(ThompsonRef { start: id, end: id })
1589	}
1590
1591	/// Compile an "empty" state with one unconditional epsilon transition.
1592	///
1593	/// Both the `start` and `end` locations point to the state created.
1594	/// Callers will likely want to keep the `start`, but patch the `end` to
1595	/// point to some other state.
1596	fn c_empty(&self) -> Result<ThompsonRef, BuildError> {
1597	let id = self.add_empty()?;
1598	Ok(ThompsonRef { start: id, end: id })
1599	}
1600
1601	/// Compile a "fail" state that can never have any outgoing transitions.
1602	fn c_fail(&self) -> Result<ThompsonRef, BuildError> {
1603	let id = self.add_fail()?;
1604	Ok(ThompsonRef { start: id, end: id })
1605	}
1606
1607	// The below helpers are meant to be simple wrappers around the
1608	// corresponding Builder methods. For the most part, they let us write
1609	// 'self.add_foo()' instead of 'self.builder.borrow_mut().add_foo()', where
1610	// the latter is a mouthful. Some of the methods do inject a little bit
1611	// of extra logic. e.g., Flipping look-around operators when compiling in
1612	// reverse mode.
1613
1614	fn patch(&self, from: StateID, to: StateID) -> Result<(), BuildError> {
1615	self.builder.borrow_mut().patch(from, to)
1616	}
1617
1618	fn start_pattern(&self) -> Result<PatternID, BuildError> {
1619	self.builder.borrow_mut().start_pattern()
1620	}
1621
1622	fn finish_pattern(
1623	&self,
1624	start_id: StateID,
1625	) -> Result<PatternID, BuildError> {
1626	self.builder.borrow_mut().finish_pattern(start_id)
1627	}
1628
1629	fn add_empty(&self) -> Result<StateID, BuildError> {
1630	self.builder.borrow_mut().add_empty()
1631	}
1632
1633	fn add_range(&self, start: u8, end: u8) -> Result<StateID, BuildError> {
1634	self.builder.borrow_mut().add_range(Transition {
1635	start,
1636	end,
1637	next: StateID::ZERO,
1638	})
1639	}
1640
1641	fn add_sparse(
1642	&self,
1643	ranges: Vec<Transition>,
1644	) -> Result<StateID, BuildError> {
1645	self.builder.borrow_mut().add_sparse(ranges)
1646	}
1647
1648	fn add_look(&self, mut look: Look) -> Result<StateID, BuildError> {
1649	if self.is_reverse() {
1650	look = look.reversed();
1651	}
1652	self.builder.borrow_mut().add_look(StateID::ZERO, look)
1653	}
1654
1655	fn add_union(&self) -> Result<StateID, BuildError> {
1656	self.builder.borrow_mut().add_union(vec![])
1657	}
1658
1659	fn add_union_reverse(&self) -> Result<StateID, BuildError> {
1660	self.builder.borrow_mut().add_union_reverse(vec![])
1661	}
1662
1663	fn add_capture_start(
1664	&self,
1665	capture_index: u32,
1666	name: Option<&str>,
1667	) -> Result<StateID, BuildError> {
1668	let name = name.map(\|n\| Arc::from(n));
1669	self.builder.borrow_mut().add_capture_start(
1670	StateID::ZERO,
1671	capture_index,
1672	name,
1673	)
1674	}
1675
1676	fn add_capture_end(
1677	&self,
1678	capture_index: u32,
1679	) -> Result<StateID, BuildError> {
1680	self.builder.borrow_mut().add_capture_end(StateID::ZERO, capture_index)
1681	}
1682
1683	fn add_fail(&self) -> Result<StateID, BuildError> {
1684	self.builder.borrow_mut().add_fail()
1685	}
1686
1687	fn add_match(&self) -> Result<StateID, BuildError> {
1688	self.builder.borrow_mut().add_match()
1689	}
1690
1691	fn is_reverse(&self) -> bool {
1692	self.config.get_reverse()
1693	}
1694	}
1695
1696	/// A value that represents the result of compiling a sub-expression of a
1697	/// regex's HIR. Specifically, this represents a sub-graph of the NFA that
1698	/// has an initial state at `start` and a final state at `end`.
1699	#[derive(Clone, Copy, Debug)]
1700	pub(crate) struct ThompsonRef {
1701	pub(crate) start: StateID,
1702	pub(crate) end: StateID,
1703	}
1704
1705	/// A UTF-8 compiler based on Daciuk's algorithm for compilining minimal DFAs
1706	/// from a lexicographically sorted sequence of strings in linear time.
1707	///
1708	/// The trick here is that any Unicode codepoint range can be converted to
1709	/// a sequence of byte ranges that form a UTF-8 automaton. Connecting them
1710	/// together via an alternation is trivial, and indeed, it works. However,
1711	/// there is a lot of redundant structure in many UTF-8 automatons. Since our
1712	/// UTF-8 ranges are in lexicographic order, we can use Daciuk's algorithm
1713	/// to build nearly minimal DFAs in linear time. (They are guaranteed to be
1714	/// minimal because we use a bounded cache of previously build DFA states.)
1715	///
1716	/// The drawback is that this sadly doesn't work for reverse automata, since
1717	/// the ranges are no longer in lexicographic order. For that, we invented the
1718	/// range trie (which gets its own module). Once a range trie is built, we then
1719	/// use this same Utf8Compiler to build a reverse UTF-8 automaton.
1720	///
1721	/// The high level idea is described here:
1722	/// https://blog.burntsushi.net/transducers/#finite-state-machines-as-data-structures
1723	///
1724	/// There is also another implementation of this in the `fst` crate.
1725	#[derive(Debug)]
1726	struct Utf8Compiler<'a> {
1727	builder: &'a mut Builder,
1728	state: &'a mut Utf8State,
1729	target: StateID,
1730	}
1731
1732	#[derive(Clone, Debug)]
1733	struct Utf8State {
1734	compiled: Utf8BoundedMap,
1735	uncompiled: Vec<Utf8Node>,
1736	}
1737
1738	#[derive(Clone, Debug)]
1739	struct Utf8Node {
1740	trans: Vec<Transition>,
1741	last: Option<Utf8LastTransition>,
1742	}
1743
1744	#[derive(Clone, Debug)]
1745	struct Utf8LastTransition {
1746	start: u8,
1747	end: u8,
1748	}
1749
1750	impl Utf8State {
1751	fn new() -> Utf8State {
1752	Utf8State { compiled: Utf8BoundedMap::new(capacity:`10_000`), uncompiled: vec![] }
1753	}
1754
1755	fn clear(&mut self) {
1756	self.compiled.clear();
1757	self.uncompiled.clear();
1758	}
1759	}
1760
1761	impl<'a> Utf8Compiler<'a> {
1762	fn new(
1763	builder: &'a mut Builder,
1764	state: &'a mut Utf8State,
1765	) -> Result<Utf8Compiler<'a>, BuildError> {
1766	let target = builder.add_empty()?;
1767	state.clear();
1768	let mut utf8c = Utf8Compiler { builder, state, target };
1769	utf8c.add_empty();
1770	Ok(utf8c)
1771	}
1772
1773	fn finish(&mut self) -> Result<ThompsonRef, BuildError> {
1774	self.compile_from(`0`)?;
1775	let node = self.pop_root();
1776	let start = self.compile(node)?;
1777	Ok(ThompsonRef { start, end: self.target })
1778	}
1779
1780	fn add(&mut self, ranges: &[Utf8Range]) -> Result<(), BuildError> {
1781	let prefix_len = ranges
1782	.iter()
1783	.zip(&self.state.uncompiled)
1784	.take_while(\|&(range, node)\| {
1785	node.last.as_ref().map_or(`false`, \|t\| {
1786	(t.start, t.end) == (range.start, range.end)
1787	})
1788	})
1789	.count();
1790	assert!(prefix_len < ranges.len());
1791	self.compile_from(prefix_len)?;
1792	self.add_suffix(&ranges[prefix_len..]);
1793	Ok(())
1794	}
1795
1796	fn compile_from(&mut self, from: usize) -> Result<(), BuildError> {
1797	let mut next = self.target;
1798	while from + `1` < self.state.uncompiled.len() {
1799	let node = self.pop_freeze(next);
1800	next = self.compile(node)?;
1801	}
1802	self.top_last_freeze(next);
1803	Ok(())
1804	}
1805
1806	fn compile(
1807	&mut self,
1808	node: Vec<Transition>,
1809	) -> Result<StateID, BuildError> {
1810	let hash = self.state.compiled.hash(&node);
1811	if let Some(id) = self.state.compiled.get(&node, hash) {
1812	return Ok(id);
1813	}
1814	let id = self.builder.add_sparse(node.clone())?;
1815	self.state.compiled.set(node, hash, id);
1816	Ok(id)
1817	}
1818
1819	fn add_suffix(&mut self, ranges: &[Utf8Range]) {
1820	assert!(!ranges.is_empty());
1821	let last = self
1822	.state
1823	.uncompiled
1824	.len()
1825	.checked_sub(`1`)
1826	.expect("non-empty nodes");
1827	assert!(self.state.uncompiled[last].last.is_none());
1828	self.state.uncompiled[last].last = Some(Utf8LastTransition {
1829	start: ranges[`0`].start,
1830	end: ranges[`0`].end,
1831	});
1832	for r in &ranges[`1`..] {
1833	self.state.uncompiled.push(Utf8Node {
1834	trans: vec![],
1835	last: Some(Utf8LastTransition { start: r.start, end: r.end }),
1836	});
1837	}
1838	}
1839
1840	fn add_empty(&mut self) {
1841	self.state.uncompiled.push(Utf8Node { trans: vec![], last: None });
1842	}
1843
1844	fn pop_freeze(&mut self, next: StateID) -> Vec<Transition> {
1845	let mut uncompiled = self.state.uncompiled.pop().unwrap();
1846	uncompiled.set_last_transition(next);
1847	uncompiled.trans
1848	}
1849
1850	fn pop_root(&mut self) -> Vec<Transition> {
1851	assert_eq!(self.state.uncompiled.len(), `1`);
1852	assert!(self.state.uncompiled[`0`].last.is_none());
1853	self.state.uncompiled.pop().expect("non-empty nodes").trans
1854	}
1855
1856	fn top_last_freeze(&mut self, next: StateID) {
1857	let last = self
1858	.state
1859	.uncompiled
1860	.len()
1861	.checked_sub(`1`)
1862	.expect("non-empty nodes");
1863	self.state.uncompiled[last].set_last_transition(next);
1864	}
1865	}
1866
1867	impl Utf8Node {
1868	fn set_last_transition(&mut self, next: StateID) {
1869	if let Some(last: Utf8LastTransition) = self.last.take() {
1870	self.trans.push(Transition {
1871	start: last.start,
1872	end: last.end,
1873	next,
1874	});
1875	}
1876	}
1877	}
1878
1879	#[cfg(test)]
1880	mod tests {
1881	use alloc::vec;
1882
1883	use crate::{
1884	nfa::thompson::{SparseTransitions, State},
1885	util::primitives::SmallIndex,
1886	};
1887
1888	use super::*;
1889
1890	fn build(pattern: &str) -> NFA {
1891	NFA::compiler()
1892	.configure(
1893	NFA::config()
1894	.which_captures(WhichCaptures::None)
1895	.unanchored_prefix(`false`),
1896	)
1897	.build(pattern)
1898	.unwrap()
1899	}
1900
1901	fn pid(id: usize) -> PatternID {
1902	PatternID::new(id).unwrap()
1903	}
1904
1905	fn sid(id: usize) -> StateID {
1906	StateID::new(id).unwrap()
1907	}
1908
1909	fn s_byte(byte: u8, next: usize) -> State {
1910	let next = sid(next);
1911	let trans = Transition { start: byte, end: byte, next };
1912	State::ByteRange { trans }
1913	}
1914
1915	fn s_range(start: u8, end: u8, next: usize) -> State {
1916	let next = sid(next);
1917	let trans = Transition { start, end, next };
1918	State::ByteRange { trans }
1919	}
1920
1921	fn s_sparse(transitions: &[(u8, u8, usize)]) -> State {
1922	let transitions = transitions
1923	.iter()
1924	.map(\|&(start, end, next)\| Transition {
1925	start,
1926	end,
1927	next: sid(next),
1928	})
1929	.collect();
1930	State::Sparse(SparseTransitions { transitions })
1931	}
1932
1933	fn s_look(look: Look, next: usize) -> State {
1934	let next = sid(next);
1935	State::Look { look, next }
1936	}
1937
1938	fn s_bin_union(alt1: usize, alt2: usize) -> State {
1939	State::BinaryUnion { alt1: sid(alt1), alt2: sid(alt2) }
1940	}
1941
1942	fn s_union(alts: &[usize]) -> State {
1943	State::Union {
1944	alternates: alts
1945	.iter()
1946	.map(\|&id\| sid(id))
1947	.collect::<Vec<StateID>>()
1948	.into_boxed_slice(),
1949	}
1950	}
1951
1952	fn s_cap(next: usize, pattern: usize, index: usize, slot: usize) -> State {
1953	State::Capture {
1954	next: sid(next),
1955	pattern_id: pid(pattern),
1956	group_index: SmallIndex::new(index).unwrap(),
1957	slot: SmallIndex::new(slot).unwrap(),
1958	}
1959	}
1960
1961	fn s_fail() -> State {
1962	State::Fail
1963	}
1964
1965	fn s_match(id: usize) -> State {
1966	State::Match { pattern_id: pid(id) }
1967	}
1968
1969	// Test that building an unanchored NFA has an appropriate `(?s:.)?`*
1970	// prefix.
1971	#[test]
1972	fn compile_unanchored_prefix() {
1973	let nfa = NFA::compiler()
1974	.configure(NFA::config().which_captures(WhichCaptures::None))
1975	.build(r"a")
1976	.unwrap();
1977	assert_eq!(
1978	nfa.states(),
1979	&[
1980	s_bin_union(`2`, `1`),
1981	s_range(`0`, `255`, `0`),
1982	s_byte(b'a', `3`),
1983	s_match(`0`),
1984	]
1985	);
1986	}
1987
1988	#[test]
1989	fn compile_no_unanchored_prefix_with_start_anchor() {
1990	let nfa = NFA::compiler()
1991	.configure(NFA::config().which_captures(WhichCaptures::None))
1992	.build(r"^a")
1993	.unwrap();
1994	assert_eq!(
1995	nfa.states(),
1996	&[s_look(Look::Start, `1`), s_byte(b'a', `2`), s_match(`0`)]
1997	);
1998	}
1999
2000	#[test]
2001	fn compile_yes_unanchored_prefix_with_end_anchor() {
2002	let nfa = NFA::compiler()
2003	.configure(NFA::config().which_captures(WhichCaptures::None))
2004	.build(r"a$")
2005	.unwrap();
2006	assert_eq!(
2007	nfa.states(),
2008	&[
2009	s_bin_union(`2`, `1`),
2010	s_range(`0`, `255`, `0`),
2011	s_byte(b'a', `3`),
2012	s_look(Look::End, `4`),
2013	s_match(`0`),
2014	]
2015	);
2016	}
2017
2018	#[test]
2019	fn compile_yes_reverse_unanchored_prefix_with_start_anchor() {
2020	let nfa = NFA::compiler()
2021	.configure(
2022	NFA::config()
2023	.reverse(`true`)
2024	.which_captures(WhichCaptures::None),
2025	)
2026	.build(r"^a")
2027	.unwrap();
2028	assert_eq!(
2029	nfa.states(),
2030	&[
2031	s_bin_union(`2`, `1`),
2032	s_range(`0`, `255`, `0`),
2033	s_byte(b'a', `3`),
2034	// Anchors get flipped in a reverse automaton.
2035	s_look(Look::End, `4`),
2036	s_match(`0`),
2037	],
2038	);
2039	}
2040
2041	#[test]
2042	fn compile_no_reverse_unanchored_prefix_with_end_anchor() {
2043	let nfa = NFA::compiler()
2044	.configure(
2045	NFA::config()
2046	.reverse(`true`)
2047	.which_captures(WhichCaptures::None),
2048	)
2049	.build(r"a$")
2050	.unwrap();
2051	assert_eq!(
2052	nfa.states(),
2053	&[
2054	// Anchors get flipped in a reverse automaton.
2055	s_look(Look::Start, `1`),
2056	s_byte(b'a', `2`),
2057	s_match(`0`),
2058	],
2059	);
2060	}
2061
2062	#[test]
2063	fn compile_empty() {
2064	assert_eq!(build("").states(), &[s_match(`0`),]);
2065	}
2066
2067	#[test]
2068	fn compile_literal() {
2069	assert_eq!(build("a").states(), &[s_byte(b'a', `1`), s_match(`0`),]);
2070	assert_eq!(
2071	build("ab").states(),
2072	&[s_byte(b'a', `1`), s_byte(b'b', `2`), s_match(`0`),]
2073	);
2074	assert_eq!(
2075	build("☃").states(),
2076	&[s_byte(`0xE2`, `1`), s_byte(`0x98`, `2`), s_byte(`0x83`, `3`), s_match(`0`)]
2077	);
2078
2079	// Check that non-UTF-8 literals work.
2080	let nfa = NFA::compiler()
2081	.configure(
2082	NFA::config()
2083	.which_captures(WhichCaptures::None)
2084	.unanchored_prefix(`false`),
2085	)
2086	.syntax(crate::util::syntax::Config::new().utf8(`false`))
2087	.build(r"(?-u)\xFF")
2088	.unwrap();
2089	assert_eq!(nfa.states(), &[s_byte(b'`\xFF`', `1`), s_match(`0`),]);
2090	}
2091
2092	#[test]
2093	fn compile_class_ascii() {
2094	assert_eq!(
2095	build(r"[a-z]").states(),
2096	&[s_range(b'a', b'z', `1`), s_match(`0`),]
2097	);
2098	assert_eq!(
2099	build(r"[x-za-c]").states(),
2100	&[s_sparse(&[(b'a', b'c', `1`), (b'x', b'z', `1`)]), s_match(`0`)]
2101	);
2102	}
2103
2104	#[test]
2105	#[cfg(not(miri))]
2106	fn compile_class_unicode() {
2107	assert_eq!(
2108	build(r"[\u03B1-\u03B4]").states(),
2109	&[s_range(`0xB1`, `0xB4`, `2`), s_byte(`0xCE`, `0`), s_match(`0`)]
2110	);
2111	assert_eq!(
2112	build(r"[\u03B1-\u03B4\u{1F919}-\u{1F91E}]").states(),
2113	&[
2114	s_range(`0xB1`, `0xB4`, `5`),
2115	s_range(`0x99`, `0x9E`, `5`),
2116	s_byte(`0xA4`, `1`),
2117	s_byte(`0x9F`, `2`),
2118	s_sparse(&[(`0xCE`, `0xCE`, `0`), (`0xF0`, `0xF0`, `3`)]),
2119	s_match(`0`),
2120	]
2121	);
2122	assert_eq!(
2123	build(r"[a-z☃]").states(),
2124	&[
2125	s_byte(`0x83`, `3`),
2126	s_byte(`0x98`, `0`),
2127	s_sparse(&[(b'a', b'z', `3`), (`0xE2`, `0xE2`, `1`)]),
2128	s_match(`0`),
2129	]
2130	);
2131	}
2132
2133	#[test]
2134	fn compile_repetition() {
2135	assert_eq!(
2136	build(r"a?").states(),
2137	&[s_bin_union(`1`, `2`), s_byte(b'a', `2`), s_match(`0`),]
2138	);
2139	assert_eq!(
2140	build(r"a??").states(),
2141	&[s_bin_union(`2`, `1`), s_byte(b'a', `2`), s_match(`0`),]
2142	);
2143	}
2144
2145	#[test]
2146	fn compile_group() {
2147	assert_eq!(
2148	build(r"ab+").states(),
2149	&[s_byte(b'a', `1`), s_byte(b'b', `2`), s_bin_union(`1`, `3`), s_match(`0`)]
2150	);
2151	assert_eq!(
2152	build(r"(ab)").states(),
2153	&[s_byte(b'a', `1`), s_byte(b'b', `2`), s_match(`0`)]
2154	);
2155	assert_eq!(
2156	build(r"(ab)+").states(),
2157	&[s_byte(b'a', `1`), s_byte(b'b', `2`), s_bin_union(`0`, `3`), s_match(`0`)]
2158	);
2159	}
2160
2161	#[test]
2162	fn compile_alternation() {
2163	assert_eq!(
2164	build(r"a\|b").states(),
2165	&[s_range(b'a', b'b', `1`), s_match(`0`)]
2166	);
2167	assert_eq!(
2168	build(r"ab\|cd").states(),
2169	&[
2170	s_byte(b'b', `3`),
2171	s_byte(b'd', `3`),
2172	s_sparse(&[(b'a', b'a', `0`), (b'c', b'c', `1`)]),
2173	s_match(`0`)
2174	],
2175	);
2176	assert_eq!(
2177	build(r"\|b").states(),
2178	&[s_byte(b'b', `2`), s_bin_union(`2`, `0`), s_match(`0`)]
2179	);
2180	assert_eq!(
2181	build(r"a\|").states(),
2182	&[s_byte(b'a', `2`), s_bin_union(`0`, `2`), s_match(`0`)]
2183	);
2184	}
2185
2186	// This tests the use of a non-binary union, i.e., a state with more than
2187	// 2 unconditional epsilon transitions. The only place they tend to appear
2188	// is in reverse NFAs when shrinking is disabled. Otherwise, 'binary-union'
2189	// and 'sparse' tend to cover all other cases of alternation.
2190	#[test]
2191	fn compile_non_binary_union() {
2192	let nfa = NFA::compiler()
2193	.configure(
2194	NFA::config()
2195	.which_captures(WhichCaptures::None)
2196	.reverse(`true`)
2197	.shrink(`false`)
2198	.unanchored_prefix(`false`),
2199	)
2200	.build(r"[\u1000\u2000\u3000]")
2201	.unwrap();
2202	assert_eq!(
2203	nfa.states(),
2204	&[
2205	s_union(&[`3`, `6`, `9`]),
2206	s_byte(`0xE1`, `10`),
2207	s_byte(`0x80`, `1`),
2208	s_byte(`0x80`, `2`),
2209	s_byte(`0xE2`, `10`),
2210	s_byte(`0x80`, `4`),
2211	s_byte(`0x80`, `5`),
2212	s_byte(`0xE3`, `10`),
2213	s_byte(`0x80`, `7`),
2214	s_byte(`0x80`, `8`),
2215	s_match(`0`),
2216	]
2217	);
2218	}
2219
2220	#[test]
2221	fn compile_many_start_pattern() {
2222	let nfa = NFA::compiler()
2223	.configure(
2224	NFA::config()
2225	.which_captures(WhichCaptures::None)
2226	.unanchored_prefix(`false`),
2227	)
2228	.build_many(&["a", "b"])
2229	.unwrap();
2230	assert_eq!(
2231	nfa.states(),
2232	&[
2233	s_byte(b'a', `1`),
2234	s_match(`0`),
2235	s_byte(b'b', `3`),
2236	s_match(`1`),
2237	s_bin_union(`0`, `2`),
2238	]
2239	);
2240	assert_eq!(nfa.start_anchored().as_usize(), `4`);
2241	assert_eq!(nfa.start_unanchored().as_usize(), `4`);
2242	// Test that the start states for each individual pattern are correct.
2243	assert_eq!(nfa.start_pattern(pid(`0`)).unwrap(), sid(`0`));
2244	assert_eq!(nfa.start_pattern(pid(`1`)).unwrap(), sid(`2`));
2245	}
2246
2247	// This tests that our compiler can handle an empty character class. At the
2248	// time of writing, the regex parser forbids it, so the only way to test it
2249	// is to provide a hand written HIR.
2250	#[test]
2251	fn empty_class_bytes() {
2252	use regex_syntax::hir::{Class, ClassBytes, Hir};
2253
2254	let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![])));
2255	let config = NFA::config()
2256	.which_captures(WhichCaptures::None)
2257	.unanchored_prefix(`false`);
2258	let nfa =
2259	NFA::compiler().configure(config).build_from_hir(&hir).unwrap();
2260	assert_eq!(nfa.states(), &[s_fail(), s_match(`0`)]);
2261	}
2262
2263	// Like empty_class_bytes, but for a Unicode class.
2264	#[test]
2265	fn empty_class_unicode() {
2266	use regex_syntax::hir::{Class, ClassUnicode, Hir};
2267
2268	let hir = Hir::class(Class::Unicode(ClassUnicode::new(vec![])));
2269	let config = NFA::config()
2270	.which_captures(WhichCaptures::None)
2271	.unanchored_prefix(`false`);
2272	let nfa =
2273	NFA::compiler().configure(config).build_from_hir(&hir).unwrap();
2274	assert_eq!(nfa.states(), &[s_fail(), s_match(`0`)]);
2275	}
2276
2277	#[test]
2278	fn compile_captures_all() {
2279	let nfa = NFA::compiler()
2280	.configure(
2281	NFA::config()
2282	.unanchored_prefix(`false`)
2283	.which_captures(WhichCaptures::All),
2284	)
2285	.build("a(b)c")
2286	.unwrap();
2287	assert_eq!(
2288	nfa.states(),
2289	&[
2290	s_cap(`1`, `0`, `0`, `0`),
2291	s_byte(b'a', `2`),
2292	s_cap(`3`, `0`, `1`, `2`),
2293	s_byte(b'b', `4`),
2294	s_cap(`5`, `0`, `1`, `3`),
2295	s_byte(b'c', `6`),
2296	s_cap(`7`, `0`, `0`, `1`),
2297	s_match(`0`)
2298	]
2299	);
2300	let ginfo = nfa.group_info();
2301	assert_eq!(`2`, ginfo.all_group_len());
2302	}
2303
2304	#[test]
2305	fn compile_captures_implicit() {
2306	let nfa = NFA::compiler()
2307	.configure(
2308	NFA::config()
2309	.unanchored_prefix(`false`)
2310	.which_captures(WhichCaptures::Implicit),
2311	)
2312	.build("a(b)c")
2313	.unwrap();
2314	assert_eq!(
2315	nfa.states(),
2316	&[
2317	s_cap(`1`, `0`, `0`, `0`),
2318	s_byte(b'a', `2`),
2319	s_byte(b'b', `3`),
2320	s_byte(b'c', `4`),
2321	s_cap(`5`, `0`, `0`, `1`),
2322	s_match(`0`)
2323	]
2324	);
2325	let ginfo = nfa.group_info();
2326	assert_eq!(`1`, ginfo.all_group_len());
2327	}
2328
2329	#[test]
2330	fn compile_captures_none() {
2331	let nfa = NFA::compiler()
2332	.configure(
2333	NFA::config()
2334	.unanchored_prefix(`false`)
2335	.which_captures(WhichCaptures::None),
2336	)
2337	.build("a(b)c")
2338	.unwrap();
2339	assert_eq!(
2340	nfa.states(),
2341	&[s_byte(b'a', `1`), s_byte(b'b', `2`), s_byte(b'c', `3`), s_match(`0`)]
2342	);
2343	let ginfo = nfa.group_info();
2344	assert_eq!(`0`, ginfo.all_group_len());
2345	}
2346	}
2347

Provided by KDAB

Definitions