compiler.rs - Codebrowser

1	use core::{borrow::Borrow, cell::RefCell};
2
3	use alloc::{sync::Arc, vec, vec::Vec};
4
5	use regex_syntax::{
6	hir::{self, Hir},
7	utf8::{Utf8Range, Utf8Sequences},
8	ParserBuilder,
9	};
10
11	use crate::{
12	nfa::thompson::{
13	builder::Builder,
14	error::BuildError,
15	literal_trie::LiteralTrie,
16	map::{Utf8BoundedMap, Utf8SuffixKey, Utf8SuffixMap},
17	nfa::{Transition, NFA},
18	range_trie::RangeTrie,
19	},
20	util::{
21	look::{Look, LookMatcher},
22	primitives::{PatternID, StateID},
23	},
24	};
25
26	/// The configuration used for a Thompson NFA compiler.
27	#[derive(Clone, Debug, Default)]
28	pub struct Config {
29	utf8: Option<bool>,
30	reverse: Option<bool>,
31	nfa_size_limit: Option<Option<usize>>,
32	shrink: Option<bool>,
33	which_captures: Option<WhichCaptures>,
34	look_matcher: Option<LookMatcher>,
35	#[cfg(test)]
36	unanchored_prefix: Option<bool>,
37	}
38
39	impl Config {
40	/// Return a new default Thompson NFA compiler configuration.
41	pub fn new() -> Config {
42	Config::default()
43	}
44
45	/// Whether to enable UTF-8 mode during search or not.
46	///
47	/// A regex engine is said to be in UTF-8 mode when it guarantees that
48	/// all matches returned by it have spans consisting of only valid UTF-8.
49	/// That is, it is impossible for a match span to be returned that
50	/// contains any invalid UTF-8.
51	///
52	/// UTF-8 mode generally consists of two things:
53	///
54	/// 1. Whether the NFA's states are constructed such that all paths to a
55	/// match state that consume at least one byte always correspond to valid
56	/// UTF-8.
57	/// 2. Whether all paths to a match state that do _not_ consume any bytes
58	/// should always correspond to valid UTF-8 boundaries.
59	///
60	/// (1) is a guarantee made by whoever constructs the NFA.
61	/// If you're parsing a regex from its concrete syntax, then
62	/// [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) can make
63	/// this guarantee for you. It does it by returning an error if the regex
64	/// pattern could every report a non-empty match span that contains invalid
65	/// UTF-8. So long as `syntax::Config::utf8` mode is enabled and your regex
66	/// successfully parses, then you're guaranteed that the corresponding NFA
67	/// will only ever report non-empty match spans containing valid UTF-8.
68	///
69	/// (2) is a trickier guarantee because it cannot be enforced by the NFA
70	/// state graph itself. Consider, for example, the regex `a`. It matches*
71	/// the empty strings in `☃` at positions `0`, `1`, `2` and `3`, where
72	/// positions `1` and `2` occur within the UTF-8 encoding of a codepoint,
73	/// and thus correspond to invalid UTF-8 boundaries. Therefore, this
74	/// guarantee must be made at a higher level than the NFA state graph
75	/// itself. This crate deals with this case in each regex engine. Namely,
76	/// when a zero-width match that splits a codepoint is found and UTF-8
77	/// mode enabled, then it is ignored and the engine moves on looking for
78	/// the next match.
79	///
80	/// Thus, UTF-8 mode is both a promise that the NFA built only reports
81	/// non-empty matches that are valid UTF-8, and an instruction* to regex*
82	/// engines that empty matches that split codepoints should be banned.
83	///
84	/// Because UTF-8 mode is fundamentally about avoiding invalid UTF-8 spans,
85	/// it only makes sense to enable this option when you know* your haystack*
86	/// is valid UTF-8. (For example, a `&str`.) Enabling UTF-8 mode and
87	/// searching a haystack that contains invalid UTF-8 leads to unspecified
88	/// behavior.
89	///
90	/// Therefore, it may make sense to enable `syntax::Config::utf8` while
91	/// simultaneously disabling* this option. That would ensure all non-empty*
92	/// match spans are valid UTF-8, but that empty match spans may still split
93	/// a codepoint or match at other places that aren't valid UTF-8.
94	///
95	/// In general, this mode is only relevant if your regex can match the
96	/// empty string. Most regexes don't.
97	///
98	/// This is enabled by default.
99	///
100	/// # Example
101	///
102	/// This example shows how UTF-8 mode can impact the match spans that may
103	/// be reported in certain cases.
104	///
105	/// ```
106	/// use regex_automata::{
107	/// nfa::thompson::{self, pikevm::PikeVM},
108	/// Match, Input,
109	/// };
110	///
111	/// let re = PikeVM::new("")?;
112	/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
113	///
114	/// // UTF-8 mode is enabled by default.
115	/// let mut input = Input::new("☃");
116	/// re.search(&mut cache, &input, &mut caps);
117	/// assert_eq!(Some(Match::must(`0`, `0`..`0`)), caps.get_match());
118	///
119	/// // Even though an empty regex matches at 1..1, our next match is
120	/// // 3..3 because 1..1 and 2..2 split the snowman codepoint (which is
121	/// // three bytes long).
122	/// input.set_start(`1`);
123	/// re.search(&mut cache, &input, &mut caps);
124	/// assert_eq!(Some(Match::must(`0`, `3`..`3`)), caps.get_match());
125	///
126	/// // But if we disable UTF-8, then we'll get matches at 1..1 and 2..2:
127	/// let re = PikeVM::builder()
128	/// .thompson(thompson::Config::new().utf8(`false`))
129	/// .build("")?;
130	/// re.search(&mut cache, &input, &mut caps);
131	/// assert_eq!(Some(Match::must(`0`, `1`..`1`)), caps.get_match());
132	///
133	/// input.set_start(`2`);
134	/// re.search(&mut cache, &input, &mut caps);
135	/// assert_eq!(Some(Match::must(`0`, `2`..`2`)), caps.get_match());
136	///
137	/// input.set_start(`3`);
138	/// re.search(&mut cache, &input, &mut caps);
139	/// assert_eq!(Some(Match::must(`0`, `3`..`3`)), caps.get_match());
140	///
141	/// input.set_start(`4`);
142	/// re.search(&mut cache, &input, &mut caps);
143	/// assert_eq!(None, caps.get_match());
144	///
145	/// # Ok::<(), Box<dyn std::error::Error>>(())
146	/// ```
147	pub fn utf8(mut self, yes: bool) -> Config {
148	self.utf8 = Some(yes);
149	self
150	}
151
152	/// Reverse the NFA.
153	///
154	/// A NFA reversal is performed by reversing all of the concatenated
155	/// sub-expressions in the original pattern, recursively. (Look around
156	/// operators are also inverted.) The resulting NFA can be used to match
157	/// the pattern starting from the end of a string instead of the beginning
158	/// of a string.
159	///
160	/// Reversing the NFA is useful for building a reverse DFA, which is most
161	/// useful for finding the start of a match after its ending position has
162	/// been found. NFA execution engines typically do not work on reverse
163	/// NFAs. For example, currently, the Pike VM reports the starting location
164	/// of matches without a reverse NFA.
165	///
166	/// Currently, enabling this setting requires disabling the
167	/// [`captures`](Config::captures) setting. If both are enabled, then the
168	/// compiler will return an error. It is expected that this limitation will
169	/// be lifted in the future.
170	///
171	/// This is disabled by default.
172	///
173	/// # Example
174	///
175	/// This example shows how to build a DFA from a reverse NFA, and then use
176	/// the DFA to search backwards.
177	///
178	/// ```
179	/// use regex_automata::{
180	/// dfa::{self, Automaton},
181	/// nfa::thompson::{NFA, WhichCaptures},
182	/// HalfMatch, Input,
183	/// };
184	///
185	/// let dfa = dfa::dense::Builder::new()
186	/// .thompson(NFA::config()
187	/// .which_captures(WhichCaptures::None)
188	/// .reverse(`true`)
189	/// )
190	/// .build("baz[0-9]+")?;
191	/// let expected = Some(HalfMatch::must(`0`, `3`));
192	/// assert_eq!(
193	/// expected,
194	/// dfa.try_search_rev(&Input::new("foobaz12345bar"))?,
195	/// );
196	///
197	/// # Ok::<(), Box<dyn std::error::Error>>(())
198	/// ```
199	pub fn reverse(mut self, yes: bool) -> Config {
200	self.reverse = Some(yes);
201	self
202	}
203
204	/// Sets an approximate size limit on the total heap used by the NFA being
205	/// compiled.
206	///
207	/// This permits imposing constraints on the size of a compiled NFA. This
208	/// may be useful in contexts where the regex pattern is untrusted and one
209	/// wants to avoid using too much memory.
210	///
211	/// This size limit does not apply to auxiliary heap used during
212	/// compilation that is not part of the built NFA.
213	///
214	/// Note that this size limit is applied during compilation in order for
215	/// the limit to prevent too much heap from being used. However, the
216	/// implementation may use an intermediate NFA representation that is
217	/// otherwise slightly bigger than the final public form. Since the size
218	/// limit may be applied to an intermediate representation, there is not
219	/// necessarily a precise correspondence between the configured size limit
220	/// and the heap usage of the final NFA.
221	///
222	/// There is no size limit by default.
223	///
224	/// # Example
225	///
226	/// This example demonstrates how Unicode mode can greatly increase the
227	/// size of the NFA.
228	///
229	/// ```
230	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
231	/// use regex_automata::nfa::thompson::NFA;
232	///
233	/// // 300KB isn't enough!
234	/// NFA::compiler()
235	/// .configure(NFA::config().nfa_size_limit(Some(`300_000`)))
236	/// .build(r"\w{20}")
237	/// .unwrap_err();
238	///
239	/// // ... but 400KB probably is.
240	/// let nfa = NFA::compiler()
241	/// .configure(NFA::config().nfa_size_limit(Some(`400_000`)))
242	/// .build(r"\w{20}")?;
243	///
244	/// assert_eq!(nfa.pattern_len(), `1`);
245	///
246	/// # Ok::<(), Box<dyn std::error::Error>>(())
247	/// ```
248	pub fn nfa_size_limit(mut self, bytes: Option<usize>) -> Config {
249	self.nfa_size_limit = Some(bytes);
250	self
251	}
252
253	/// Apply best effort heuristics to shrink the NFA at the expense of more
254	/// time/memory.
255	///
256	/// Generally speaking, if one is using an NFA to compile a DFA, then the
257	/// extra time used to shrink the NFA will be more than made up for during
258	/// DFA construction (potentially by a lot). In other words, enabling this
259	/// can substantially decrease the overall amount of time it takes to build
260	/// a DFA.
261	///
262	/// A reason to keep this disabled is if you want to compile an NFA and
263	/// start using it as quickly as possible without needing to build a DFA,
264	/// and you don't mind using a bit of extra memory for the NFA. e.g., for
265	/// an NFA simulation or for a lazy DFA.
266	///
267	/// NFA shrinking is currently most useful when compiling a reverse
268	/// NFA with large Unicode character classes. In particular, it trades
269	/// additional CPU time during NFA compilation in favor of generating fewer
270	/// NFA states.
271	///
272	/// This is disabled by default because it can increase compile times
273	/// quite a bit if you aren't building a full DFA.
274	///
275	/// # Example
276	///
277	/// This example shows that NFA shrinking can lead to substantial space
278	/// savings in some cases. Notice that, as noted above, we build a reverse
279	/// DFA and use a pattern with a large Unicode character class.
280	///
281	/// ```
282	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
283	/// use regex_automata::nfa::thompson::{NFA, WhichCaptures};
284	///
285	/// // Currently we have to disable captures when enabling reverse NFA.
286	/// let config = NFA::config()
287	/// .which_captures(WhichCaptures::None)
288	/// .reverse(`true`);
289	/// let not_shrunk = NFA::compiler()
290	/// .configure(config.clone().shrink(`false`))
291	/// .build(r"\w")?;
292	/// let shrunk = NFA::compiler()
293	/// .configure(config.clone().shrink(`true`))
294	/// .build(r"\w")?;
295	///
296	/// // While a specific shrink factor is not guaranteed, the savings can be
297	/// // considerable in some cases.
298	/// assert!(shrunk.states().len() * `2` < not_shrunk.states().len());
299	///
300	/// # Ok::<(), Box<dyn std::error::Error>>(())
301	/// ```
302	pub fn shrink(mut self, yes: bool) -> Config {
303	self.shrink = Some(yes);
304	self
305	}
306
307	/// Whether to include 'Capture' states in the NFA.
308	///
309	/// Currently, enabling this setting requires disabling the
310	/// [`reverse`](Config::reverse) setting. If both are enabled, then the
311	/// compiler will return an error. It is expected that this limitation will
312	/// be lifted in the future.
313	///
314	/// This is enabled by default.
315	///
316	/// # Example
317	///
318	/// This example demonstrates that some regex engines, like the Pike VM,
319	/// require capturing states to be present in the NFA to report match
320	/// offsets.
321	///
322	/// (Note that since this method is deprecated, the example below uses
323	/// [`Config::which_captures`] to disable capture states.)
324	///
325	/// ```
326	/// use regex_automata::nfa::thompson::{
327	/// pikevm::PikeVM,
328	/// NFA,
329	/// WhichCaptures,
330	/// };
331	///
332	/// let re = PikeVM::builder()
333	/// .thompson(NFA::config().which_captures(WhichCaptures::None))
334	/// .build(r"[a-z]+")?;
335	/// let mut cache = re.create_cache();
336	///
337	/// assert!(re.is_match(&mut cache, "abc"));
338	/// assert_eq!(None, re.find(&mut cache, "abc"));
339	///
340	/// # Ok::<(), Box<dyn std::error::Error>>(())
341	/// ```
342	#[deprecated(since = "0.3.5", note = "use which_captures instead")]
343	pub fn captures(self, yes: bool) -> Config {
344	self.which_captures(if yes {
345	WhichCaptures::All
346	} else {
347	WhichCaptures::None
348	})
349	}
350
351	/// Configures what kinds of capture groups are compiled into
352	/// [`State::Capture`](crate::nfa::thompson::State::Capture) states in a
353	/// Thompson NFA.
354	///
355	/// Currently, using any option except for [`WhichCaptures::None`] requires
356	/// disabling the [`reverse`](Config::reverse) setting. If both are
357	/// enabled, then the compiler will return an error. It is expected that
358	/// this limitation will be lifted in the future.
359	///
360	/// This is set to [`WhichCaptures::All`] by default. Callers may wish to
361	/// use [`WhichCaptures::Implicit`] in cases where one wants avoid the
362	/// overhead of capture states for explicit groups. Usually this occurs
363	/// when one wants to use the `PikeVM` only for determining the overall
364	/// match. Otherwise, the `PikeVM` could use much more memory than is
365	/// necessary.
366	///
367	/// # Example
368	///
369	/// This example demonstrates that some regex engines, like the Pike VM,
370	/// require capturing states to be present in the NFA to report match
371	/// offsets.
372	///
373	/// ```
374	/// use regex_automata::nfa::thompson::{
375	/// pikevm::PikeVM,
376	/// NFA,
377	/// WhichCaptures,
378	/// };
379	///
380	/// let re = PikeVM::builder()
381	/// .thompson(NFA::config().which_captures(WhichCaptures::None))
382	/// .build(r"[a-z]+")?;
383	/// let mut cache = re.create_cache();
384	///
385	/// assert!(re.is_match(&mut cache, "abc"));
386	/// assert_eq!(None, re.find(&mut cache, "abc"));
387	///
388	/// # Ok::<(), Box<dyn std::error::Error>>(())
389	/// ```
390	///
391	/// The same applies to the bounded backtracker:
392	///
393	/// ```
394	/// use regex_automata::nfa::thompson::{
395	/// backtrack::BoundedBacktracker,
396	/// NFA,
397	/// WhichCaptures,
398	/// };
399	///
400	/// let re = BoundedBacktracker::builder()
401	/// .thompson(NFA::config().which_captures(WhichCaptures::None))
402	/// .build(r"[a-z]+")?;
403	/// let mut cache = re.create_cache();
404	///
405	/// assert!(re.try_is_match(&mut cache, "abc")?);
406	/// assert_eq!(None, re.try_find(&mut cache, "abc")?);
407	///
408	/// # Ok::<(), Box<dyn std::error::Error>>(())
409	/// ```
410	pub fn which_captures(mut self, which_captures: WhichCaptures) -> Config {
411	self.which_captures = Some(which_captures);
412	self
413	}
414
415	/// Sets the look-around matcher that should be used with this NFA.
416	///
417	/// A look-around matcher determines how to match look-around assertions.
418	/// In particular, some assertions are configurable. For example, the
419	/// `(?m:^)` and `(?m:$)` assertions can have their line terminator changed
420	/// from the default of `\n` to any other byte.
421	///
422	/// # Example
423	///
424	/// This shows how to change the line terminator for multi-line assertions.
425	///
426	/// ```
427	/// use regex_automata::{
428	/// nfa::thompson::{self, pikevm::PikeVM},
429	/// util::look::LookMatcher,
430	/// Match, Input,
431	/// };
432	///
433	/// let mut lookm = LookMatcher::new();
434	/// lookm.set_line_terminator(b'`\x00`');
435	///
436	/// let re = PikeVM::builder()
437	/// .thompson(thompson::Config::new().look_matcher(lookm))
438	/// .build(r"(?m)^[a-z]+$")?;
439	/// let mut cache = re.create_cache();
440	///
441	/// // Multi-line assertions now use NUL as a terminator.
442	/// assert_eq!(
443	/// Some(Match::must(`0`, `1`..`4`)),
444	/// re.find(&mut cache, b"`\x00`abc`\x00`"),
445	/// );
446	/// // ... and \n is no longer recognized as a terminator.
447	/// assert_eq!(
448	/// None,
449	/// re.find(&mut cache, b"`\n`abc`\n`"),
450	/// );
451	///
452	/// # Ok::<(), Box<dyn std::error::Error>>(())
453	/// ```
454	pub fn look_matcher(mut self, m: LookMatcher) -> Config {
455	self.look_matcher = Some(m);
456	self
457	}
458
459	/// Whether to compile an unanchored prefix into this NFA.
460	///
461	/// This is enabled by default. It is made available for tests only to make
462	/// it easier to unit test the output of the compiler.
463	#[cfg(test)]
464	fn unanchored_prefix(mut self, yes: bool) -> Config {
465	self.unanchored_prefix = Some(yes);
466	self
467	}
468
469	/// Returns whether this configuration has enabled UTF-8 mode.
470	pub fn get_utf8(&self) -> bool {
471	self.utf8.unwrap_or(`true`)
472	}
473
474	/// Returns whether this configuration has enabled reverse NFA compilation.
475	pub fn get_reverse(&self) -> bool {
476	self.reverse.unwrap_or(`false`)
477	}
478
479	/// Return the configured NFA size limit, if it exists, in the number of
480	/// bytes of heap used.
481	pub fn get_nfa_size_limit(&self) -> Option<usize> {
482	self.nfa_size_limit.unwrap_or(None)
483	}
484
485	/// Return whether NFA shrinking is enabled.
486	pub fn get_shrink(&self) -> bool {
487	self.shrink.unwrap_or(`false`)
488	}
489
490	/// Return whether NFA compilation is configured to produce capture states.
491	#[deprecated(since = "0.3.5", note = "use get_which_captures instead")]
492	pub fn get_captures(&self) -> bool {
493	self.get_which_captures().is_any()
494	}
495
496	/// Return what kinds of capture states will be compiled into an NFA.
497	pub fn get_which_captures(&self) -> WhichCaptures {
498	self.which_captures.unwrap_or(WhichCaptures::All)
499	}
500
501	/// Return the look-around matcher for this NFA.
502	pub fn get_look_matcher(&self) -> LookMatcher {
503	self.look_matcher.clone().unwrap_or(LookMatcher::default())
504	}
505
506	/// Return whether NFA compilation is configured to include an unanchored
507	/// prefix.
508	///
509	/// This is always false when not in test mode.
510	fn get_unanchored_prefix(&self) -> bool {
511	#[cfg(test)]
512	{
513	self.unanchored_prefix.unwrap_or(`true`)
514	}
515	#[cfg(not(test))]
516	{
517	`true`
518	}
519	}
520
521	/// Overwrite the default configuration such that the options in `o` are
522	/// always used. If an option in `o` is not set, then the corresponding
523	/// option in `self` is used. If it's not set in `self` either, then it
524	/// remains not set.
525	pub(crate) fn overwrite(&self, o: Config) -> Config {
526	Config {
527	utf8: o.utf8.or(self.utf8),
528	reverse: o.reverse.or(self.reverse),
529	nfa_size_limit: o.nfa_size_limit.or(self.nfa_size_limit),
530	shrink: o.shrink.or(self.shrink),
531	which_captures: o.which_captures.or(self.which_captures),
532	look_matcher: o.look_matcher.or_else(\|\| self.look_matcher.clone()),
533	#[cfg(test)]
534	unanchored_prefix: o.unanchored_prefix.or(self.unanchored_prefix),
535	}
536	}
537	}
538
539	/// A configuration indicating which kinds of
540	/// [`State::Capture`](crate::nfa::thompson::State::Capture) states to include.
541	///
542	/// This configuration can be used with [`Config::which_captures`] to control
543	/// which capture states are compiled into a Thompson NFA.
544	///
545	/// The default configuration is [`WhichCaptures::All`].
546	#[derive(Clone, Copy, Debug)]
547	pub enum WhichCaptures {
548	/// All capture states, including those corresponding to both implicit and
549	/// explicit capture groups, are included in the Thompson NFA.
550	All,
551	/// Only capture states corresponding to implicit capture groups are
552	/// included. Implicit capture groups appear in every pattern implicitly
553	/// and correspond to the overall match of a pattern.
554	///
555	/// This is useful when one only cares about the overall match of a
556	/// pattern. By excluding capture states from explicit capture groups,
557	/// one might be able to reduce the memory usage of a multi-pattern regex
558	/// substantially if it was otherwise written to have many explicit capture
559	/// groups.
560	Implicit,
561	/// No capture states are compiled into the Thompson NFA.
562	///
563	/// This is useful when capture states are either not needed (for example,
564	/// if one is only trying to build a DFA) or if they aren't supported (for
565	/// example, a reverse NFA).
566	None,
567	}
568
569	impl Default for WhichCaptures {
570	fn default() -> WhichCaptures {
571	WhichCaptures::All
572	}
573	}
574
575	impl WhichCaptures {
576	/// Returns true if this configuration indicates that no capture states
577	/// should be produced in an NFA.
578	pub fn is_none(&self) -> bool {
579	matches!(*self, WhichCaptures::None)
580	}
581
582	/// Returns true if this configuration indicates that some capture states
583	/// should be added to an NFA. Note that this might only include capture
584	/// states for implicit capture groups.
585	pub fn is_any(&self) -> bool {
586	!self.is_none()
587	}
588	}
589
590	/*
591	This compiler below uses Thompson's construction algorithm. The compiler takes
592	a regex-syntax::Hir as input and emits an NFA graph as output. The NFA graph
593	is structured in a way that permits it to be executed by a virtual machine and
594	also used to efficiently build a DFA.
595
596	The compiler deals with a slightly expanded set of NFA states than what is
597	in a final NFA (as exhibited by builder::State and nfa::State). Notably a
598	compiler state includes an empty node that has exactly one unconditional
599	epsilon transition to the next state. In other words, it's a "goto" instruction
600	if one views Thompson's NFA as a set of bytecode instructions. These goto
601	instructions are removed in a subsequent phase before returning the NFA to the
602	caller. The purpose of these empty nodes is that they make the construction
603	algorithm substantially simpler to implement. We remove them before returning
604	to the caller because they can represent substantial overhead when traversing
605	the NFA graph (either while searching using the NFA directly or while building
606	a DFA).
607
608	In the future, it would be nice to provide a Glushkov compiler as well, as it
609	would work well as a bit-parallel NFA for smaller regexes. But the Thompson
610	construction is one I'm more familiar with and seems more straight-forward to
611	deal with when it comes to large Unicode character classes.
612
613	Internally, the compiler uses interior mutability to improve composition in the
614	face of the borrow checker. In particular, we'd really like to be able to write
615	things like this:
616
617	self.c_concat(exprs.iter().map(\|e\| self.c(e)))
618
619	Which elegantly uses iterators to build up a sequence of compiled regex
620	sub-expressions and then hands it off to the concatenating compiler routine.
621	Without interior mutability, the borrow checker won't let us borrow `self`
622	mutably both inside and outside the closure at the same time.
623	*/
624
625	/// A builder for compiling an NFA from a regex's high-level intermediate
626	/// representation (HIR).
627	///
628	/// This compiler provides a way to translate a parsed regex pattern into an
629	/// NFA state graph. The NFA state graph can either be used directly to execute
630	/// a search (e.g., with a Pike VM), or it can be further used to build a DFA.
631	///
632	/// This compiler provides APIs both for compiling regex patterns directly from
633	/// their concrete syntax, or via a [`regex_syntax::hir::Hir`].
634	///
635	/// This compiler has various options that may be configured via
636	/// [`thompson::Config`](Config).
637	///
638	/// Note that a compiler is not the same as a [`thompson::Builder`](Builder).
639	/// A `Builder` provides a lower level API that is uncoupled from a regex
640	/// pattern's concrete syntax or even its HIR. Instead, it permits stitching
641	/// together an NFA by hand. See its docs for examples.
642	///
643	/// # Example: compilation from concrete syntax
644	///
645	/// This shows how to compile an NFA from a pattern string while setting a size
646	/// limit on how big the NFA is allowed to be (in terms of bytes of heap used).
647	///
648	/// ```
649	/// use regex_automata::{
650	/// nfa::thompson::{NFA, pikevm::PikeVM},
651	/// Match,
652	/// };
653	///
654	/// let config = NFA::config().nfa_size_limit(Some(`1_000`));
655	/// let nfa = NFA::compiler().configure(config).build(r"(?-u)\w")?;
656	///
657	/// let re = PikeVM::new_from_nfa(nfa)?;
658	/// let mut cache = re.create_cache();
659	/// let mut caps = re.create_captures();
660	/// let expected = Some(Match::must(`0`, `3`..`4`));
661	/// re.captures(&mut cache, "!@#A#@!", &mut caps);
662	/// assert_eq!(expected, caps.get_match());
663	///
664	/// # Ok::<(), Box<dyn std::error::Error>>(())
665	/// ```
666	///
667	/// # Example: compilation from HIR
668	///
669	/// This shows how to hand assemble a regular expression via its HIR, and then
670	/// compile an NFA directly from it.
671	///
672	/// ```
673	/// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
674	/// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange};
675	///
676	/// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![
677	/// ClassBytesRange::new(b'0', b'9'),
678	/// ClassBytesRange::new(b'A', b'Z'),
679	/// ClassBytesRange::new(b'_', b'_'),
680	/// ClassBytesRange::new(b'a', b'z'),
681	/// ])));
682	///
683	/// let config = NFA::config().nfa_size_limit(Some(`1_000`));
684	/// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?;
685	///
686	/// let re = PikeVM::new_from_nfa(nfa)?;
687	/// let mut cache = re.create_cache();
688	/// let mut caps = re.create_captures();
689	/// let expected = Some(Match::must(`0`, `3`..`4`));
690	/// re.captures(&mut cache, "!@#A#@!", &mut caps);
691	/// assert_eq!(expected, caps.get_match());
692	///
693	/// # Ok::<(), Box<dyn std::error::Error>>(())
694	/// ```
695	#[derive(Clone, Debug)]
696	pub struct Compiler {
697	/// A regex parser, used when compiling an NFA directly from a pattern
698	/// string.
699	parser: ParserBuilder,
700	/// The compiler configuration.
701	config: Config,
702	/// The builder for actually constructing an NFA. This provides a
703	/// convenient abstraction for writing a compiler.
704	builder: RefCell<Builder>,
705	/// State used for compiling character classes to UTF-8 byte automata.
706	/// State is not retained between character class compilations. This just
707	/// serves to amortize allocation to the extent possible.
708	utf8_state: RefCell<Utf8State>,
709	/// State used for arranging character classes in reverse into a trie.
710	trie_state: RefCell<RangeTrie>,
711	/// State used for caching common suffixes when compiling reverse UTF-8
712	/// automata (for Unicode character classes).
713	utf8_suffix: RefCell<Utf8SuffixMap>,
714	}
715
716	impl Compiler {
717	/// Create a new NFA builder with its default configuration.
718	pub fn new() -> Compiler {
719	Compiler {
720	parser: ParserBuilder::new(),
721	config: Config::default(),
722	builder: RefCell::new(Builder::new()),
723	utf8_state: RefCell::new(Utf8State::new()),
724	trie_state: RefCell::new(RangeTrie::new()),
725	utf8_suffix: RefCell::new(Utf8SuffixMap::new(`1000`)),
726	}
727	}
728
729	/// Compile the given regular expression pattern into an NFA.
730	///
731	/// If there was a problem parsing the regex, then that error is returned.
732	///
733	/// Otherwise, if there was a problem building the NFA, then an error is
734	/// returned. The only error that can occur is if the compiled regex would
735	/// exceed the size limits configured on this builder, or if any part of
736	/// the NFA would exceed the integer representations used. (For example,
737	/// too many states might plausibly occur on a 16-bit target.)
738	///
739	/// # Example
740	///
741	/// ```
742	/// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
743	///
744	/// let config = NFA::config().nfa_size_limit(Some(`1_000`));
745	/// let nfa = NFA::compiler().configure(config).build(r"(?-u)\w")?;
746	///
747	/// let re = PikeVM::new_from_nfa(nfa)?;
748	/// let mut cache = re.create_cache();
749	/// let mut caps = re.create_captures();
750	/// let expected = Some(Match::must(`0`, `3`..`4`));
751	/// re.captures(&mut cache, "!@#A#@!", &mut caps);
752	/// assert_eq!(expected, caps.get_match());
753	///
754	/// # Ok::<(), Box<dyn std::error::Error>>(())
755	/// ```
756	pub fn build(&self, pattern: &str) -> Result<NFA, BuildError> {
757	self.build_many(&[pattern])
758	}
759
760	/// Compile the given regular expression patterns into a single NFA.
761	///
762	/// When matches are returned, the pattern ID corresponds to the index of
763	/// the pattern in the slice given.
764	///
765	/// # Example
766	///
767	/// ```
768	/// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
769	///
770	/// let config = NFA::config().nfa_size_limit(Some(`1_000`));
771	/// let nfa = NFA::compiler().configure(config).build_many(&[
772	/// r"(?-u)\s",
773	/// r"(?-u)\w",
774	/// ])?;
775	///
776	/// let re = PikeVM::new_from_nfa(nfa)?;
777	/// let mut cache = re.create_cache();
778	/// let mut caps = re.create_captures();
779	/// let expected = Some(Match::must(`1`, `1`..`2`));
780	/// re.captures(&mut cache, "!A! !A!", &mut caps);
781	/// assert_eq!(expected, caps.get_match());
782	///
783	/// # Ok::<(), Box<dyn std::error::Error>>(())
784	/// ```
785	pub fn build_many<P: AsRef<str>>(
786	&self,
787	patterns: &[P],
788	) -> Result<NFA, BuildError> {
789	let mut hirs = vec![];
790	for p in patterns {
791	hirs.push(
792	self.parser
793	.build()
794	.parse(p.as_ref())
795	.map_err(BuildError::syntax)?,
796	);
797	debug!("parsed: {:?}", p.as_ref());
798	}
799	self.build_many_from_hir(&hirs)
800	}
801
802	/// Compile the given high level intermediate representation of a regular
803	/// expression into an NFA.
804	///
805	/// If there was a problem building the NFA, then an error is returned. The
806	/// only error that can occur is if the compiled regex would exceed the
807	/// size limits configured on this builder, or if any part of the NFA would
808	/// exceed the integer representations used. (For example, too many states
809	/// might plausibly occur on a 16-bit target.)
810	///
811	/// # Example
812	///
813	/// ```
814	/// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
815	/// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange};
816	///
817	/// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![
818	/// ClassBytesRange::new(b'0', b'9'),
819	/// ClassBytesRange::new(b'A', b'Z'),
820	/// ClassBytesRange::new(b'_', b'_'),
821	/// ClassBytesRange::new(b'a', b'z'),
822	/// ])));
823	///
824	/// let config = NFA::config().nfa_size_limit(Some(`1_000`));
825	/// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?;
826	///
827	/// let re = PikeVM::new_from_nfa(nfa)?;
828	/// let mut cache = re.create_cache();
829	/// let mut caps = re.create_captures();
830	/// let expected = Some(Match::must(`0`, `3`..`4`));
831	/// re.captures(&mut cache, "!@#A#@!", &mut caps);
832	/// assert_eq!(expected, caps.get_match());
833	///
834	/// # Ok::<(), Box<dyn std::error::Error>>(())
835	/// ```
836	pub fn build_from_hir(&self, expr: &Hir) -> Result<NFA, BuildError> {
837	self.build_many_from_hir(&[expr])
838	}
839
840	/// Compile the given high level intermediate representations of regular
841	/// expressions into a single NFA.
842	///
843	/// When matches are returned, the pattern ID corresponds to the index of
844	/// the pattern in the slice given.
845	///
846	/// # Example
847	///
848	/// ```
849	/// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
850	/// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange};
851	///
852	/// let hirs = &[
853	/// Hir::class(Class::Bytes(ClassBytes::new(vec![
854	/// ClassBytesRange::new(b'`\t`', b'`\r`'),
855	/// ClassBytesRange::new(b' ', b' '),
856	/// ]))),
857	/// Hir::class(Class::Bytes(ClassBytes::new(vec![
858	/// ClassBytesRange::new(b'0', b'9'),
859	/// ClassBytesRange::new(b'A', b'Z'),
860	/// ClassBytesRange::new(b'_', b'_'),
861	/// ClassBytesRange::new(b'a', b'z'),
862	/// ]))),
863	/// ];
864	///
865	/// let config = NFA::config().nfa_size_limit(Some(`1_000`));
866	/// let nfa = NFA::compiler().configure(config).build_many_from_hir(hirs)?;
867	///
868	/// let re = PikeVM::new_from_nfa(nfa)?;
869	/// let mut cache = re.create_cache();
870	/// let mut caps = re.create_captures();
871	/// let expected = Some(Match::must(`1`, `1`..`2`));
872	/// re.captures(&mut cache, "!A! !A!", &mut caps);
873	/// assert_eq!(expected, caps.get_match());
874	///
875	/// # Ok::<(), Box<dyn std::error::Error>>(())
876	/// ```
877	pub fn build_many_from_hir<H: Borrow<Hir>>(
878	&self,
879	exprs: &[H],
880	) -> Result<NFA, BuildError> {
881	self.compile(exprs)
882	}
883
884	/// Apply the given NFA configuration options to this builder.
885	///
886	/// # Example
887	///
888	/// ```
889	/// use regex_automata::nfa::thompson::NFA;
890	///
891	/// let config = NFA::config().nfa_size_limit(Some(`1_000`));
892	/// let nfa = NFA::compiler().configure(config).build(r"(?-u)\w")?;
893	/// assert_eq!(nfa.pattern_len(), `1`);
894	///
895	/// # Ok::<(), Box<dyn std::error::Error>>(())
896	/// ```
897	pub fn configure(&mut self, config: Config) -> &mut Compiler {
898	self.config = self.config.overwrite(config);
899	self
900	}
901
902	/// Set the syntax configuration for this builder using
903	/// [`syntax::Config`](crate::util::syntax::Config).
904	///
905	/// This permits setting things like case insensitivity, Unicode and multi
906	/// line mode.
907	///
908	/// This syntax configuration only applies when an NFA is built directly
909	/// from a pattern string. If an NFA is built from an HIR, then all syntax
910	/// settings are ignored.
911	///
912	/// # Example
913	///
914	/// ```
915	/// use regex_automata::{nfa::thompson::NFA, util::syntax};
916	///
917	/// let syntax_config = syntax::Config::new().unicode(`false`);
918	/// let nfa = NFA::compiler().syntax(syntax_config).build(r"\w")?;
919	/// // If Unicode were enabled, the number of states would be much bigger.
920	/// assert!(nfa.states().len() < `15`);
921	///
922	/// # Ok::<(), Box<dyn std::error::Error>>(())
923	/// ```
924	pub fn syntax(
925	&mut self,
926	config: crate::util::syntax::Config,
927	) -> &mut Compiler {
928	config.apply(&mut self.parser);
929	self
930	}
931	}
932
933	impl Compiler {
934	/// Compile the sequence of HIR expressions given. Pattern IDs are
935	/// allocated starting from 0, in correspondence with the slice given.
936	///
937	/// It is legal to provide an empty slice. In that case, the NFA returned
938	/// has no patterns and will never match anything.
939	fn compile<H: Borrow<Hir>>(&self, exprs: &[H]) -> Result<NFA, BuildError> {
940	if exprs.len() > PatternID::LIMIT {
941	return Err(BuildError::too_many_patterns(exprs.len()));
942	}
943	if self.config.get_reverse()
944	&& self.config.get_which_captures().is_any()
945	{
946	return Err(BuildError::unsupported_captures());
947	}
948
949	self.builder.borrow_mut().clear();
950	self.builder.borrow_mut().set_utf8(self.config.get_utf8());
951	self.builder.borrow_mut().set_reverse(self.config.get_reverse());
952	self.builder
953	.borrow_mut()
954	.set_look_matcher(self.config.get_look_matcher());
955	self.builder
956	.borrow_mut()
957	.set_size_limit(self.config.get_nfa_size_limit())?;
958
959	// We always add an unanchored prefix unless we were specifically told
960	// not to (for tests only), or if we know that the regex is anchored
961	// for all matches. When an unanchored prefix is not added, then the
962	// NFA's anchored and unanchored start states are equivalent.
963	let all_anchored = exprs.iter().all(\|e\| {
964	e.borrow()
965	.properties()
966	.look_set_prefix()
967	.contains(hir::Look::Start)
968	});
969	let anchored = !self.config.get_unanchored_prefix() \|\| all_anchored;
970	let unanchored_prefix = if anchored {
971	self.c_empty()?
972	} else {
973	self.c_at_least(&Hir::dot(hir::Dot::AnyByte), `false`, `0`)?
974	};
975
976	let compiled = self.c_alt_iter(exprs.iter().map(\|e\| {
977	let _ = self.start_pattern()?;
978	let one = self.c_cap(`0`, None, e.borrow())?;
979	let match_state_id = self.add_match()?;
980	self.patch(one.end, match_state_id)?;
981	let _ = self.finish_pattern(one.start)?;
982	Ok(ThompsonRef { start: one.start, end: match_state_id })
983	}))?;
984	self.patch(unanchored_prefix.end, compiled.start)?;
985	let nfa = self
986	.builder
987	.borrow_mut()
988	.build(compiled.start, unanchored_prefix.start)?;
989
990	debug!("HIR-to-NFA compilation complete, config: {:?}", self.config);
991	Ok(nfa)
992	}
993
994	/// Compile an arbitrary HIR expression.
995	fn c(&self, expr: &Hir) -> Result<ThompsonRef, BuildError> {
996	use regex_syntax::hir::{Class, HirKind::*};
997
998	match *expr.kind() {
999	Empty => self.c_empty(),
1000	Literal(hir::Literal(ref bytes)) => self.c_literal(bytes),
1001	Class(Class::Bytes(ref c)) => self.c_byte_class(c),
1002	Class(Class::Unicode(ref c)) => self.c_unicode_class(c),
1003	Look(ref look) => self.c_look(look),
1004	Repetition(ref rep) => self.c_repetition(rep),
1005	Capture(ref c) => self.c_cap(c.index, c.name.as_deref(), &c.sub),
1006	Concat(ref es) => self.c_concat(es.iter().map(\|e\| self.c(e))),
1007	Alternation(ref es) => self.c_alt_slice(es),
1008	}
1009	}
1010
1011	/// Compile a concatenation of the sub-expressions yielded by the given
1012	/// iterator. If the iterator yields no elements, then this compiles down
1013	/// to an "empty" state that always matches.
1014	///
1015	/// If the compiler is in reverse mode, then the expressions given are
1016	/// automatically compiled in reverse.
1017	fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef, BuildError>
1018	where
1019	I: DoubleEndedIterator<Item = Result<ThompsonRef, BuildError>>,
1020	{
1021	let first = if self.is_reverse() { it.next_back() } else { it.next() };
1022	let ThompsonRef { start, mut end } = match first {
1023	Some(result) => result?,
1024	None => return self.c_empty(),
1025	};
1026	loop {
1027	let next =
1028	if self.is_reverse() { it.next_back() } else { it.next() };
1029	let compiled = match next {
1030	Some(result) => result?,
1031	None => break,
1032	};
1033	self.patch(end, compiled.start)?;
1034	end = compiled.end;
1035	}
1036	Ok(ThompsonRef { start, end })
1037	}
1038
1039	/// Compile an alternation of the given HIR values.
1040	///
1041	/// This is like 'c_alt_iter', but it accepts a slice of HIR values instead
1042	/// of an iterator of compiled NFA subgraphs. The point of accepting a
1043	/// slice here is that it opens up some optimization opportunities. For
1044	/// example, if all of the HIR values are literals, then this routine might
1045	/// re-shuffle them to make NFA epsilon closures substantially faster.
1046	fn c_alt_slice(&self, exprs: &[Hir]) -> Result<ThompsonRef, BuildError> {
1047	// self.c_alt_iter(exprs.iter().map(\|e\| self.c(e)))
1048	let literal_count = exprs
1049	.iter()
1050	.filter(\|e\| {
1051	matches!(*e.kind(), hir::HirKind::Literal(hir::Literal(_)))
1052	})
1053	.count();
1054	if literal_count <= `1` \|\| literal_count < exprs.len() {
1055	return self.c_alt_iter(exprs.iter().map(\|e\| self.c(e)));
1056	}
1057
1058	let mut trie = if self.is_reverse() {
1059	LiteralTrie::reverse()
1060	} else {
1061	LiteralTrie::forward()
1062	};
1063	for expr in exprs.iter() {
1064	let literal = match *expr.kind() {
1065	hir::HirKind::Literal(hir::Literal(ref bytes)) => bytes,
1066	_ => unreachable!(),
1067	};
1068	trie.add(literal)?;
1069	}
1070	trie.compile(&mut self.builder.borrow_mut())
1071	}
1072
1073	/// Compile an alternation, where each element yielded by the given
1074	/// iterator represents an item in the alternation. If the iterator yields
1075	/// no elements, then this compiles down to a "fail" state.
1076	///
1077	/// In an alternation, expressions appearing earlier are "preferred" at
1078	/// match time over expressions appearing later. At least, this is true
1079	/// when using "leftmost first" match semantics. (If "leftmost longest" are
1080	/// ever added in the future, then this preference order of priority would
1081	/// not apply in that mode.)
1082	fn c_alt_iter<I>(&self, mut it: I) -> Result<ThompsonRef, BuildError>
1083	where
1084	I: Iterator<Item = Result<ThompsonRef, BuildError>>,
1085	{
1086	let first = match it.next() {
1087	None => return self.c_fail(),
1088	Some(result) => result?,
1089	};
1090	let second = match it.next() {
1091	None => return Ok(first),
1092	Some(result) => result?,
1093	};
1094
1095	let union = self.add_union()?;
1096	let end = self.add_empty()?;
1097	self.patch(union, first.start)?;
1098	self.patch(first.end, end)?;
1099	self.patch(union, second.start)?;
1100	self.patch(second.end, end)?;
1101	for result in it {
1102	let compiled = result?;
1103	self.patch(union, compiled.start)?;
1104	self.patch(compiled.end, end)?;
1105	}
1106	Ok(ThompsonRef { start: union, end })
1107	}
1108
1109	/// Compile the given capture sub-expression. `expr` should be the
1110	/// sub-expression contained inside the capture. If "capture" states are
1111	/// enabled, then they are added as appropriate.
1112	///
1113	/// This accepts the pieces of a capture instead of a `hir::Capture` so
1114	/// that it's easy to manufacture a "fake" group when necessary, e.g., for
1115	/// adding the entire pattern as if it were a group in order to create
1116	/// appropriate "capture" states in the NFA.
1117	fn c_cap(
1118	&self,
1119	index: u32,
1120	name: Option<&str>,
1121	expr: &Hir,
1122	) -> Result<ThompsonRef, BuildError> {
1123	match self.config.get_which_captures() {
1124	// No capture states means we always skip them.
1125	WhichCaptures::None => return self.c(expr),
1126	// Implicit captures states means we only add when index==0 since
1127	// index==0 implies the group is implicit.
1128	WhichCaptures::Implicit if index > `0` => return self.c(expr),
1129	_ => {}
1130	}
1131
1132	let start = self.add_capture_start(index, name)?;
1133	let inner = self.c(expr)?;
1134	let end = self.add_capture_end(index)?;
1135	self.patch(start, inner.start)?;
1136	self.patch(inner.end, end)?;
1137	Ok(ThompsonRef { start, end })
1138	}
1139
1140	/// Compile the given repetition expression. This handles all types of
1141	/// repetitions and greediness.
1142	fn c_repetition(
1143	&self,
1144	rep: &hir::Repetition,
1145	) -> Result<ThompsonRef, BuildError> {
1146	match (rep.min, rep.max) {
1147	(`0`, Some(`1`)) => self.c_zero_or_one(&rep.sub, rep.greedy),
1148	(min, None) => self.c_at_least(&rep.sub, rep.greedy, min),
1149	(min, Some(max)) if min == max => self.c_exactly(&rep.sub, min),
1150	(min, Some(max)) => self.c_bounded(&rep.sub, rep.greedy, min, max),
1151	}
1152	}
1153
1154	/// Compile the given expression such that it matches at least `min` times,
1155	/// but no more than `max` times.
1156	///
1157	/// When `greedy` is true, then the preference is for the expression to
1158	/// match as much as possible. Otherwise, it will match as little as
1159	/// possible.
1160	fn c_bounded(
1161	&self,
1162	expr: &Hir,
1163	greedy: bool,
1164	min: u32,
1165	max: u32,
1166	) -> Result<ThompsonRef, BuildError> {
1167	let prefix = self.c_exactly(expr, min)?;
1168	if min == max {
1169	return Ok(prefix);
1170	}
1171
1172	// It is tempting here to compile the rest here as a concatenation
1173	// of zero-or-one matches. i.e., for `a{2,5}`, compile it as if it
1174	// were `aaa?a?a?`. The problem here is that it leads to this program:
1175	//
1176	// >000000: 61 => 01
1177	// 000001: 61 => 02
1178	// 000002: union(03, 04)
1179	// 000003: 61 => 04
1180	// 000004: union(05, 06)
1181	// 000005: 61 => 06
1182	// 000006: union(07, 08)
1183	// 000007: 61 => 08
1184	// 000008: MATCH
1185	//
1186	// And effectively, once you hit state 2, the epsilon closure will
1187	// include states 3, 5, 6, 7 and 8, which is quite a bit. It is better
1188	// to instead compile it like so:
1189	//
1190	// >000000: 61 => 01
1191	// 000001: 61 => 02
1192	// 000002: union(03, 08)
1193	// 000003: 61 => 04
1194	// 000004: union(05, 08)
1195	// 000005: 61 => 06
1196	// 000006: union(07, 08)
1197	// 000007: 61 => 08
1198	// 000008: MATCH
1199	//
1200	// So that the epsilon closure of state 2 is now just 3 and 8.
1201	let empty = self.add_empty()?;
1202	let mut prev_end = prefix.end;
1203	for _ in min..max {
1204	let union = if greedy {
1205	self.add_union()
1206	} else {
1207	self.add_union_reverse()
1208	}?;
1209	let compiled = self.c(expr)?;
1210	self.patch(prev_end, union)?;
1211	self.patch(union, compiled.start)?;
1212	self.patch(union, empty)?;
1213	prev_end = compiled.end;
1214	}
1215	self.patch(prev_end, empty)?;
1216	Ok(ThompsonRef { start: prefix.start, end: empty })
1217	}
1218
1219	/// Compile the given expression such that it may be matched `n` or more
1220	/// times, where `n` can be any integer. (Although a particularly large
1221	/// integer is likely to run afoul of any configured size limits.)
1222	///
1223	/// When `greedy` is true, then the preference is for the expression to
1224	/// match as much as possible. Otherwise, it will match as little as
1225	/// possible.
1226	fn c_at_least(
1227	&self,
1228	expr: &Hir,
1229	greedy: bool,
1230	n: u32,
1231	) -> Result<ThompsonRef, BuildError> {
1232	if n == `0` {
1233	// When the expression cannot match the empty string, then we
1234	// can get away with something much simpler: just one 'alt'
1235	// instruction that optionally repeats itself. But if the expr
1236	// can match the empty string... see below.
1237	if expr.properties().minimum_len().map_or(`false`, \|len\| len > `0`) {
1238	let union = if greedy {
1239	self.add_union()
1240	} else {
1241	self.add_union_reverse()
1242	}?;
1243	let compiled = self.c(expr)?;
1244	self.patch(union, compiled.start)?;
1245	self.patch(compiled.end, union)?;
1246	return Ok(ThompsonRef { start: union, end: union });
1247	}
1248
1249	// What's going on here? Shouldn't x be simpler than this? It*
1250	// turns out that when implementing leftmost-first (Perl-like)
1251	// match semantics, x results in an incorrect preference order*
1252	// when computing the transitive closure of states if and only if
1253	// 'x' can match the empty string. So instead, we compile x as*
1254	// (x+)?, which preserves the correct preference order.
1255	//
1256	// See: https://github.com/rust-lang/regex/issues/779
1257	let compiled = self.c(expr)?;
1258	let plus = if greedy {
1259	self.add_union()
1260	} else {
1261	self.add_union_reverse()
1262	}?;
1263	self.patch(compiled.end, plus)?;
1264	self.patch(plus, compiled.start)?;
1265
1266	let question = if greedy {
1267	self.add_union()
1268	} else {
1269	self.add_union_reverse()
1270	}?;
1271	let empty = self.add_empty()?;
1272	self.patch(question, compiled.start)?;
1273	self.patch(question, empty)?;
1274	self.patch(plus, empty)?;
1275	Ok(ThompsonRef { start: question, end: empty })
1276	} else if n == `1` {
1277	let compiled = self.c(expr)?;
1278	let union = if greedy {
1279	self.add_union()
1280	} else {
1281	self.add_union_reverse()
1282	}?;
1283	self.patch(compiled.end, union)?;
1284	self.patch(union, compiled.start)?;
1285	Ok(ThompsonRef { start: compiled.start, end: union })
1286	} else {
1287	let prefix = self.c_exactly(expr, n - `1`)?;
1288	let last = self.c(expr)?;
1289	let union = if greedy {
1290	self.add_union()
1291	} else {
1292	self.add_union_reverse()
1293	}?;
1294	self.patch(prefix.end, last.start)?;
1295	self.patch(last.end, union)?;
1296	self.patch(union, last.start)?;
1297	Ok(ThompsonRef { start: prefix.start, end: union })
1298	}
1299	}
1300
1301	/// Compile the given expression such that it may be matched zero or one
1302	/// times.
1303	///
1304	/// When `greedy` is true, then the preference is for the expression to
1305	/// match as much as possible. Otherwise, it will match as little as
1306	/// possible.
1307	fn c_zero_or_one(
1308	&self,
1309	expr: &Hir,
1310	greedy: bool,
1311	) -> Result<ThompsonRef, BuildError> {
1312	let union =
1313	if greedy { self.add_union() } else { self.add_union_reverse() }?;
1314	let compiled = self.c(expr)?;
1315	let empty = self.add_empty()?;
1316	self.patch(union, compiled.start)?;
1317	self.patch(union, empty)?;
1318	self.patch(compiled.end, empty)?;
1319	Ok(ThompsonRef { start: union, end: empty })
1320	}
1321
1322	/// Compile the given HIR expression exactly `n` times.
1323	fn c_exactly(
1324	&self,
1325	expr: &Hir,
1326	n: u32,
1327	) -> Result<ThompsonRef, BuildError> {
1328	let it = (`0`..n).map(\|_\| self.c(expr));
1329	self.c_concat(it)
1330	}
1331
1332	/// Compile the given byte oriented character class.
1333	///
1334	/// This uses "sparse" states to represent an alternation between ranges in
1335	/// this character class. We can use "sparse" states instead of stitching
1336	/// together a "union" state because all ranges in a character class have
1337	/// equal priority and* are non-overlapping (thus, only one can match, so*
1338	/// there's never a question of priority in the first place). This saves a
1339	/// fair bit of overhead when traversing an NFA.
1340	///
1341	/// This routine compiles an empty character class into a "fail" state.
1342	fn c_byte_class(
1343	&self,
1344	cls: &hir::ClassBytes,
1345	) -> Result<ThompsonRef, BuildError> {
1346	let end = self.add_empty()?;
1347	let mut trans = Vec::with_capacity(cls.ranges().len());
1348	for r in cls.iter() {
1349	trans.push(Transition {
1350	start: r.start(),
1351	end: r.end(),
1352	next: end,
1353	});
1354	}
1355	Ok(ThompsonRef { start: self.add_sparse(trans)?, end })
1356	}
1357
1358	/// Compile the given Unicode character class.
1359	///
1360	/// This routine specifically tries to use various types of compression,
1361	/// since UTF-8 automata of large classes can get quite large. The specific
1362	/// type of compression used depends on forward vs reverse compilation, and
1363	/// whether NFA shrinking is enabled or not.
1364	///
1365	/// Aside from repetitions causing lots of repeat group, this is like the
1366	/// single most expensive part of regex compilation. Therefore, a large part
1367	/// of the expense of compilation may be reduce by disabling Unicode in the
1368	/// pattern.
1369	///
1370	/// This routine compiles an empty character class into a "fail" state.
1371	fn c_unicode_class(
1372	&self,
1373	cls: &hir::ClassUnicode,
1374	) -> Result<ThompsonRef, BuildError> {
1375	// If all we have are ASCII ranges wrapped in a Unicode package, then
1376	// there is zero reason to bring out the big guns. We can fit all ASCII
1377	// ranges within a single sparse state.
1378	if cls.is_ascii() {
1379	let end = self.add_empty()?;
1380	let mut trans = Vec::with_capacity(cls.ranges().len());
1381	for r in cls.iter() {
1382	// The unwraps below are OK because we've verified that this
1383	// class only contains ASCII codepoints.
1384	trans.push(Transition {
1385	// FIXME(1.59): use the 'TryFrom<char> for u8' impl.
1386	start: u8::try_from(u32::from(r.start())).unwrap(),
1387	end: u8::try_from(u32::from(r.end())).unwrap(),
1388	next: end,
1389	});
1390	}
1391	Ok(ThompsonRef { start: self.add_sparse(trans)?, end })
1392	} else if self.is_reverse() {
1393	if !self.config.get_shrink() {
1394	// When we don't want to spend the extra time shrinking, we
1395	// compile the UTF-8 automaton in reverse using something like
1396	// the "naive" approach, but will attempt to re-use common
1397	// suffixes.
1398	self.c_unicode_class_reverse_with_suffix(cls)
1399	} else {
1400	// When we want to shrink our NFA for reverse UTF-8 automata,
1401	// we cannot feed UTF-8 sequences directly to the UTF-8
1402	// compiler, since the UTF-8 compiler requires all sequences
1403	// to be lexicographically sorted. Instead, we organize our
1404	// sequences into a range trie, which can then output our
1405	// sequences in the correct order. Unfortunately, building the
1406	// range trie is fairly expensive (but not nearly as expensive
1407	// as building a DFA). Hence the reason why the 'shrink' option
1408	// exists, so that this path can be toggled off. For example,
1409	// we might want to turn this off if we know we won't be
1410	// compiling a DFA.
1411	let mut trie = self.trie_state.borrow_mut();
1412	trie.clear();
1413
1414	for rng in cls.iter() {
1415	for mut seq in Utf8Sequences::new(rng.start(), rng.end()) {
1416	seq.reverse();
1417	trie.insert(seq.as_slice());
1418	}
1419	}
1420	let mut builder = self.builder.borrow_mut();
1421	let mut utf8_state = self.utf8_state.borrow_mut();
1422	let mut utf8c =
1423	Utf8Compiler::new(&mut builder, &mut* *utf8_state)?;
1424	trie.iter(\|seq\| {
1425	utf8c.add(&seq)?;
1426	Ok(())
1427	})?;
1428	utf8c.finish()
1429	}
1430	} else {
1431	// In the forward direction, we always shrink our UTF-8 automata
1432	// because we can stream it right into the UTF-8 compiler. There
1433	// is almost no downside (in either memory or time) to using this
1434	// approach.
1435	let mut builder = self.builder.borrow_mut();
1436	let mut utf8_state = self.utf8_state.borrow_mut();
1437	let mut utf8c =
1438	Utf8Compiler::new(&mut builder, &mut* *utf8_state)?;
1439	for rng in cls.iter() {
1440	for seq in Utf8Sequences::new(rng.start(), rng.end()) {
1441	utf8c.add(seq.as_slice())?;
1442	}
1443	}
1444	utf8c.finish()
1445	}
1446
1447	// For reference, the code below is the "naive" version of compiling a
1448	// UTF-8 automaton. It is deliciously simple (and works for both the
1449	// forward and reverse cases), but will unfortunately produce very
1450	// large NFAs. When compiling a forward automaton, the size difference
1451	// can sometimes be an order of magnitude. For example, the '\w' regex
1452	// will generate about ~3000 NFA states using the naive approach below,
1453	// but only 283 states when using the approach above. This is because
1454	// the approach above actually compiles a minimal* (or near minimal,*
1455	// because of the bounded hashmap for reusing equivalent states) UTF-8
1456	// automaton.
1457	//
1458	// The code below is kept as a reference point in order to make it
1459	// easier to understand the higher level goal here. Although, it will
1460	// almost certainly bit-rot, so keep that in mind. Also, if you try to
1461	// use it, some of the tests in this module will fail because they look
1462	// for terser byte code produce by the more optimized handling above.
1463	// But the integration test suite should still pass.
1464	//
1465	// One good example of the substantial difference this can make is to
1466	// compare and contrast performance of the Pike VM when the code below
1467	// is active vs the code above. Here's an example to try:
1468	//
1469	// regex-cli find match pikevm -b -p '(?m)^\w{20}' non-ascii-file
1470	//
1471	// With Unicode classes generated below, this search takes about 45s on
1472	// my machine. But with the compressed version above, the search takes
1473	// only around 1.4s. The NFA is also 20% smaller. This is in part due
1474	// to the compression, but also because of the utilization of 'sparse'
1475	// NFA states. They lead to much less state shuffling during the NFA
1476	// search.
1477	/*
1478	let it = cls
1479	.iter()
1480	.flat_map(\|rng\| Utf8Sequences::new(rng.start(), rng.end()))
1481	.map(\|seq\| {
1482	let it = seq
1483	.as_slice()
1484	.iter()
1485	.map(\|rng\| self.c_range(rng.start, rng.end));
1486	self.c_concat(it)
1487	});
1488	self.c_alt_iter(it)
1489	*/
1490	}
1491
1492	/// Compile the given Unicode character class in reverse with suffix
1493	/// caching.
1494	///
1495	/// This is a "quick" way to compile large Unicode classes into reverse
1496	/// UTF-8 automata while doing a small amount of compression on that
1497	/// automata by reusing common suffixes.
1498	///
1499	/// A more comprehensive compression scheme can be accomplished by using
1500	/// a range trie to efficiently sort a reverse sequence of UTF-8 byte
1501	/// rqanges, and then use Daciuk's algorithm via `Utf8Compiler`.
1502	///
1503	/// This is the technique used when "NFA shrinking" is disabled.
1504	///
1505	/// (This also tries to use "sparse" states where possible, just like
1506	/// `c_byte_class` does.)
1507	fn c_unicode_class_reverse_with_suffix(
1508	&self,
1509	cls: &hir::ClassUnicode,
1510	) -> Result<ThompsonRef, BuildError> {
1511	// N.B. It would likely be better to cache common prefixes* in the*
1512	// reverse direction, but it's not quite clear how to do that. The
1513	// advantage of caching suffixes is that it does give us a win, and
1514	// has a very small additional overhead.
1515	let mut cache = self.utf8_suffix.borrow_mut();
1516	cache.clear();
1517
1518	let union = self.add_union()?;
1519	let alt_end = self.add_empty()?;
1520	for urng in cls.iter() {
1521	for seq in Utf8Sequences::new(urng.start(), urng.end()) {
1522	let mut end = alt_end;
1523	for brng in seq.as_slice() {
1524	let key = Utf8SuffixKey {
1525	from: end,
1526	start: brng.start,
1527	end: brng.end,
1528	};
1529	let hash = cache.hash(&key);
1530	if let Some(id) = cache.get(&key, hash) {
1531	end = id;
1532	continue;
1533	}
1534
1535	let compiled = self.c_range(brng.start, brng.end)?;
1536	self.patch(compiled.end, end)?;
1537	end = compiled.start;
1538	cache.set(key, hash, end);
1539	}
1540	self.patch(union, end)?;
1541	}
1542	}
1543	Ok(ThompsonRef { start: union, end: alt_end })
1544	}
1545
1546	/// Compile the given HIR look-around assertion to an NFA look-around
1547	/// assertion.
1548	fn c_look(&self, anchor: &hir::Look) -> Result<ThompsonRef, BuildError> {
1549	let look = match *anchor {
1550	hir::Look::Start => Look::Start,
1551	hir::Look::End => Look::End,
1552	hir::Look::StartLF => Look::StartLF,
1553	hir::Look::EndLF => Look::EndLF,
1554	hir::Look::StartCRLF => Look::StartCRLF,
1555	hir::Look::EndCRLF => Look::EndCRLF,
1556	hir::Look::WordAscii => Look::WordAscii,
1557	hir::Look::WordAsciiNegate => Look::WordAsciiNegate,
1558	hir::Look::WordUnicode => Look::WordUnicode,
1559	hir::Look::WordUnicodeNegate => Look::WordUnicodeNegate,
1560	hir::Look::WordStartAscii => Look::WordStartAscii,
1561	hir::Look::WordEndAscii => Look::WordEndAscii,
1562	hir::Look::WordStartUnicode => Look::WordStartUnicode,
1563	hir::Look::WordEndUnicode => Look::WordEndUnicode,
1564	hir::Look::WordStartHalfAscii => Look::WordStartHalfAscii,
1565	hir::Look::WordEndHalfAscii => Look::WordEndHalfAscii,
1566	hir::Look::WordStartHalfUnicode => Look::WordStartHalfUnicode,
1567	hir::Look::WordEndHalfUnicode => Look::WordEndHalfUnicode,
1568	};
1569	let id = self.add_look(look)?;
1570	Ok(ThompsonRef { start: id, end: id })
1571	}
1572
1573	/// Compile the given byte string to a concatenation of bytes.
1574	fn c_literal(&self, bytes: &[u8]) -> Result<ThompsonRef, BuildError> {
1575	self.c_concat(bytes.iter().copied().map(\|b\| self.c_range(b, b)))
1576	}
1577
1578	/// Compile a "range" state with one transition that may only be followed
1579	/// if the input byte is in the (inclusive) range given.
1580	///
1581	/// Both the `start` and `end` locations point to the state created.
1582	/// Callers will likely want to keep the `start`, but patch the `end` to
1583	/// point to some other state.
1584	fn c_range(&self, start: u8, end: u8) -> Result<ThompsonRef, BuildError> {
1585	let id = self.add_range(start, end)?;
1586	Ok(ThompsonRef { start: id, end: id })
1587	}
1588
1589	/// Compile an "empty" state with one unconditional epsilon transition.
1590	///
1591	/// Both the `start` and `end` locations point to the state created.
1592	/// Callers will likely want to keep the `start`, but patch the `end` to
1593	/// point to some other state.
1594	fn c_empty(&self) -> Result<ThompsonRef, BuildError> {
1595	let id = self.add_empty()?;
1596	Ok(ThompsonRef { start: id, end: id })
1597	}
1598
1599	/// Compile a "fail" state that can never have any outgoing transitions.
1600	fn c_fail(&self) -> Result<ThompsonRef, BuildError> {
1601	let id = self.add_fail()?;
1602	Ok(ThompsonRef { start: id, end: id })
1603	}
1604
1605	// The below helpers are meant to be simple wrappers around the
1606	// corresponding Builder methods. For the most part, they let us write
1607	// 'self.add_foo()' instead of 'self.builder.borrow_mut().add_foo()', where
1608	// the latter is a mouthful. Some of the methods do inject a little bit
1609	// of extra logic. e.g., Flipping look-around operators when compiling in
1610	// reverse mode.
1611
1612	fn patch(&self, from: StateID, to: StateID) -> Result<(), BuildError> {
1613	self.builder.borrow_mut().patch(from, to)
1614	}
1615
1616	fn start_pattern(&self) -> Result<PatternID, BuildError> {
1617	self.builder.borrow_mut().start_pattern()
1618	}
1619
1620	fn finish_pattern(
1621	&self,
1622	start_id: StateID,
1623	) -> Result<PatternID, BuildError> {
1624	self.builder.borrow_mut().finish_pattern(start_id)
1625	}
1626
1627	fn add_empty(&self) -> Result<StateID, BuildError> {
1628	self.builder.borrow_mut().add_empty()
1629	}
1630
1631	fn add_range(&self, start: u8, end: u8) -> Result<StateID, BuildError> {
1632	self.builder.borrow_mut().add_range(Transition {
1633	start,
1634	end,
1635	next: StateID::ZERO,
1636	})
1637	}
1638
1639	fn add_sparse(
1640	&self,
1641	ranges: Vec<Transition>,
1642	) -> Result<StateID, BuildError> {
1643	self.builder.borrow_mut().add_sparse(ranges)
1644	}
1645
1646	fn add_look(&self, mut look: Look) -> Result<StateID, BuildError> {
1647	if self.is_reverse() {
1648	look = look.reversed();
1649	}
1650	self.builder.borrow_mut().add_look(StateID::ZERO, look)
1651	}
1652
1653	fn add_union(&self) -> Result<StateID, BuildError> {
1654	self.builder.borrow_mut().add_union(vec![])
1655	}
1656
1657	fn add_union_reverse(&self) -> Result<StateID, BuildError> {
1658	self.builder.borrow_mut().add_union_reverse(vec![])
1659	}
1660
1661	fn add_capture_start(
1662	&self,
1663	capture_index: u32,
1664	name: Option<&str>,
1665	) -> Result<StateID, BuildError> {
1666	let name = name.map(\|n\| Arc::from(n));
1667	self.builder.borrow_mut().add_capture_start(
1668	StateID::ZERO,
1669	capture_index,
1670	name,
1671	)
1672	}
1673
1674	fn add_capture_end(
1675	&self,
1676	capture_index: u32,
1677	) -> Result<StateID, BuildError> {
1678	self.builder.borrow_mut().add_capture_end(StateID::ZERO, capture_index)
1679	}
1680
1681	fn add_fail(&self) -> Result<StateID, BuildError> {
1682	self.builder.borrow_mut().add_fail()
1683	}
1684
1685	fn add_match(&self) -> Result<StateID, BuildError> {
1686	self.builder.borrow_mut().add_match()
1687	}
1688
1689	fn is_reverse(&self) -> bool {
1690	self.config.get_reverse()
1691	}
1692	}
1693
1694	/// A value that represents the result of compiling a sub-expression of a
1695	/// regex's HIR. Specifically, this represents a sub-graph of the NFA that
1696	/// has an initial state at `start` and a final state at `end`.
1697	#[derive(Clone, Copy, Debug)]
1698	pub(crate) struct ThompsonRef {
1699	pub(crate) start: StateID,
1700	pub(crate) end: StateID,
1701	}
1702
1703	/// A UTF-8 compiler based on Daciuk's algorithm for compilining minimal DFAs
1704	/// from a lexicographically sorted sequence of strings in linear time.
1705	///
1706	/// The trick here is that any Unicode codepoint range can be converted to
1707	/// a sequence of byte ranges that form a UTF-8 automaton. Connecting them
1708	/// together via an alternation is trivial, and indeed, it works. However,
1709	/// there is a lot of redundant structure in many UTF-8 automatons. Since our
1710	/// UTF-8 ranges are in lexicographic order, we can use Daciuk's algorithm
1711	/// to build nearly minimal DFAs in linear time. (They are guaranteed to be
1712	/// minimal because we use a bounded cache of previously build DFA states.)
1713	///
1714	/// The drawback is that this sadly doesn't work for reverse automata, since
1715	/// the ranges are no longer in lexicographic order. For that, we invented the
1716	/// range trie (which gets its own module). Once a range trie is built, we then
1717	/// use this same Utf8Compiler to build a reverse UTF-8 automaton.
1718	///
1719	/// The high level idea is described here:
1720	/// https://blog.burntsushi.net/transducers/#finite-state-machines-as-data-structures
1721	///
1722	/// There is also another implementation of this in the `fst` crate.
1723	#[derive(Debug)]
1724	struct Utf8Compiler<'a> {
1725	builder: &'a mut Builder,
1726	state: &'a mut Utf8State,
1727	target: StateID,
1728	}
1729
1730	#[derive(Clone, Debug)]
1731	struct Utf8State {
1732	compiled: Utf8BoundedMap,
1733	uncompiled: Vec<Utf8Node>,
1734	}
1735
1736	#[derive(Clone, Debug)]
1737	struct Utf8Node {
1738	trans: Vec<Transition>,
1739	last: Option<Utf8LastTransition>,
1740	}
1741
1742	#[derive(Clone, Debug)]
1743	struct Utf8LastTransition {
1744	start: u8,
1745	end: u8,
1746	}
1747
1748	impl Utf8State {
1749	fn new() -> Utf8State {
1750	Utf8State { compiled: Utf8BoundedMap::new(`10_000`), uncompiled: vec![] }
1751	}
1752
1753	fn clear(&mut self) {
1754	self.compiled.clear();
1755	self.uncompiled.clear();
1756	}
1757	}
1758
1759	impl<'a> Utf8Compiler<'a> {
1760	fn new(
1761	builder: &'a mut Builder,
1762	state: &'a mut Utf8State,
1763	) -> Result<Utf8Compiler<'a>, BuildError> {
1764	let target = builder.add_empty()?;
1765	state.clear();
1766	let mut utf8c = Utf8Compiler { builder, state, target };
1767	utf8c.add_empty();
1768	Ok(utf8c)
1769	}
1770
1771	fn finish(&mut self) -> Result<ThompsonRef, BuildError> {
1772	self.compile_from(`0`)?;
1773	let node = self.pop_root();
1774	let start = self.compile(node)?;
1775	Ok(ThompsonRef { start, end: self.target })
1776	}
1777
1778	fn add(&mut self, ranges: &[Utf8Range]) -> Result<(), BuildError> {
1779	let prefix_len = ranges
1780	.iter()
1781	.zip(&self.state.uncompiled)
1782	.take_while(\|&(range, node)\| {
1783	node.last.as_ref().map_or(`false`, \|t\| {
1784	(t.start, t.end) == (range.start, range.end)
1785	})
1786	})
1787	.count();
1788	assert!(prefix_len < ranges.len());
1789	self.compile_from(prefix_len)?;
1790	self.add_suffix(&ranges[prefix_len..]);
1791	Ok(())
1792	}
1793
1794	fn compile_from(&mut self, from: usize) -> Result<(), BuildError> {
1795	let mut next = self.target;
1796	while from + `1` < self.state.uncompiled.len() {
1797	let node = self.pop_freeze(next);
1798	next = self.compile(node)?;
1799	}
1800	self.top_last_freeze(next);
1801	Ok(())
1802	}
1803
1804	fn compile(
1805	&mut self,
1806	node: Vec<Transition>,
1807	) -> Result<StateID, BuildError> {
1808	let hash = self.state.compiled.hash(&node);
1809	if let Some(id) = self.state.compiled.get(&node, hash) {
1810	return Ok(id);
1811	}
1812	let id = self.builder.add_sparse(node.clone())?;
1813	self.state.compiled.set(node, hash, id);
1814	Ok(id)
1815	}
1816
1817	fn add_suffix(&mut self, ranges: &[Utf8Range]) {
1818	assert!(!ranges.is_empty());
1819	let last = self
1820	.state
1821	.uncompiled
1822	.len()
1823	.checked_sub(`1`)
1824	.expect("non-empty nodes");
1825	assert!(self.state.uncompiled[last].last.is_none());
1826	self.state.uncompiled[last].last = Some(Utf8LastTransition {
1827	start: ranges[`0`].start,
1828	end: ranges[`0`].end,
1829	});
1830	for r in &ranges[`1`..] {
1831	self.state.uncompiled.push(Utf8Node {
1832	trans: vec![],
1833	last: Some(Utf8LastTransition { start: r.start, end: r.end }),
1834	});
1835	}
1836	}
1837
1838	fn add_empty(&mut self) {
1839	self.state.uncompiled.push(Utf8Node { trans: vec![], last: None });
1840	}
1841
1842	fn pop_freeze(&mut self, next: StateID) -> Vec<Transition> {
1843	let mut uncompiled = self.state.uncompiled.pop().unwrap();
1844	uncompiled.set_last_transition(next);
1845	uncompiled.trans
1846	}
1847
1848	fn pop_root(&mut self) -> Vec<Transition> {
1849	assert_eq!(self.state.uncompiled.len(), `1`);
1850	assert!(self.state.uncompiled[`0`].last.is_none());
1851	self.state.uncompiled.pop().expect("non-empty nodes").trans
1852	}
1853
1854	fn top_last_freeze(&mut self, next: StateID) {
1855	let last = self
1856	.state
1857	.uncompiled
1858	.len()
1859	.checked_sub(`1`)
1860	.expect("non-empty nodes");
1861	self.state.uncompiled[last].set_last_transition(next);
1862	}
1863	}
1864
1865	impl Utf8Node {
1866	fn set_last_transition(&mut self, next: StateID) {
1867	if let Some(last) = self.last.take() {
1868	self.trans.push(Transition {
1869	start: last.start,
1870	end: last.end,
1871	next,
1872	});
1873	}
1874	}
1875	}
1876
1877	#[cfg(test)]
1878	mod tests {
1879	use alloc::{vec, vec::Vec};
1880
1881	use crate::{
1882	nfa::thompson::{SparseTransitions, State, Transition, NFA},
1883	util::primitives::{PatternID, SmallIndex, StateID},
1884	};
1885
1886	use super::*;
1887
1888	fn build(pattern: &str) -> NFA {
1889	NFA::compiler()
1890	.configure(
1891	NFA::config()
1892	.which_captures(WhichCaptures::None)
1893	.unanchored_prefix(`false`),
1894	)
1895	.build(pattern)
1896	.unwrap()
1897	}
1898
1899	fn pid(id: usize) -> PatternID {
1900	PatternID::new(id).unwrap()
1901	}
1902
1903	fn sid(id: usize) -> StateID {
1904	StateID::new(id).unwrap()
1905	}
1906
1907	fn s_byte(byte: u8, next: usize) -> State {
1908	let next = sid(next);
1909	let trans = Transition { start: byte, end: byte, next };
1910	State::ByteRange { trans }
1911	}
1912
1913	fn s_range(start: u8, end: u8, next: usize) -> State {
1914	let next = sid(next);
1915	let trans = Transition { start, end, next };
1916	State::ByteRange { trans }
1917	}
1918
1919	fn s_sparse(transitions: &[(u8, u8, usize)]) -> State {
1920	let transitions = transitions
1921	.iter()
1922	.map(\|&(start, end, next)\| Transition {
1923	start,
1924	end,
1925	next: sid(next),
1926	})
1927	.collect();
1928	State::Sparse(SparseTransitions { transitions })
1929	}
1930
1931	fn s_bin_union(alt1: usize, alt2: usize) -> State {
1932	State::BinaryUnion { alt1: sid(alt1), alt2: sid(alt2) }
1933	}
1934
1935	fn s_union(alts: &[usize]) -> State {
1936	State::Union {
1937	alternates: alts
1938	.iter()
1939	.map(\|&id\| sid(id))
1940	.collect::<Vec<StateID>>()
1941	.into_boxed_slice(),
1942	}
1943	}
1944
1945	fn s_cap(next: usize, pattern: usize, index: usize, slot: usize) -> State {
1946	State::Capture {
1947	next: sid(next),
1948	pattern_id: pid(pattern),
1949	group_index: SmallIndex::new(index).unwrap(),
1950	slot: SmallIndex::new(slot).unwrap(),
1951	}
1952	}
1953
1954	fn s_fail() -> State {
1955	State::Fail
1956	}
1957
1958	fn s_match(id: usize) -> State {
1959	State::Match { pattern_id: pid(id) }
1960	}
1961
1962	// Test that building an unanchored NFA has an appropriate `(?s:.)?`*
1963	// prefix.
1964	#[test]
1965	fn compile_unanchored_prefix() {
1966	let nfa = NFA::compiler()
1967	.configure(NFA::config().which_captures(WhichCaptures::None))
1968	.build(r"a")
1969	.unwrap();
1970	assert_eq!(
1971	nfa.states(),
1972	&[
1973	s_bin_union(`2`, `1`),
1974	s_range(`0`, `255`, `0`),
1975	s_byte(b'a', `3`),
1976	s_match(`0`),
1977	]
1978	);
1979	}
1980
1981	#[test]
1982	fn compile_empty() {
1983	assert_eq!(build("").states(), &[s_match(`0`),]);
1984	}
1985
1986	#[test]
1987	fn compile_literal() {
1988	assert_eq!(build("a").states(), &[s_byte(b'a', `1`), s_match(`0`),]);
1989	assert_eq!(
1990	build("ab").states(),
1991	&[s_byte(b'a', `1`), s_byte(b'b', `2`), s_match(`0`),]
1992	);
1993	assert_eq!(
1994	build("☃").states(),
1995	&[s_byte(`0xE2`, `1`), s_byte(`0x98`, `2`), s_byte(`0x83`, `3`), s_match(`0`)]
1996	);
1997
1998	// Check that non-UTF-8 literals work.
1999	let nfa = NFA::compiler()
2000	.configure(
2001	NFA::config()
2002	.which_captures(WhichCaptures::None)
2003	.unanchored_prefix(`false`),
2004	)
2005	.syntax(crate::util::syntax::Config::new().utf8(`false`))
2006	.build(r"(?-u)\xFF")
2007	.unwrap();
2008	assert_eq!(nfa.states(), &[s_byte(b'`\xFF`', `1`), s_match(`0`),]);
2009	}
2010
2011	#[test]
2012	fn compile_class_ascii() {
2013	assert_eq!(
2014	build(r"[a-z]").states(),
2015	&[s_range(b'a', b'z', `1`), s_match(`0`),]
2016	);
2017	assert_eq!(
2018	build(r"[x-za-c]").states(),
2019	&[s_sparse(&[(b'a', b'c', `1`), (b'x', b'z', `1`)]), s_match(`0`)]
2020	);
2021	}
2022
2023	#[test]
2024	#[cfg(not(miri))]
2025	fn compile_class_unicode() {
2026	assert_eq!(
2027	build(r"[\u03B1-\u03B4]").states(),
2028	&[s_range(`0xB1`, `0xB4`, `2`), s_byte(`0xCE`, `0`), s_match(`0`)]
2029	);
2030	assert_eq!(
2031	build(r"[\u03B1-\u03B4\u{1F919}-\u{1F91E}]").states(),
2032	&[
2033	s_range(`0xB1`, `0xB4`, `5`),
2034	s_range(`0x99`, `0x9E`, `5`),
2035	s_byte(`0xA4`, `1`),
2036	s_byte(`0x9F`, `2`),
2037	s_sparse(&[(`0xCE`, `0xCE`, `0`), (`0xF0`, `0xF0`, `3`)]),
2038	s_match(`0`),
2039	]
2040	);
2041	assert_eq!(
2042	build(r"[a-z☃]").states(),
2043	&[
2044	s_byte(`0x83`, `3`),
2045	s_byte(`0x98`, `0`),
2046	s_sparse(&[(b'a', b'z', `3`), (`0xE2`, `0xE2`, `1`)]),
2047	s_match(`0`),
2048	]
2049	);
2050	}
2051
2052	#[test]
2053	fn compile_repetition() {
2054	assert_eq!(
2055	build(r"a?").states(),
2056	&[s_bin_union(`1`, `2`), s_byte(b'a', `2`), s_match(`0`),]
2057	);
2058	assert_eq!(
2059	build(r"a??").states(),
2060	&[s_bin_union(`2`, `1`), s_byte(b'a', `2`), s_match(`0`),]
2061	);
2062	}
2063
2064	#[test]
2065	fn compile_group() {
2066	assert_eq!(
2067	build(r"ab+").states(),
2068	&[s_byte(b'a', `1`), s_byte(b'b', `2`), s_bin_union(`1`, `3`), s_match(`0`)]
2069	);
2070	assert_eq!(
2071	build(r"(ab)").states(),
2072	&[s_byte(b'a', `1`), s_byte(b'b', `2`), s_match(`0`)]
2073	);
2074	assert_eq!(
2075	build(r"(ab)+").states(),
2076	&[s_byte(b'a', `1`), s_byte(b'b', `2`), s_bin_union(`0`, `3`), s_match(`0`)]
2077	);
2078	}
2079
2080	#[test]
2081	fn compile_alternation() {
2082	assert_eq!(
2083	build(r"a\|b").states(),
2084	&[s_range(b'a', b'b', `1`), s_match(`0`)]
2085	);
2086	assert_eq!(
2087	build(r"ab\|cd").states(),
2088	&[
2089	s_byte(b'b', `3`),
2090	s_byte(b'd', `3`),
2091	s_sparse(&[(b'a', b'a', `0`), (b'c', b'c', `1`)]),
2092	s_match(`0`)
2093	],
2094	);
2095	assert_eq!(
2096	build(r"\|b").states(),
2097	&[s_byte(b'b', `2`), s_bin_union(`2`, `0`), s_match(`0`)]
2098	);
2099	assert_eq!(
2100	build(r"a\|").states(),
2101	&[s_byte(b'a', `2`), s_bin_union(`0`, `2`), s_match(`0`)]
2102	);
2103	}
2104
2105	// This tests the use of a non-binary union, i.e., a state with more than
2106	// 2 unconditional epsilon transitions. The only place they tend to appear
2107	// is in reverse NFAs when shrinking is disabled. Otherwise, 'binary-union'
2108	// and 'sparse' tend to cover all other cases of alternation.
2109	#[test]
2110	fn compile_non_binary_union() {
2111	let nfa = NFA::compiler()
2112	.configure(
2113	NFA::config()
2114	.which_captures(WhichCaptures::None)
2115	.reverse(`true`)
2116	.shrink(`false`)
2117	.unanchored_prefix(`false`),
2118	)
2119	.build(r"[\u1000\u2000\u3000]")
2120	.unwrap();
2121	assert_eq!(
2122	nfa.states(),
2123	&[
2124	s_union(&[`3`, `6`, `9`]),
2125	s_byte(`0xE1`, `10`),
2126	s_byte(`0x80`, `1`),
2127	s_byte(`0x80`, `2`),
2128	s_byte(`0xE2`, `10`),
2129	s_byte(`0x80`, `4`),
2130	s_byte(`0x80`, `5`),
2131	s_byte(`0xE3`, `10`),
2132	s_byte(`0x80`, `7`),
2133	s_byte(`0x80`, `8`),
2134	s_match(`0`),
2135	]
2136	);
2137	}
2138
2139	#[test]
2140	fn compile_many_start_pattern() {
2141	let nfa = NFA::compiler()
2142	.configure(
2143	NFA::config()
2144	.which_captures(WhichCaptures::None)
2145	.unanchored_prefix(`false`),
2146	)
2147	.build_many(&["a", "b"])
2148	.unwrap();
2149	assert_eq!(
2150	nfa.states(),
2151	&[
2152	s_byte(b'a', `1`),
2153	s_match(`0`),
2154	s_byte(b'b', `3`),
2155	s_match(`1`),
2156	s_bin_union(`0`, `2`),
2157	]
2158	);
2159	assert_eq!(nfa.start_anchored().as_usize(), `4`);
2160	assert_eq!(nfa.start_unanchored().as_usize(), `4`);
2161	// Test that the start states for each individual pattern are correct.
2162	assert_eq!(nfa.start_pattern(pid(`0`)).unwrap(), sid(`0`));
2163	assert_eq!(nfa.start_pattern(pid(`1`)).unwrap(), sid(`2`));
2164	}
2165
2166	// This tests that our compiler can handle an empty character class. At the
2167	// time of writing, the regex parser forbids it, so the only way to test it
2168	// is to provide a hand written HIR.
2169	#[test]
2170	fn empty_class_bytes() {
2171	use regex_syntax::hir::{Class, ClassBytes, Hir};
2172
2173	let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![])));
2174	let config = NFA::config()
2175	.which_captures(WhichCaptures::None)
2176	.unanchored_prefix(`false`);
2177	let nfa =
2178	NFA::compiler().configure(config).build_from_hir(&hir).unwrap();
2179	assert_eq!(nfa.states(), &[s_fail(), s_match(`0`)]);
2180	}
2181
2182	// Like empty_class_bytes, but for a Unicode class.
2183	#[test]
2184	fn empty_class_unicode() {
2185	use regex_syntax::hir::{Class, ClassUnicode, Hir};
2186
2187	let hir = Hir::class(Class::Unicode(ClassUnicode::new(vec![])));
2188	let config = NFA::config()
2189	.which_captures(WhichCaptures::None)
2190	.unanchored_prefix(`false`);
2191	let nfa =
2192	NFA::compiler().configure(config).build_from_hir(&hir).unwrap();
2193	assert_eq!(nfa.states(), &[s_fail(), s_match(`0`)]);
2194	}
2195
2196	#[test]
2197	fn compile_captures_all() {
2198	let nfa = NFA::compiler()
2199	.configure(
2200	NFA::config()
2201	.unanchored_prefix(`false`)
2202	.which_captures(WhichCaptures::All),
2203	)
2204	.build("a(b)c")
2205	.unwrap();
2206	assert_eq!(
2207	nfa.states(),
2208	&[
2209	s_cap(`1`, `0`, `0`, `0`),
2210	s_byte(b'a', `2`),
2211	s_cap(`3`, `0`, `1`, `2`),
2212	s_byte(b'b', `4`),
2213	s_cap(`5`, `0`, `1`, `3`),
2214	s_byte(b'c', `6`),
2215	s_cap(`7`, `0`, `0`, `1`),
2216	s_match(`0`)
2217	]
2218	);
2219	let ginfo = nfa.group_info();
2220	assert_eq!(`2`, ginfo.all_group_len());
2221	}
2222
2223	#[test]
2224	fn compile_captures_implicit() {
2225	let nfa = NFA::compiler()
2226	.configure(
2227	NFA::config()
2228	.unanchored_prefix(`false`)
2229	.which_captures(WhichCaptures::Implicit),
2230	)
2231	.build("a(b)c")
2232	.unwrap();
2233	assert_eq!(
2234	nfa.states(),
2235	&[
2236	s_cap(`1`, `0`, `0`, `0`),
2237	s_byte(b'a', `2`),
2238	s_byte(b'b', `3`),
2239	s_byte(b'c', `4`),
2240	s_cap(`5`, `0`, `0`, `1`),
2241	s_match(`0`)
2242	]
2243	);
2244	let ginfo = nfa.group_info();
2245	assert_eq!(`1`, ginfo.all_group_len());
2246	}
2247
2248	#[test]
2249	fn compile_captures_none() {
2250	let nfa = NFA::compiler()
2251	.configure(
2252	NFA::config()
2253	.unanchored_prefix(`false`)
2254	.which_captures(WhichCaptures::None),
2255	)
2256	.build("a(b)c")
2257	.unwrap();
2258	assert_eq!(
2259	nfa.states(),
2260	&[s_byte(b'a', `1`), s_byte(b'b', `2`), s_byte(b'c', `3`), s_match(`0`)]
2261	);
2262	let ginfo = nfa.group_info();
2263	assert_eq!(`0`, ginfo.all_group_len());
2264	}
2265	}
2266