dense.rs - Codebrowser

1	/!*
2	Types and routines specific to dense DFAs.
3
4	This module is the home of [`dense::DFA`](DFA).
5
6	This module also contains a [`dense::Builder`](Builder) and a
7	[`dense::Config`](Config) for building and configuring a dense DFA.
8	*/
9
10	#[cfg(feature = "dfa-build")]
11	use core::cmp;
12	use core::{convert::TryFrom, fmt, iter, mem::size_of, slice};
13
14	#[cfg(feature = "dfa-build")]
15	use alloc::{
16	collections::{BTreeMap, BTreeSet},
17	vec,
18	vec::Vec,
19	};
20
21	#[cfg(feature = "dfa-build")]
22	use crate::{
23	dfa::{
24	accel::Accel, determinize, minimize::Minimizer, remapper::Remapper,
25	sparse,
26	},
27	nfa::thompson,
28	util::{look::LookMatcher, search::MatchKind},
29	};
30	use crate::{
31	dfa::{
32	accel::Accels,
33	automaton::{fmt_state_indicator, Automaton, StartError},
34	special::Special,
35	start::StartKind,
36	DEAD,
37	},
38	util::{
39	alphabet::{self, ByteClasses, ByteSet},
40	int::{Pointer, Usize},
41	prefilter::Prefilter,
42	primitives::{PatternID, StateID},
43	search::Anchored,
44	start::{self, Start, StartByteMap},
45	wire::{self, DeserializeError, Endian, SerializeError},
46	},
47	};
48
49	/// The label that is pre-pended to a serialized DFA.
50	const LABEL: &str = "rust-regex-automata-dfa-dense";
51
52	/// The format version of dense regexes. This version gets incremented when a
53	/// change occurs. A change may not necessarily be a breaking change, but the
54	/// version does permit good error messages in the case where a breaking change
55	/// is made.
56	const VERSION: u32 = `2`;
57
58	/// The configuration used for compiling a dense DFA.
59	///
60	/// As a convenience, [`DFA::config`] is an alias for [`Config::new`]. The
61	/// advantage of the former is that it often lets you avoid importing the
62	/// `Config` type directly.
63	///
64	/// A dense DFA configuration is a simple data object that is typically used
65	/// with [`dense::Builder::configure`](self::Builder::configure).
66	///
67	/// The default configuration guarantees that a search will never return
68	/// a "quit" error, although it is possible for a search to fail if
69	/// [`Config::starts_for_each_pattern`] wasn't enabled (which it is
70	/// not by default) and an [`Anchored::Pattern`] mode is requested via
71	/// [`Input`](crate::Input).
72	#[cfg(feature = "dfa-build")]
73	#[derive(Clone, Debug, Default)]
74	pub struct Config {
75	// As with other configuration types in this crate, we put all our knobs
76	// in options so that we can distinguish between "default" and "not set."
77	// This makes it possible to easily combine multiple configurations
78	// without default values overwriting explicitly specified values. See the
79	// 'overwrite' method.
80	//
81	// For docs on the fields below, see the corresponding method setters.
82	accelerate: Option<bool>,
83	pre: Option<Option<Prefilter>>,
84	minimize: Option<bool>,
85	match_kind: Option<MatchKind>,
86	start_kind: Option<StartKind>,
87	starts_for_each_pattern: Option<bool>,
88	byte_classes: Option<bool>,
89	unicode_word_boundary: Option<bool>,
90	quitset: Option<ByteSet>,
91	specialize_start_states: Option<bool>,
92	dfa_size_limit: Option<Option<usize>>,
93	determinize_size_limit: Option<Option<usize>>,
94	}
95
96	#[cfg(feature = "dfa-build")]
97	impl Config {
98	/// Return a new default dense DFA compiler configuration.
99	pub fn new() -> Config {
100	Config::default()
101	}
102
103	/// Enable state acceleration.
104	///
105	/// When enabled, DFA construction will analyze each state to determine
106	/// whether it is eligible for simple acceleration. Acceleration typically
107	/// occurs when most of a state's transitions loop back to itself, leaving
108	/// only a select few bytes that will exit the state. When this occurs,
109	/// other routines like `memchr` can be used to look for those bytes which
110	/// may be much faster than traversing the DFA.
111	///
112	/// Callers may elect to disable this if consistent performance is more
113	/// desirable than variable performance. Namely, acceleration can sometimes
114	/// make searching slower than it otherwise would be if the transitions
115	/// that leave accelerated states are traversed frequently.
116	///
117	/// See [`Automaton::accelerator`] for an example.
118	///
119	/// This is enabled by default.
120	pub fn accelerate(mut self, yes: bool) -> Config {
121	self.accelerate = Some(yes);
122	self
123	}
124
125	/// Set a prefilter to be used whenever a start state is entered.
126	///
127	/// A [`Prefilter`] in this context is meant to accelerate searches by
128	/// looking for literal prefixes that every match for the corresponding
129	/// pattern (or patterns) must start with. Once a prefilter produces a
130	/// match, the underlying search routine continues on to try and confirm
131	/// the match.
132	///
133	/// Be warned that setting a prefilter does not guarantee that the search
134	/// will be faster. While it's usually a good bet, if the prefilter
135	/// produces a lot of false positive candidates (i.e., positions matched
136	/// by the prefilter but not by the regex), then the overall result can
137	/// be slower than if you had just executed the regex engine without any
138	/// prefilters.
139	///
140	/// Note that unless [`Config::specialize_start_states`] has been
141	/// explicitly set, then setting this will also enable (when `pre` is
142	/// `Some`) or disable (when `pre` is `None`) start state specialization.
143	/// This occurs because without start state specialization, a prefilter
144	/// is likely to be less effective. And without a prefilter, start state
145	/// specialization is usually pointless.
146	///
147	/// WARNING:* Note that prefilters are not preserved as part of*
148	/// serialization. Serializing a DFA will drop its prefilter.
149	///
150	/// By default no prefilter is set.
151	///
152	/// # Example
153	///
154	/// ```
155	/// use regex_automata::{
156	/// dfa::{dense::DFA, Automaton},
157	/// util::prefilter::Prefilter,
158	/// Input, HalfMatch, MatchKind,
159	/// };
160	///
161	/// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "bar"]);
162	/// let re = DFA::builder()
163	/// .configure(DFA::config().prefilter(pre))
164	/// .build(r"(foo\|bar)[a-z]+")?;
165	/// let input = Input::new("foo1 barfox bar");
166	/// assert_eq!(
167	/// Some(HalfMatch::must(`0`, `11`)),
168	/// re.try_search_fwd(&input)?,
169	/// );
170	///
171	/// # Ok::<(), Box<dyn std::error::Error>>(())
172	/// ```
173	///
174	/// Be warned though that an incorrect prefilter can lead to incorrect
175	/// results!
176	///
177	/// ```
178	/// use regex_automata::{
179	/// dfa::{dense::DFA, Automaton},
180	/// util::prefilter::Prefilter,
181	/// Input, HalfMatch, MatchKind,
182	/// };
183	///
184	/// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "car"]);
185	/// let re = DFA::builder()
186	/// .configure(DFA::config().prefilter(pre))
187	/// .build(r"(foo\|bar)[a-z]+")?;
188	/// let input = Input::new("foo1 barfox bar");
189	/// assert_eq!(
190	/// // No match reported even though there clearly is one!
191	/// None,
192	/// re.try_search_fwd(&input)?,
193	/// );
194	///
195	/// # Ok::<(), Box<dyn std::error::Error>>(())
196	/// ```
197	pub fn prefilter(mut self, pre: Option<Prefilter>) -> Config {
198	self.pre = Some(pre);
199	if self.specialize_start_states.is_none() {
200	self.specialize_start_states =
201	Some(self.get_prefilter().is_some());
202	}
203	self
204	}
205
206	/// Minimize the DFA.
207	///
208	/// When enabled, the DFA built will be minimized such that it is as small
209	/// as possible.
210	///
211	/// Whether one enables minimization or not depends on the types of costs
212	/// you're willing to pay and how much you care about its benefits. In
213	/// particular, minimization has worst case `O(nklogn)` time and `O(kn)`*
214	/// space, where `n` is the number of DFA states and `k` is the alphabet
215	/// size. In practice, minimization can be quite costly in terms of both
216	/// space and time, so it should only be done if you're willing to wait
217	/// longer to produce a DFA. In general, you might want a minimal DFA in
218	/// the following circumstances:
219	///
220	/// 1. You would like to optimize for the size of the automaton. This can
221	/// manifest in one of two ways. Firstly, if you're converting the
222	/// DFA into Rust code (or a table embedded in the code), then a minimal
223	/// DFA will translate into a corresponding reduction in code size, and
224	/// thus, also the final compiled binary size. Secondly, if you are
225	/// building many DFAs and putting them on the heap, you'll be able to
226	/// fit more if they are smaller. Note though that building a minimal
227	/// DFA itself requires additional space; you only realize the space
228	/// savings once the minimal DFA is constructed (at which point, the
229	/// space used for minimization is freed).
230	/// 2. You've observed that a smaller DFA results in faster match
231	/// performance. Naively, this isn't guaranteed since there is no
232	/// inherent difference between matching with a bigger-than-minimal
233	/// DFA and a minimal DFA. However, a smaller DFA may make use of your
234	/// CPU's cache more efficiently.
235	/// 3. You are trying to establish an equivalence between regular
236	/// languages. The standard method for this is to build a minimal DFA
237	/// for each language and then compare them. If the DFAs are equivalent
238	/// (up to state renaming), then the languages are equivalent.
239	///
240	/// Typically, minimization only makes sense as an offline process. That
241	/// is, one might minimize a DFA before serializing it to persistent
242	/// storage. In practical terms, minimization can take around an order of
243	/// magnitude more time than compiling the initial DFA via determinization.
244	///
245	/// This option is disabled by default.
246	pub fn minimize(mut self, yes: bool) -> Config {
247	self.minimize = Some(yes);
248	self
249	}
250
251	/// Set the desired match semantics.
252	///
253	/// The default is [`MatchKind::LeftmostFirst`], which corresponds to the
254	/// match semantics of Perl-like regex engines. That is, when multiple
255	/// patterns would match at the same leftmost position, the pattern that
256	/// appears first in the concrete syntax is chosen.
257	///
258	/// Currently, the only other kind of match semantics supported is
259	/// [`MatchKind::All`]. This corresponds to classical DFA construction
260	/// where all possible matches are added to the DFA.
261	///
262	/// Typically, `All` is used when one wants to execute an overlapping
263	/// search and `LeftmostFirst` otherwise. In particular, it rarely makes
264	/// sense to use `All` with the various "leftmost" find routines, since the
265	/// leftmost routines depend on the `LeftmostFirst` automata construction
266	/// strategy. Specifically, `LeftmostFirst` adds dead states to the DFA
267	/// as a way to terminate the search and report a match. `LeftmostFirst`
268	/// also supports non-greedy matches using this strategy where as `All`
269	/// does not.
270	///
271	/// # Example: overlapping search
272	///
273	/// This example shows the typical use of `MatchKind::All`, which is to
274	/// report overlapping matches.
275	///
276	/// ```
277	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
278	/// use regex_automata::{
279	/// dfa::{Automaton, OverlappingState, dense},
280	/// HalfMatch, Input, MatchKind,
281	/// };
282	///
283	/// let dfa = dense::Builder::new()
284	/// .configure(dense::Config::new().match_kind(MatchKind::All))
285	/// .build_many(&[r"\w+$", r"\S+$"])?;
286	/// let input = Input::new("@foo");
287	/// let mut state = OverlappingState::start();
288	///
289	/// let expected = Some(HalfMatch::must(`1`, `4`));
290	/// dfa.try_search_overlapping_fwd(&input, &mut state)?;
291	/// assert_eq!(expected, state.get_match());
292	///
293	/// // The first pattern also matches at the same position, so re-running
294	/// // the search will yield another match. Notice also that the first
295	/// // pattern is returned after the second. This is because the second
296	/// // pattern begins its match before the first, is therefore an earlier
297	/// // match and is thus reported first.
298	/// let expected = Some(HalfMatch::must(`0`, `4`));
299	/// dfa.try_search_overlapping_fwd(&input, &mut state)?;
300	/// assert_eq!(expected, state.get_match());
301	///
302	/// # Ok::<(), Box<dyn std::error::Error>>(())
303	/// ```
304	///
305	/// # Example: reverse automaton to find start of match
306	///
307	/// Another example for using `MatchKind::All` is for constructing a
308	/// reverse automaton to find the start of a match. `All` semantics are
309	/// used for this in order to find the longest possible match, which
310	/// corresponds to the leftmost starting position.
311	///
312	/// Note that if you need the starting position then
313	/// [`dfa::regex::Regex`](crate::dfa::regex::Regex) will handle this for
314	/// you, so it's usually not necessary to do this yourself.
315	///
316	/// ```
317	/// use regex_automata::{
318	/// dfa::{dense, Automaton, StartKind},
319	/// nfa::thompson::NFA,
320	/// Anchored, HalfMatch, Input, MatchKind,
321	/// };
322	///
323	/// let haystack = "123foobar456".as_bytes();
324	/// let pattern = r"[a-z]+r";
325	///
326	/// let dfa_fwd = dense::DFA::new(pattern)?;
327	/// let dfa_rev = dense::Builder::new()
328	/// .thompson(NFA::config().reverse(`true`))
329	/// .configure(dense::Config::new()
330	/// // This isn't strictly necessary since both anchored and
331	/// // unanchored searches are supported by default. But since
332	/// // finding the start-of-match only requires anchored searches,
333	/// // we can get rid of the unanchored configuration and possibly
334	/// // slim down our DFA considerably.
335	/// .start_kind(StartKind::Anchored)
336	/// .match_kind(MatchKind::All)
337	/// )
338	/// .build(pattern)?;
339	/// let expected_fwd = HalfMatch::must(`0`, `9`);
340	/// let expected_rev = HalfMatch::must(`0`, `3`);
341	/// let got_fwd = dfa_fwd.try_search_fwd(&Input::new(haystack))?.unwrap();
342	/// // Here we don't specify the pattern to search for since there's only
343	/// // one pattern and we're doing a leftmost search. But if this were an
344	/// // overlapping search, you'd need to specify the pattern that matched
345	/// // in the forward direction. (Otherwise, you might wind up finding the
346	/// // starting position of a match of some other pattern.) That in turn
347	/// // requires building the reverse automaton with starts_for_each_pattern
348	/// // enabled. Indeed, this is what Regex does internally.
349	/// let input = Input::new(haystack)
350	/// .range(..got_fwd.offset())
351	/// .anchored(Anchored::Yes);
352	/// let got_rev = dfa_rev.try_search_rev(&input)?.unwrap();
353	/// assert_eq!(expected_fwd, got_fwd);
354	/// assert_eq!(expected_rev, got_rev);
355	///
356	/// # Ok::<(), Box<dyn std::error::Error>>(())
357	/// ```
358	pub fn match_kind(mut self, kind: MatchKind) -> Config {
359	self.match_kind = Some(kind);
360	self
361	}
362
363	/// The type of starting state configuration to use for a DFA.
364	///
365	/// By default, the starting state configuration is [`StartKind::Both`].
366	///
367	/// # Example
368	///
369	/// ```
370	/// use regex_automata::{
371	/// dfa::{dense::DFA, Automaton, StartKind},
372	/// Anchored, HalfMatch, Input,
373	/// };
374	///
375	/// let haystack = "quux foo123";
376	/// let expected = HalfMatch::must(`0`, `11`);
377	///
378	/// // By default, DFAs support both anchored and unanchored searches.
379	/// let dfa = DFA::new(r"[0-9]+")?;
380	/// let input = Input::new(haystack);
381	/// assert_eq!(Some(expected), dfa.try_search_fwd(&input)?);
382	///
383	/// // But if we only need anchored searches, then we can build a DFA
384	/// // that only supports anchored searches. This leads to a smaller DFA
385	/// // (potentially significantly smaller in some cases), but a DFA that
386	/// // will panic if you try to use it with an unanchored search.
387	/// let dfa = DFA::builder()
388	/// .configure(DFA::config().start_kind(StartKind::Anchored))
389	/// .build(r"[0-9]+")?;
390	/// let input = Input::new(haystack)
391	/// .range(`8`..)
392	/// .anchored(Anchored::Yes);
393	/// assert_eq!(Some(expected), dfa.try_search_fwd(&input)?);
394	///
395	/// # Ok::<(), Box<dyn std::error::Error>>(())
396	/// ```
397	pub fn start_kind(mut self, kind: StartKind) -> Config {
398	self.start_kind = Some(kind);
399	self
400	}
401
402	/// Whether to compile a separate start state for each pattern in the
403	/// automaton.
404	///
405	/// When enabled, a separate anchored* start state is added for each*
406	/// pattern in the DFA. When this start state is used, then the DFA will
407	/// only search for matches for the pattern specified, even if there are
408	/// other patterns in the DFA.
409	///
410	/// The main downside of this option is that it can potentially increase
411	/// the size of the DFA and/or increase the time it takes to build the DFA.
412	///
413	/// There are a few reasons one might want to enable this (it's disabled
414	/// by default):
415	///
416	/// 1. When looking for the start of an overlapping match (using a
417	/// reverse DFA), doing it correctly requires starting the reverse search
418	/// using the starting state of the pattern that matched in the forward
419	/// direction. Indeed, when building a [`Regex`](crate::dfa::regex::Regex),
420	/// it will automatically enable this option when building the reverse DFA
421	/// internally.
422	/// 2. When you want to use a DFA with multiple patterns to both search
423	/// for matches of any pattern or to search for anchored matches of one
424	/// particular pattern while using the same DFA. (Otherwise, you would need
425	/// to compile a new DFA for each pattern.)
426	/// 3. Since the start states added for each pattern are anchored, if you
427	/// compile an unanchored DFA with one pattern while also enabling this
428	/// option, then you can use the same DFA to perform anchored or unanchored
429	/// searches. The latter you get with the standard search APIs. The former
430	/// you get from the various `_at` search methods that allow you specify a
431	/// pattern ID to search for.
432	///
433	/// By default this is disabled.
434	///
435	/// # Example
436	///
437	/// This example shows how to use this option to permit the same DFA to
438	/// run both anchored and unanchored searches for a single pattern.
439	///
440	/// ```
441	/// use regex_automata::{
442	/// dfa::{dense, Automaton},
443	/// Anchored, HalfMatch, PatternID, Input,
444	/// };
445	///
446	/// let dfa = dense::Builder::new()
447	/// .configure(dense::Config::new().starts_for_each_pattern(`true`))
448	/// .build(r"foo[0-9]+")?;
449	/// let haystack = "quux foo123";
450	///
451	/// // Here's a normal unanchored search. Notice that we use 'None' for the
452	/// // pattern ID. Since the DFA was built as an unanchored machine, it
453	/// // use its default unanchored starting state.
454	/// let expected = HalfMatch::must(`0`, `11`);
455	/// let input = Input::new(haystack);
456	/// assert_eq!(Some(expected), dfa.try_search_fwd(&input)?);
457	/// // But now if we explicitly specify the pattern to search ('0' being
458	/// // the only pattern in the DFA), then it will use the starting state
459	/// // for that specific pattern which is always anchored. Since the
460	/// // pattern doesn't have a match at the beginning of the haystack, we
461	/// // find nothing.
462	/// let input = Input::new(haystack)
463	/// .anchored(Anchored::Pattern(PatternID::must(`0`)));
464	/// assert_eq!(None, dfa.try_search_fwd(&input)?);
465	/// // And finally, an anchored search is not the same as putting a '^' at
466	/// // beginning of the pattern. An anchored search can only match at the
467	/// // beginning of the search, which we can change:
468	/// let input = Input::new(haystack)
469	/// .anchored(Anchored::Pattern(PatternID::must(`0`)))
470	/// .range(`5`..);
471	/// assert_eq!(Some(expected), dfa.try_search_fwd(&input)?);
472	///
473	/// # Ok::<(), Box<dyn std::error::Error>>(())
474	/// ```
475	pub fn starts_for_each_pattern(mut self, yes: bool) -> Config {
476	self.starts_for_each_pattern = Some(yes);
477	self
478	}
479
480	/// Whether to attempt to shrink the size of the DFA's alphabet or not.
481	///
482	/// This option is enabled by default and should never be disabled unless
483	/// one is debugging a generated DFA.
484	///
485	/// When enabled, the DFA will use a map from all possible bytes to their
486	/// corresponding equivalence class. Each equivalence class represents a
487	/// set of bytes that does not discriminate between a match and a non-match
488	/// in the DFA. For example, the pattern `[ab]+` has at least two
489	/// equivalence classes: a set containing `a` and `b` and a set containing
490	/// every byte except for `a` and `b`. `a` and `b` are in the same
491	/// equivalence class because they never discriminate between a match and a
492	/// non-match.
493	///
494	/// The advantage of this map is that the size of the transition table
495	/// can be reduced drastically from `#states 256 * sizeof(StateID)` to*
496	/// `#states k * sizeof(StateID)` where `k` is the number of equivalence*
497	/// classes (rounded up to the nearest power of 2). As a result, total
498	/// space usage can decrease substantially. Moreover, since a smaller
499	/// alphabet is used, DFA compilation becomes faster as well.
500	///
501	/// WARNING:* This is only useful for debugging DFAs. Disabling this*
502	/// does not yield any speed advantages. Namely, even when this is
503	/// disabled, a byte class map is still used while searching. The only
504	/// difference is that every byte will be forced into its own distinct
505	/// equivalence class. This is useful for debugging the actual generated
506	/// transitions because it lets one see the transitions defined on actual
507	/// bytes instead of the equivalence classes.
508	pub fn byte_classes(mut self, yes: bool) -> Config {
509	self.byte_classes = Some(yes);
510	self
511	}
512
513	/// Heuristically enable Unicode word boundaries.
514	///
515	/// When set, this will attempt to implement Unicode word boundaries as if
516	/// they were ASCII word boundaries. This only works when the search input
517	/// is ASCII only. If a non-ASCII byte is observed while searching, then a
518	/// [`MatchError::quit`](crate::MatchError::quit) error is returned.
519	///
520	/// A possible alternative to enabling this option is to simply use an
521	/// ASCII word boundary, e.g., via `(?-u:\b)`. The main reason to use this
522	/// option is if you absolutely need Unicode support. This option lets one
523	/// use a fast search implementation (a DFA) for some potentially very
524	/// common cases, while providing the option to fall back to some other
525	/// regex engine to handle the general case when an error is returned.
526	///
527	/// If the pattern provided has no Unicode word boundary in it, then this
528	/// option has no effect. (That is, quitting on a non-ASCII byte only
529	/// occurs when this option is enabled _and_ a Unicode word boundary is
530	/// present in the pattern.)
531	///
532	/// This is almost equivalent to setting all non-ASCII bytes to be quit
533	/// bytes. The only difference is that this will cause non-ASCII bytes to
534	/// be quit bytes _only_ when a Unicode word boundary is present in the
535	/// pattern.
536	///
537	/// When enabling this option, callers _must_ be prepared to handle
538	/// a [`MatchError`](crate::MatchError) error during search.
539	/// When using a [`Regex`](crate::dfa::regex::Regex), this corresponds
540	/// to using the `try_` suite of methods. Alternatively, if
541	/// callers can guarantee that their input is ASCII only, then a
542	/// [`MatchError::quit`](crate::MatchError::quit) error will never be
543	/// returned while searching.
544	///
545	/// This is disabled by default.
546	///
547	/// # Example
548	///
549	/// This example shows how to heuristically enable Unicode word boundaries
550	/// in a pattern. It also shows what happens when a search comes across a
551	/// non-ASCII byte.
552	///
553	/// ```
554	/// use regex_automata::{
555	/// dfa::{Automaton, dense},
556	/// HalfMatch, Input, MatchError,
557	/// };
558	///
559	/// let dfa = dense::Builder::new()
560	/// .configure(dense::Config::new().unicode_word_boundary(`true`))
561	/// .build(r"\b[0-9]+\b")?;
562	///
563	/// // The match occurs before the search ever observes the snowman
564	/// // character, so no error occurs.
565	/// let haystack = "foo 123 ☃".as_bytes();
566	/// let expected = Some(HalfMatch::must(`0`, `7`));
567	/// let got = dfa.try_search_fwd(&Input::new(haystack))?;
568	/// assert_eq!(expected, got);
569	///
570	/// // Notice that this search fails, even though the snowman character
571	/// // occurs after the ending match offset. This is because search
572	/// // routines read one byte past the end of the search to account for
573	/// // look-around, and indeed, this is required here to determine whether
574	/// // the trailing \b matches.
575	/// let haystack = "foo 123 ☃".as_bytes();
576	/// let expected = MatchError::quit(`0xE2`, `8`);
577	/// let got = dfa.try_search_fwd(&Input::new(haystack));
578	/// assert_eq!(Err(expected), got);
579	///
580	/// // Another example is executing a search where the span of the haystack
581	/// // we specify is all ASCII, but there is non-ASCII just before it. This
582	/// // correctly also reports an error.
583	/// let input = Input::new("β123").range(`2`..);
584	/// let expected = MatchError::quit(`0xB2`, `1`);
585	/// let got = dfa.try_search_fwd(&input);
586	/// assert_eq!(Err(expected), got);
587	///
588	/// // And similarly for the trailing word boundary.
589	/// let input = Input::new("123β").range(..`3`);
590	/// let expected = MatchError::quit(`0xCE`, `3`);
591	/// let got = dfa.try_search_fwd(&input);
592	/// assert_eq!(Err(expected), got);
593	///
594	/// # Ok::<(), Box<dyn std::error::Error>>(())
595	/// ```
596	pub fn unicode_word_boundary(mut self, yes: bool) -> Config {
597	// We have a separate option for this instead of just setting the
598	// appropriate quit bytes here because we don't want to set quit bytes
599	// for every regex. We only want to set them when the regex contains a
600	// Unicode word boundary.
601	self.unicode_word_boundary = Some(yes);
602	self
603	}
604
605	/// Add a "quit" byte to the DFA.
606	///
607	/// When a quit byte is seen during search time, then search will return
608	/// a [`MatchError::quit`](crate::MatchError::quit) error indicating the
609	/// offset at which the search stopped.
610	///
611	/// A quit byte will always overrule any other aspects of a regex. For
612	/// example, if the `x` byte is added as a quit byte and the regex `\w` is
613	/// used, then observing `x` will cause the search to quit immediately
614	/// despite the fact that `x` is in the `\w` class.
615	///
616	/// This mechanism is primarily useful for heuristically enabling certain
617	/// features like Unicode word boundaries in a DFA. Namely, if the input
618	/// to search is ASCII, then a Unicode word boundary can be implemented
619	/// via an ASCII word boundary with no change in semantics. Thus, a DFA
620	/// can attempt to match a Unicode word boundary but give up as soon as it
621	/// observes a non-ASCII byte. Indeed, if callers set all non-ASCII bytes
622	/// to be quit bytes, then Unicode word boundaries will be permitted when
623	/// building DFAs. Of course, callers should enable
624	/// [`Config::unicode_word_boundary`] if they want this behavior instead.
625	/// (The advantage being that non-ASCII quit bytes will only be added if a
626	/// Unicode word boundary is in the pattern.)
627	///
628	/// When enabling this option, callers _must_ be prepared to handle a
629	/// [`MatchError`](crate::MatchError) error during search. When using a
630	/// [`Regex`](crate::dfa::regex::Regex), this corresponds to using the
631	/// `try_` suite of methods.
632	///
633	/// By default, there are no quit bytes set.
634	///
635	/// # Panics
636	///
637	/// This panics if heuristic Unicode word boundaries are enabled and any
638	/// non-ASCII byte is removed from the set of quit bytes. Namely, enabling
639	/// Unicode word boundaries requires setting every non-ASCII byte to a quit
640	/// byte. So if the caller attempts to undo any of that, then this will
641	/// panic.
642	///
643	/// # Example
644	///
645	/// This example shows how to cause a search to terminate if it sees a
646	/// `\n` byte. This could be useful if, for example, you wanted to prevent
647	/// a user supplied pattern from matching across a line boundary.
648	///
649	/// ```
650	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
651	/// use regex_automata::{dfa::{Automaton, dense}, Input, MatchError};
652	///
653	/// let dfa = dense::Builder::new()
654	/// .configure(dense::Config::new().quit(b'`\n`', `true`))
655	/// .build(r"foo\p{any}+bar")?;
656	///
657	/// let haystack = "foo`\n`bar".as_bytes();
658	/// // Normally this would produce a match, since \p{any} contains '\n'.
659	/// // But since we instructed the automaton to enter a quit state if a
660	/// // '\n' is observed, this produces a match error instead.
661	/// let expected = MatchError::quit(b'`\n`', `3`);
662	/// let got = dfa.try_search_fwd(&Input::new(haystack)).unwrap_err();
663	/// assert_eq!(expected, got);
664	///
665	/// # Ok::<(), Box<dyn std::error::Error>>(())
666	/// ```
667	pub fn quit(mut self, byte: u8, yes: bool) -> Config {
668	if self.get_unicode_word_boundary() && !byte.is_ascii() && !yes {
669	panic!(
670	"cannot set non-ASCII byte to be non-quit when \
671	Unicode word boundaries are enabled"
672	);
673	}
674	if self.quitset.is_none() {
675	self.quitset = Some(ByteSet::empty());
676	}
677	if yes {
678	self.quitset.as_mut().unwrap().add(byte);
679	} else {
680	self.quitset.as_mut().unwrap().remove(byte);
681	}
682	self
683	}
684
685	/// Enable specializing start states in the DFA.
686	///
687	/// When start states are specialized, an implementor of a search routine
688	/// using a lazy DFA can tell when the search has entered a starting state.
689	/// When start states aren't specialized, then it is impossible to know
690	/// whether the search has entered a start state.
691	///
692	/// Ideally, this option wouldn't need to exist and we could always
693	/// specialize start states. The problem is that start states can be quite
694	/// active. This in turn means that an efficient search routine is likely
695	/// to ping-pong between a heavily optimized hot loop that handles most
696	/// states and to a less optimized specialized handling of start states.
697	/// This causes branches to get heavily mispredicted and overall can
698	/// materially decrease throughput. Therefore, specializing start states
699	/// should only be enabled when it is needed.
700	///
701	/// Knowing whether a search is in a start state is typically useful when a
702	/// prefilter is active for the search. A prefilter is typically only run
703	/// when in a start state and a prefilter can greatly accelerate a search.
704	/// Therefore, the possible cost of specializing start states is worth it
705	/// in this case. Otherwise, if you have no prefilter, there is likely no
706	/// reason to specialize start states.
707	///
708	/// This is disabled by default, but note that it is automatically
709	/// enabled (or disabled) if [`Config::prefilter`] is set. Namely, unless
710	/// `specialize_start_states` has already been set, [`Config::prefilter`]
711	/// will automatically enable or disable it based on whether a prefilter
712	/// is present or not, respectively. This is done because a prefilter's
713	/// effectiveness is rooted in being executed whenever the DFA is in a
714	/// start state, and that's only possible to do when they are specialized.
715	///
716	/// Note that it is plausibly reasonable to _disable_ this option
717	/// explicitly while _enabling_ a prefilter. In that case, a prefilter
718	/// will still be run at the beginning of a search, but never again. This
719	/// in theory could strike a good balance if you're in a situation where a
720	/// prefilter is likely to produce many false positive candidates.
721	///
722	/// # Example
723	///
724	/// This example shows how to enable start state specialization and then
725	/// shows how to check whether a state is a start state or not.
726	///
727	/// ```
728	/// use regex_automata::{dfa::{Automaton, dense::DFA}, Input};
729	///
730	/// let dfa = DFA::builder()
731	/// .configure(DFA::config().specialize_start_states(`true`))
732	/// .build(r"[a-z]+")?;
733	///
734	/// let haystack = "123 foobar 4567".as_bytes();
735	/// let sid = dfa.start_state_forward(&Input::new(haystack))?;
736	/// // The ID returned by 'start_state_forward' will always be tagged as
737	/// // a start state when start state specialization is enabled.
738	/// assert!(dfa.is_special_state(sid));
739	/// assert!(dfa.is_start_state(sid));
740	///
741	/// # Ok::<(), Box<dyn std::error::Error>>(())
742	/// ```
743	///
744	/// Compare the above with the default DFA configuration where start states
745	/// are _not_ specialized. In this case, the start state is not tagged at
746	/// all:
747	///
748	/// ```
749	/// use regex_automata::{dfa::{Automaton, dense::DFA}, Input};
750	///
751	/// let dfa = DFA::new(r"[a-z]+")?;
752	///
753	/// let haystack = "123 foobar 4567";
754	/// let sid = dfa.start_state_forward(&Input::new(haystack))?;
755	/// // Start states are not special in the default configuration!
756	/// assert!(!dfa.is_special_state(sid));
757	/// assert!(!dfa.is_start_state(sid));
758	///
759	/// # Ok::<(), Box<dyn std::error::Error>>(())
760	/// ```
761	pub fn specialize_start_states(mut self, yes: bool) -> Config {
762	self.specialize_start_states = Some(yes);
763	self
764	}
765
766	/// Set a size limit on the total heap used by a DFA.
767	///
768	/// This size limit is expressed in bytes and is applied during
769	/// determinization of an NFA into a DFA. If the DFA's heap usage, and only
770	/// the DFA, exceeds this configured limit, then determinization is stopped
771	/// and an error is returned.
772	///
773	/// This limit does not apply to auxiliary storage used during
774	/// determinization that isn't part of the generated DFA.
775	///
776	/// This limit is only applied during determinization. Currently, there is
777	/// no way to post-pone this check to after minimization if minimization
778	/// was enabled.
779	///
780	/// The total limit on heap used during determinization is the sum of the
781	/// DFA and determinization size limits.
782	///
783	/// The default is no limit.
784	///
785	/// # Example
786	///
787	/// This example shows a DFA that fails to build because of a configured
788	/// size limit. This particular example also serves as a cautionary tale
789	/// demonstrating just how big DFAs with large Unicode character classes
790	/// can get.
791	///
792	/// ```
793	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
794	/// use regex_automata::{dfa::{dense, Automaton}, Input};
795	///
796	/// // 6MB isn't enough!
797	/// dense::Builder::new()
798	/// .configure(dense::Config::new().dfa_size_limit(Some(`6_000_000`)))
799	/// .build(r"\w{20}")
800	/// .unwrap_err();
801	///
802	/// // ... but 7MB probably is!
803	/// // (Note that DFA sizes aren't necessarily stable between releases.)
804	/// let dfa = dense::Builder::new()
805	/// .configure(dense::Config::new().dfa_size_limit(Some(`7_000_000`)))
806	/// .build(r"\w{20}")?;
807	/// let haystack = "A".repeat(`20`).into_bytes();
808	/// assert!(dfa.try_search_fwd(&Input::new(&haystack))?.is_some());
809	///
810	/// # Ok::<(), Box<dyn std::error::Error>>(())
811	/// ```
812	///
813	/// While one needs a little more than 6MB to represent `\w{20}`, it
814	/// turns out that you only need a little more than 6KB to represent
815	/// `(?-u:\w{20})`. So only use Unicode if you need it!
816	///
817	/// As with [`Config::determinize_size_limit`], the size of a DFA is
818	/// influenced by other factors, such as what start state configurations
819	/// to support. For example, if you only need unanchored searches and not
820	/// anchored searches, then configuring the DFA to only support unanchored
821	/// searches can reduce its size. By default, DFAs support both unanchored
822	/// and anchored searches.
823	///
824	/// ```
825	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
826	/// use regex_automata::{dfa::{dense, Automaton, StartKind}, Input};
827	///
828	/// // 3MB isn't enough!
829	/// dense::Builder::new()
830	/// .configure(dense::Config::new()
831	/// .dfa_size_limit(Some(`3_000_000`))
832	/// .start_kind(StartKind::Unanchored)
833	/// )
834	/// .build(r"\w{20}")
835	/// .unwrap_err();
836	///
837	/// // ... but 4MB probably is!
838	/// // (Note that DFA sizes aren't necessarily stable between releases.)
839	/// let dfa = dense::Builder::new()
840	/// .configure(dense::Config::new()
841	/// .dfa_size_limit(Some(`4_000_000`))
842	/// .start_kind(StartKind::Unanchored)
843	/// )
844	/// .build(r"\w{20}")?;
845	/// let haystack = "A".repeat(`20`).into_bytes();
846	/// assert!(dfa.try_search_fwd(&Input::new(&haystack))?.is_some());
847	///
848	/// # Ok::<(), Box<dyn std::error::Error>>(())
849	/// ```
850	pub fn dfa_size_limit(mut self, bytes: Option<usize>) -> Config {
851	self.dfa_size_limit = Some(bytes);
852	self
853	}
854
855	/// Set a size limit on the total heap used by determinization.
856	///
857	/// This size limit is expressed in bytes and is applied during
858	/// determinization of an NFA into a DFA. If the heap used for auxiliary
859	/// storage during determinization (memory that is not in the DFA but
860	/// necessary for building the DFA) exceeds this configured limit, then
861	/// determinization is stopped and an error is returned.
862	///
863	/// This limit does not apply to heap used by the DFA itself.
864	///
865	/// The total limit on heap used during determinization is the sum of the
866	/// DFA and determinization size limits.
867	///
868	/// The default is no limit.
869	///
870	/// # Example
871	///
872	/// This example shows a DFA that fails to build because of a
873	/// configured size limit on the amount of heap space used by
874	/// determinization. This particular example complements the example for
875	/// [`Config::dfa_size_limit`] by demonstrating that not only does Unicode
876	/// potentially make DFAs themselves big, but it also results in more
877	/// auxiliary storage during determinization. (Although, auxiliary storage
878	/// is still not as much as the DFA itself.)
879	///
880	/// ```
881	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
882	/// # if !cfg!(target_pointer_width = "64") { return Ok(()); } // see #1039
883	/// use regex_automata::{dfa::{dense, Automaton}, Input};
884	///
885	/// // 700KB isn't enough!
886	/// dense::Builder::new()
887	/// .configure(dense::Config::new()
888	/// .determinize_size_limit(Some(`700_000`))
889	/// )
890	/// .build(r"\w{20}")
891	/// .unwrap_err();
892	///
893	/// // ... but 800KB probably is!
894	/// // (Note that auxiliary storage sizes aren't necessarily stable between
895	/// // releases.)
896	/// let dfa = dense::Builder::new()
897	/// .configure(dense::Config::new()
898	/// .determinize_size_limit(Some(`800_000`))
899	/// )
900	/// .build(r"\w{20}")?;
901	/// let haystack = "A".repeat(`20`).into_bytes();
902	/// assert!(dfa.try_search_fwd(&Input::new(&haystack))?.is_some());
903	///
904	/// # Ok::<(), Box<dyn std::error::Error>>(())
905	/// ```
906	///
907	/// Note that some parts of the configuration on a DFA can have a
908	/// big impact on how big the DFA is, and thus, how much memory is
909	/// used. For example, the default setting for [`Config::start_kind`] is
910	/// [`StartKind::Both`]. But if you only need an anchored search, for
911	/// example, then it can be much cheaper to build a DFA that only supports
912	/// anchored searches. (Running an unanchored search with it would panic.)
913	///
914	/// ```
915	/// # if cfg!(miri) { return Ok(()); } // miri takes too long
916	/// # if !cfg!(target_pointer_width = "64") { return Ok(()); } // see #1039
917	/// use regex_automata::{
918	/// dfa::{dense, Automaton, StartKind},
919	/// Anchored, Input,
920	/// };
921	///
922	/// // 200KB isn't enough!
923	/// dense::Builder::new()
924	/// .configure(dense::Config::new()
925	/// .determinize_size_limit(Some(`200_000`))
926	/// .start_kind(StartKind::Anchored)
927	/// )
928	/// .build(r"\w{20}")
929	/// .unwrap_err();
930	///
931	/// // ... but 300KB probably is!
932	/// // (Note that auxiliary storage sizes aren't necessarily stable between
933	/// // releases.)
934	/// let dfa = dense::Builder::new()
935	/// .configure(dense::Config::new()
936	/// .determinize_size_limit(Some(`300_000`))
937	/// .start_kind(StartKind::Anchored)
938	/// )
939	/// .build(r"\w{20}")?;
940	/// let haystack = "A".repeat(`20`).into_bytes();
941	/// let input = Input::new(&haystack).anchored(Anchored::Yes);
942	/// assert!(dfa.try_search_fwd(&input)?.is_some());
943	///
944	/// # Ok::<(), Box<dyn std::error::Error>>(())
945	/// ```
946	pub fn determinize_size_limit(mut self, bytes: Option<usize>) -> Config {
947	self.determinize_size_limit = Some(bytes);
948	self
949	}
950
951	/// Returns whether this configuration has enabled simple state
952	/// acceleration.
953	pub fn get_accelerate(&self) -> bool {
954	self.accelerate.unwrap_or(`true`)
955	}
956
957	/// Returns the prefilter attached to this configuration, if any.
958	pub fn get_prefilter(&self) -> Option<&Prefilter> {
959	self.pre.as_ref().unwrap_or(&None).as_ref()
960	}
961
962	/// Returns whether this configuration has enabled the expensive process
963	/// of minimizing a DFA.
964	pub fn get_minimize(&self) -> bool {
965	self.minimize.unwrap_or(`false`)
966	}
967
968	/// Returns the match semantics set in this configuration.
969	pub fn get_match_kind(&self) -> MatchKind {
970	self.match_kind.unwrap_or(MatchKind::LeftmostFirst)
971	}
972
973	/// Returns the starting state configuration for a DFA.
974	pub fn get_starts(&self) -> StartKind {
975	self.start_kind.unwrap_or(StartKind::Both)
976	}
977
978	/// Returns whether this configuration has enabled anchored starting states
979	/// for every pattern in the DFA.
980	pub fn get_starts_for_each_pattern(&self) -> bool {
981	self.starts_for_each_pattern.unwrap_or(`false`)
982	}
983
984	/// Returns whether this configuration has enabled byte classes or not.
985	/// This is typically a debugging oriented option, as disabling it confers
986	/// no speed benefit.
987	pub fn get_byte_classes(&self) -> bool {
988	self.byte_classes.unwrap_or(`true`)
989	}
990
991	/// Returns whether this configuration has enabled heuristic Unicode word
992	/// boundary support. When enabled, it is possible for a search to return
993	/// an error.
994	pub fn get_unicode_word_boundary(&self) -> bool {
995	self.unicode_word_boundary.unwrap_or(`false`)
996	}
997
998	/// Returns whether this configuration will instruct the DFA to enter a
999	/// quit state whenever the given byte is seen during a search. When at
1000	/// least one byte has this enabled, it is possible for a search to return
1001	/// an error.
1002	pub fn get_quit(&self, byte: u8) -> bool {
1003	self.quitset.map_or(`false`, \|q\| q.contains(byte))
1004	}
1005
1006	/// Returns whether this configuration will instruct the DFA to
1007	/// "specialize" start states. When enabled, the DFA will mark start states
1008	/// as "special" so that search routines using the DFA can detect when
1009	/// it's in a start state and do some kind of optimization (like run a
1010	/// prefilter).
1011	pub fn get_specialize_start_states(&self) -> bool {
1012	self.specialize_start_states.unwrap_or(`false`)
1013	}
1014
1015	/// Returns the DFA size limit of this configuration if one was set.
1016	/// The size limit is total number of bytes on the heap that a DFA is
1017	/// permitted to use. If the DFA exceeds this limit during construction,
1018	/// then construction is stopped and an error is returned.
1019	pub fn get_dfa_size_limit(&self) -> Option<usize> {
1020	self.dfa_size_limit.unwrap_or(None)
1021	}
1022
1023	/// Returns the determinization size limit of this configuration if one
1024	/// was set. The size limit is total number of bytes on the heap that
1025	/// determinization is permitted to use. If determinization exceeds this
1026	/// limit during construction, then construction is stopped and an error is
1027	/// returned.
1028	///
1029	/// This is different from the DFA size limit in that this only applies to
1030	/// the auxiliary storage used during determinization. Once determinization
1031	/// is complete, this memory is freed.
1032	///
1033	/// The limit on the total heap memory used is the sum of the DFA and
1034	/// determinization size limits.
1035	pub fn get_determinize_size_limit(&self) -> Option<usize> {
1036	self.determinize_size_limit.unwrap_or(None)
1037	}
1038
1039	/// Overwrite the default configuration such that the options in `o` are
1040	/// always used. If an option in `o` is not set, then the corresponding
1041	/// option in `self` is used. If it's not set in `self` either, then it
1042	/// remains not set.
1043	pub(crate) fn overwrite(&self, o: Config) -> Config {
1044	Config {
1045	accelerate: o.accelerate.or(self.accelerate),
1046	pre: o.pre.or_else(\|\| self.pre.clone()),
1047	minimize: o.minimize.or(self.minimize),
1048	match_kind: o.match_kind.or(self.match_kind),
1049	start_kind: o.start_kind.or(self.start_kind),
1050	starts_for_each_pattern: o
1051	.starts_for_each_pattern
1052	.or(self.starts_for_each_pattern),
1053	byte_classes: o.byte_classes.or(self.byte_classes),
1054	unicode_word_boundary: o
1055	.unicode_word_boundary
1056	.or(self.unicode_word_boundary),
1057	quitset: o.quitset.or(self.quitset),
1058	specialize_start_states: o
1059	.specialize_start_states
1060	.or(self.specialize_start_states),
1061	dfa_size_limit: o.dfa_size_limit.or(self.dfa_size_limit),
1062	determinize_size_limit: o
1063	.determinize_size_limit
1064	.or(self.determinize_size_limit),
1065	}
1066	}
1067	}
1068
1069	/// A builder for constructing a deterministic finite automaton from regular
1070	/// expressions.
1071	///
1072	/// This builder provides two main things:
1073	///
1074	/// 1. It provides a few different `build` routines for actually constructing
1075	/// a DFA from different kinds of inputs. The most convenient is
1076	/// [`Builder::build`], which builds a DFA directly from a pattern string. The
1077	/// most flexible is [`Builder::build_from_nfa`], which builds a DFA straight
1078	/// from an NFA.
1079	/// 2. The builder permits configuring a number of things.
1080	/// [`Builder::configure`] is used with [`Config`] to configure aspects of
1081	/// the DFA and the construction process itself. [`Builder::syntax`] and
1082	/// [`Builder::thompson`] permit configuring the regex parser and Thompson NFA
1083	/// construction, respectively. The syntax and thompson configurations only
1084	/// apply when building from a pattern string.
1085	///
1086	/// This builder always constructs a single* DFA. As such, this builder*
1087	/// can only be used to construct regexes that either detect the presence
1088	/// of a match or find the end location of a match. A single DFA cannot
1089	/// produce both the start and end of a match. For that information, use a
1090	/// [`Regex`](crate::dfa::regex::Regex), which can be similarly configured
1091	/// using [`regex::Builder`](crate::dfa::regex::Builder). The main reason to
1092	/// use a DFA directly is if the end location of a match is enough for your use
1093	/// case. Namely, a `Regex` will construct two DFAs instead of one, since a
1094	/// second reverse DFA is needed to find the start of a match.
1095	///
1096	/// Note that if one wants to build a sparse DFA, you must first build a dense
1097	/// DFA and convert that to a sparse DFA. There is no way to build a sparse
1098	/// DFA without first building a dense DFA.
1099	///
1100	/// # Example
1101	///
1102	/// This example shows how to build a minimized DFA that completely disables
1103	/// Unicode. That is:
1104	///
1105	/// Things such as `\w`, `.` and `\b` are no longer Unicode-aware. `\w`*
1106	/// and `\b` are ASCII-only while `.` matches any byte except for `\n`
1107	/// (instead of any UTF-8 encoding of a Unicode scalar value except for
1108	/// `\n`). Things that are Unicode only, such as `\pL`, are not allowed.
1109	/// The pattern itself is permitted to match invalid UTF-8. For example,*
1110	/// things like `[^a]` that match any byte except for `a` are permitted.
1111	///
1112	/// ```
1113	/// use regex_automata::{
1114	/// dfa::{Automaton, dense},
1115	/// util::syntax,
1116	/// HalfMatch, Input,
1117	/// };
1118	///
1119	/// let dfa = dense::Builder::new()
1120	/// .configure(dense::Config::new().minimize(`false`))
1121	/// .syntax(syntax::Config::new().unicode(`false`).utf8(`false`))
1122	/// .build(r"foo[^b]ar.*")?;
1123	///
1124	/// let haystack = b"`\xFE`foo`\xFF`ar`\xE2\x98\xFF\n`";
1125	/// let expected = Some(HalfMatch::must(`0`, `10`));
1126	/// let got = dfa.try_search_fwd(&Input::new(haystack))?;
1127	/// assert_eq!(expected, got);
1128	///
1129	/// # Ok::<(), Box<dyn std::error::Error>>(())
1130	/// ```
1131	#[cfg(feature = "dfa-build")]
1132	#[derive(Clone, Debug)]
1133	pub struct Builder {
1134	config: Config,
1135	#[cfg(feature = "syntax")]
1136	thompson: thompson::Compiler,
1137	}
1138
1139	#[cfg(feature = "dfa-build")]
1140	impl Builder {
1141	/// Create a new dense DFA builder with the default configuration.
1142	pub fn new() -> Builder {
1143	Builder {
1144	config: Config::default(),
1145	#[cfg(feature = "syntax")]
1146	thompson: thompson::Compiler::new(),
1147	}
1148	}
1149
1150	/// Build a DFA from the given pattern.
1151	///
1152	/// If there was a problem parsing or compiling the pattern, then an error
1153	/// is returned.
1154	#[cfg(feature = "syntax")]
1155	pub fn build(&self, pattern: &str) -> Result<OwnedDFA, BuildError> {
1156	self.build_many(&[pattern])
1157	}
1158
1159	/// Build a DFA from the given patterns.
1160	///
1161	/// When matches are returned, the pattern ID corresponds to the index of
1162	/// the pattern in the slice given.
1163	#[cfg(feature = "syntax")]
1164	pub fn build_many<P: AsRef<str>>(
1165	&self,
1166	patterns: &[P],
1167	) -> Result<OwnedDFA, BuildError> {
1168	let nfa = self
1169	.thompson
1170	.clone()
1171	// We can always forcefully disable captures because DFAs do not
1172	// support them.
1173	.configure(
1174	thompson::Config::new()
1175	.which_captures(thompson::WhichCaptures::None),
1176	)
1177	.build_many(patterns)
1178	.map_err(BuildError::nfa)?;
1179	self.build_from_nfa(&nfa)
1180	}
1181
1182	/// Build a DFA from the given NFA.
1183	///
1184	/// # Example
1185	///
1186	/// This example shows how to build a DFA if you already have an NFA in
1187	/// hand.
1188	///
1189	/// ```
1190	/// use regex_automata::{
1191	/// dfa::{Automaton, dense},
1192	/// nfa::thompson::NFA,
1193	/// HalfMatch, Input,
1194	/// };
1195	///
1196	/// let haystack = "foo123bar".as_bytes();
1197	///
1198	/// // This shows how to set non-default options for building an NFA.
1199	/// let nfa = NFA::compiler()
1200	/// .configure(NFA::config().shrink(`true`))
1201	/// .build(r"[0-9]+")?;
1202	/// let dfa = dense::Builder::new().build_from_nfa(&nfa)?;
1203	/// let expected = Some(HalfMatch::must(`0`, `6`));
1204	/// let got = dfa.try_search_fwd(&Input::new(haystack))?;
1205	/// assert_eq!(expected, got);
1206	///
1207	/// # Ok::<(), Box<dyn std::error::Error>>(())
1208	/// ```
1209	pub fn build_from_nfa(
1210	&self,
1211	nfa: &thompson::NFA,
1212	) -> Result<OwnedDFA, BuildError> {
1213	let mut quitset = self.config.quitset.unwrap_or(ByteSet::empty());
1214	if self.config.get_unicode_word_boundary()
1215	&& nfa.look_set_any().contains_word_unicode()
1216	{
1217	for b in `0x80`..=`0xFF` {
1218	quitset.add(b);
1219	}
1220	}
1221	let classes = if !self.config.get_byte_classes() {
1222	// DFAs will always use the equivalence class map, but enabling
1223	// this option is useful for debugging. Namely, this will cause all
1224	// transitions to be defined over their actual bytes instead of an
1225	// opaque equivalence class identifier. The former is much easier
1226	// to grok as a human.
1227	ByteClasses::singletons()
1228	} else {
1229	let mut set = nfa.byte_class_set().clone();
1230	// It is important to distinguish any "quit" bytes from all other
1231	// bytes. Otherwise, a non-quit byte may end up in the same
1232	// class as a quit byte, and thus cause the DFA to stop when it
1233	// shouldn't.
1234	//
1235	// Test case:
1236	//
1237	// regex-cli find match dense --unicode-word-boundary \
1238	// -p '^#' -p '\b10\.55\.182\.100\b' -y @conn.json.1000x.log
1239	if !quitset.is_empty() {
1240	set.add_set(&quitset);
1241	}
1242	set.byte_classes()
1243	};
1244
1245	let mut dfa = DFA::initial(
1246	classes,
1247	nfa.pattern_len(),
1248	self.config.get_starts(),
1249	nfa.look_matcher(),
1250	self.config.get_starts_for_each_pattern(),
1251	self.config.get_prefilter().map(\|p\| p.clone()),
1252	quitset,
1253	Flags::from_nfa(&nfa),
1254	)?;
1255	determinize::Config::new()
1256	.match_kind(self.config.get_match_kind())
1257	.quit(quitset)
1258	.dfa_size_limit(self.config.get_dfa_size_limit())
1259	.determinize_size_limit(self.config.get_determinize_size_limit())
1260	.run(nfa, &mut dfa)?;
1261	if self.config.get_minimize() {
1262	dfa.minimize();
1263	}
1264	if self.config.get_accelerate() {
1265	dfa.accelerate();
1266	}
1267	// The state shuffling done before this point always assumes that start
1268	// states should be marked as "special," even though it isn't the
1269	// default configuration. State shuffling is complex enough as it is,
1270	// so it's simpler to just "fix" our special state ID ranges to not
1271	// include starting states after-the-fact.
1272	if !self.config.get_specialize_start_states() {
1273	dfa.special.set_no_special_start_states();
1274	}
1275	// Look for and set the universal starting states.
1276	dfa.set_universal_starts();
1277	Ok(dfa)
1278	}
1279
1280	/// Apply the given dense DFA configuration options to this builder.
1281	pub fn configure(&mut self, config: Config) -> &mut Builder {
1282	self.config = self.config.overwrite(config);
1283	self
1284	}
1285
1286	/// Set the syntax configuration for this builder using
1287	/// [`syntax::Config`](crate::util::syntax::Config).
1288	///
1289	/// This permits setting things like case insensitivity, Unicode and multi
1290	/// line mode.
1291	///
1292	/// These settings only apply when constructing a DFA directly from a
1293	/// pattern.
1294	#[cfg(feature = "syntax")]
1295	pub fn syntax(
1296	&mut self,
1297	config: crate::util::syntax::Config,
1298	) -> &mut Builder {
1299	self.thompson.syntax(config);
1300	self
1301	}
1302
1303	/// Set the Thompson NFA configuration for this builder using
1304	/// [`nfa::thompson::Config`](crate::nfa::thompson::Config).
1305	///
1306	/// This permits setting things like whether the DFA should match the regex
1307	/// in reverse or if additional time should be spent shrinking the size of
1308	/// the NFA.
1309	///
1310	/// These settings only apply when constructing a DFA directly from a
1311	/// pattern.
1312	#[cfg(feature = "syntax")]
1313	pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
1314	self.thompson.configure(config);
1315	self
1316	}
1317	}
1318
1319	#[cfg(feature = "dfa-build")]
1320	impl Default for Builder {
1321	fn default() -> Builder {
1322	Builder::new()
1323	}
1324	}
1325
1326	/// A convenience alias for an owned DFA. We use this particular instantiation
1327	/// a lot in this crate, so it's worth giving it a name. This instantiation
1328	/// is commonly used for mutable APIs on the DFA while building it. The main
1329	/// reason for making DFAs generic is no_std support, and more generally,
1330	/// making it possible to load a DFA from an arbitrary slice of bytes.
1331	#[cfg(feature = "alloc")]
1332	pub(crate) type OwnedDFA = DFA<alloc::vec::Vec<u32>>;
1333
1334	/// A dense table-based deterministic finite automaton (DFA).
1335	///
1336	/// All dense DFAs have one or more start states, zero or more match states
1337	/// and a transition table that maps the current state and the current byte
1338	/// of input to the next state. A DFA can use this information to implement
1339	/// fast searching. In particular, the use of a dense DFA generally makes the
1340	/// trade off that match speed is the most valuable characteristic, even if
1341	/// building the DFA may take significant time and* space. (More concretely,*
1342	/// building a DFA takes time and space that is exponential in the size of the
1343	/// pattern in the worst case.) As such, the processing of every byte of input
1344	/// is done with a small constant number of operations that does not vary with
1345	/// the pattern, its size or the size of the alphabet. If your needs don't line
1346	/// up with this trade off, then a dense DFA may not be an adequate solution to
1347	/// your problem.
1348	///
1349	/// In contrast, a [`sparse::DFA`] makes the opposite
1350	/// trade off: it uses less space but will execute a variable number of
1351	/// instructions per byte at match time, which makes it slower for matching.
1352	/// (Note that space usage is still exponential in the size of the pattern in
1353	/// the worst case.)
1354	///
1355	/// A DFA can be built using the default configuration via the
1356	/// [`DFA::new`] constructor. Otherwise, one can
1357	/// configure various aspects via [`dense::Builder`](Builder).
1358	///
1359	/// A single DFA fundamentally supports the following operations:
1360	///
1361	/// 1. Detection of a match.
1362	/// 2. Location of the end of a match.
1363	/// 3. In the case of a DFA with multiple patterns, which pattern matched is
1364	/// reported as well.
1365	///
1366	/// A notable absence from the above list of capabilities is the location of
1367	/// the start* of a match. In order to provide both the start and end of*
1368	/// a match, two* DFAs are required. This functionality is provided by a*
1369	/// [`Regex`](crate::dfa::regex::Regex).
1370	///
1371	/// # Type parameters
1372	///
1373	/// A `DFA` has one type parameter, `T`, which is used to represent state IDs,
1374	/// pattern IDs and accelerators. `T` is typically a `Vec<u32>` or a `&[u32]`.
1375	///
1376	/// # The `Automaton` trait
1377	///
1378	/// This type implements the [`Automaton`] trait, which means it can be used
1379	/// for searching. For example:
1380	///
1381	/// ```
1382	/// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input};
1383	///
1384	/// let dfa = DFA::new("foo[0-9]+")?;
1385	/// let expected = HalfMatch::must(`0`, `8`);
1386	/// assert_eq!(Some(expected), dfa.try_search_fwd(&Input::new("foo12345"))?);
1387	/// # Ok::<(), Box<dyn std::error::Error>>(())
1388	/// ```
1389	#[derive(Clone)]
1390	pub struct DFA<T> {
1391	/// The transition table for this DFA. This includes the transitions
1392	/// themselves, along with the stride, number of states and the equivalence
1393	/// class mapping.
1394	tt: TransitionTable<T>,
1395	/// The set of starting state identifiers for this DFA. The starting state
1396	/// IDs act as pointers into the transition table. The specific starting
1397	/// state chosen for each search is dependent on the context at which the
1398	/// search begins.
1399	st: StartTable<T>,
1400	/// The set of match states and the patterns that match for each
1401	/// corresponding match state.
1402	///
1403	/// This structure is technically only needed because of support for
1404	/// multi-regexes. Namely, multi-regexes require answering not just whether
1405	/// a match exists, but _which_ patterns match. So we need to store the
1406	/// matching pattern IDs for each match state. We do this even when there
1407	/// is only one pattern for the sake of simplicity. In practice, this uses
1408	/// up very little space for the case of one pattern.
1409	ms: MatchStates<T>,
1410	/// Information about which states are "special." Special states are states
1411	/// that are dead, quit, matching, starting or accelerated. For more info,
1412	/// see the docs for `Special`.
1413	special: Special,
1414	/// The accelerators for this DFA.
1415	///
1416	/// If a state is accelerated, then there exist only a small number of
1417	/// bytes that can cause the DFA to leave the state. This permits searching
1418	/// to use optimized routines to find those specific bytes instead of using
1419	/// the transition table.
1420	///
1421	/// All accelerated states exist in a contiguous range in the DFA's
1422	/// transition table. See dfa/special.rs for more details on how states are
1423	/// arranged.
1424	accels: Accels<T>,
1425	/// Any prefilter attached to this DFA.
1426	///
1427	/// Note that currently prefilters are not serialized. When deserializing
1428	/// a DFA from bytes, this is always set to `None`.
1429	pre: Option<Prefilter>,
1430	/// The set of "quit" bytes for this DFA.
1431	///
1432	/// This is only used when computing the start state for a particular
1433	/// position in a haystack. Namely, in the case where there is a quit
1434	/// byte immediately before the start of the search, this set needs to be
1435	/// explicitly consulted. In all other cases, quit bytes are detected by
1436	/// the DFA itself, by transitioning all quit bytes to a special "quit
1437	/// state."
1438	quitset: ByteSet,
1439	/// Various flags describing the behavior of this DFA.
1440	flags: Flags,
1441	}
1442
1443	#[cfg(feature = "dfa-build")]
1444	impl OwnedDFA {
1445	/// Parse the given regular expression using a default configuration and
1446	/// return the corresponding DFA.
1447	///
1448	/// If you want a non-default configuration, then use the
1449	/// [`dense::Builder`](Builder) to set your own configuration.
1450	///
1451	/// # Example
1452	///
1453	/// ```
1454	/// use regex_automata::{dfa::{Automaton, dense}, HalfMatch, Input};
1455	///
1456	/// let dfa = dense::DFA::new("foo[0-9]+bar")?;
1457	/// let expected = Some(HalfMatch::must(`0`, `11`));
1458	/// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345bar"))?);
1459	/// # Ok::<(), Box<dyn std::error::Error>>(())
1460	/// ```
1461	#[cfg(feature = "syntax")]
1462	pub fn new(pattern: &str) -> Result<OwnedDFA, BuildError> {
1463	Builder::new().build(pattern)
1464	}
1465
1466	/// Parse the given regular expressions using a default configuration and
1467	/// return the corresponding multi-DFA.
1468	///
1469	/// If you want a non-default configuration, then use the
1470	/// [`dense::Builder`](Builder) to set your own configuration.
1471	///
1472	/// # Example
1473	///
1474	/// ```
1475	/// use regex_automata::{dfa::{Automaton, dense}, HalfMatch, Input};
1476	///
1477	/// let dfa = dense::DFA::new_many(&["[0-9]+", "[a-z]+"])?;
1478	/// let expected = Some(HalfMatch::must(`1`, `3`));
1479	/// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345bar"))?);
1480	/// # Ok::<(), Box<dyn std::error::Error>>(())
1481	/// ```
1482	#[cfg(feature = "syntax")]
1483	pub fn new_many<P: AsRef<str>>(
1484	patterns: &[P],
1485	) -> Result<OwnedDFA, BuildError> {
1486	Builder::new().build_many(patterns)
1487	}
1488	}
1489
1490	#[cfg(feature = "dfa-build")]
1491	impl OwnedDFA {
1492	/// Create a new DFA that matches every input.
1493	///
1494	/// # Example
1495	///
1496	/// ```
1497	/// use regex_automata::{dfa::{Automaton, dense}, HalfMatch, Input};
1498	///
1499	/// let dfa = dense::DFA::always_match()?;
1500	///
1501	/// let expected = Some(HalfMatch::must(`0`, `0`));
1502	/// assert_eq!(expected, dfa.try_search_fwd(&Input::new(""))?);
1503	/// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo"))?);
1504	/// # Ok::<(), Box<dyn std::error::Error>>(())
1505	/// ```
1506	pub fn always_match() -> Result<OwnedDFA, BuildError> {
1507	let nfa = thompson::NFA::always_match();
1508	Builder::new().build_from_nfa(&nfa)
1509	}
1510
1511	/// Create a new DFA that never matches any input.
1512	///
1513	/// # Example
1514	///
1515	/// ```
1516	/// use regex_automata::{dfa::{Automaton, dense}, Input};
1517	///
1518	/// let dfa = dense::DFA::never_match()?;
1519	/// assert_eq!(None, dfa.try_search_fwd(&Input::new(""))?);
1520	/// assert_eq!(None, dfa.try_search_fwd(&Input::new("foo"))?);
1521	/// # Ok::<(), Box<dyn std::error::Error>>(())
1522	/// ```
1523	pub fn never_match() -> Result<OwnedDFA, BuildError> {
1524	let nfa = thompson::NFA::never_match();
1525	Builder::new().build_from_nfa(&nfa)
1526	}
1527
1528	/// Create an initial DFA with the given equivalence classes, pattern
1529	/// length and whether anchored starting states are enabled for each
1530	/// pattern. An initial DFA can be further mutated via determinization.
1531	fn initial(
1532	classes: ByteClasses,
1533	pattern_len: usize,
1534	starts: StartKind,
1535	lookm: &LookMatcher,
1536	starts_for_each_pattern: bool,
1537	pre: Option<Prefilter>,
1538	quitset: ByteSet,
1539	flags: Flags,
1540	) -> Result<OwnedDFA, BuildError> {
1541	let start_pattern_len =
1542	if starts_for_each_pattern { Some(pattern_len) } else { None };
1543	Ok(DFA {
1544	tt: TransitionTable::minimal(classes),
1545	st: StartTable::dead(starts, lookm, start_pattern_len)?,
1546	ms: MatchStates::empty(pattern_len),
1547	special: Special::new(),
1548	accels: Accels::empty(),
1549	pre,
1550	quitset,
1551	flags,
1552	})
1553	}
1554	}
1555
1556	#[cfg(feature = "dfa-build")]
1557	impl DFA<&[u32]> {
1558	/// Return a new default dense DFA compiler configuration.
1559	///
1560	/// This is a convenience routine to avoid needing to import the [`Config`]
1561	/// type when customizing the construction of a dense DFA.
1562	pub fn config() -> Config {
1563	Config::new()
1564	}
1565
1566	/// Create a new dense DFA builder with the default configuration.
1567	///
1568	/// This is a convenience routine to avoid needing to import the
1569	/// [`Builder`] type in common cases.
1570	pub fn builder() -> Builder {
1571	Builder::new()
1572	}
1573	}
1574
1575	impl<T: AsRef<[u32]>> DFA<T> {
1576	/// Cheaply return a borrowed version of this dense DFA. Specifically,
1577	/// the DFA returned always uses `&[u32]` for its transition table.
1578	pub fn as_ref(&self) -> DFA<&'_ [u32]> {
1579	DFA {
1580	tt: self.tt.as_ref(),
1581	st: self.st.as_ref(),
1582	ms: self.ms.as_ref(),
1583	special: self.special,
1584	accels: self.accels(),
1585	pre: self.pre.clone(),
1586	quitset: self.quitset,
1587	flags: self.flags,
1588	}
1589	}
1590
1591	/// Return an owned version of this sparse DFA. Specifically, the DFA
1592	/// returned always uses `Vec<u32>` for its transition table.
1593	///
1594	/// Effectively, this returns a dense DFA whose transition table lives on
1595	/// the heap.
1596	#[cfg(feature = "alloc")]
1597	pub fn to_owned(&self) -> OwnedDFA {
1598	DFA {
1599	tt: self.tt.to_owned(),
1600	st: self.st.to_owned(),
1601	ms: self.ms.to_owned(),
1602	special: self.special,
1603	accels: self.accels().to_owned(),
1604	pre: self.pre.clone(),
1605	quitset: self.quitset,
1606	flags: self.flags,
1607	}
1608	}
1609
1610	/// Returns the starting state configuration for this DFA.
1611	///
1612	/// The default is [`StartKind::Both`], which means the DFA supports both
1613	/// unanchored and anchored searches. However, this can generally lead to
1614	/// bigger DFAs. Therefore, a DFA might be compiled with support for just
1615	/// unanchored or anchored searches. In that case, running a search with
1616	/// an unsupported configuration will panic.
1617	pub fn start_kind(&self) -> StartKind {
1618	self.st.kind
1619	}
1620
1621	/// Returns the start byte map used for computing the `Start` configuration
1622	/// at the beginning of a search.
1623	pub(crate) fn start_map(&self) -> &StartByteMap {
1624	&self.st.start_map
1625	}
1626
1627	/// Returns true only if this DFA has starting states for each pattern.
1628	///
1629	/// When a DFA has starting states for each pattern, then a search with the
1630	/// DFA can be configured to only look for anchored matches of a specific
1631	/// pattern. Specifically, APIs like [`Automaton::try_search_fwd`] can
1632	/// accept a non-None `pattern_id` if and only if this method returns true.
1633	/// Otherwise, calling `try_search_fwd` will panic.
1634	///
1635	/// Note that if the DFA has no patterns, this always returns false.
1636	pub fn starts_for_each_pattern(&self) -> bool {
1637	self.st.pattern_len.is_some()
1638	}
1639
1640	/// Returns the equivalence classes that make up the alphabet for this DFA.
1641	///
1642	/// Unless [`Config::byte_classes`] was disabled, it is possible that
1643	/// multiple distinct bytes are grouped into the same equivalence class
1644	/// if it is impossible for them to discriminate between a match and a
1645	/// non-match. This has the effect of reducing the overall alphabet size
1646	/// and in turn potentially substantially reducing the size of the DFA's
1647	/// transition table.
1648	///
1649	/// The downside of using equivalence classes like this is that every state
1650	/// transition will automatically use this map to convert an arbitrary
1651	/// byte to its corresponding equivalence class. In practice this has a
1652	/// negligible impact on performance.
1653	pub fn byte_classes(&self) -> &ByteClasses {
1654	&self.tt.classes
1655	}
1656
1657	/// Returns the total number of elements in the alphabet for this DFA.
1658	///
1659	/// That is, this returns the total number of transitions that each state
1660	/// in this DFA must have. Typically, a normal byte oriented DFA would
1661	/// always have an alphabet size of 256, corresponding to the number of
1662	/// unique values in a single byte. However, this implementation has two
1663	/// peculiarities that impact the alphabet length:
1664	///
1665	/// Every state has a special "EOI" transition that is only followed*
1666	/// after the end of some haystack is reached. This EOI transition is
1667	/// necessary to account for one byte of look-ahead when implementing
1668	/// things like `\b` and `$`.
1669	/// Bytes are grouped into equivalence classes such that no two bytes in*
1670	/// the same class can distinguish a match from a non-match. For example,
1671	/// in the regex `^[a-z]+$`, the ASCII bytes `a-z` could all be in the
1672	/// same equivalence class. This leads to a massive space savings.
1673	///
1674	/// Note though that the alphabet length does _not_ necessarily equal the
1675	/// total stride space taken up by a single DFA state in the transition
1676	/// table. Namely, for performance reasons, the stride is always the
1677	/// smallest power of two that is greater than or equal to the alphabet
1678	/// length. For this reason, [`DFA::stride`] or [`DFA::stride2`] are
1679	/// often more useful. The alphabet length is typically useful only for
1680	/// informational purposes.
1681	pub fn alphabet_len(&self) -> usize {
1682	self.tt.alphabet_len()
1683	}
1684
1685	/// Returns the total stride for every state in this DFA, expressed as the
1686	/// exponent of a power of 2. The stride is the amount of space each state
1687	/// takes up in the transition table, expressed as a number of transitions.
1688	/// (Unused transitions map to dead states.)
1689	///
1690	/// The stride of a DFA is always equivalent to the smallest power of 2
1691	/// that is greater than or equal to the DFA's alphabet length. This
1692	/// definition uses extra space, but permits faster translation between
1693	/// premultiplied state identifiers and contiguous indices (by using shifts
1694	/// instead of relying on integer division).
1695	///
1696	/// For example, if the DFA's stride is 16 transitions, then its `stride2`
1697	/// is `4` since `2^4 = 16`.
1698	///
1699	/// The minimum `stride2` value is `1` (corresponding to a stride of `2`)
1700	/// while the maximum `stride2` value is `9` (corresponding to a stride of
1701	/// `512`). The maximum is not `8` since the maximum alphabet size is `257`
1702	/// when accounting for the special EOI transition. However, an alphabet
1703	/// length of that size is exceptionally rare since the alphabet is shrunk
1704	/// into equivalence classes.
1705	pub fn stride2(&self) -> usize {
1706	self.tt.stride2
1707	}
1708
1709	/// Returns the total stride for every state in this DFA. This corresponds
1710	/// to the total number of transitions used by each state in this DFA's
1711	/// transition table.
1712	///
1713	/// Please see [`DFA::stride2`] for more information. In particular, this
1714	/// returns the stride as the number of transitions, where as `stride2`
1715	/// returns it as the exponent of a power of 2.
1716	pub fn stride(&self) -> usize {
1717	self.tt.stride()
1718	}
1719
1720	/// Returns the memory usage, in bytes, of this DFA.
1721	///
1722	/// The memory usage is computed based on the number of bytes used to
1723	/// represent this DFA.
1724	///
1725	/// This does not* include the stack size used up by this DFA. To*
1726	/// compute that, use `std::mem::size_of::<dense::DFA>()`.
1727	pub fn memory_usage(&self) -> usize {
1728	self.tt.memory_usage()
1729	+ self.st.memory_usage()
1730	+ self.ms.memory_usage()
1731	+ self.accels.memory_usage()
1732	}
1733	}
1734
1735	/// Routines for converting a dense DFA to other representations, such as
1736	/// sparse DFAs or raw bytes suitable for persistent storage.
1737	impl<T: AsRef<[u32]>> DFA<T> {
1738	/// Convert this dense DFA to a sparse DFA.
1739	///
1740	/// If a `StateID` is too small to represent all states in the sparse
1741	/// DFA, then this returns an error. In most cases, if a dense DFA is
1742	/// constructable with `StateID` then a sparse DFA will be as well.
1743	/// However, it is not guaranteed.
1744	///
1745	/// # Example
1746	///
1747	/// ```
1748	/// use regex_automata::{dfa::{Automaton, dense}, HalfMatch, Input};
1749	///
1750	/// let dense = dense::DFA::new("foo[0-9]+")?;
1751	/// let sparse = dense.to_sparse()?;
1752	///
1753	/// let expected = Some(HalfMatch::must(`0`, `8`));
1754	/// assert_eq!(expected, sparse.try_search_fwd(&Input::new("foo12345"))?);
1755	/// # Ok::<(), Box<dyn std::error::Error>>(())
1756	/// ```
1757	#[cfg(feature = "dfa-build")]
1758	pub fn to_sparse(&self) -> Result<sparse::DFA<Vec<u8>>, BuildError> {
1759	sparse::DFA::from_dense(self)
1760	}
1761
1762	/// Serialize this DFA as raw bytes to a `Vec<u8>` in little endian
1763	/// format. Upon success, the `Vec<u8>` and the initial padding length are
1764	/// returned.
1765	///
1766	/// The written bytes are guaranteed to be deserialized correctly and
1767	/// without errors in a semver compatible release of this crate by a
1768	/// `DFA`'s deserialization APIs (assuming all other criteria for the
1769	/// deserialization APIs has been satisfied):
1770	///
1771	/// [`DFA::from_bytes`]*
1772	/// [`DFA::from_bytes_unchecked`]*
1773	///
1774	/// The padding returned is non-zero if the returned `Vec<u8>` starts at
1775	/// an address that does not have the same alignment as `u32`. The padding
1776	/// corresponds to the number of leading bytes written to the returned
1777	/// `Vec<u8>`.
1778	///
1779	/// # Example
1780	///
1781	/// This example shows how to serialize and deserialize a DFA:
1782	///
1783	/// ```
1784	/// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input};
1785	///
1786	/// // Compile our original DFA.
1787	/// let original_dfa = DFA::new("foo[0-9]+")?;
1788	///
1789	/// // N.B. We use native endianness here to make the example work, but
1790	/// // using to_bytes_little_endian would work on a little endian target.
1791	/// let (buf, _) = original_dfa.to_bytes_native_endian();
1792	/// // Even if buf has initial padding, DFA::from_bytes will automatically
1793	/// // ignore it.
1794	/// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf)?.0;
1795	///
1796	/// let expected = Some(HalfMatch::must(`0`, `8`));
1797	/// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
1798	/// # Ok::<(), Box<dyn std::error::Error>>(())
1799	/// ```
1800	#[cfg(feature = "dfa-build")]
1801	pub fn to_bytes_little_endian(&self) -> (Vec<u8>, usize) {
1802	self.to_bytes::<wire::LE>()
1803	}
1804
1805	/// Serialize this DFA as raw bytes to a `Vec<u8>` in big endian
1806	/// format. Upon success, the `Vec<u8>` and the initial padding length are
1807	/// returned.
1808	///
1809	/// The written bytes are guaranteed to be deserialized correctly and
1810	/// without errors in a semver compatible release of this crate by a
1811	/// `DFA`'s deserialization APIs (assuming all other criteria for the
1812	/// deserialization APIs has been satisfied):
1813	///
1814	/// [`DFA::from_bytes`]*
1815	/// [`DFA::from_bytes_unchecked`]*
1816	///
1817	/// The padding returned is non-zero if the returned `Vec<u8>` starts at
1818	/// an address that does not have the same alignment as `u32`. The padding
1819	/// corresponds to the number of leading bytes written to the returned
1820	/// `Vec<u8>`.
1821	///
1822	/// # Example
1823	///
1824	/// This example shows how to serialize and deserialize a DFA:
1825	///
1826	/// ```
1827	/// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input};
1828	///
1829	/// // Compile our original DFA.
1830	/// let original_dfa = DFA::new("foo[0-9]+")?;
1831	///
1832	/// // N.B. We use native endianness here to make the example work, but
1833	/// // using to_bytes_big_endian would work on a big endian target.
1834	/// let (buf, _) = original_dfa.to_bytes_native_endian();
1835	/// // Even if buf has initial padding, DFA::from_bytes will automatically
1836	/// // ignore it.
1837	/// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf)?.0;
1838	///
1839	/// let expected = Some(HalfMatch::must(`0`, `8`));
1840	/// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
1841	/// # Ok::<(), Box<dyn std::error::Error>>(())
1842	/// ```
1843	#[cfg(feature = "dfa-build")]
1844	pub fn to_bytes_big_endian(&self) -> (Vec<u8>, usize) {
1845	self.to_bytes::<wire::BE>()
1846	}
1847
1848	/// Serialize this DFA as raw bytes to a `Vec<u8>` in native endian
1849	/// format. Upon success, the `Vec<u8>` and the initial padding length are
1850	/// returned.
1851	///
1852	/// The written bytes are guaranteed to be deserialized correctly and
1853	/// without errors in a semver compatible release of this crate by a
1854	/// `DFA`'s deserialization APIs (assuming all other criteria for the
1855	/// deserialization APIs has been satisfied):
1856	///
1857	/// [`DFA::from_bytes`]*
1858	/// [`DFA::from_bytes_unchecked`]*
1859	///
1860	/// The padding returned is non-zero if the returned `Vec<u8>` starts at
1861	/// an address that does not have the same alignment as `u32`. The padding
1862	/// corresponds to the number of leading bytes written to the returned
1863	/// `Vec<u8>`.
1864	///
1865	/// Generally speaking, native endian format should only be used when
1866	/// you know that the target you're compiling the DFA for matches the
1867	/// endianness of the target on which you're compiling DFA. For example,
1868	/// if serialization and deserialization happen in the same process or on
1869	/// the same machine. Otherwise, when serializing a DFA for use in a
1870	/// portable environment, you'll almost certainly want to serialize _both_
1871	/// a little endian and a big endian version and then load the correct one
1872	/// based on the target's configuration.
1873	///
1874	/// # Example
1875	///
1876	/// This example shows how to serialize and deserialize a DFA:
1877	///
1878	/// ```
1879	/// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input};
1880	///
1881	/// // Compile our original DFA.
1882	/// let original_dfa = DFA::new("foo[0-9]+")?;
1883	///
1884	/// let (buf, _) = original_dfa.to_bytes_native_endian();
1885	/// // Even if buf has initial padding, DFA::from_bytes will automatically
1886	/// // ignore it.
1887	/// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf)?.0;
1888	///
1889	/// let expected = Some(HalfMatch::must(`0`, `8`));
1890	/// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
1891	/// # Ok::<(), Box<dyn std::error::Error>>(())
1892	/// ```
1893	#[cfg(feature = "dfa-build")]
1894	pub fn to_bytes_native_endian(&self) -> (Vec<u8>, usize) {
1895	self.to_bytes::<wire::NE>()
1896	}
1897
1898	/// The implementation of the public `to_bytes` serialization methods,
1899	/// which is generic over endianness.
1900	#[cfg(feature = "dfa-build")]
1901	fn to_bytes<E: Endian>(&self) -> (Vec<u8>, usize) {
1902	let len = self.write_to_len();
1903	let (mut buf, padding) = wire::alloc_aligned_buffer::<u32>(len);
1904	// This should always succeed since the only possible serialization
1905	// error is providing a buffer that's too small, but we've ensured that
1906	// `buf` is big enough here.
1907	self.as_ref().write_to::<E>(&mut buf[padding..]).unwrap();
1908	(buf, padding)
1909	}
1910
1911	/// Serialize this DFA as raw bytes to the given slice, in little endian
1912	/// format. Upon success, the total number of bytes written to `dst` is
1913	/// returned.
1914	///
1915	/// The written bytes are guaranteed to be deserialized correctly and
1916	/// without errors in a semver compatible release of this crate by a
1917	/// `DFA`'s deserialization APIs (assuming all other criteria for the
1918	/// deserialization APIs has been satisfied):
1919	///
1920	/// [`DFA::from_bytes`]*
1921	/// [`DFA::from_bytes_unchecked`]*
1922	///
1923	/// Note that unlike the various `to_byte_` routines, this does not write*
1924	/// any padding. Callers are responsible for handling alignment correctly.
1925	///
1926	/// # Errors
1927	///
1928	/// This returns an error if the given destination slice is not big enough
1929	/// to contain the full serialized DFA. If an error occurs, then nothing
1930	/// is written to `dst`.
1931	///
1932	/// # Example
1933	///
1934	/// This example shows how to serialize and deserialize a DFA without
1935	/// dynamic memory allocation.
1936	///
1937	/// ```
1938	/// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input};
1939	///
1940	/// // Compile our original DFA.
1941	/// let original_dfa = DFA::new("foo[0-9]+")?;
1942	///
1943	/// // Create a 4KB buffer on the stack to store our serialized DFA. We
1944	/// // need to use a special type to force the alignment of our [u8; N]
1945	/// // array to be aligned to a 4 byte boundary. Otherwise, deserializing
1946	/// // the DFA may fail because of an alignment mismatch.
1947	/// #[repr(C)]
1948	/// struct Aligned<B: ?Sized> {
1949	/// _align: [u32; `0`],
1950	/// bytes: B,
1951	/// }
1952	/// let mut buf = Aligned { _align: [], bytes: [`0u8`; `4` * (`1`<<`10`)] };
1953	/// // N.B. We use native endianness here to make the example work, but
1954	/// // using write_to_little_endian would work on a little endian target.
1955	/// let written = original_dfa.write_to_native_endian(&mut buf.bytes)?;
1956	/// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf.bytes[..written])?.0;
1957	///
1958	/// let expected = Some(HalfMatch::must(`0`, `8`));
1959	/// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
1960	/// # Ok::<(), Box<dyn std::error::Error>>(())
1961	/// ```
1962	pub fn write_to_little_endian(
1963	&self,
1964	dst: &mut [u8],
1965	) -> Result<usize, SerializeError> {
1966	self.as_ref().write_to::<wire::LE>(dst)
1967	}
1968
1969	/// Serialize this DFA as raw bytes to the given slice, in big endian
1970	/// format. Upon success, the total number of bytes written to `dst` is
1971	/// returned.
1972	///
1973	/// The written bytes are guaranteed to be deserialized correctly and
1974	/// without errors in a semver compatible release of this crate by a
1975	/// `DFA`'s deserialization APIs (assuming all other criteria for the
1976	/// deserialization APIs has been satisfied):
1977	///
1978	/// [`DFA::from_bytes`]*
1979	/// [`DFA::from_bytes_unchecked`]*
1980	///
1981	/// Note that unlike the various `to_byte_` routines, this does not write*
1982	/// any padding. Callers are responsible for handling alignment correctly.
1983	///
1984	/// # Errors
1985	///
1986	/// This returns an error if the given destination slice is not big enough
1987	/// to contain the full serialized DFA. If an error occurs, then nothing
1988	/// is written to `dst`.
1989	///
1990	/// # Example
1991	///
1992	/// This example shows how to serialize and deserialize a DFA without
1993	/// dynamic memory allocation.
1994	///
1995	/// ```
1996	/// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input};
1997	///
1998	/// // Compile our original DFA.
1999	/// let original_dfa = DFA::new("foo[0-9]+")?;
2000	///
2001	/// // Create a 4KB buffer on the stack to store our serialized DFA. We
2002	/// // need to use a special type to force the alignment of our [u8; N]
2003	/// // array to be aligned to a 4 byte boundary. Otherwise, deserializing
2004	/// // the DFA may fail because of an alignment mismatch.
2005	/// #[repr(C)]
2006	/// struct Aligned<B: ?Sized> {
2007	/// _align: [u32; `0`],
2008	/// bytes: B,
2009	/// }
2010	/// let mut buf = Aligned { _align: [], bytes: [`0u8`; `4` * (`1`<<`10`)] };
2011	/// // N.B. We use native endianness here to make the example work, but
2012	/// // using write_to_big_endian would work on a big endian target.
2013	/// let written = original_dfa.write_to_native_endian(&mut buf.bytes)?;
2014	/// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf.bytes[..written])?.0;
2015	///
2016	/// let expected = Some(HalfMatch::must(`0`, `8`));
2017	/// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
2018	/// # Ok::<(), Box<dyn std::error::Error>>(())
2019	/// ```
2020	pub fn write_to_big_endian(
2021	&self,
2022	dst: &mut [u8],
2023	) -> Result<usize, SerializeError> {
2024	self.as_ref().write_to::<wire::BE>(dst)
2025	}
2026
2027	/// Serialize this DFA as raw bytes to the given slice, in native endian
2028	/// format. Upon success, the total number of bytes written to `dst` is
2029	/// returned.
2030	///
2031	/// The written bytes are guaranteed to be deserialized correctly and
2032	/// without errors in a semver compatible release of this crate by a
2033	/// `DFA`'s deserialization APIs (assuming all other criteria for the
2034	/// deserialization APIs has been satisfied):
2035	///
2036	/// [`DFA::from_bytes`]*
2037	/// [`DFA::from_bytes_unchecked`]*
2038	///
2039	/// Generally speaking, native endian format should only be used when
2040	/// you know that the target you're compiling the DFA for matches the
2041	/// endianness of the target on which you're compiling DFA. For example,
2042	/// if serialization and deserialization happen in the same process or on
2043	/// the same machine. Otherwise, when serializing a DFA for use in a
2044	/// portable environment, you'll almost certainly want to serialize _both_
2045	/// a little endian and a big endian version and then load the correct one
2046	/// based on the target's configuration.
2047	///
2048	/// Note that unlike the various `to_byte_` routines, this does not write*
2049	/// any padding. Callers are responsible for handling alignment correctly.
2050	///
2051	/// # Errors
2052	///
2053	/// This returns an error if the given destination slice is not big enough
2054	/// to contain the full serialized DFA. If an error occurs, then nothing
2055	/// is written to `dst`.
2056	///
2057	/// # Example
2058	///
2059	/// This example shows how to serialize and deserialize a DFA without
2060	/// dynamic memory allocation.
2061	///
2062	/// ```
2063	/// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input};
2064	///
2065	/// // Compile our original DFA.
2066	/// let original_dfa = DFA::new("foo[0-9]+")?;
2067	///
2068	/// // Create a 4KB buffer on the stack to store our serialized DFA. We
2069	/// // need to use a special type to force the alignment of our [u8; N]
2070	/// // array to be aligned to a 4 byte boundary. Otherwise, deserializing
2071	/// // the DFA may fail because of an alignment mismatch.
2072	/// #[repr(C)]
2073	/// struct Aligned<B: ?Sized> {
2074	/// _align: [u32; `0`],
2075	/// bytes: B,
2076	/// }
2077	/// let mut buf = Aligned { _align: [], bytes: [`0u8`; `4` * (`1`<<`10`)] };
2078	/// let written = original_dfa.write_to_native_endian(&mut buf.bytes)?;
2079	/// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf.bytes[..written])?.0;
2080	///
2081	/// let expected = Some(HalfMatch::must(`0`, `8`));
2082	/// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
2083	/// # Ok::<(), Box<dyn std::error::Error>>(())
2084	/// ```
2085	pub fn write_to_native_endian(
2086	&self,
2087	dst: &mut [u8],
2088	) -> Result<usize, SerializeError> {
2089	self.as_ref().write_to::<wire::NE>(dst)
2090	}
2091
2092	/// Return the total number of bytes required to serialize this DFA.
2093	///
2094	/// This is useful for determining the size of the buffer required to pass
2095	/// to one of the serialization routines:
2096	///
2097	/// [`DFA::write_to_little_endian`]*
2098	/// [`DFA::write_to_big_endian`]*
2099	/// [`DFA::write_to_native_endian`]*
2100	///
2101	/// Passing a buffer smaller than the size returned by this method will
2102	/// result in a serialization error. Serialization routines are guaranteed
2103	/// to succeed when the buffer is big enough.
2104	///
2105	/// # Example
2106	///
2107	/// This example shows how to dynamically allocate enough room to serialize
2108	/// a DFA.
2109	///
2110	/// ```
2111	/// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input};
2112	///
2113	/// let original_dfa = DFA::new("foo[0-9]+")?;
2114	///
2115	/// let mut buf = vec![`0`; original_dfa.write_to_len()];
2116	/// // This is guaranteed to succeed, because the only serialization error
2117	/// // that can occur is when the provided buffer is too small. But
2118	/// // write_to_len guarantees a correct size.
2119	/// let written = original_dfa.write_to_native_endian(&mut buf).unwrap();
2120	/// // But this is not guaranteed to succeed! In particular,
2121	/// // deserialization requires proper alignment for &[u32], but our buffer
2122	/// // was allocated as a &[u8] whose required alignment is smaller than
2123	/// // &[u32]. However, it's likely to work in practice because of how most
2124	/// // allocators work. So if you write code like this, make sure to either
2125	/// // handle the error correctly and/or run it under Miri since Miri will
2126	/// // likely provoke the error by returning Vec<u8> buffers with alignment
2127	/// // less than &[u32].
2128	/// let dfa: DFA<&[u32]> = match DFA::from_bytes(&buf[..written]) {
2129	/// // As mentioned above, it is legal for an error to be returned
2130	/// // here. It is quite difficult to get a Vec<u8> with a guaranteed
2131	/// // alignment equivalent to Vec<u32>.
2132	/// Err(_) => return Ok(()),
2133	/// Ok((dfa, _)) => dfa,
2134	/// };
2135	///
2136	/// let expected = Some(HalfMatch::must(`0`, `8`));
2137	/// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
2138	/// # Ok::<(), Box<dyn std::error::Error>>(())
2139	/// ```
2140	///
2141	/// Note that this example isn't actually guaranteed to work! In
2142	/// particular, if `buf` is not aligned to a 4-byte boundary, then the
2143	/// `DFA::from_bytes` call will fail. If you need this to work, then you
2144	/// either need to deal with adding some initial padding yourself, or use
2145	/// one of the `to_bytes` methods, which will do it for you.
2146	pub fn write_to_len(&self) -> usize {
2147	wire::write_label_len(LABEL)
2148	+ wire::write_endianness_check_len()
2149	+ wire::write_version_len()
2150	+ size_of::<u32>() // unused, intended for future flexibility
2151	+ self.flags.write_to_len()
2152	+ self.tt.write_to_len()
2153	+ self.st.write_to_len()
2154	+ self.ms.write_to_len()
2155	+ self.special.write_to_len()
2156	+ self.accels.write_to_len()
2157	+ self.quitset.write_to_len()
2158	}
2159	}
2160
2161	impl<'a> DFA<&'a [u32]> {
2162	/// Safely deserialize a DFA with a specific state identifier
2163	/// representation. Upon success, this returns both the deserialized DFA
2164	/// and the number of bytes read from the given slice. Namely, the contents
2165	/// of the slice beyond the DFA are not read.
2166	///
2167	/// Deserializing a DFA using this routine will never allocate heap memory.
2168	/// For safety purposes, the DFA's transition table will be verified such
2169	/// that every transition points to a valid state. If this verification is
2170	/// too costly, then a [`DFA::from_bytes_unchecked`] API is provided, which
2171	/// will always execute in constant time.
2172	///
2173	/// The bytes given must be generated by one of the serialization APIs
2174	/// of a `DFA` using a semver compatible release of this crate. Those
2175	/// include:
2176	///
2177	/// [`DFA::to_bytes_little_endian`]*
2178	/// [`DFA::to_bytes_big_endian`]*
2179	/// [`DFA::to_bytes_native_endian`]*
2180	/// [`DFA::write_to_little_endian`]*
2181	/// [`DFA::write_to_big_endian`]*
2182	/// [`DFA::write_to_native_endian`]*
2183	///
2184	/// The `to_bytes` methods allocate and return a `Vec<u8>` for you, along
2185	/// with handling alignment correctly. The `write_to` methods do not
2186	/// allocate and write to an existing slice (which may be on the stack).
2187	/// Since deserialization always uses the native endianness of the target
2188	/// platform, the serialization API you use should match the endianness of
2189	/// the target platform. (It's often a good idea to generate serialized
2190	/// DFAs for both forms of endianness and then load the correct one based
2191	/// on endianness.)
2192	///
2193	/// # Errors
2194	///
2195	/// Generally speaking, it's easier to state the conditions in which an
2196	/// error is _not_ returned. All of the following must be true:
2197	///
2198	/// The bytes given must be produced by one of the serialization APIs*
2199	/// on this DFA, as mentioned above.
2200	/// The endianness of the target platform matches the endianness used to*
2201	/// serialized the provided DFA.
2202	/// The slice given must have the same alignment as `u32`.*
2203	///
2204	/// If any of the above are not true, then an error will be returned.
2205	///
2206	/// # Panics
2207	///
2208	/// This routine will never panic for any input.
2209	///
2210	/// # Example
2211	///
2212	/// This example shows how to serialize a DFA to raw bytes, deserialize it
2213	/// and then use it for searching.
2214	///
2215	/// ```
2216	/// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input};
2217	///
2218	/// let initial = DFA::new("foo[0-9]+")?;
2219	/// let (bytes, _) = initial.to_bytes_native_endian();
2220	/// let dfa: DFA<&[u32]> = DFA::from_bytes(&bytes)?.0;
2221	///
2222	/// let expected = Some(HalfMatch::must(`0`, `8`));
2223	/// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
2224	/// # Ok::<(), Box<dyn std::error::Error>>(())
2225	/// ```
2226	///
2227	/// # Example: dealing with alignment and padding
2228	///
2229	/// In the above example, we used the `to_bytes_native_endian` method to
2230	/// serialize a DFA, but we ignored part of its return value corresponding
2231	/// to padding added to the beginning of the serialized DFA. This is OK
2232	/// because deserialization will skip this initial padding. What matters
2233	/// is that the address immediately following the padding has an alignment
2234	/// that matches `u32`. That is, the following is an equivalent but
2235	/// alternative way to write the above example:
2236	///
2237	/// ```
2238	/// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input};
2239	///
2240	/// let initial = DFA::new("foo[0-9]+")?;
2241	/// // Serialization returns the number of leading padding bytes added to
2242	/// // the returned Vec<u8>.
2243	/// let (bytes, pad) = initial.to_bytes_native_endian();
2244	/// let dfa: DFA<&[u32]> = DFA::from_bytes(&bytes[pad..])?.0;
2245	///
2246	/// let expected = Some(HalfMatch::must(`0`, `8`));
2247	/// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
2248	/// # Ok::<(), Box<dyn std::error::Error>>(())
2249	/// ```
2250	///
2251	/// This padding is necessary because Rust's standard library does
2252	/// not expose any safe and robust way of creating a `Vec<u8>` with a
2253	/// guaranteed alignment other than 1. Now, in practice, the underlying
2254	/// allocator is likely to provide a `Vec<u8>` that meets our alignment
2255	/// requirements, which means `pad` is zero in practice most of the time.
2256	///
2257	/// The purpose of exposing the padding like this is flexibility for the
2258	/// caller. For example, if one wants to embed a serialized DFA into a
2259	/// compiled program, then it's important to guarantee that it starts at a
2260	/// `u32`-aligned address. The simplest way to do this is to discard the
2261	/// padding bytes and set it up so that the serialized DFA itself begins at
2262	/// a properly aligned address. We can show this in two parts. The first
2263	/// part is serializing the DFA to a file:
2264	///
2265	/// ```no_run
2266	/// use regex_automata::dfa::dense::DFA;
2267	///
2268	/// let dfa = DFA::new("foo[0-9]+")?;
2269	///
2270	/// let (bytes, pad) = dfa.to_bytes_big_endian();
2271	/// // Write the contents of the DFA without* the initial padding.*
2272	/// std::fs::write("foo.bigendian.dfa", &bytes[pad..])?;
2273	///
2274	/// // Do it again, but this time for little endian.
2275	/// let (bytes, pad) = dfa.to_bytes_little_endian();
2276	/// std::fs::write("foo.littleendian.dfa", &bytes[pad..])?;
2277	/// # Ok::<(), Box<dyn std::error::Error>>(())
2278	/// ```
2279	///
2280	/// And now the second part is embedding the DFA into the compiled program
2281	/// and deserializing it at runtime on first use. We use conditional
2282	/// compilation to choose the correct endianness.
2283	///
2284	/// ```no_run
2285	/// use regex_automata::{
2286	/// dfa::{Automaton, dense::DFA},
2287	/// util::{lazy::Lazy, wire::AlignAs},
2288	/// HalfMatch, Input,
2289	/// };
2290	///
2291	/// // This crate provides its own "lazy" type, kind of like
2292	/// // lazy_static! or once_cell::sync::Lazy. But it works in no-alloc
2293	/// // no-std environments and let's us write this using completely
2294	/// // safe code.
2295	/// static RE: Lazy<DFA<&'static [u32]>> = Lazy::new(\|\| {
2296	/// # const _: &str = stringify! {
2297	/// // This assignment is made possible (implicitly) via the
2298	/// // CoerceUnsized trait. This is what guarantees that our
2299	/// // bytes are stored in memory on a 4 byte boundary. You
2300	/// // must* do this or something equivalent for correct*
2301	/// // deserialization.
2302	/// static ALIGNED: &AlignAs<[u8], u32> = &AlignAs {
2303	/// _align: [],
2304	/// #[cfg(target_endian = "big")]
2305	/// bytes: *include_bytes!("foo.bigendian.dfa"),
2306	/// #[cfg(target_endian = "little")]
2307	/// bytes: *include_bytes!("foo.littleendian.dfa"),
2308	/// };
2309	/// # };
2310	/// # static ALIGNED: &AlignAs<[u8], u32> = &AlignAs {
2311	/// # _align: [],
2312	/// # bytes: [],
2313	/// # };
2314	///
2315	/// let (dfa, _) = DFA::from_bytes(&ALIGNED.bytes)
2316	/// .expect("serialized DFA should be valid");
2317	/// dfa
2318	/// });
2319	///
2320	/// let expected = Ok(Some(HalfMatch::must(`0`, `8`)));
2321	/// assert_eq!(expected, RE.try_search_fwd(&Input::new("foo12345")));
2322	/// ```
2323	///
2324	/// An alternative to [`util::lazy::Lazy`](crate::util::lazy::Lazy)
2325	/// is [`lazy_static`](https://crates.io/crates/lazy_static) or
2326	/// [`once_cell`](https://crates.io/crates/once_cell), which provide
2327	/// stronger guarantees (like the initialization function only being
2328	/// executed once). And `once_cell` in particular provides a more
2329	/// expressive API. But a `Lazy` value from this crate is likely just fine
2330	/// in most circumstances.
2331	///
2332	/// Note that regardless of which initialization method you use, you
2333	/// will still need to use the [`AlignAs`](crate::util::wire::AlignAs)
2334	/// trick above to force correct alignment, but this is safe to do and
2335	/// `from_bytes` will return an error if you get it wrong.
2336	pub fn from_bytes(
2337	slice: &'a [u8],
2338	) -> Result<(DFA<&'a [u32]>, usize), DeserializeError> {
2339	// SAFETY: This is safe because we validate the transition table, start
2340	// table, match states and accelerators below. If any validation fails,
2341	// then we return an error.
2342	let (dfa, nread) = unsafe { DFA::from_bytes_unchecked(slice)? };
2343	dfa.tt.validate(&dfa.special)?;
2344	dfa.st.validate(&dfa.tt)?;
2345	dfa.ms.validate(&dfa)?;
2346	dfa.accels.validate()?;
2347	// N.B. dfa.special doesn't have a way to do unchecked deserialization,
2348	// so it has already been validated.
2349	for state in dfa.states() {
2350	// If the state is an accel state, then it must have a non-empty
2351	// accelerator.
2352	if dfa.is_accel_state(state.id()) {
2353	let index = dfa.accelerator_index(state.id());
2354	if index >= dfa.accels.len() {
2355	return Err(DeserializeError::generic(
2356	"found DFA state with invalid accelerator index",
2357	));
2358	}
2359	let needles = dfa.accels.needles(index);
2360	if !(`1` <= needles.len() && needles.len() <= `3`) {
2361	return Err(DeserializeError::generic(
2362	"accelerator needles has invalid length",
2363	));
2364	}
2365	}
2366	}
2367	Ok((dfa, nread))
2368	}
2369
2370	/// Deserialize a DFA with a specific state identifier representation in
2371	/// constant time by omitting the verification of the validity of the
2372	/// transition table and other data inside the DFA.
2373	///
2374	/// This is just like [`DFA::from_bytes`], except it can potentially return
2375	/// a DFA that exhibits undefined behavior if its transition table contains
2376	/// invalid state identifiers.
2377	///
2378	/// This routine is useful if you need to deserialize a DFA cheaply
2379	/// and cannot afford the transition table validation performed by
2380	/// `from_bytes`.
2381	///
2382	/// # Example
2383	///
2384	/// ```
2385	/// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input};
2386	///
2387	/// let initial = DFA::new("foo[0-9]+")?;
2388	/// let (bytes, _) = initial.to_bytes_native_endian();
2389	/// // SAFETY: This is guaranteed to be safe since the bytes given come
2390	/// // directly from a compatible serialization routine.
2391	/// let dfa: DFA<&[u32]> = unsafe { DFA::from_bytes_unchecked(&bytes)?.0 };
2392	///
2393	/// let expected = Some(HalfMatch::must(`0`, `8`));
2394	/// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
2395	/// # Ok::<(), Box<dyn std::error::Error>>(())
2396	/// ```
2397	pub unsafe fn from_bytes_unchecked(
2398	slice: &'a [u8],
2399	) -> Result<(DFA<&'a [u32]>, usize), DeserializeError> {
2400	let mut nr = `0`;
2401
2402	nr += wire::skip_initial_padding(slice);
2403	wire::check_alignment::<StateID>(&slice[nr..])?;
2404	nr += wire::read_label(&slice[nr..], LABEL)?;
2405	nr += wire::read_endianness_check(&slice[nr..])?;
2406	nr += wire::read_version(&slice[nr..], VERSION)?;
2407
2408	let _unused = wire::try_read_u32(&slice[nr..], "unused space")?;
2409	nr += size_of::<u32>();
2410
2411	let (flags, nread) = Flags::from_bytes(&slice[nr..])?;
2412	nr += nread;
2413
2414	let (tt, nread) = TransitionTable::from_bytes_unchecked(&slice[nr..])?;
2415	nr += nread;
2416
2417	let (st, nread) = StartTable::from_bytes_unchecked(&slice[nr..])?;
2418	nr += nread;
2419
2420	let (ms, nread) = MatchStates::from_bytes_unchecked(&slice[nr..])?;
2421	nr += nread;
2422
2423	let (special, nread) = Special::from_bytes(&slice[nr..])?;
2424	nr += nread;
2425	special.validate_state_len(tt.len(), tt.stride2)?;
2426
2427	let (accels, nread) = Accels::from_bytes_unchecked(&slice[nr..])?;
2428	nr += nread;
2429
2430	let (quitset, nread) = ByteSet::from_bytes(&slice[nr..])?;
2431	nr += nread;
2432
2433	// Prefilters don't support serialization, so they're always absent.
2434	let pre = None;
2435	Ok((DFA { tt, st, ms, special, accels, pre, quitset, flags }, nr))
2436	}
2437
2438	/// The implementation of the public `write_to` serialization methods,
2439	/// which is generic over endianness.
2440	///
2441	/// This is defined only for &[u32] to reduce binary size/compilation time.
2442	fn write_to<E: Endian>(
2443	&self,
2444	mut dst: &mut [u8],
2445	) -> Result<usize, SerializeError> {
2446	let nwrite = self.write_to_len();
2447	if dst.len() < nwrite {
2448	return Err(SerializeError::buffer_too_small("dense DFA"));
2449	}
2450	dst = &mut dst[..nwrite];
2451
2452	let mut nw = `0`;
2453	nw += wire::write_label(LABEL, &mut dst[nw..])?;
2454	nw += wire::write_endianness_check::<E>(&mut dst[nw..])?;
2455	nw += wire::write_version::<E>(VERSION, &mut dst[nw..])?;
2456	nw += {
2457	// Currently unused, intended for future flexibility
2458	E::write_u32(`0`, &mut dst[nw..]);
2459	size_of::<u32>()
2460	};
2461	nw += self.flags.write_to::<E>(&mut dst[nw..])?;
2462	nw += self.tt.write_to::<E>(&mut dst[nw..])?;
2463	nw += self.st.write_to::<E>(&mut dst[nw..])?;
2464	nw += self.ms.write_to::<E>(&mut dst[nw..])?;
2465	nw += self.special.write_to::<E>(&mut dst[nw..])?;
2466	nw += self.accels.write_to::<E>(&mut dst[nw..])?;
2467	nw += self.quitset.write_to::<E>(&mut dst[nw..])?;
2468	Ok(nw)
2469	}
2470	}
2471
2472	// The following methods implement mutable routines on the internal
2473	// representation of a DFA. As such, we must fix the first type parameter to a
2474	// `Vec<u32>` since a generic `T: AsRef<[u32]>` does not permit mutation. We
2475	// can get away with this because these methods are internal to the crate and
2476	// are exclusively used during construction of the DFA.
2477	#[cfg(feature = "dfa-build")]
2478	impl OwnedDFA {
2479	/// Add a start state of this DFA.
2480	pub(crate) fn set_start_state(
2481	&mut self,
2482	anchored: Anchored,
2483	start: Start,
2484	id: StateID,
2485	) {
2486	assert!(self.tt.is_valid(id), "invalid start state");
2487	self.st.set_start(anchored, start, id);
2488	}
2489
2490	/// Set the given transition to this DFA. Both the `from` and `to` states
2491	/// must already exist.
2492	pub(crate) fn set_transition(
2493	&mut self,
2494	from: StateID,
2495	byte: alphabet::Unit,
2496	to: StateID,
2497	) {
2498	self.tt.set(from, byte, to);
2499	}
2500
2501	/// An an empty state (a state where all transitions lead to a dead state)
2502	/// and return its identifier. The identifier returned is guaranteed to
2503	/// not point to any other existing state.
2504	///
2505	/// If adding a state would exceed `StateID::LIMIT`, then this returns an
2506	/// error.
2507	pub(crate) fn add_empty_state(&mut self) -> Result<StateID, BuildError> {
2508	self.tt.add_empty_state()
2509	}
2510
2511	/// Swap the two states given in the transition table.
2512	///
2513	/// This routine does not do anything to check the correctness of this
2514	/// swap. Callers must ensure that other states pointing to id1 and id2 are
2515	/// updated appropriately.
2516	pub(crate) fn swap_states(&mut self, id1: StateID, id2: StateID) {
2517	self.tt.swap(id1, id2);
2518	}
2519
2520	/// Remap all of the state identifiers in this DFA according to the map
2521	/// function given. This includes all transitions and all starting state
2522	/// identifiers.
2523	pub(crate) fn remap(&mut self, map: impl Fn(StateID) -> StateID) {
2524	// We could loop over each state ID and call 'remap_state' here, but
2525	// this is more direct: just map every transition directly. This
2526	// technically might do a little extra work since the alphabet length
2527	// is likely less than the stride, but if that is indeed an issue we
2528	// should benchmark it and fix it.
2529	for sid in self.tt.table_mut().iter_mut() {
2530	sid = map(sid);
2531	}
2532	for sid in self.st.table_mut().iter_mut() {
2533	sid = map(sid);
2534	}
2535	}
2536
2537	/// Remap the transitions for the state given according to the function
2538	/// given. This applies the given map function to every transition in the
2539	/// given state and changes the transition in place to the result of the
2540	/// map function for that transition.
2541	pub(crate) fn remap_state(
2542	&mut self,
2543	id: StateID,
2544	map: impl Fn(StateID) -> StateID,
2545	) {
2546	self.tt.remap(id, map);
2547	}
2548
2549	/// Truncate the states in this DFA to the given length.
2550	///
2551	/// This routine does not do anything to check the correctness of this
2552	/// truncation. Callers must ensure that other states pointing to truncated
2553	/// states are updated appropriately.
2554	pub(crate) fn truncate_states(&mut self, len: usize) {
2555	self.tt.truncate(len);
2556	}
2557
2558	/// Minimize this DFA in place using Hopcroft's algorithm.
2559	pub(crate) fn minimize(&mut self) {
2560	Minimizer::new(self).run();
2561	}
2562
2563	/// Updates the match state pattern ID map to use the one provided.
2564	///
2565	/// This is useful when it's convenient to manipulate matching states
2566	/// (and their corresponding pattern IDs) as a map. In particular, the
2567	/// representation used by a DFA for this map is not amenable to mutation,
2568	/// so if things need to be changed (like when shuffling states), it's
2569	/// often easier to work with the map form.
2570	pub(crate) fn set_pattern_map(
2571	&mut self,
2572	map: &BTreeMap<StateID, Vec<PatternID>>,
2573	) -> Result<(), BuildError> {
2574	self.ms = self.ms.new_with_map(map)?;
2575	Ok(())
2576	}
2577
2578	/// Find states that have a small number of non-loop transitions and mark
2579	/// them as candidates for acceleration during search.
2580	pub(crate) fn accelerate(&mut self) {
2581	// dead and quit states can never be accelerated.
2582	if self.state_len() <= `2` {
2583	return;
2584	}
2585
2586	// Go through every state and record their accelerator, if possible.
2587	let mut accels = BTreeMap::new();
2588	// Count the number of accelerated match, start and non-match/start
2589	// states.
2590	let (mut cmatch, mut cstart, mut cnormal) = (`0`, `0`, `0`);
2591	for state in self.states() {
2592	if let Some(accel) = state.accelerate(self.byte_classes()) {
2593	debug!(
2594	"accelerating full DFA state {}: {:?}",
2595	state.id().as_usize(),
2596	accel,
2597	);
2598	accels.insert(state.id(), accel);
2599	if self.is_match_state(state.id()) {
2600	cmatch += `1`;
2601	} else if self.is_start_state(state.id()) {
2602	cstart += `1`;
2603	} else {
2604	assert!(!self.is_dead_state(state.id()));
2605	assert!(!self.is_quit_state(state.id()));
2606	cnormal += `1`;
2607	}
2608	}
2609	}
2610	// If no states were able to be accelerated, then we're done.
2611	if accels.is_empty() {
2612	return;
2613	}
2614	let original_accels_len = accels.len();
2615
2616	// A remapper keeps track of state ID changes. Once we're done
2617	// shuffling, the remapper is used to rewrite all transitions in the
2618	// DFA based on the new positions of states.
2619	let mut remapper = Remapper::new(self);
2620
2621	// As we swap states, if they are match states, we need to swap their
2622	// pattern ID lists too (for multi-regexes). We do this by converting
2623	// the lists to an easily swappable map, and then convert back to
2624	// MatchStates once we're done.
2625	let mut new_matches = self.ms.to_map(self);
2626
2627	// There is at least one state that gets accelerated, so these are
2628	// guaranteed to get set to sensible values below.
2629	self.special.min_accel = StateID::MAX;
2630	self.special.max_accel = StateID::ZERO;
2631	let update_special_accel =
2632	\|special: &mut Special, accel_id: StateID\| {
2633	special.min_accel = cmp::min(special.min_accel, accel_id);
2634	special.max_accel = cmp::max(special.max_accel, accel_id);
2635	};
2636
2637	// Start by shuffling match states. Any match states that are
2638	// accelerated get moved to the end of the match state range.
2639	if cmatch > `0` && self.special.matches() {
2640	// N.B. special.{min,max}_match do not need updating, since the
2641	// range/number of match states does not change. Only the ordering
2642	// of match states may change.
2643	let mut next_id = self.special.max_match;
2644	let mut cur_id = next_id;
2645	while cur_id >= self.special.min_match {
2646	if let Some(accel) = accels.remove(&cur_id) {
2647	accels.insert(next_id, accel);
2648	update_special_accel(&mut self.special, next_id);
2649
2650	// No need to do any actual swapping for equivalent IDs.
2651	if cur_id != next_id {
2652	remapper.swap(self, cur_id, next_id);
2653
2654	// Swap pattern IDs for match states.
2655	let cur_pids = new_matches.remove(&cur_id).unwrap();
2656	let next_pids = new_matches.remove(&next_id).unwrap();
2657	new_matches.insert(cur_id, next_pids);
2658	new_matches.insert(next_id, cur_pids);
2659	}
2660	next_id = self.tt.prev_state_id(next_id);
2661	}
2662	cur_id = self.tt.prev_state_id(cur_id);
2663	}
2664	}
2665
2666	// This is where it gets tricky. Without acceleration, start states
2667	// normally come right after match states. But we want accelerated
2668	// states to be a single contiguous range (to make it very fast
2669	// to determine whether a state is* accelerated), while also keeping*
2670	// match and starting states as contiguous ranges for the same reason.
2671	// So what we do here is shuffle states such that it looks like this:
2672	//
2673	// DQMMMMAAAAASSSSSSNNNNNNN
2674	// \| \|
2675	// \|---------\|
2676	// accelerated states
2677	//
2678	// Where:
2679	// D - dead state
2680	// Q - quit state
2681	// M - match state (may be accelerated)
2682	// A - normal state that is accelerated
2683	// S - start state (may be accelerated)
2684	// N - normal state that is NOT accelerated
2685	//
2686	// We implement this by shuffling states, which is done by a sequence
2687	// of pairwise swaps. We start by looking at all normal states to be
2688	// accelerated. When we find one, we swap it with the earliest starting
2689	// state, and then swap that with the earliest normal state. This
2690	// preserves the contiguous property.
2691	//
2692	// Once we're done looking for accelerated normal states, now we look
2693	// for accelerated starting states by moving them to the beginning
2694	// of the starting state range (just like we moved accelerated match
2695	// states to the end of the matching state range).
2696	//
2697	// For a more detailed/different perspective on this, see the docs
2698	// in dfa/special.rs.
2699	if cnormal > `0` {
2700	// our next available starting and normal states for swapping.
2701	let mut next_start_id = self.special.min_start;
2702	let mut cur_id = self.to_state_id(self.state_len() - `1`);
2703	// This is guaranteed to exist since cnormal > 0.
2704	let mut next_norm_id =
2705	self.tt.next_state_id(self.special.max_start);
2706	while cur_id >= next_norm_id {
2707	if let Some(accel) = accels.remove(&cur_id) {
2708	remapper.swap(self, next_start_id, cur_id);
2709	remapper.swap(self, next_norm_id, cur_id);
2710	// Keep our accelerator map updated with new IDs if the
2711	// states we swapped were also accelerated.
2712	if let Some(accel2) = accels.remove(&next_norm_id) {
2713	accels.insert(cur_id, accel2);
2714	}
2715	if let Some(accel2) = accels.remove(&next_start_id) {
2716	accels.insert(next_norm_id, accel2);
2717	}
2718	accels.insert(next_start_id, accel);
2719	update_special_accel(&mut self.special, next_start_id);
2720	// Our start range shifts one to the right now.
2721	self.special.min_start =
2722	self.tt.next_state_id(self.special.min_start);
2723	self.special.max_start =
2724	self.tt.next_state_id(self.special.max_start);
2725	next_start_id = self.tt.next_state_id(next_start_id);
2726	next_norm_id = self.tt.next_state_id(next_norm_id);
2727	}
2728	// This is pretty tricky, but if our 'next_norm_id' state also
2729	// happened to be accelerated, then the result is that it is
2730	// now in the position of cur_id, so we need to consider it
2731	// again. This loop is still guaranteed to terminate though,
2732	// because when accels contains cur_id, we're guaranteed to
2733	// increment next_norm_id even if cur_id remains unchanged.
2734	if !accels.contains_key(&cur_id) {
2735	cur_id = self.tt.prev_state_id(cur_id);
2736	}
2737	}
2738	}
2739	// Just like we did for match states, but we want to move accelerated
2740	// start states to the beginning of the range instead of the end.
2741	if cstart > `0` {
2742	// N.B. special.{min,max}_start do not need updating, since the
2743	// range/number of start states does not change at this point. Only
2744	// the ordering of start states may change.
2745	let mut next_id = self.special.min_start;
2746	let mut cur_id = next_id;
2747	while cur_id <= self.special.max_start {
2748	if let Some(accel) = accels.remove(&cur_id) {
2749	remapper.swap(self, cur_id, next_id);
2750	accels.insert(next_id, accel);
2751	update_special_accel(&mut self.special, next_id);
2752	next_id = self.tt.next_state_id(next_id);
2753	}
2754	cur_id = self.tt.next_state_id(cur_id);
2755	}
2756	}
2757
2758	// Remap all transitions in our DFA and assert some things.
2759	remapper.remap(self);
2760	// This unwrap is OK because acceleration never changes the number of
2761	// match states or patterns in those match states. Since acceleration
2762	// runs after the pattern map has been set at least once, we know that
2763	// our match states cannot error.
2764	self.set_pattern_map(&new_matches).unwrap();
2765	self.special.set_max();
2766	self.special.validate().expect("special state ranges should validate");
2767	self.special
2768	.validate_state_len(self.state_len(), self.stride2())
2769	.expect(
2770	"special state ranges should be consistent with state length",
2771	);
2772	assert_eq!(
2773	self.special.accel_len(self.stride()),
2774	// We record the number of accelerated states initially detected
2775	// since the accels map is itself mutated in the process above.
2776	// If mutated incorrectly, its size may change, and thus can't be
2777	// trusted as a source of truth of how many accelerated states we
2778	// expected there to be.
2779	original_accels_len,
2780	"mismatch with expected number of accelerated states",
2781	);
2782
2783	// And finally record our accelerators. We kept our accels map updated
2784	// as we shuffled states above, so the accelerators should now
2785	// correspond to a contiguous range in the state ID space. (Which we
2786	// assert.)
2787	let mut prev: Option<StateID> = None;
2788	for (id, accel) in accels {
2789	assert!(prev.map_or(`true`, \|p\| self.tt.next_state_id(p) == id));
2790	prev = Some(id);
2791	self.accels.add(accel);
2792	}
2793	}
2794
2795	/// Shuffle the states in this DFA so that starting states, match
2796	/// states and accelerated states are all contiguous.
2797	///
2798	/// See dfa/special.rs for more details.
2799	pub(crate) fn shuffle(
2800	&mut self,
2801	mut matches: BTreeMap<StateID, Vec<PatternID>>,
2802	) -> Result<(), BuildError> {
2803	// The determinizer always adds a quit state and it is always second.
2804	self.special.quit_id = self.to_state_id(`1`);
2805	// If all we have are the dead and quit states, then we're done and
2806	// the DFA will never produce a match.
2807	if self.state_len() <= `2` {
2808	self.special.set_max();
2809	return Ok(());
2810	}
2811
2812	// Collect all our non-DEAD start states into a convenient set and
2813	// confirm there is no overlap with match states. In the classicl DFA
2814	// construction, start states can be match states. But because of
2815	// look-around, we delay all matches by a byte, which prevents start
2816	// states from being match states.
2817	let mut is_start: BTreeSet<StateID> = BTreeSet::new();
2818	for (start_id, _, _) in self.starts() {
2819	// If a starting configuration points to a DEAD state, then we
2820	// don't want to shuffle it. The DEAD state is always the first
2821	// state with ID=0. So we can just leave it be.
2822	if start_id == DEAD {
2823	continue;
2824	}
2825	assert!(
2826	!matches.contains_key(&start_id),
2827	"{:?} is both a start and a match state, which is not allowed",
2828	start_id,
2829	);
2830	is_start.insert(start_id);
2831	}
2832
2833	// We implement shuffling by a sequence of pairwise swaps of states.
2834	// Since we have a number of things referencing states via their
2835	// IDs and swapping them changes their IDs, we need to record every
2836	// swap we make so that we can remap IDs. The remapper handles this
2837	// book-keeping for us.
2838	let mut remapper = Remapper::new(self);
2839
2840	// Shuffle matching states.
2841	if matches.is_empty() {
2842	self.special.min_match = DEAD;
2843	self.special.max_match = DEAD;
2844	} else {
2845	// The determinizer guarantees that the first two states are the
2846	// dead and quit states, respectively. We want our match states to
2847	// come right after quit.
2848	let mut next_id = self.to_state_id(`2`);
2849	let mut new_matches = BTreeMap::new();
2850	self.special.min_match = next_id;
2851	for (id, pids) in matches {
2852	remapper.swap(self, next_id, id);
2853	new_matches.insert(next_id, pids);
2854	// If we swapped a start state, then update our set.
2855	if is_start.contains(&next_id) {
2856	is_start.remove(&next_id);
2857	is_start.insert(id);
2858	}
2859	next_id = self.tt.next_state_id(next_id);
2860	}
2861	matches = new_matches;
2862	self.special.max_match = cmp::max(
2863	self.special.min_match,
2864	self.tt.prev_state_id(next_id),
2865	);
2866	}
2867
2868	// Shuffle starting states.
2869	{
2870	let mut next_id = self.to_state_id(`2`);
2871	if self.special.matches() {
2872	next_id = self.tt.next_state_id(self.special.max_match);
2873	}
2874	self.special.min_start = next_id;
2875	for id in is_start {
2876	remapper.swap(self, next_id, id);
2877	next_id = self.tt.next_state_id(next_id);
2878	}
2879	self.special.max_start = cmp::max(
2880	self.special.min_start,
2881	self.tt.prev_state_id(next_id),
2882	);
2883	}
2884
2885	// Finally remap all transitions in our DFA.
2886	remapper.remap(self);
2887	self.set_pattern_map(&matches)?;
2888	self.special.set_max();
2889	self.special.validate().expect("special state ranges should validate");
2890	self.special
2891	.validate_state_len(self.state_len(), self.stride2())
2892	.expect(
2893	"special state ranges should be consistent with state length",
2894	);
2895	Ok(())
2896	}
2897
2898	/// Checks whether there are universal start states (both anchored and
2899	/// unanchored), and if so, sets the relevant fields to the start state
2900	/// IDs.
2901	///
2902	/// Universal start states occur precisely when the all patterns in the
2903	/// DFA have no look-around assertions in their prefix.
2904	fn set_universal_starts(&mut self) {
2905	assert_eq!(`6`, Start::len(), "expected 6 start configurations");
2906
2907	let start_id = \|dfa: &mut OwnedDFA,
2908	anchored: Anchored,
2909	start: Start\| {
2910	// This OK because we only call 'start' under conditions
2911	// in which we know it will succeed.
2912	dfa.st.start(anchored, start).expect("valid Input configuration")
2913	};
2914	if self.start_kind().has_unanchored() {
2915	let anchor = Anchored::No;
2916	let sid = start_id(self, anchor, Start::NonWordByte);
2917	if sid == start_id(self, anchor, Start::WordByte)
2918	&& sid == start_id(self, anchor, Start::Text)
2919	&& sid == start_id(self, anchor, Start::LineLF)
2920	&& sid == start_id(self, anchor, Start::LineCR)
2921	&& sid == start_id(self, anchor, Start::CustomLineTerminator)
2922	{
2923	self.st.universal_start_unanchored = Some(sid);
2924	}
2925	}
2926	if self.start_kind().has_anchored() {
2927	let anchor = Anchored::Yes;
2928	let sid = start_id(self, anchor, Start::NonWordByte);
2929	if sid == start_id(self, anchor, Start::WordByte)
2930	&& sid == start_id(self, anchor, Start::Text)
2931	&& sid == start_id(self, anchor, Start::LineLF)
2932	&& sid == start_id(self, anchor, Start::LineCR)
2933	&& sid == start_id(self, anchor, Start::CustomLineTerminator)
2934	{
2935	self.st.universal_start_anchored = Some(sid);
2936	}
2937	}
2938	}
2939	}
2940
2941	// A variety of generic internal methods for accessing DFA internals.
2942	impl<T: AsRef<[u32]>> DFA<T> {
2943	/// Return the info about special states.
2944	pub(crate) fn special(&self) -> &Special {
2945	&self.special
2946	}
2947
2948	/// Return the info about special states as a mutable borrow.
2949	#[cfg(feature = "dfa-build")]
2950	pub(crate) fn special_mut(&mut self) -> &mut Special {
2951	&mut self.special
2952	}
2953
2954	/// Returns the quit set (may be empty) used by this DFA.
2955	pub(crate) fn quitset(&self) -> &ByteSet {
2956	&self.quitset
2957	}
2958
2959	/// Returns the flags for this DFA.
2960	pub(crate) fn flags(&self) -> &Flags {
2961	&self.flags
2962	}
2963
2964	/// Returns an iterator over all states in this DFA.
2965	///
2966	/// This iterator yields a tuple for each state. The first element of the
2967	/// tuple corresponds to a state's identifier, and the second element
2968	/// corresponds to the state itself (comprised of its transitions).
2969	pub(crate) fn states(&self) -> StateIter<'_, T> {
2970	self.tt.states()
2971	}
2972
2973	/// Return the total number of states in this DFA. Every DFA has at least
2974	/// 1 state, even the empty DFA.
2975	pub(crate) fn state_len(&self) -> usize {
2976	self.tt.len()
2977	}
2978
2979	/// Return an iterator over all pattern IDs for the given match state.
2980	///
2981	/// If the given state is not a match state, then this panics.
2982	#[cfg(feature = "dfa-build")]
2983	pub(crate) fn pattern_id_slice(&self, id: StateID) -> &[PatternID] {
2984	assert!(self.is_match_state(id));
2985	self.ms.pattern_id_slice(self.match_state_index(id))
2986	}
2987
2988	/// Return the total number of pattern IDs for the given match state.
2989	///
2990	/// If the given state is not a match state, then this panics.
2991	pub(crate) fn match_pattern_len(&self, id: StateID) -> usize {
2992	assert!(self.is_match_state(id));
2993	self.ms.pattern_len(self.match_state_index(id))
2994	}
2995
2996	/// Returns the total number of patterns matched by this DFA.
2997	pub(crate) fn pattern_len(&self) -> usize {
2998	self.ms.pattern_len
2999	}
3000
3001	/// Returns a map from match state ID to a list of pattern IDs that match
3002	/// in that state.
3003	#[cfg(feature = "dfa-build")]
3004	pub(crate) fn pattern_map(&self) -> BTreeMap<StateID, Vec<PatternID>> {
3005	self.ms.to_map(self)
3006	}
3007
3008	/// Returns the ID of the quit state for this DFA.
3009	#[cfg(feature = "dfa-build")]
3010	pub(crate) fn quit_id(&self) -> StateID {
3011	self.to_state_id(`1`)
3012	}
3013
3014	/// Convert the given state identifier to the state's index. The state's
3015	/// index corresponds to the position in which it appears in the transition
3016	/// table. When a DFA is NOT premultiplied, then a state's identifier is
3017	/// also its index. When a DFA is premultiplied, then a state's identifier
3018	/// is equal to `index alphabet_len`. This routine reverses that.*
3019	pub(crate) fn to_index(&self, id: StateID) -> usize {
3020	self.tt.to_index(id)
3021	}
3022
3023	/// Convert an index to a state (in the range 0..self.state_len()) to an
3024	/// actual state identifier.
3025	///
3026	/// This is useful when using a `Vec<T>` as an efficient map keyed by state
3027	/// to some other information (such as a remapped state ID).
3028	#[cfg(feature = "dfa-build")]
3029	pub(crate) fn to_state_id(&self, index: usize) -> StateID {
3030	self.tt.to_state_id(index)
3031	}
3032
3033	/// Return the table of state IDs for this DFA's start states.
3034	pub(crate) fn starts(&self) -> StartStateIter<'_> {
3035	self.st.iter()
3036	}
3037
3038	/// Returns the index of the match state for the given ID. If the
3039	/// given ID does not correspond to a match state, then this may
3040	/// panic or produce an incorrect result.
3041	#[cfg_attr(feature = "perf-inline", inline(always))]
3042	fn match_state_index(&self, id: StateID) -> usize {
3043	debug_assert!(self.is_match_state(id));
3044	// This is one of the places where we rely on the fact that match
3045	// states are contiguous in the transition table. Namely, that the
3046	// first match state ID always corresponds to dfa.special.min_match.
3047	// From there, since we know the stride, we can compute the overall
3048	// index of any match state given the match state's ID.
3049	let min = self.special().min_match.as_usize();
3050	// CORRECTNESS: We're allowed to produce an incorrect result or panic,
3051	// so both the subtraction and the unchecked StateID construction is
3052	// OK.
3053	self.to_index(StateID::new_unchecked(id.as_usize() - min))
3054	}
3055
3056	/// Returns the index of the accelerator state for the given ID. If the
3057	/// given ID does not correspond to an accelerator state, then this may
3058	/// panic or produce an incorrect result.
3059	fn accelerator_index(&self, id: StateID) -> usize {
3060	let min = self.special().min_accel.as_usize();
3061	// CORRECTNESS: We're allowed to produce an incorrect result or panic,
3062	// so both the subtraction and the unchecked StateID construction is
3063	// OK.
3064	self.to_index(StateID::new_unchecked(id.as_usize() - min))
3065	}
3066
3067	/// Return the accelerators for this DFA.
3068	fn accels(&self) -> Accels<&[u32]> {
3069	self.accels.as_ref()
3070	}
3071
3072	/// Return this DFA's transition table as a slice.
3073	fn trans(&self) -> &[StateID] {
3074	self.tt.table()
3075	}
3076	}
3077
3078	impl<T: AsRef<[u32]>> fmt::Debug for DFA<T> {
3079	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
3080	writeln!(f, "dense::DFA(")?;
3081	for state in self.states() {
3082	fmt_state_indicator(f, self, state.id())?;
3083	let id = if f.alternate() {
3084	state.id().as_usize()
3085	} else {
3086	self.to_index(state.id())
3087	};
3088	write!(f, "{:06?}: ", id)?;
3089	state.fmt(f)?;
3090	write!(f, "`\n`")?;
3091	}
3092	writeln!(f, "")?;
3093	for (i, (start_id, anchored, sty)) in self.starts().enumerate() {
3094	let id = if f.alternate() {
3095	start_id.as_usize()
3096	} else {
3097	self.to_index(start_id)
3098	};
3099	if i % self.st.stride == `0` {
3100	match anchored {
3101	Anchored::No => writeln!(f, "START-GROUP(unanchored)")?,
3102	Anchored::Yes => writeln!(f, "START-GROUP(anchored)")?,
3103	Anchored::Pattern(pid) => {
3104	writeln!(f, "START_GROUP(pattern: {:?})", pid)?
3105	}
3106	}
3107	}
3108	writeln!(f, " {:?} => {:06?}", sty, id)?;
3109	}
3110	if self.pattern_len() > `1` {
3111	writeln!(f, "")?;
3112	for i in `0`..self.ms.len() {
3113	let id = self.ms.match_state_id(self, i);
3114	let id = if f.alternate() {
3115	id.as_usize()
3116	} else {
3117	self.to_index(id)
3118	};
3119	write!(f, "MATCH({:06?}): ", id)?;
3120	for (i, &pid) in self.ms.pattern_id_slice(i).iter().enumerate()
3121	{
3122	if i > `0` {
3123	write!(f, ", ")?;
3124	}
3125	write!(f, "{:?}", pid)?;
3126	}
3127	writeln!(f, "")?;
3128	}
3129	}
3130	writeln!(f, "state length: {:?}", self.state_len())?;
3131	writeln!(f, "pattern length: {:?}", self.pattern_len())?;
3132	writeln!(f, "flags: {:?}", self.flags)?;
3133	writeln!(f, ")")?;
3134	Ok(())
3135	}
3136	}
3137
3138	// SAFETY: We assert that our implementation of each method is correct.
3139	unsafe impl<T: AsRef<[u32]>> Automaton for DFA<T> {
3140	#[cfg_attr(feature = "perf-inline", inline(always))]
3141	fn is_special_state(&self, id: StateID) -> bool {
3142	self.special.is_special_state(id)
3143	}
3144
3145	#[cfg_attr(feature = "perf-inline", inline(always))]
3146	fn is_dead_state(&self, id: StateID) -> bool {
3147	self.special.is_dead_state(id)
3148	}
3149
3150	#[cfg_attr(feature = "perf-inline", inline(always))]
3151	fn is_quit_state(&self, id: StateID) -> bool {
3152	self.special.is_quit_state(id)
3153	}
3154
3155	#[cfg_attr(feature = "perf-inline", inline(always))]
3156	fn is_match_state(&self, id: StateID) -> bool {
3157	self.special.is_match_state(id)
3158	}
3159
3160	#[cfg_attr(feature = "perf-inline", inline(always))]
3161	fn is_start_state(&self, id: StateID) -> bool {
3162	self.special.is_start_state(id)
3163	}
3164
3165	#[cfg_attr(feature = "perf-inline", inline(always))]
3166	fn is_accel_state(&self, id: StateID) -> bool {
3167	self.special.is_accel_state(id)
3168	}
3169
3170	#[cfg_attr(feature = "perf-inline", inline(always))]
3171	fn next_state(&self, current: StateID, input: u8) -> StateID {
3172	let input = self.byte_classes().get(input);
3173	let o = current.as_usize() + usize::from(input);
3174	self.trans()[o]
3175	}
3176
3177	#[cfg_attr(feature = "perf-inline", inline(always))]
3178	unsafe fn next_state_unchecked(
3179	&self,
3180	current: StateID,
3181	byte: u8,
3182	) -> StateID {
3183	// We don't (or shouldn't) need an unchecked variant for the byte
3184	// class mapping, since bound checks should be omitted automatically
3185	// by virtue of its representation. If this ends up not being true as
3186	// confirmed by codegen, please file an issue. ---AG
3187	let class = self.byte_classes().get(byte);
3188	let o = current.as_usize() + usize::from(class);
3189	let next = *self.trans().get_unchecked(o);
3190	next
3191	}
3192
3193	#[cfg_attr(feature = "perf-inline", inline(always))]
3194	fn next_eoi_state(&self, current: StateID) -> StateID {
3195	let eoi = self.byte_classes().eoi().as_usize();
3196	let o = current.as_usize() + eoi;
3197	self.trans()[o]
3198	}
3199
3200	#[cfg_attr(feature = "perf-inline", inline(always))]
3201	fn pattern_len(&self) -> usize {
3202	self.ms.pattern_len
3203	}
3204
3205	#[cfg_attr(feature = "perf-inline", inline(always))]
3206	fn match_len(&self, id: StateID) -> usize {
3207	self.match_pattern_len(id)
3208	}
3209
3210	#[cfg_attr(feature = "perf-inline", inline(always))]
3211	fn match_pattern(&self, id: StateID, match_index: usize) -> PatternID {
3212	// This is an optimization for the very common case of a DFA with a
3213	// single pattern. This conditional avoids a somewhat more costly path
3214	// that finds the pattern ID from the state machine, which requires
3215	// a bit of slicing/pointer-chasing. This optimization tends to only
3216	// matter when matches are frequent.
3217	if self.ms.pattern_len == `1` {
3218	return PatternID::ZERO;
3219	}
3220	let state_index = self.match_state_index(id);
3221	self.ms.pattern_id(state_index, match_index)
3222	}
3223
3224	#[cfg_attr(feature = "perf-inline", inline(always))]
3225	fn has_empty(&self) -> bool {
3226	self.flags.has_empty
3227	}
3228
3229	#[cfg_attr(feature = "perf-inline", inline(always))]
3230	fn is_utf8(&self) -> bool {
3231	self.flags.is_utf8
3232	}
3233
3234	#[cfg_attr(feature = "perf-inline", inline(always))]
3235	fn is_always_start_anchored(&self) -> bool {
3236	self.flags.is_always_start_anchored
3237	}
3238
3239	#[cfg_attr(feature = "perf-inline", inline(always))]
3240	fn start_state(
3241	&self,
3242	config: &start::Config,
3243	) -> Result<StateID, StartError> {
3244	let anchored = config.get_anchored();
3245	let start = match config.get_look_behind() {
3246	None => Start::Text,
3247	Some(byte) => {
3248	if !self.quitset.is_empty() && self.quitset.contains(byte) {
3249	return Err(StartError::quit(byte));
3250	}
3251	self.st.start_map.get(byte)
3252	}
3253	};
3254	self.st.start(anchored, start)
3255	}
3256
3257	#[cfg_attr(feature = "perf-inline", inline(always))]
3258	fn universal_start_state(&self, mode: Anchored) -> Option<StateID> {
3259	match mode {
3260	Anchored::No => self.st.universal_start_unanchored,
3261	Anchored::Yes => self.st.universal_start_anchored,
3262	Anchored::Pattern(_) => None,
3263	}
3264	}
3265
3266	#[cfg_attr(feature = "perf-inline", inline(always))]
3267	fn accelerator(&self, id: StateID) -> &[u8] {
3268	if !self.is_accel_state(id) {
3269	return &[];
3270	}
3271	self.accels.needles(self.accelerator_index(id))
3272	}
3273
3274	#[cfg_attr(feature = "perf-inline", inline(always))]
3275	fn get_prefilter(&self) -> Option<&Prefilter> {
3276	self.pre.as_ref()
3277	}
3278	}
3279
3280	/// The transition table portion of a dense DFA.
3281	///
3282	/// The transition table is the core part of the DFA in that it describes how
3283	/// to move from one state to another based on the input sequence observed.
3284	#[derive(Clone)]
3285	pub(crate) struct TransitionTable<T> {
3286	/// A contiguous region of memory representing the transition table in
3287	/// row-major order. The representation is dense. That is, every state
3288	/// has precisely the same number of transitions. The maximum number of
3289	/// transitions per state is 257 (256 for each possible byte value, plus 1
3290	/// for the special EOI transition). If a DFA has been instructed to use
3291	/// byte classes (the default), then the number of transitions is usually
3292	/// substantially fewer.
3293	///
3294	/// In practice, T is either `Vec<u32>` or `&[u32]`.
3295	table: T,
3296	/// A set of equivalence classes, where a single equivalence class
3297	/// represents a set of bytes that never discriminate between a match
3298	/// and a non-match in the DFA. Each equivalence class corresponds to a
3299	/// single character in this DFA's alphabet, where the maximum number of
3300	/// characters is 257 (each possible value of a byte plus the special
3301	/// EOI transition). Consequently, the number of equivalence classes
3302	/// corresponds to the number of transitions for each DFA state. Note
3303	/// though that the space* used by each DFA state in the transition table*
3304	/// may be larger. The total space used by each DFA state is known as the
3305	/// stride.
3306	///
3307	/// The only time the number of equivalence classes is fewer than 257 is if
3308	/// the DFA's kind uses byte classes (which is the default). Equivalence
3309	/// classes should generally only be disabled when debugging, so that
3310	/// the transitions themselves aren't obscured. Disabling them has no
3311	/// other benefit, since the equivalence class map is always used while
3312	/// searching. In the vast majority of cases, the number of equivalence
3313	/// classes is substantially smaller than 257, particularly when large
3314	/// Unicode classes aren't used.
3315	classes: ByteClasses,
3316	/// The stride of each DFA state, expressed as a power-of-two exponent.
3317	///
3318	/// The stride of a DFA corresponds to the total amount of space used by
3319	/// each DFA state in the transition table. This may be bigger than the
3320	/// size of a DFA's alphabet, since the stride is always the smallest
3321	/// power of two greater than or equal to the alphabet size.
3322	///
3323	/// While this wastes space, this avoids the need for integer division
3324	/// to convert between premultiplied state IDs and their corresponding
3325	/// indices. Instead, we can use simple bit-shifts.
3326	///
3327	/// See the docs for the `stride2` method for more details.
3328	///
3329	/// The minimum `stride2` value is `1` (corresponding to a stride of `2`)
3330	/// while the maximum `stride2` value is `9` (corresponding to a stride of
3331	/// `512`). The maximum is not `8` since the maximum alphabet size is `257`
3332	/// when accounting for the special EOI transition. However, an alphabet
3333	/// length of that size is exceptionally rare since the alphabet is shrunk
3334	/// into equivalence classes.
3335	stride2: usize,
3336	}
3337
3338	impl<'a> TransitionTable<&'a [u32]> {
3339	/// Deserialize a transition table starting at the beginning of `slice`.
3340	/// Upon success, return the total number of bytes read along with the
3341	/// transition table.
3342	///
3343	/// If there was a problem deserializing any part of the transition table,
3344	/// then this returns an error. Notably, if the given slice does not have
3345	/// the same alignment as `StateID`, then this will return an error (among
3346	/// other possible errors).
3347	///
3348	/// This is guaranteed to execute in constant time.
3349	///
3350	/// # Safety
3351	///
3352	/// This routine is not safe because it does not check the validity of the
3353	/// transition table itself. In particular, the transition table can be
3354	/// quite large, so checking its validity can be somewhat expensive. An
3355	/// invalid transition table is not safe because other code may rely on the
3356	/// transition table being correct (such as explicit bounds check elision).
3357	/// Therefore, an invalid transition table can lead to undefined behavior.
3358	///
3359	/// Callers that use this function must either pass on the safety invariant
3360	/// or guarantee that the bytes given contain a valid transition table.
3361	/// This guarantee is upheld by the bytes written by `write_to`.
3362	unsafe fn from_bytes_unchecked(
3363	mut slice: &'a [u8],
3364	) -> Result<(TransitionTable<&'a [u32]>, usize), DeserializeError> {
3365	let slice_start = slice.as_ptr().as_usize();
3366
3367	let (state_len, nr) =
3368	wire::try_read_u32_as_usize(slice, "state length")?;
3369	slice = &slice[nr..];
3370
3371	let (stride2, nr) = wire::try_read_u32_as_usize(slice, "stride2")?;
3372	slice = &slice[nr..];
3373
3374	let (classes, nr) = ByteClasses::from_bytes(slice)?;
3375	slice = &slice[nr..];
3376
3377	// The alphabet length (determined by the byte class map) cannot be
3378	// bigger than the stride (total space used by each DFA state).
3379	if stride2 > `9` {
3380	return Err(DeserializeError::generic(
3381	"dense DFA has invalid stride2 (too big)",
3382	));
3383	}
3384	// It also cannot be zero, since even a DFA that never matches anything
3385	// has a non-zero number of states with at least two equivalence
3386	// classes: one for all 256 byte values and another for the EOI
3387	// sentinel.
3388	if stride2 < `1` {
3389	return Err(DeserializeError::generic(
3390	"dense DFA has invalid stride2 (too small)",
3391	));
3392	}
3393	// This is OK since 1 <= stride2 <= 9.
3394	let stride =
3395	`1usize`.checked_shl(u32::try_from(stride2).unwrap()).unwrap();
3396	if classes.alphabet_len() > stride {
3397	return Err(DeserializeError::generic(
3398	"alphabet size cannot be bigger than transition table stride",
3399	));
3400	}
3401
3402	let trans_len =
3403	wire::shl(state_len, stride2, "dense table transition length")?;
3404	let table_bytes_len = wire::mul(
3405	trans_len,
3406	StateID::SIZE,
3407	"dense table state byte length",
3408	)?;
3409	wire::check_slice_len(slice, table_bytes_len, "transition table")?;
3410	wire::check_alignment::<StateID>(slice)?;
3411	let table_bytes = &slice[..table_bytes_len];
3412	slice = &slice[table_bytes_len..];
3413	// SAFETY: Since StateID is always representable as a u32, all we need
3414	// to do is ensure that we have the proper length and alignment. We've
3415	// checked both above, so the cast below is safe.
3416	//
3417	// N.B. This is the only not-safe code in this function.
3418	let table = core::slice::from_raw_parts(
3419	table_bytes.as_ptr().cast::<u32>(),
3420	trans_len,
3421	);
3422	let tt = TransitionTable { table, classes, stride2 };
3423	Ok((tt, slice.as_ptr().as_usize() - slice_start))
3424	}
3425	}
3426
3427	#[cfg(feature = "dfa-build")]
3428	impl TransitionTable<Vec<u32>> {
3429	/// Create a minimal transition table with just two states: a dead state
3430	/// and a quit state. The alphabet length and stride of the transition
3431	/// table is determined by the given set of equivalence classes.
3432	fn minimal(classes: ByteClasses) -> TransitionTable<Vec<u32>> {
3433	let mut tt = TransitionTable {
3434	table: vec![],
3435	classes,
3436	stride2: classes.stride2(),
3437	};
3438	// Two states, regardless of alphabet size, can always fit into u32.
3439	tt.add_empty_state().unwrap(); // dead state
3440	tt.add_empty_state().unwrap(); // quit state
3441	tt
3442	}
3443
3444	/// Set a transition in this table. Both the `from` and `to` states must
3445	/// already exist, otherwise this panics. `unit` should correspond to the
3446	/// transition out of `from` to set to `to`.
3447	fn set(&mut self, from: StateID, unit: alphabet::Unit, to: StateID) {
3448	assert!(self.is_valid(from), "invalid 'from' state");
3449	assert!(self.is_valid(to), "invalid 'to' state");
3450	self.table[from.as_usize() + self.classes.get_by_unit(unit)] =
3451	to.as_u32();
3452	}
3453
3454	/// Add an empty state (a state where all transitions lead to a dead state)
3455	/// and return its identifier. The identifier returned is guaranteed to
3456	/// not point to any other existing state.
3457	///
3458	/// If adding a state would exhaust the state identifier space, then this
3459	/// returns an error.
3460	fn add_empty_state(&mut self) -> Result<StateID, BuildError> {
3461	// Normally, to get a fresh state identifier, we would just
3462	// take the index of the next state added to the transition
3463	// table. However, we actually perform an optimization here
3464	// that premultiplies state IDs by the stride, such that they
3465	// point immediately at the beginning of their transitions in
3466	// the transition table. This avoids an extra multiplication
3467	// instruction for state lookup at search time.
3468	//
3469	// Premultiplied identifiers means that instead of your matching
3470	// loop looking something like this:
3471	//
3472	// state = dfa.start
3473	// for byte in haystack:
3474	// next = dfa.transitions[state stride + byte]*
3475	// if dfa.is_match(next):
3476	// return true
3477	// return false
3478	//
3479	// it can instead look like this:
3480	//
3481	// state = dfa.start
3482	// for byte in haystack:
3483	// next = dfa.transitions[state + byte]
3484	// if dfa.is_match(next):
3485	// return true
3486	// return false
3487	//
3488	// In other words, we save a multiplication instruction in the
3489	// critical path. This turns out to be a decent performance win.
3490	// The cost of using premultiplied state ids is that they can
3491	// require a bigger state id representation. (And they also make
3492	// the code a bit more complex, especially during minimization and
3493	// when reshuffling states, as one needs to convert back and forth
3494	// between state IDs and state indices.)
3495	//
3496	// To do this, we simply take the index of the state into the
3497	// entire transition table, rather than the index of the state
3498	// itself. e.g., If the stride is 64, then the ID of the 3rd state
3499	// is 192, not 2.
3500	let next = self.table.len();
3501	let id =
3502	StateID::new(next).map_err(\|_\| BuildError::too_many_states())?;
3503	self.table.extend(iter::repeat(`0`).take(self.stride()));
3504	Ok(id)
3505	}
3506
3507	/// Swap the two states given in this transition table.
3508	///
3509	/// This routine does not do anything to check the correctness of this
3510	/// swap. Callers must ensure that other states pointing to id1 and id2 are
3511	/// updated appropriately.
3512	///
3513	/// Both id1 and id2 must point to valid states, otherwise this panics.
3514	fn swap(&mut self, id1: StateID, id2: StateID) {
3515	assert!(self.is_valid(id1), "invalid 'id1' state: {:?}", id1);
3516	assert!(self.is_valid(id2), "invalid 'id2' state: {:?}", id2);
3517	// We only need to swap the parts of the state that are used. So if the
3518	// stride is 64, but the alphabet length is only 33, then we save a lot
3519	// of work.
3520	for b in `0`..self.classes.alphabet_len() {
3521	self.table.swap(id1.as_usize() + b, id2.as_usize() + b);
3522	}
3523	}
3524
3525	/// Remap the transitions for the state given according to the function
3526	/// given. This applies the given map function to every transition in the
3527	/// given state and changes the transition in place to the result of the
3528	/// map function for that transition.
3529	fn remap(&mut self, id: StateID, map: impl Fn(StateID) -> StateID) {
3530	for byte in `0`..self.alphabet_len() {
3531	let i = id.as_usize() + byte;
3532	let next = self.table()[i];
3533	self.table_mut()[id.as_usize() + byte] = map(next);
3534	}
3535	}
3536
3537	/// Truncate the states in this transition table to the given length.
3538	///
3539	/// This routine does not do anything to check the correctness of this
3540	/// truncation. Callers must ensure that other states pointing to truncated
3541	/// states are updated appropriately.
3542	fn truncate(&mut self, len: usize) {
3543	self.table.truncate(len << self.stride2);
3544	}
3545	}
3546
3547	impl<T: AsRef<[u32]>> TransitionTable<T> {
3548	/// Writes a serialized form of this transition table to the buffer given.
3549	/// If the buffer is too small, then an error is returned. To determine
3550	/// how big the buffer must be, use `write_to_len`.
3551	fn write_to<E: Endian>(
3552	&self,
3553	mut dst: &mut [u8],
3554	) -> Result<usize, SerializeError> {
3555	let nwrite = self.write_to_len();
3556	if dst.len() < nwrite {
3557	return Err(SerializeError::buffer_too_small("transition table"));
3558	}
3559	dst = &mut dst[..nwrite];
3560
3561	// write state length
3562	// Unwrap is OK since number of states is guaranteed to fit in a u32.
3563	E::write_u32(u32::try_from(self.len()).unwrap(), dst);
3564	dst = &mut dst[size_of::<u32>()..];
3565
3566	// write state stride (as power of 2)
3567	// Unwrap is OK since stride2 is guaranteed to be <= 9.
3568	E::write_u32(u32::try_from(self.stride2).unwrap(), dst);
3569	dst = &mut dst[size_of::<u32>()..];
3570
3571	// write byte class map
3572	let n = self.classes.write_to(dst)?;
3573	dst = &mut dst[n..];
3574
3575	// write actual transitions
3576	for &sid in self.table() {
3577	let n = wire::write_state_id::<E>(sid, &mut dst);
3578	dst = &mut dst[n..];
3579	}
3580	Ok(nwrite)
3581	}
3582
3583	/// Returns the number of bytes the serialized form of this transition
3584	/// table will use.
3585	fn write_to_len(&self) -> usize {
3586	size_of::<u32>() // state length
3587	+ size_of::<u32>() // stride2
3588	+ self.classes.write_to_len()
3589	+ (self.table().len() * StateID::SIZE)
3590	}
3591
3592	/// Validates that every state ID in this transition table is valid.
3593	///
3594	/// That is, every state ID can be used to correctly index a state in this
3595	/// table.
3596	fn validate(&self, sp: &Special) -> Result<(), DeserializeError> {
3597	for state in self.states() {
3598	// We check that the ID itself is well formed. That is, if it's
3599	// a special state then it must actually be a quit, dead, accel,
3600	// match or start state.
3601	if sp.is_special_state(state.id()) {
3602	let is_actually_special = sp.is_dead_state(state.id())
3603	\|\| sp.is_quit_state(state.id())
3604	\|\| sp.is_match_state(state.id())
3605	\|\| sp.is_start_state(state.id())
3606	\|\| sp.is_accel_state(state.id());
3607	if !is_actually_special {
3608	// This is kind of a cryptic error message...
3609	return Err(DeserializeError::generic(
3610	"found dense state tagged as special but \
3611	wasn't actually special",
3612	));
3613	}
3614	}
3615	for (_, to) in state.transitions() {
3616	if !self.is_valid(to) {
3617	return Err(DeserializeError::generic(
3618	"found invalid state ID in transition table",
3619	));
3620	}
3621	}
3622	}
3623	Ok(())
3624	}
3625
3626	/// Converts this transition table to a borrowed value.
3627	fn as_ref(&self) -> TransitionTable<&'_ [u32]> {
3628	TransitionTable {
3629	table: self.table.as_ref(),
3630	classes: self.classes.clone(),
3631	stride2: self.stride2,
3632	}
3633	}
3634
3635	/// Converts this transition table to an owned value.
3636	#[cfg(feature = "alloc")]
3637	fn to_owned(&self) -> TransitionTable<alloc::vec::Vec<u32>> {
3638	TransitionTable {
3639	table: self.table.as_ref().to_vec(),
3640	classes: self.classes.clone(),
3641	stride2: self.stride2,
3642	}
3643	}
3644
3645	/// Return the state for the given ID. If the given ID is not valid, then
3646	/// this panics.
3647	fn state(&self, id: StateID) -> State<'_> {
3648	assert!(self.is_valid(id));
3649
3650	let i = id.as_usize();
3651	State {
3652	id,
3653	stride2: self.stride2,
3654	transitions: &self.table()[i..i + self.alphabet_len()],
3655	}
3656	}
3657
3658	/// Returns an iterator over all states in this transition table.
3659	///
3660	/// This iterator yields a tuple for each state. The first element of the
3661	/// tuple corresponds to a state's identifier, and the second element
3662	/// corresponds to the state itself (comprised of its transitions).
3663	fn states(&self) -> StateIter<'_, T> {
3664	StateIter {
3665	tt: self,
3666	it: self.table().chunks(self.stride()).enumerate(),
3667	}
3668	}
3669
3670	/// Convert a state identifier to an index to a state (in the range
3671	/// 0..self.len()).
3672	///
3673	/// This is useful when using a `Vec<T>` as an efficient map keyed by state
3674	/// to some other information (such as a remapped state ID).
3675	///
3676	/// If the given ID is not valid, then this may panic or produce an
3677	/// incorrect index.
3678	fn to_index(&self, id: StateID) -> usize {
3679	id.as_usize() >> self.stride2
3680	}
3681
3682	/// Convert an index to a state (in the range 0..self.len()) to an actual
3683	/// state identifier.
3684	///
3685	/// This is useful when using a `Vec<T>` as an efficient map keyed by state
3686	/// to some other information (such as a remapped state ID).
3687	///
3688	/// If the given index is not in the specified range, then this may panic
3689	/// or produce an incorrect state ID.
3690	fn to_state_id(&self, index: usize) -> StateID {
3691	// CORRECTNESS: If the given index is not valid, then it is not
3692	// required for this to panic or return a valid state ID.
3693	StateID::new_unchecked(index << self.stride2)
3694	}
3695
3696	/// Returns the state ID for the state immediately following the one given.
3697	///
3698	/// This does not check whether the state ID returned is invalid. In fact,
3699	/// if the state ID given is the last state in this DFA, then the state ID
3700	/// returned is guaranteed to be invalid.
3701	#[cfg(feature = "dfa-build")]
3702	fn next_state_id(&self, id: StateID) -> StateID {
3703	self.to_state_id(self.to_index(id).checked_add(`1`).unwrap())
3704	}
3705
3706	/// Returns the state ID for the state immediately preceding the one given.
3707	///
3708	/// If the dead ID given (which is zero), then this panics.
3709	#[cfg(feature = "dfa-build")]
3710	fn prev_state_id(&self, id: StateID) -> StateID {
3711	self.to_state_id(self.to_index(id).checked_sub(`1`).unwrap())
3712	}
3713
3714	/// Returns the table as a slice of state IDs.
3715	fn table(&self) -> &[StateID] {
3716	wire::u32s_to_state_ids(self.table.as_ref())
3717	}
3718
3719	/// Returns the total number of states in this transition table.
3720	///
3721	/// Note that a DFA always has at least two states: the dead and quit
3722	/// states. In particular, the dead state always has ID 0 and is
3723	/// correspondingly always the first state. The dead state is never a match
3724	/// state.
3725	fn len(&self) -> usize {
3726	self.table().len() >> self.stride2
3727	}
3728
3729	/// Returns the total stride for every state in this DFA. This corresponds
3730	/// to the total number of transitions used by each state in this DFA's
3731	/// transition table.
3732	fn stride(&self) -> usize {
3733	`1` << self.stride2
3734	}
3735
3736	/// Returns the total number of elements in the alphabet for this
3737	/// transition table. This is always less than or equal to `self.stride()`.
3738	/// It is only equal when the alphabet length is a power of 2. Otherwise,
3739	/// it is always strictly less.
3740	fn alphabet_len(&self) -> usize {
3741	self.classes.alphabet_len()
3742	}
3743
3744	/// Returns true if and only if the given state ID is valid for this
3745	/// transition table. Validity in this context means that the given ID can
3746	/// be used as a valid offset with `self.stride()` to index this transition
3747	/// table.
3748	fn is_valid(&self, id: StateID) -> bool {
3749	let id = id.as_usize();
3750	id < self.table().len() && id % self.stride() == `0`
3751	}
3752
3753	/// Return the memory usage, in bytes, of this transition table.
3754	///
3755	/// This does not include the size of a `TransitionTable` value itself.
3756	fn memory_usage(&self) -> usize {
3757	self.table().len() * StateID::SIZE
3758	}
3759	}
3760
3761	#[cfg(feature = "dfa-build")]
3762	impl<T: AsMut<[u32]>> TransitionTable<T> {
3763	/// Returns the table as a slice of state IDs.
3764	fn table_mut(&mut self) -> &mut [StateID] {
3765	wire::u32s_to_state_ids_mut(self.table.as_mut())
3766	}
3767	}
3768
3769	/// The set of all possible starting states in a DFA.
3770	///
3771	/// The set of starting states corresponds to the possible choices one can make
3772	/// in terms of starting a DFA. That is, before following the first transition,
3773	/// you first need to select the state that you start in.
3774	///
3775	/// Normally, a DFA converted from an NFA that has a single starting state
3776	/// would itself just have one starting state. However, our support for look
3777	/// around generally requires more starting states. The correct starting state
3778	/// is chosen based on certain properties of the position at which we begin
3779	/// our search.
3780	///
3781	/// Before listing those properties, we first must define two terms:
3782	///
3783	/// `haystack` - The bytes to execute the search. The search always starts*
3784	/// at the beginning of `haystack` and ends before or at the end of
3785	/// `haystack`.
3786	/// `context` - The (possibly empty) bytes surrounding `haystack`. `haystack`*
3787	/// must be contained within `context` such that `context` is at least as big
3788	/// as `haystack`.
3789	///
3790	/// This split is crucial for dealing with look-around. For example, consider
3791	/// the context `foobarbaz`, the haystack `bar` and the regex `^bar$`. This
3792	/// regex should _not_ match the haystack since `bar` does not appear at the
3793	/// beginning of the input. Similarly, the regex `\Bbar\B` should match the
3794	/// haystack because `bar` is not surrounded by word boundaries. But a search
3795	/// that does not take context into account would not permit `\B` to match
3796	/// since the beginning of any string matches a word boundary. Similarly, a
3797	/// search that does not take context into account when searching `^bar$` in
3798	/// the haystack `bar` would produce a match when it shouldn't.
3799	///
3800	/// Thus, it follows that the starting state is chosen based on the following
3801	/// criteria, derived from the position at which the search starts in the
3802	/// `context` (corresponding to the start of `haystack`):
3803	///
3804	/// 1. If the search starts at the beginning of `context`, then the `Text`
3805	/// start state is used. (Since `^` corresponds to
3806	/// `hir::Anchor::Start`.)
3807	/// 2. If the search starts at a position immediately following a line
3808	/// terminator, then the `Line` start state is used. (Since `(?m:^)`
3809	/// corresponds to `hir::Anchor::StartLF`.)
3810	/// 3. If the search starts at a position immediately following a byte
3811	/// classified as a "word" character (`[_0-9a-zA-Z]`), then the `WordByte`
3812	/// start state is used. (Since `(?-u:\b)` corresponds to a word boundary.)
3813	/// 4. Otherwise, if the search starts at a position immediately following
3814	/// a byte that is not classified as a "word" character (`[^_0-9a-zA-Z]`),
3815	/// then the `NonWordByte` start state is used. (Since `(?-u:\B)`
3816	/// corresponds to a not-word-boundary.)
3817	///
3818	/// (N.B. Unicode word boundaries are not supported by the DFA because they
3819	/// require multi-byte look-around and this is difficult to support in a DFA.)
3820	///
3821	/// To further complicate things, we also support constructing individual
3822	/// anchored start states for each pattern in the DFA. (Which is required to
3823	/// implement overlapping regexes correctly, but is also generally useful.)
3824	/// Thus, when individual start states for each pattern are enabled, then the
3825	/// total number of start states represented is `4 + (4 #patterns)`, where*
3826	/// the 4 comes from each of the 4 possibilities above. The first 4 represents
3827	/// the starting states for the entire DFA, which support searching for
3828	/// multiple patterns simultaneously (possibly unanchored).
3829	///
3830	/// If individual start states are disabled, then this will only store 4
3831	/// start states. Typically, individual start states are only enabled when
3832	/// constructing the reverse DFA for regex matching. But they are also useful
3833	/// for building DFAs that can search for a specific pattern or even to support
3834	/// both anchored and unanchored searches with the same DFA.
3835	///
3836	/// Note though that while the start table always has either `4` or
3837	/// `4 + (4 #patterns)` starting state ids, the total number of states*
3838	/// might be considerably smaller. That is, many of the IDs may be duplicative.
3839	/// (For example, if a regex doesn't have a `\b` sub-pattern, then there's no
3840	/// reason to generate a unique starting state for handling word boundaries.
3841	/// Similarly for start/end anchors.)
3842	#[derive(Clone)]
3843	pub(crate) struct StartTable<T> {
3844	/// The initial start state IDs.
3845	///
3846	/// In practice, T is either `Vec<u32>` or `&[u32]`.
3847	///
3848	/// The first `2 stride` (currently always 8) entries always correspond*
3849	/// to the starts states for the entire DFA, with the first 4 entries being
3850	/// for unanchored searches and the second 4 entries being for anchored
3851	/// searches. To keep things simple, we always use 8 entries even if the
3852	/// `StartKind` is not both.
3853	///
3854	/// After that, there are `stride patterns` state IDs, where `patterns`*
3855	/// may be zero in the case of a DFA with no patterns or in the case where
3856	/// the DFA was built without enabling starting states for each pattern.
3857	table: T,
3858	/// The starting state configuration supported. When 'both', both
3859	/// unanchored and anchored searches work. When 'unanchored', anchored
3860	/// searches panic. When 'anchored', unanchored searches panic.
3861	kind: StartKind,
3862	/// The start state configuration for every possible byte.
3863	start_map: StartByteMap,
3864	/// The number of starting state IDs per pattern.
3865	stride: usize,
3866	/// The total number of patterns for which starting states are encoded.
3867	/// This is `None` for DFAs that were built without start states for each
3868	/// pattern. Thus, one cannot use this field to say how many patterns
3869	/// are in the DFA in all cases. It is specific to how many patterns are
3870	/// represented in this start table.
3871	pattern_len: Option<usize>,
3872	/// The universal starting state for unanchored searches. This is only
3873	/// present when the DFA supports unanchored searches and when all starting
3874	/// state IDs for an unanchored search are equivalent.
3875	universal_start_unanchored: Option<StateID>,
3876	/// The universal starting state for anchored searches. This is only
3877	/// present when the DFA supports anchored searches and when all starting
3878	/// state IDs for an anchored search are equivalent.
3879	universal_start_anchored: Option<StateID>,
3880	}
3881
3882	#[cfg(feature = "dfa-build")]
3883	impl StartTable<Vec<u32>> {
3884	/// Create a valid set of start states all pointing to the dead state.
3885	///
3886	/// When the corresponding DFA is constructed with start states for each
3887	/// pattern, then `patterns` should be the number of patterns. Otherwise,
3888	/// it should be zero.
3889	///
3890	/// If the total table size could exceed the allocatable limit, then this
3891	/// returns an error. In practice, this is unlikely to be able to occur,
3892	/// since it's likely that allocation would have failed long before it got
3893	/// to this point.
3894	fn dead(
3895	kind: StartKind,
3896	lookm: &LookMatcher,
3897	pattern_len: Option<usize>,
3898	) -> Result<StartTable<Vec<u32>>, BuildError> {
3899	if let Some(len) = pattern_len {
3900	assert!(len <= PatternID::LIMIT);
3901	}
3902	let stride = Start::len();
3903	// OK because 24 is never going to overflow anything.*
3904	let starts_len = stride.checked_mul(`2`).unwrap();
3905	let pattern_starts_len =
3906	match stride.checked_mul(pattern_len.unwrap_or(`0`)) {
3907	Some(x) => x,
3908	None => return Err(BuildError::too_many_start_states()),
3909	};
3910	let table_len = match starts_len.checked_add(pattern_starts_len) {
3911	Some(x) => x,
3912	None => return Err(BuildError::too_many_start_states()),
3913	};
3914	if let Err(_) = isize::try_from(table_len) {
3915	return Err(BuildError::too_many_start_states());
3916	}
3917	let table = vec![DEAD.as_u32(); table_len];
3918	let start_map = StartByteMap::new(lookm);
3919	Ok(StartTable {
3920	table,
3921	kind,
3922	start_map,
3923	stride,
3924	pattern_len,
3925	universal_start_unanchored: None,
3926	universal_start_anchored: None,
3927	})
3928	}
3929	}
3930
3931	impl<'a> StartTable<&'a [u32]> {
3932	/// Deserialize a table of start state IDs starting at the beginning of
3933	/// `slice`. Upon success, return the total number of bytes read along with
3934	/// the table of starting state IDs.
3935	///
3936	/// If there was a problem deserializing any part of the starting IDs,
3937	/// then this returns an error. Notably, if the given slice does not have
3938	/// the same alignment as `StateID`, then this will return an error (among
3939	/// other possible errors).
3940	///
3941	/// This is guaranteed to execute in constant time.
3942	///
3943	/// # Safety
3944	///
3945	/// This routine is not safe because it does not check the validity of the
3946	/// starting state IDs themselves. In particular, the number of starting
3947	/// IDs can be of variable length, so it's possible that checking their
3948	/// validity cannot be done in constant time. An invalid starting state
3949	/// ID is not safe because other code may rely on the starting IDs being
3950	/// correct (such as explicit bounds check elision). Therefore, an invalid
3951	/// start ID can lead to undefined behavior.
3952	///
3953	/// Callers that use this function must either pass on the safety invariant
3954	/// or guarantee that the bytes given contain valid starting state IDs.
3955	/// This guarantee is upheld by the bytes written by `write_to`.
3956	unsafe fn from_bytes_unchecked(
3957	mut slice: &'a [u8],
3958	) -> Result<(StartTable<&'a [u32]>, usize), DeserializeError> {
3959	let slice_start = slice.as_ptr().as_usize();
3960
3961	let (kind, nr) = StartKind::from_bytes(slice)?;
3962	slice = &slice[nr..];
3963
3964	let (start_map, nr) = StartByteMap::from_bytes(slice)?;
3965	slice = &slice[nr..];
3966
3967	let (stride, nr) =
3968	wire::try_read_u32_as_usize(slice, "start table stride")?;
3969	slice = &slice[nr..];
3970	if stride != Start::len() {
3971	return Err(DeserializeError::generic(
3972	"invalid starting table stride",
3973	));
3974	}
3975
3976	let (maybe_pattern_len, nr) =
3977	wire::try_read_u32_as_usize(slice, "start table patterns")?;
3978	slice = &slice[nr..];
3979	let pattern_len = if maybe_pattern_len.as_u32() == u32::MAX {
3980	None
3981	} else {
3982	Some(maybe_pattern_len)
3983	};
3984	if pattern_len.map_or(`false`, \|len\| len > PatternID::LIMIT) {
3985	return Err(DeserializeError::generic(
3986	"invalid number of patterns",
3987	));
3988	}
3989
3990	let (universal_unanchored, nr) =
3991	wire::try_read_u32(slice, "universal unanchored start")?;
3992	slice = &slice[nr..];
3993	let universal_start_unanchored = if universal_unanchored == u32::MAX {
3994	None
3995	} else {
3996	Some(StateID::try_from(universal_unanchored).map_err(\|e\| {
3997	DeserializeError::state_id_error(
3998	e,
3999	"universal unanchored start",
4000	)
4001	})?)
4002	};
4003
4004	let (universal_anchored, nr) =
4005	wire::try_read_u32(slice, "universal anchored start")?;
4006	slice = &slice[nr..];
4007	let universal_start_anchored = if universal_anchored == u32::MAX {
4008	None
4009	} else {
4010	Some(StateID::try_from(universal_anchored).map_err(\|e\| {
4011	DeserializeError::state_id_error(e, "universal anchored start")
4012	})?)
4013	};
4014
4015	let pattern_table_size = wire::mul(
4016	stride,
4017	pattern_len.unwrap_or(`0`),
4018	"invalid pattern length",
4019	)?;
4020	// Our start states always start with a two stride of start states for
4021	// the entire automaton. The first stride is for unanchored starting
4022	// states and the second stride is for anchored starting states. What
4023	// follows it are an optional set of start states for each pattern.
4024	let start_state_len = wire::add(
4025	wire::mul(`2`, stride, "start state stride too big")?,
4026	pattern_table_size,
4027	"invalid 'any' pattern starts size",
4028	)?;
4029	let table_bytes_len = wire::mul(
4030	start_state_len,
4031	StateID::SIZE,
4032	"pattern table bytes length",
4033	)?;
4034	wire::check_slice_len(slice, table_bytes_len, "start ID table")?;
4035	wire::check_alignment::<StateID>(slice)?;
4036	let table_bytes = &slice[..table_bytes_len];
4037	slice = &slice[table_bytes_len..];
4038	// SAFETY: Since StateID is always representable as a u32, all we need
4039	// to do is ensure that we have the proper length and alignment. We've
4040	// checked both above, so the cast below is safe.
4041	//
4042	// N.B. This is the only not-safe code in this function.
4043	let table = core::slice::from_raw_parts(
4044	table_bytes.as_ptr().cast::<u32>(),
4045	start_state_len,
4046	);
4047	let st = StartTable {
4048	table,
4049	kind,
4050	start_map,
4051	stride,
4052	pattern_len,
4053	universal_start_unanchored,
4054	universal_start_anchored,
4055	};
4056	Ok((st, slice.as_ptr().as_usize() - slice_start))
4057	}
4058	}
4059
4060	impl<T: AsRef<[u32]>> StartTable<T> {
4061	/// Writes a serialized form of this start table to the buffer given. If
4062	/// the buffer is too small, then an error is returned. To determine how
4063	/// big the buffer must be, use `write_to_len`.
4064	fn write_to<E: Endian>(
4065	&self,
4066	mut dst: &mut [u8],
4067	) -> Result<usize, SerializeError> {
4068	let nwrite = self.write_to_len();
4069	if dst.len() < nwrite {
4070	return Err(SerializeError::buffer_too_small(
4071	"starting table ids",
4072	));
4073	}
4074	dst = &mut dst[..nwrite];
4075
4076	// write start kind
4077	let nw = self.kind.write_to::<E>(dst)?;
4078	dst = &mut dst[nw..];
4079	// write start byte map
4080	let nw = self.start_map.write_to(dst)?;
4081	dst = &mut dst[nw..];
4082	// write stride
4083	// Unwrap is OK since the stride is always 4 (currently).
4084	E::write_u32(u32::try_from(self.stride).unwrap(), dst);
4085	dst = &mut dst[size_of::<u32>()..];
4086	// write pattern length
4087	// Unwrap is OK since number of patterns is guaranteed to fit in a u32.
4088	E::write_u32(
4089	u32::try_from(self.pattern_len.unwrap_or(`0xFFFF_FFFF`)).unwrap(),
4090	dst,
4091	);
4092	dst = &mut dst[size_of::<u32>()..];
4093	// write universal start unanchored state id, u32::MAX if absent
4094	E::write_u32(
4095	self.universal_start_unanchored
4096	.map_or(u32::MAX, \|sid\| sid.as_u32()),
4097	dst,
4098	);
4099	dst = &mut dst[size_of::<u32>()..];
4100	// write universal start anchored state id, u32::MAX if absent
4101	E::write_u32(
4102	self.universal_start_anchored.map_or(u32::MAX, \|sid\| sid.as_u32()),
4103	dst,
4104	);
4105	dst = &mut dst[size_of::<u32>()..];
4106	// write start IDs
4107	for &sid in self.table() {
4108	let n = wire::write_state_id::<E>(sid, &mut dst);
4109	dst = &mut dst[n..];
4110	}
4111	Ok(nwrite)
4112	}
4113
4114	/// Returns the number of bytes the serialized form of this start ID table
4115	/// will use.
4116	fn write_to_len(&self) -> usize {
4117	self.kind.write_to_len()
4118	+ self.start_map.write_to_len()
4119	+ size_of::<u32>() // stride
4120	+ size_of::<u32>() // # patterns
4121	+ size_of::<u32>() // universal unanchored start
4122	+ size_of::<u32>() // universal anchored start
4123	+ (self.table().len() * StateID::SIZE)
4124	}
4125
4126	/// Validates that every state ID in this start table is valid by checking
4127	/// it against the given transition table (which must be for the same DFA).
4128	///
4129	/// That is, every state ID can be used to correctly index a state.
4130	fn validate(
4131	&self,
4132	tt: &TransitionTable<T>,
4133	) -> Result<(), DeserializeError> {
4134	if !self.universal_start_unanchored.map_or(`true`, \|s\| tt.is_valid(s)) {
4135	return Err(DeserializeError::generic(
4136	"found invalid universal unanchored starting state ID",
4137	));
4138	}
4139	if !self.universal_start_anchored.map_or(`true`, \|s\| tt.is_valid(s)) {
4140	return Err(DeserializeError::generic(
4141	"found invalid universal anchored starting state ID",
4142	));
4143	}
4144	for &id in self.table() {
4145	if !tt.is_valid(id) {
4146	return Err(DeserializeError::generic(
4147	"found invalid starting state ID",
4148	));
4149	}
4150	}
4151	Ok(())
4152	}
4153
4154	/// Converts this start list to a borrowed value.
4155	fn as_ref(&self) -> StartTable<&'_ [u32]> {
4156	StartTable {
4157	table: self.table.as_ref(),
4158	kind: self.kind,
4159	start_map: self.start_map.clone(),
4160	stride: self.stride,
4161	pattern_len: self.pattern_len,
4162	universal_start_unanchored: self.universal_start_unanchored,
4163	universal_start_anchored: self.universal_start_anchored,
4164	}
4165	}
4166
4167	/// Converts this start list to an owned value.
4168	#[cfg(feature = "alloc")]
4169	fn to_owned(&self) -> StartTable<alloc::vec::Vec<u32>> {
4170	StartTable {
4171	table: self.table.as_ref().to_vec(),
4172	kind: self.kind,
4173	start_map: self.start_map.clone(),
4174	stride: self.stride,
4175	pattern_len: self.pattern_len,
4176	universal_start_unanchored: self.universal_start_unanchored,
4177	universal_start_anchored: self.universal_start_anchored,
4178	}
4179	}
4180
4181	/// Return the start state for the given input and starting configuration.
4182	/// This returns an error if the input configuration is not supported by
4183	/// this DFA. For example, requesting an unanchored search when the DFA was
4184	/// not built with unanchored starting states. Or asking for an anchored
4185	/// pattern search with an invalid pattern ID or on a DFA that was not
4186	/// built with start states for each pattern.
4187	#[cfg_attr(feature = "perf-inline", inline(always))]
4188	fn start(
4189	&self,
4190	anchored: Anchored,
4191	start: Start,
4192	) -> Result<StateID, StartError> {
4193	let start_index = start.as_usize();
4194	let index = match anchored {
4195	Anchored::No => {
4196	if !self.kind.has_unanchored() {
4197	return Err(StartError::unsupported_anchored(anchored));
4198	}
4199	start_index
4200	}
4201	Anchored::Yes => {
4202	if !self.kind.has_anchored() {
4203	return Err(StartError::unsupported_anchored(anchored));
4204	}
4205	self.stride + start_index
4206	}
4207	Anchored::Pattern(pid) => {
4208	let len = match self.pattern_len {
4209	None => {
4210	return Err(StartError::unsupported_anchored(anchored))
4211	}
4212	Some(len) => len,
4213	};
4214	if pid.as_usize() >= len {
4215	return Ok(DEAD);
4216	}
4217	(`2` * self.stride)
4218	+ (self.stride * pid.as_usize())
4219	+ start_index
4220	}
4221	};
4222	Ok(self.table()[index])
4223	}
4224
4225	/// Returns an iterator over all start state IDs in this table.
4226	///
4227	/// Each item is a triple of: start state ID, the start state type and the
4228	/// pattern ID (if any).
4229	fn iter(&self) -> StartStateIter<'_> {
4230	StartStateIter { st: self.as_ref(), i: `0` }
4231	}
4232
4233	/// Returns the table as a slice of state IDs.
4234	fn table(&self) -> &[StateID] {
4235	wire::u32s_to_state_ids(self.table.as_ref())
4236	}
4237
4238	/// Return the memory usage, in bytes, of this start list.
4239	///
4240	/// This does not include the size of a `StartList` value itself.
4241	fn memory_usage(&self) -> usize {
4242	self.table().len() * StateID::SIZE
4243	}
4244	}
4245
4246	#[cfg(feature = "dfa-build")]
4247	impl<T: AsMut<[u32]>> StartTable<T> {
4248	/// Set the start state for the given index and pattern.
4249	///
4250	/// If the pattern ID or state ID are not valid, then this will panic.
4251	fn set_start(&mut self, anchored: Anchored, start: Start, id: StateID) {
4252	let start_index = start.as_usize();
4253	let index = match anchored {
4254	Anchored::No => start_index,
4255	Anchored::Yes => self.stride + start_index,
4256	Anchored::Pattern(pid) => {
4257	let pid = pid.as_usize();
4258	let len = self
4259	.pattern_len
4260	.expect("start states for each pattern enabled");
4261	assert!(pid < len, "invalid pattern ID {:?}", pid);
4262	self.stride
4263	.checked_mul(pid)
4264	.unwrap()
4265	.checked_add(self.stride.checked_mul(`2`).unwrap())
4266	.unwrap()
4267	.checked_add(start_index)
4268	.unwrap()
4269	}
4270	};
4271	self.table_mut()[index] = id;
4272	}
4273
4274	/// Returns the table as a mutable slice of state IDs.
4275	fn table_mut(&mut self) -> &mut [StateID] {
4276	wire::u32s_to_state_ids_mut(self.table.as_mut())
4277	}
4278	}
4279
4280	/// An iterator over start state IDs.
4281	///
4282	/// This iterator yields a triple of start state ID, the anchored mode and the
4283	/// start state type. If a pattern ID is relevant, then the anchored mode will
4284	/// contain it. Start states with an anchored mode containing a pattern ID will
4285	/// only occur when the DFA was compiled with start states for each pattern
4286	/// (which is disabled by default).
4287	pub(crate) struct StartStateIter<'a> {
4288	st: StartTable<&'a [u32]>,
4289	i: usize,
4290	}
4291
4292	impl<'a> Iterator for StartStateIter<'a> {
4293	type Item = (StateID, Anchored, Start);
4294
4295	fn next(&mut self) -> Option<(StateID, Anchored, Start)> {
4296	let i = self.i;
4297	let table = self.st.table();
4298	if i >= table.len() {
4299	return None;
4300	}
4301	self.i += `1`;
4302
4303	// This unwrap is okay since the stride of the starting state table
4304	// must always match the number of start state types.
4305	let start_type = Start::from_usize(i % self.st.stride).unwrap();
4306	let anchored = if i < self.st.stride {
4307	Anchored::No
4308	} else if i < (`2` * self.st.stride) {
4309	Anchored::Yes
4310	} else {
4311	let pid = (i - (`2` * self.st.stride)) / self.st.stride;
4312	Anchored::Pattern(PatternID::new(pid).unwrap())
4313	};
4314	Some((table[i], anchored, start_type))
4315	}
4316	}
4317
4318	/// This type represents that patterns that should be reported whenever a DFA
4319	/// enters a match state. This structure exists to support DFAs that search for
4320	/// matches for multiple regexes.
4321	///
4322	/// This structure relies on the fact that all match states in a DFA occur
4323	/// contiguously in the DFA's transition table. (See dfa/special.rs for a more
4324	/// detailed breakdown of the representation.) Namely, when a match occurs, we
4325	/// know its state ID. Since we know the start and end of the contiguous region
4326	/// of match states, we can use that to compute the position at which the match
4327	/// state occurs. That in turn is used as an offset into this structure.
4328	#[derive(Clone, Debug)]
4329	struct MatchStates<T> {
4330	/// Slices is a flattened sequence of pairs, where each pair points to a
4331	/// sub-slice of pattern_ids. The first element of the pair is an offset
4332	/// into pattern_ids and the second element of the pair is the number
4333	/// of 32-bit pattern IDs starting at that position. That is, each pair
4334	/// corresponds to a single DFA match state and its corresponding match
4335	/// IDs. The number of pairs always corresponds to the number of distinct
4336	/// DFA match states.
4337	///
4338	/// In practice, T is either Vec<u32> or &[u32].
4339	slices: T,
4340	/// A flattened sequence of pattern IDs for each DFA match state. The only
4341	/// way to correctly read this sequence is indirectly via `slices`.
4342	///
4343	/// In practice, T is either Vec<u32> or &[u32].
4344	pattern_ids: T,
4345	/// The total number of unique patterns represented by these match states.
4346	pattern_len: usize,
4347	}
4348
4349	impl<'a> MatchStates<&'a [u32]> {
4350	unsafe fn from_bytes_unchecked(
4351	mut slice: &'a [u8],
4352	) -> Result<(MatchStates<&'a [u32]>, usize), DeserializeError> {
4353	let slice_start = slice.as_ptr().as_usize();
4354
4355	// Read the total number of match states.
4356	let (state_len, nr) =
4357	wire::try_read_u32_as_usize(slice, "match state length")?;
4358	slice = &slice[nr..];
4359
4360	// Read the slice start/length pairs.
4361	let pair_len = wire::mul(`2`, state_len, "match state offset pairs")?;
4362	let slices_bytes_len = wire::mul(
4363	pair_len,
4364	PatternID::SIZE,
4365	"match state slice offset byte length",
4366	)?;
4367	wire::check_slice_len(slice, slices_bytes_len, "match state slices")?;
4368	wire::check_alignment::<PatternID>(slice)?;
4369	let slices_bytes = &slice[..slices_bytes_len];
4370	slice = &slice[slices_bytes_len..];
4371	// SAFETY: Since PatternID is always representable as a u32, all we
4372	// need to do is ensure that we have the proper length and alignment.
4373	// We've checked both above, so the cast below is safe.
4374	//
4375	// N.B. This is one of the few not-safe snippets in this function,
4376	// so we mark it explicitly to call it out.
4377	let slices = core::slice::from_raw_parts(
4378	slices_bytes.as_ptr().cast::<u32>(),
4379	pair_len,
4380	);
4381
4382	// Read the total number of unique pattern IDs (which is always 1 more
4383	// than the maximum pattern ID in this automaton, since pattern IDs are
4384	// handed out contiguously starting at 0).
4385	let (pattern_len, nr) =
4386	wire::try_read_u32_as_usize(slice, "pattern length")?;
4387	slice = &slice[nr..];
4388
4389	// Now read the pattern ID length. We don't need to store this
4390	// explicitly, but we need it to know how many pattern IDs to read.
4391	let (idlen, nr) =
4392	wire::try_read_u32_as_usize(slice, "pattern ID length")?;
4393	slice = &slice[nr..];
4394
4395	// Read the actual pattern IDs.
4396	let pattern_ids_len =
4397	wire::mul(idlen, PatternID::SIZE, "pattern ID byte length")?;
4398	wire::check_slice_len(slice, pattern_ids_len, "match pattern IDs")?;
4399	wire::check_alignment::<PatternID>(slice)?;
4400	let pattern_ids_bytes = &slice[..pattern_ids_len];
4401	slice = &slice[pattern_ids_len..];
4402	// SAFETY: Since PatternID is always representable as a u32, all we
4403	// need to do is ensure that we have the proper length and alignment.
4404	// We've checked both above, so the cast below is safe.
4405	//
4406	// N.B. This is one of the few not-safe snippets in this function,
4407	// so we mark it explicitly to call it out.
4408	let pattern_ids = core::slice::from_raw_parts(
4409	pattern_ids_bytes.as_ptr().cast::<u32>(),
4410	idlen,
4411	);
4412
4413	let ms = MatchStates { slices, pattern_ids, pattern_len };
4414	Ok((ms, slice.as_ptr().as_usize() - slice_start))
4415	}
4416	}
4417
4418	#[cfg(feature = "dfa-build")]
4419	impl MatchStates<Vec<u32>> {
4420	fn empty(pattern_len: usize) -> MatchStates<Vec<u32>> {
4421	assert!(pattern_len <= PatternID::LIMIT);
4422	MatchStates { slices: vec![], pattern_ids: vec![], pattern_len }
4423	}
4424
4425	fn new(
4426	matches: &BTreeMap<StateID, Vec<PatternID>>,
4427	pattern_len: usize,
4428	) -> Result<MatchStates<Vec<u32>>, BuildError> {
4429	let mut m = MatchStates::empty(pattern_len);
4430	for (_, pids) in matches.iter() {
4431	let start = PatternID::new(m.pattern_ids.len())
4432	.map_err(\|_\| BuildError::too_many_match_pattern_ids())?;
4433	m.slices.push(start.as_u32());
4434	// This is always correct since the number of patterns in a single
4435	// match state can never exceed maximum number of allowable
4436	// patterns. Why? Because a pattern can only appear once in a
4437	// particular match state, by construction. (And since our pattern
4438	// ID limit is one less than u32::MAX, we're guaranteed that the
4439	// length fits in a u32.)
4440	m.slices.push(u32::try_from(pids.len()).unwrap());
4441	for &pid in pids {
4442	m.pattern_ids.push(pid.as_u32());
4443	}
4444	}
4445	m.pattern_len = pattern_len;
4446	Ok(m)
4447	}
4448
4449	fn new_with_map(
4450	&self,
4451	matches: &BTreeMap<StateID, Vec<PatternID>>,
4452	) -> Result<MatchStates<Vec<u32>>, BuildError> {
4453	MatchStates::new(matches, self.pattern_len)
4454	}
4455	}
4456
4457	impl<T: AsRef<[u32]>> MatchStates<T> {
4458	/// Writes a serialized form of these match states to the buffer given. If
4459	/// the buffer is too small, then an error is returned. To determine how
4460	/// big the buffer must be, use `write_to_len`.
4461	fn write_to<E: Endian>(
4462	&self,
4463	mut dst: &mut [u8],
4464	) -> Result<usize, SerializeError> {
4465	let nwrite = self.write_to_len();
4466	if dst.len() < nwrite {
4467	return Err(SerializeError::buffer_too_small("match states"));
4468	}
4469	dst = &mut dst[..nwrite];
4470
4471	// write state ID length
4472	// Unwrap is OK since number of states is guaranteed to fit in a u32.
4473	E::write_u32(u32::try_from(self.len()).unwrap(), dst);
4474	dst = &mut dst[size_of::<u32>()..];
4475
4476	// write slice offset pairs
4477	for &pid in self.slices() {
4478	let n = wire::write_pattern_id::<E>(pid, &mut dst);
4479	dst = &mut dst[n..];
4480	}
4481
4482	// write unique pattern ID length
4483	// Unwrap is OK since number of patterns is guaranteed to fit in a u32.
4484	E::write_u32(u32::try_from(self.pattern_len).unwrap(), dst);
4485	dst = &mut dst[size_of::<u32>()..];
4486
4487	// write pattern ID length
4488	// Unwrap is OK since we check at construction (and deserialization)
4489	// that the number of patterns is representable as a u32.
4490	E::write_u32(u32::try_from(self.pattern_ids().len()).unwrap(), dst);
4491	dst = &mut dst[size_of::<u32>()..];
4492
4493	// write pattern IDs
4494	for &pid in self.pattern_ids() {
4495	let n = wire::write_pattern_id::<E>(pid, &mut dst);
4496	dst = &mut dst[n..];
4497	}
4498
4499	Ok(nwrite)
4500	}
4501
4502	/// Returns the number of bytes the serialized form of these match states
4503	/// will use.
4504	fn write_to_len(&self) -> usize {
4505	size_of::<u32>() // match state length
4506	+ (self.slices().len() * PatternID::SIZE)
4507	+ size_of::<u32>() // unique pattern ID length
4508	+ size_of::<u32>() // pattern ID length
4509	+ (self.pattern_ids().len() * PatternID::SIZE)
4510	}
4511
4512	/// Valides that the match state info is itself internally consistent and
4513	/// consistent with the recorded match state region in the given DFA.
4514	fn validate(&self, dfa: &DFA<T>) -> Result<(), DeserializeError> {
4515	if self.len() != dfa.special.match_len(dfa.stride()) {
4516	return Err(DeserializeError::generic(
4517	"match state length mismatch",
4518	));
4519	}
4520	for si in `0`..self.len() {
4521	let start = self.slices()[si * `2`].as_usize();
4522	let len = self.slices()[si * `2` + `1`].as_usize();
4523	if start >= self.pattern_ids().len() {
4524	return Err(DeserializeError::generic(
4525	"invalid pattern ID start offset",
4526	));
4527	}
4528	if start + len > self.pattern_ids().len() {
4529	return Err(DeserializeError::generic(
4530	"invalid pattern ID length",
4531	));
4532	}
4533	for mi in `0`..len {
4534	let pid = self.pattern_id(si, mi);
4535	if pid.as_usize() >= self.pattern_len {
4536	return Err(DeserializeError::generic(
4537	"invalid pattern ID",
4538	));
4539	}
4540	}
4541	}
4542	Ok(())
4543	}
4544
4545	/// Converts these match states back into their map form. This is useful
4546	/// when shuffling states, as the normal MatchStates representation is not
4547	/// amenable to easy state swapping. But with this map, to swap id1 and
4548	/// id2, all you need to do is:
4549	///
4550	/// if let Some(pids) = map.remove(&id1) {
4551	/// map.insert(id2, pids);
4552	/// }
4553	///
4554	/// Once shuffling is done, use MatchStates::new to convert back.
4555	#[cfg(feature = "dfa-build")]
4556	fn to_map(&self, dfa: &DFA<T>) -> BTreeMap<StateID, Vec<PatternID>> {
4557	let mut map = BTreeMap::new();
4558	for i in `0`..self.len() {
4559	let mut pids = vec![];
4560	for j in `0`..self.pattern_len(i) {
4561	pids.push(self.pattern_id(i, j));
4562	}
4563	map.insert(self.match_state_id(dfa, i), pids);
4564	}
4565	map
4566	}
4567
4568	/// Converts these match states to a borrowed value.
4569	fn as_ref(&self) -> MatchStates<&'_ [u32]> {
4570	MatchStates {
4571	slices: self.slices.as_ref(),
4572	pattern_ids: self.pattern_ids.as_ref(),
4573	pattern_len: self.pattern_len,
4574	}
4575	}
4576
4577	/// Converts these match states to an owned value.
4578	#[cfg(feature = "alloc")]
4579	fn to_owned(&self) -> MatchStates<alloc::vec::Vec<u32>> {
4580	MatchStates {
4581	slices: self.slices.as_ref().to_vec(),
4582	pattern_ids: self.pattern_ids.as_ref().to_vec(),
4583	pattern_len: self.pattern_len,
4584	}
4585	}
4586
4587	/// Returns the match state ID given the match state index. (Where the
4588	/// first match state corresponds to index 0.)
4589	///
4590	/// This panics if there is no match state at the given index.
4591	fn match_state_id(&self, dfa: &DFA<T>, index: usize) -> StateID {
4592	assert!(dfa.special.matches(), "no match states to index");
4593	// This is one of the places where we rely on the fact that match
4594	// states are contiguous in the transition table. Namely, that the
4595	// first match state ID always corresponds to dfa.special.min_start.
4596	// From there, since we know the stride, we can compute the ID of any
4597	// match state given its index.
4598	let stride2 = u32::try_from(dfa.stride2()).unwrap();
4599	let offset = index.checked_shl(stride2).unwrap();
4600	let id = dfa.special.min_match.as_usize().checked_add(offset).unwrap();
4601	let sid = StateID::new(id).unwrap();
4602	assert!(dfa.is_match_state(sid));
4603	sid
4604	}
4605
4606	/// Returns the pattern ID at the given match index for the given match
4607	/// state.
4608	///
4609	/// The match state index is the state index minus the state index of the
4610	/// first match state in the DFA.
4611	///
4612	/// The match index is the index of the pattern ID for the given state.
4613	/// The index must be less than `self.pattern_len(state_index)`.
4614	#[cfg_attr(feature = "perf-inline", inline(always))]
4615	fn pattern_id(&self, state_index: usize, match_index: usize) -> PatternID {
4616	self.pattern_id_slice(state_index)[match_index]
4617	}
4618
4619	/// Returns the number of patterns in the given match state.
4620	///
4621	/// The match state index is the state index minus the state index of the
4622	/// first match state in the DFA.
4623	#[cfg_attr(feature = "perf-inline", inline(always))]
4624	fn pattern_len(&self, state_index: usize) -> usize {
4625	self.slices()[state_index * `2` + `1`].as_usize()
4626	}
4627
4628	/// Returns all of the pattern IDs for the given match state index.
4629	///
4630	/// The match state index is the state index minus the state index of the
4631	/// first match state in the DFA.
4632	#[cfg_attr(feature = "perf-inline", inline(always))]
4633	fn pattern_id_slice(&self, state_index: usize) -> &[PatternID] {
4634	let start = self.slices()[state_index * `2`].as_usize();
4635	let len = self.pattern_len(state_index);
4636	&self.pattern_ids()[start..start + len]
4637	}
4638
4639	/// Returns the pattern ID offset slice of u32 as a slice of PatternID.
4640	#[cfg_attr(feature = "perf-inline", inline(always))]
4641	fn slices(&self) -> &[PatternID] {
4642	wire::u32s_to_pattern_ids(self.slices.as_ref())
4643	}
4644
4645	/// Returns the total number of match states.
4646	#[cfg_attr(feature = "perf-inline", inline(always))]
4647	fn len(&self) -> usize {
4648	assert_eq!(`0`, self.slices().len() % `2`);
4649	self.slices().len() / `2`
4650	}
4651
4652	/// Returns the pattern ID slice of u32 as a slice of PatternID.
4653	#[cfg_attr(feature = "perf-inline", inline(always))]
4654	fn pattern_ids(&self) -> &[PatternID] {
4655	wire::u32s_to_pattern_ids(self.pattern_ids.as_ref())
4656	}
4657
4658	/// Return the memory usage, in bytes, of these match pairs.
4659	fn memory_usage(&self) -> usize {
4660	(self.slices().len() + self.pattern_ids().len()) * PatternID::SIZE
4661	}
4662	}
4663
4664	/// A common set of flags for both dense and sparse DFAs. This primarily
4665	/// centralizes the serialization format of these flags at a bitset.
4666	#[derive(Clone, Copy, Debug)]
4667	pub(crate) struct Flags {
4668	/// Whether the DFA can match the empty string. When this is false, all
4669	/// matches returned by this DFA are guaranteed to have non-zero length.
4670	pub(crate) has_empty: bool,
4671	/// Whether the DFA should only produce matches with spans that correspond
4672	/// to valid UTF-8. This also includes omitting any zero-width matches that
4673	/// split the UTF-8 encoding of a codepoint.
4674	pub(crate) is_utf8: bool,
4675	/// Whether the DFA is always anchored or not, regardless of `Input`
4676	/// configuration. This is useful for avoiding a reverse scan even when
4677	/// executing unanchored searches.
4678	pub(crate) is_always_start_anchored: bool,
4679	}
4680
4681	impl Flags {
4682	/// Creates a set of flags for a DFA from an NFA.
4683	///
4684	/// N.B. This constructor was defined at the time of writing because all
4685	/// of the flags are derived directly from the NFA. If this changes in the
4686	/// future, we might be more thoughtful about how the `Flags` value is
4687	/// itself built.
4688	#[cfg(feature = "dfa-build")]
4689	fn from_nfa(nfa: &thompson::NFA) -> Flags {
4690	Flags {
4691	has_empty: nfa.has_empty(),
4692	is_utf8: nfa.is_utf8(),
4693	is_always_start_anchored: nfa.is_always_start_anchored(),
4694	}
4695	}
4696
4697	/// Deserializes the flags from the given slice. On success, this also
4698	/// returns the number of bytes read from the slice.
4699	pub(crate) fn from_bytes(
4700	slice: &[u8],
4701	) -> Result<(Flags, usize), DeserializeError> {
4702	let (bits, nread) = wire::try_read_u32(slice, "flag bitset")?;
4703	let flags = Flags {
4704	has_empty: bits & (`1` << `0`) != `0`,
4705	is_utf8: bits & (`1` << `1`) != `0`,
4706	is_always_start_anchored: bits & (`1` << `2`) != `0`,
4707	};
4708	Ok((flags, nread))
4709	}
4710
4711	/// Writes these flags to the given byte slice. If the buffer is too small,
4712	/// then an error is returned. To determine how big the buffer must be,
4713	/// use `write_to_len`.
4714	pub(crate) fn write_to<E: Endian>(
4715	&self,
4716	dst: &mut [u8],
4717	) -> Result<usize, SerializeError> {
4718	fn bool_to_int(b: bool) -> u32 {
4719	if b {
4720	`1`
4721	} else {
4722	`0`
4723	}
4724	}
4725
4726	let nwrite = self.write_to_len();
4727	if dst.len() < nwrite {
4728	return Err(SerializeError::buffer_too_small("flag bitset"));
4729	}
4730	let bits = (bool_to_int(self.has_empty) << `0`)
4731	\| (bool_to_int(self.is_utf8) << `1`)
4732	\| (bool_to_int(self.is_always_start_anchored) << `2`);
4733	E::write_u32(bits, dst);
4734	Ok(nwrite)
4735	}
4736
4737	/// Returns the number of bytes the serialized form of these flags
4738	/// will use.
4739	pub(crate) fn write_to_len(&self) -> usize {
4740	size_of::<u32>()
4741	}
4742	}
4743
4744	/// An iterator over all states in a DFA.
4745	///
4746	/// This iterator yields a tuple for each state. The first element of the
4747	/// tuple corresponds to a state's identifier, and the second element
4748	/// corresponds to the state itself (comprised of its transitions).
4749	///
4750	/// `'a` corresponding to the lifetime of original DFA, `T` corresponds to
4751	/// the type of the transition table itself.
4752	pub(crate) struct StateIter<'a, T> {
4753	tt: &'a TransitionTable<T>,
4754	it: iter::Enumerate<slice::Chunks<'a, StateID>>,
4755	}
4756
4757	impl<'a, T: AsRef<[u32]>> Iterator for StateIter<'a, T> {
4758	type Item = State<'a>;
4759
4760	fn next(&mut self) -> Option<State<'a>> {
4761	self.it.next().map(\|(index, _)\| {
4762	let id = self.tt.to_state_id(index);
4763	self.tt.state(id)
4764	})
4765	}
4766	}
4767
4768	/// An immutable representation of a single DFA state.
4769	///
4770	/// `'a` correspondings to the lifetime of a DFA's transition table.
4771	pub(crate) struct State<'a> {
4772	id: StateID,
4773	stride2: usize,
4774	transitions: &'a [StateID],
4775	}
4776
4777	impl<'a> State<'a> {
4778	/// Return an iterator over all transitions in this state. This yields
4779	/// a number of transitions equivalent to the alphabet length of the
4780	/// corresponding DFA.
4781	///
4782	/// Each transition is represented by a tuple. The first element is
4783	/// the input byte for that transition and the second element is the
4784	/// transitions itself.
4785	pub(crate) fn transitions(&self) -> StateTransitionIter<'_> {
4786	StateTransitionIter {
4787	len: self.transitions.len(),
4788	it: self.transitions.iter().enumerate(),
4789	}
4790	}
4791
4792	/// Return an iterator over a sparse representation of the transitions in
4793	/// this state. Only non-dead transitions are returned.
4794	///
4795	/// The "sparse" representation in this case corresponds to a sequence of
4796	/// triples. The first two elements of the triple comprise an inclusive
4797	/// byte range while the last element corresponds to the transition taken
4798	/// for all bytes in the range.
4799	///
4800	/// This is somewhat more condensed than the classical sparse
4801	/// representation (where you have an element for every non-dead
4802	/// transition), but in practice, checking if a byte is in a range is very
4803	/// cheap and using ranges tends to conserve quite a bit more space.
4804	pub(crate) fn sparse_transitions(&self) -> StateSparseTransitionIter<'_> {
4805	StateSparseTransitionIter { dense: self.transitions(), cur: None }
4806	}
4807
4808	/// Returns the identifier for this state.
4809	pub(crate) fn id(&self) -> StateID {
4810	self.id
4811	}
4812
4813	/// Analyzes this state to determine whether it can be accelerated. If so,
4814	/// it returns an accelerator that contains at least one byte.
4815	#[cfg(feature = "dfa-build")]
4816	fn accelerate(&self, classes: &ByteClasses) -> Option<Accel> {
4817	// We just try to add bytes to our accelerator. Once adding fails
4818	// (because we've added too many bytes), then give up.
4819	let mut accel = Accel::new();
4820	for (class, id) in self.transitions() {
4821	if id == self.id() {
4822	continue;
4823	}
4824	for unit in classes.elements(class) {
4825	if let Some(byte) = unit.as_u8() {
4826	if !accel.add(byte) {
4827	return None;
4828	}
4829	}
4830	}
4831	}
4832	if accel.is_empty() {
4833	None
4834	} else {
4835	Some(accel)
4836	}
4837	}
4838	}
4839
4840	impl<'a> fmt::Debug for State<'a> {
4841	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
4842	for (i, (start, end, sid)) in self.sparse_transitions().enumerate() {
4843	let id = if f.alternate() {
4844	sid.as_usize()
4845	} else {
4846	sid.as_usize() >> self.stride2
4847	};
4848	if i > `0` {
4849	write!(f, ", ")?;
4850	}
4851	if start == end {
4852	write!(f, "{:?} => {:?}", start, id)?;
4853	} else {
4854	write!(f, "{:?}-{:?} => {:?}", start, end, id)?;
4855	}
4856	}
4857	Ok(())
4858	}
4859	}
4860
4861	/// An iterator over all transitions in a single DFA state. This yields
4862	/// a number of transitions equivalent to the alphabet length of the
4863	/// corresponding DFA.
4864	///
4865	/// Each transition is represented by a tuple. The first element is the input
4866	/// byte for that transition and the second element is the transition itself.
4867	#[derive(Debug)]
4868	pub(crate) struct StateTransitionIter<'a> {
4869	len: usize,
4870	it: iter::Enumerate<slice::Iter<'a, StateID>>,
4871	}
4872
4873	impl<'a> Iterator for StateTransitionIter<'a> {
4874	type Item = (alphabet::Unit, StateID);
4875
4876	fn next(&mut self) -> Option<(alphabet::Unit, StateID)> {
4877	self.it.next().map(\|(i, &id)\| {
4878	let unit = if i + `1` == self.len {
4879	alphabet::Unit::eoi(i)
4880	} else {
4881	let b = u8::try_from(i)
4882	.expect("raw byte alphabet is never exceeded");
4883	alphabet::Unit::u8(b)
4884	};
4885	(unit, id)
4886	})
4887	}
4888	}
4889
4890	/// An iterator over all non-DEAD transitions in a single DFA state using a
4891	/// sparse representation.
4892	///
4893	/// Each transition is represented by a triple. The first two elements of the
4894	/// triple comprise an inclusive byte range while the last element corresponds
4895	/// to the transition taken for all bytes in the range.
4896	///
4897	/// As a convenience, this always returns `alphabet::Unit` values of the same
4898	/// type. That is, you'll never get a (byte, EOI) or a (EOI, byte). Only (byte,
4899	/// byte) and (EOI, EOI) values are yielded.
4900	#[derive(Debug)]
4901	pub(crate) struct StateSparseTransitionIter<'a> {
4902	dense: StateTransitionIter<'a>,
4903	cur: Option<(alphabet::Unit, alphabet::Unit, StateID)>,
4904	}
4905
4906	impl<'a> Iterator for StateSparseTransitionIter<'a> {
4907	type Item = (alphabet::Unit, alphabet::Unit, StateID);
4908
4909	fn next(&mut self) -> Option<(alphabet::Unit, alphabet::Unit, StateID)> {
4910	while let Some((unit, next)) = self.dense.next() {
4911	let (prev_start, prev_end, prev_next) = match self.cur {
4912	Some(t) => t,
4913	None => {
4914	self.cur = Some((unit, unit, next));
4915	continue;
4916	}
4917	};
4918	if prev_next == next && !unit.is_eoi() {
4919	self.cur = Some((prev_start, unit, prev_next));
4920	} else {
4921	self.cur = Some((unit, unit, next));
4922	if prev_next != DEAD {
4923	return Some((prev_start, prev_end, prev_next));
4924	}
4925	}
4926	}
4927	if let Some((start, end, next)) = self.cur.take() {
4928	if next != DEAD {
4929	return Some((start, end, next));
4930	}
4931	}
4932	None
4933	}
4934	}
4935
4936	/// An error that occurred during the construction of a DFA.
4937	///
4938	/// This error does not provide many introspection capabilities. There are
4939	/// generally only two things you can do with it:
4940	///
4941	/// Obtain a human readable message via its `std::fmt::Display` impl.*
4942	/// Access an underlying* [`nfa::thompson::BuildError`](thompson::BuildError)
4943	/// type from its `source` method via the `std::error::Error` trait. This error
4944	/// only occurs when using convenience routines for building a DFA directly
4945	/// from a pattern string.
4946	///
4947	/// When the `std` feature is enabled, this implements the `std::error::Error`
4948	/// trait.
4949	#[cfg(feature = "dfa-build")]
4950	#[derive(Clone, Debug)]
4951	pub struct BuildError {
4952	kind: BuildErrorKind,
4953	}
4954
4955	/// The kind of error that occurred during the construction of a DFA.
4956	///
4957	/// Note that this error is non-exhaustive. Adding new variants is not
4958	/// considered a breaking change.
4959	#[cfg(feature = "dfa-build")]
4960	#[derive(Clone, Debug)]
4961	enum BuildErrorKind {
4962	/// An error that occurred while constructing an NFA as a precursor step
4963	/// before a DFA is compiled.
4964	NFA(thompson::BuildError),
4965	/// An error that occurred because an unsupported regex feature was used.
4966	/// The message string describes which unsupported feature was used.
4967	///
4968	/// The primary regex feature that is unsupported by DFAs is the Unicode
4969	/// word boundary look-around assertion (`\b`). This can be worked around
4970	/// by either using an ASCII word boundary (`(?-u:\b)`) or by enabling
4971	/// Unicode word boundaries when building a DFA.
4972	Unsupported(&'static str),
4973	/// An error that occurs if too many states are produced while building a
4974	/// DFA.
4975	TooManyStates,
4976	/// An error that occurs if too many start states are needed while building
4977	/// a DFA.
4978	///
4979	/// This is a kind of oddball error that occurs when building a DFA with
4980	/// start states enabled for each pattern and enough patterns to cause
4981	/// the table of start states to overflow `usize`.
4982	TooManyStartStates,
4983	/// This is another oddball error that can occur if there are too many
4984	/// patterns spread out across too many match states.
4985	TooManyMatchPatternIDs,
4986	/// An error that occurs if the DFA got too big during determinization.
4987	DFAExceededSizeLimit { limit: usize },
4988	/// An error that occurs if auxiliary storage (not the DFA) used during
4989	/// determinization got too big.
4990	DeterminizeExceededSizeLimit { limit: usize },
4991	}
4992
4993	#[cfg(feature = "dfa-build")]
4994	impl BuildError {
4995	/// Return the kind of this error.
4996	fn kind(&self) -> &BuildErrorKind {
4997	&self.kind
4998	}
4999
5000	pub(crate) fn nfa(err: thompson::BuildError) -> BuildError {
5001	BuildError { kind: BuildErrorKind::NFA(err) }
5002	}
5003
5004	pub(crate) fn unsupported_dfa_word_boundary_unicode() -> BuildError {
5005	let msg = "cannot build DFAs for regexes with Unicode word \
5006	boundaries; switch to ASCII word boundaries, or \
5007	heuristically enable Unicode word boundaries or use a \
5008	different regex engine";
5009	BuildError { kind: BuildErrorKind::Unsupported(msg) }
5010	}
5011
5012	pub(crate) fn too_many_states() -> BuildError {
5013	BuildError { kind: BuildErrorKind::TooManyStates }
5014	}
5015
5016	pub(crate) fn too_many_start_states() -> BuildError {
5017	BuildError { kind: BuildErrorKind::TooManyStartStates }
5018	}
5019
5020	pub(crate) fn too_many_match_pattern_ids() -> BuildError {
5021	BuildError { kind: BuildErrorKind::TooManyMatchPatternIDs }
5022	}
5023
5024	pub(crate) fn dfa_exceeded_size_limit(limit: usize) -> BuildError {
5025	BuildError { kind: BuildErrorKind::DFAExceededSizeLimit { limit } }
5026	}
5027
5028	pub(crate) fn determinize_exceeded_size_limit(limit: usize) -> BuildError {
5029	BuildError {
5030	kind: BuildErrorKind::DeterminizeExceededSizeLimit { limit },
5031	}
5032	}
5033	}
5034
5035	#[cfg(all(feature = "std", feature = "dfa-build"))]
5036	impl std::error::Error for BuildError {
5037	fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
5038	match self.kind() {
5039	BuildErrorKind::NFA(ref err) => Some(err),
5040	_ => None,
5041	}
5042	}
5043	}
5044
5045	#[cfg(feature = "dfa-build")]
5046	impl core::fmt::Display for BuildError {
5047	fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
5048	match self.kind() {
5049	BuildErrorKind::NFA(_) => write!(f, "error building NFA"),
5050	BuildErrorKind::Unsupported(ref msg) => {
5051	write!(f, "unsupported regex feature for DFAs: {}", msg)
5052	}
5053	BuildErrorKind::TooManyStates => write!(
5054	f,
5055	"number of DFA states exceeds limit of {}",
5056	StateID::LIMIT,
5057	),
5058	BuildErrorKind::TooManyStartStates => {
5059	let stride = Start::len();
5060	// The start table has `stride` entries for starting states for
5061	// the entire DFA, and then `stride` entries for each pattern
5062	// if start states for each pattern are enabled (which is the
5063	// only way this error can occur). Thus, the total number of
5064	// patterns that can fit in the table is `stride` less than
5065	// what we can allocate.
5066	let max = usize::try_from(core::isize::MAX).unwrap();
5067	let limit = (max - stride) / stride;
5068	write!(
5069	f,
5070	"compiling DFA with start states exceeds pattern \
5071	pattern limit of {}",
5072	limit,
5073	)
5074	}
5075	BuildErrorKind::TooManyMatchPatternIDs => write!(
5076	f,
5077	"compiling DFA with total patterns in all match states \
5078	exceeds limit of {}",
5079	PatternID::LIMIT,
5080	),
5081	BuildErrorKind::DFAExceededSizeLimit { limit } => write!(
5082	f,
5083	"DFA exceeded size limit of {:?} during determinization",
5084	limit,
5085	),
5086	BuildErrorKind::DeterminizeExceededSizeLimit { limit } => {
5087	write!(f, "determinization exceeded size limit of {:?}", limit)
5088	}
5089	}
5090	}
5091	}
5092
5093	#[cfg(all(test, feature = "syntax", feature = "dfa-build"))]
5094	mod tests {
5095	use crate::{Input, MatchError};
5096
5097	use super::*;
5098
5099	#[test]
5100	fn errors_with_unicode_word_boundary() {
5101	let pattern = r"\b";
5102	assert!(Builder::new().build(pattern).is_err());
5103	}
5104
5105	#[test]
5106	fn roundtrip_never_match() {
5107	let dfa = DFA::never_match().unwrap();
5108	let (buf, _) = dfa.to_bytes_native_endian();
5109	let dfa: DFA<&[u32]> = DFA::from_bytes(&buf).unwrap().0;
5110
5111	assert_eq!(None, dfa.try_search_fwd(&Input::new("foo12345")).unwrap());
5112	}
5113
5114	#[test]
5115	fn roundtrip_always_match() {
5116	use crate::HalfMatch;
5117
5118	let dfa = DFA::always_match().unwrap();
5119	let (buf, _) = dfa.to_bytes_native_endian();
5120	let dfa: DFA<&[u32]> = DFA::from_bytes(&buf).unwrap().0;
5121
5122	assert_eq!(
5123	Some(HalfMatch::must(`0`, `0`)),
5124	dfa.try_search_fwd(&Input::new("foo12345")).unwrap()
5125	);
5126	}
5127
5128	// See the analogous test in src/hybrid/dfa.rs.
5129	#[test]
5130	fn heuristic_unicode_reverse() {
5131	let dfa = DFA::builder()
5132	.configure(DFA::config().unicode_word_boundary(`true`))
5133	.thompson(thompson::Config::new().reverse(`true`))
5134	.build(r"\b[0-9]+\b")
5135	.unwrap();
5136
5137	let input = Input::new("β123").range(`2`..);
5138	let expected = MatchError::quit(`0xB2`, `1`);
5139	let got = dfa.try_search_rev(&input);
5140	assert_eq!(Err(expected), got);
5141
5142	let input = Input::new("123β").range(..`3`);
5143	let expected = MatchError::quit(`0xCE`, `3`);
5144	let got = dfa.try_search_rev(&input);
5145	assert_eq!(Err(expected), got);
5146	}
5147	}
5148

Provided by KDAB

Definitions