sparse.rs - Codebrowser

1	/!*
2	Types and routines specific to sparse DFAs.
3
4	This module is the home of [`sparse::DFA`](DFA).
5
6	Unlike the [`dense`] module, this module does not contain a builder or
7	configuration specific for sparse DFAs. Instead, the intended way to build a
8	sparse DFA is either by using a default configuration with its constructor
9	[`sparse::DFA::new`](DFA::new), or by first configuring the construction of a
10	dense DFA with [`dense::Builder`] and then calling [`dense::DFA::to_sparse`].
11	For example, this configures a sparse DFA to do an overlapping search:
12
13	```
14	use regex_automata::{
15	dfa::{Automaton, OverlappingState, dense},
16	HalfMatch, Input, MatchKind,
17	};
18
19	let dense_re = dense::Builder::new()
20	.configure(dense::Config::new().match_kind(MatchKind::All))
21	.build(r"Samwise\|Sam")?;
22	let sparse_re = dense_re.to_sparse()?;
23
24	// Setup our haystack and initial start state.
25	let input = Input::new("Samwise");
26	let mut state = OverlappingState::start();
27
28	// First, 'Sam' will match.
29	sparse_re.try_search_overlapping_fwd(&input, &mut state)?;
30	assert_eq!(Some(HalfMatch::must(`0`, `3`)), state.get_match());
31
32	// And now 'Samwise' will match.
33	sparse_re.try_search_overlapping_fwd(&input, &mut state)?;
34	assert_eq!(Some(HalfMatch::must(`0`, `7`)), state.get_match());
35	# Ok::<(), Box<dyn std::error::Error>>(())
36	```
37	*/
38
39	#[cfg(feature = "dfa-build")]
40	use core::iter;
41	use core::{
42	convert::{TryFrom, TryInto},
43	fmt,
44	mem::size_of,
45	};
46
47	#[cfg(feature = "dfa-build")]
48	use alloc::{vec, vec::Vec};
49
50	#[cfg(feature = "dfa-build")]
51	use crate::dfa::dense::{self, BuildError};
52	use crate::{
53	dfa::{
54	automaton::{fmt_state_indicator, Automaton, StartError},
55	dense::Flags,
56	special::Special,
57	StartKind, DEAD,
58	},
59	util::{
60	alphabet::{ByteClasses, ByteSet},
61	escape::DebugByte,
62	int::{Pointer, Usize, U16, U32},
63	prefilter::Prefilter,
64	primitives::{PatternID, StateID},
65	search::Anchored,
66	start::{self, Start, StartByteMap},
67	wire::{self, DeserializeError, Endian, SerializeError},
68	},
69	};
70
71	const LABEL: &str = "rust-regex-automata-dfa-sparse";
72	const VERSION: u32 = `2`;
73
74	/// A sparse deterministic finite automaton (DFA) with variable sized states.
75	///
76	/// In contrast to a [dense::DFA], a sparse DFA uses a more space efficient
77	/// representation for its transitions. Consequently, sparse DFAs may use much
78	/// less memory than dense DFAs, but this comes at a price. In particular,
79	/// reading the more space efficient transitions takes more work, and
80	/// consequently, searching using a sparse DFA is typically slower than a dense
81	/// DFA.
82	///
83	/// A sparse DFA can be built using the default configuration via the
84	/// [`DFA::new`] constructor. Otherwise, one can configure various aspects of a
85	/// dense DFA via [`dense::Builder`], and then convert a dense DFA to a sparse
86	/// DFA using [`dense::DFA::to_sparse`].
87	///
88	/// In general, a sparse DFA supports all the same search operations as a dense
89	/// DFA.
90	///
91	/// Making the choice between a dense and sparse DFA depends on your specific
92	/// work load. If you can sacrifice a bit of search time performance, then a
93	/// sparse DFA might be the best choice. In particular, while sparse DFAs are
94	/// probably always slower than dense DFAs, you may find that they are easily
95	/// fast enough for your purposes!
96	///
97	/// # Type parameters
98	///
99	/// A `DFA` has one type parameter, `T`, which is used to represent the parts
100	/// of a sparse DFA. `T` is typically a `Vec<u8>` or a `&[u8]`.
101	///
102	/// # The `Automaton` trait
103	///
104	/// This type implements the [`Automaton`] trait, which means it can be used
105	/// for searching. For example:
106	///
107	/// ```
108	/// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input};
109	///
110	/// let dfa = DFA::new("foo[0-9]+")?;
111	/// let expected = Some(HalfMatch::must(`0`, `8`));
112	/// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
113	/// # Ok::<(), Box<dyn std::error::Error>>(())
114	/// ```
115	#[derive(Clone)]
116	pub struct DFA<T> {
117	// When compared to a dense DFA, a sparse DFA looks* a lot simpler*
118	// representation-wise. In reality, it is perhaps more complicated. Namely,
119	// in a dense DFA, all information needs to be very cheaply accessible
120	// using only state IDs. In a sparse DFA however, each state uses a
121	// variable amount of space because each state encodes more information
122	// than just its transitions. Each state also includes an accelerator if
123	// one exists, along with the matching pattern IDs if the state is a match
124	// state.
125	//
126	// That is, a lot of the complexity is pushed down into how each state
127	// itself is represented.
128	tt: Transitions<T>,
129	st: StartTable<T>,
130	special: Special,
131	pre: Option<Prefilter>,
132	quitset: ByteSet,
133	flags: Flags,
134	}
135
136	#[cfg(feature = "dfa-build")]
137	impl DFA<Vec<u8>> {
138	/// Parse the given regular expression using a default configuration and
139	/// return the corresponding sparse DFA.
140	///
141	/// If you want a non-default configuration, then use the
142	/// [`dense::Builder`] to set your own configuration, and then call
143	/// [`dense::DFA::to_sparse`] to create a sparse DFA.
144	///
145	/// # Example
146	///
147	/// ```
148	/// use regex_automata::{dfa::{Automaton, sparse}, HalfMatch, Input};
149	///
150	/// let dfa = sparse::DFA::new("foo[0-9]+bar")?;
151	///
152	/// let expected = Some(HalfMatch::must(`0`, `11`));
153	/// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345bar"))?);
154	/// # Ok::<(), Box<dyn std::error::Error>>(())
155	/// ```
156	#[cfg(feature = "syntax")]
157	pub fn new(pattern: &str) -> Result<DFA<Vec<u8>>, BuildError> {
158	dense::Builder::new()
159	.build(pattern)
160	.and_then(\|dense\| dense.to_sparse())
161	}
162
163	/// Parse the given regular expressions using a default configuration and
164	/// return the corresponding multi-DFA.
165	///
166	/// If you want a non-default configuration, then use the
167	/// [`dense::Builder`] to set your own configuration, and then call
168	/// [`dense::DFA::to_sparse`] to create a sparse DFA.
169	///
170	/// # Example
171	///
172	/// ```
173	/// use regex_automata::{dfa::{Automaton, sparse}, HalfMatch, Input};
174	///
175	/// let dfa = sparse::DFA::new_many(&["[0-9]+", "[a-z]+"])?;
176	/// let expected = Some(HalfMatch::must(`1`, `3`));
177	/// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345bar"))?);
178	/// # Ok::<(), Box<dyn std::error::Error>>(())
179	/// ```
180	#[cfg(feature = "syntax")]
181	pub fn new_many<P: AsRef<str>>(
182	patterns: &[P],
183	) -> Result<DFA<Vec<u8>>, BuildError> {
184	dense::Builder::new()
185	.build_many(patterns)
186	.and_then(\|dense\| dense.to_sparse())
187	}
188	}
189
190	#[cfg(feature = "dfa-build")]
191	impl DFA<Vec<u8>> {
192	/// Create a new DFA that matches every input.
193	///
194	/// # Example
195	///
196	/// ```
197	/// use regex_automata::{
198	/// dfa::{Automaton, sparse},
199	/// HalfMatch, Input,
200	/// };
201	///
202	/// let dfa = sparse::DFA::always_match()?;
203	///
204	/// let expected = Some(HalfMatch::must(`0`, `0`));
205	/// assert_eq!(expected, dfa.try_search_fwd(&Input::new(""))?);
206	/// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo"))?);
207	/// # Ok::<(), Box<dyn std::error::Error>>(())
208	/// ```
209	pub fn always_match() -> Result<DFA<Vec<u8>>, BuildError> {
210	dense::DFA::always_match()?.to_sparse()
211	}
212
213	/// Create a new sparse DFA that never matches any input.
214	///
215	/// # Example
216	///
217	/// ```
218	/// use regex_automata::{dfa::{Automaton, sparse}, Input};
219	///
220	/// let dfa = sparse::DFA::never_match()?;
221	/// assert_eq!(None, dfa.try_search_fwd(&Input::new(""))?);
222	/// assert_eq!(None, dfa.try_search_fwd(&Input::new("foo"))?);
223	/// # Ok::<(), Box<dyn std::error::Error>>(())
224	/// ```
225	pub fn never_match() -> Result<DFA<Vec<u8>>, BuildError> {
226	dense::DFA::never_match()?.to_sparse()
227	}
228
229	/// The implementation for constructing a sparse DFA from a dense DFA.
230	pub(crate) fn from_dense<T: AsRef<[u32]>>(
231	dfa: &dense::DFA<T>,
232	) -> Result<DFA<Vec<u8>>, BuildError> {
233	// In order to build the transition table, we need to be able to write
234	// state identifiers for each of the "next" transitions in each state.
235	// Our state identifiers correspond to the byte offset in the
236	// transition table at which the state is encoded. Therefore, we do not
237	// actually know what the state identifiers are until we've allocated
238	// exactly as much space as we need for each state. Thus, construction
239	// of the transition table happens in two passes.
240	//
241	// In the first pass, we fill out the shell of each state, which
242	// includes the transition length, the input byte ranges and
243	// zero-filled space for the transitions and accelerators, if present.
244	// In this first pass, we also build up a map from the state identifier
245	// index of the dense DFA to the state identifier in this sparse DFA.
246	//
247	// In the second pass, we fill in the transitions based on the map
248	// built in the first pass.
249
250	// The capacity given here reflects a minimum. (Well, the true minimum
251	// is likely even bigger, but hopefully this saves a few reallocs.)
252	let mut sparse = Vec::with_capacity(StateID::SIZE * dfa.state_len());
253	// This maps state indices from the dense DFA to StateIDs in the sparse
254	// DFA. We build out this map on the first pass, and then use it in the
255	// second pass to back-fill our transitions.
256	let mut remap: Vec<StateID> = vec![DEAD; dfa.state_len()];
257	for state in dfa.states() {
258	let pos = sparse.len();
259
260	remap[dfa.to_index(state.id())] = StateID::new(pos)
261	.map_err(\|_\| BuildError::too_many_states())?;
262	// zero-filled space for the transition length
263	sparse.push(`0`);
264	sparse.push(`0`);
265
266	let mut transition_len = `0`;
267	for (unit1, unit2, _) in state.sparse_transitions() {
268	match (unit1.as_u8(), unit2.as_u8()) {
269	(Some(b1), Some(b2)) => {
270	transition_len += `1`;
271	sparse.push(b1);
272	sparse.push(b2);
273	}
274	(None, None) => {}
275	(Some(_), None) \| (None, Some(_)) => {
276	// can never occur because sparse_transitions never
277	// groups EOI with any other transition.
278	unreachable!()
279	}
280	}
281	}
282	// Add dummy EOI transition. This is never actually read while
283	// searching, but having space equivalent to the total number
284	// of transitions is convenient. Otherwise, we'd need to track
285	// a different number of transitions for the byte ranges as for
286	// the 'next' states.
287	//
288	// N.B. The loop above is not guaranteed to yield the EOI
289	// transition, since it may point to a DEAD state. By putting
290	// it here, we always write the EOI transition, and thus
291	// guarantee that our transition length is >0. Why do we always
292	// need the EOI transition? Because in order to implement
293	// Automaton::next_eoi_state, this lets us just ask for the last
294	// transition. There are probably other/better ways to do this.
295	transition_len += `1`;
296	sparse.push(`0`);
297	sparse.push(`0`);
298
299	// Check some assumptions about transition length.
300	assert_ne!(
301	transition_len, `0`,
302	"transition length should be non-zero",
303	);
304	assert!(
305	transition_len <= `257`,
306	"expected transition length {} to be <= 257",
307	transition_len,
308	);
309
310	// Fill in the transition length.
311	// Since transition length is always <= 257, we use the most
312	// significant bit to indicate whether this is a match state or
313	// not.
314	let ntrans = if dfa.is_match_state(state.id()) {
315	transition_len \| (`1` << `15`)
316	} else {
317	transition_len
318	};
319	wire::NE::write_u16(ntrans, &mut sparse[pos..]);
320
321	// zero-fill the actual transitions.
322	// Unwraps are OK since transition_length <= 257 and our minimum
323	// support usize size is 16-bits.
324	let zeros = usize::try_from(transition_len)
325	.unwrap()
326	.checked_mul(StateID::SIZE)
327	.unwrap();
328	sparse.extend(iter::repeat(`0`).take(zeros));
329
330	// If this is a match state, write the pattern IDs matched by this
331	// state.
332	if dfa.is_match_state(state.id()) {
333	let plen = dfa.match_pattern_len(state.id());
334	// Write the actual pattern IDs with a u32 length prefix.
335	// First, zero-fill space.
336	let mut pos = sparse.len();
337	// Unwraps are OK since it's guaranteed that plen <=
338	// PatternID::LIMIT, which is in turn guaranteed to fit into a
339	// u32.
340	let zeros = size_of::<u32>()
341	.checked_mul(plen)
342	.unwrap()
343	.checked_add(size_of::<u32>())
344	.unwrap();
345	sparse.extend(iter::repeat(`0`).take(zeros));
346
347	// Now write the length prefix.
348	wire::NE::write_u32(
349	// Will never fail since u32::MAX is invalid pattern ID.
350	// Thus, the number of pattern IDs is representable by a
351	// u32.
352	plen.try_into().expect("pattern ID length fits in u32"),
353	&mut sparse[pos..],
354	);
355	pos += size_of::<u32>();
356
357	// Now write the pattern IDs.
358	for &pid in dfa.pattern_id_slice(state.id()) {
359	pos += wire::write_pattern_id::<wire::NE>(
360	pid,
361	&mut sparse[pos..],
362	);
363	}
364	}
365
366	// And now add the accelerator, if one exists. An accelerator is
367	// at most 4 bytes and at least 1 byte. The first byte is the
368	// length, N. N bytes follow the length. The set of bytes that
369	// follow correspond (exhaustively) to the bytes that must be seen
370	// to leave this state.
371	let accel = dfa.accelerator(state.id());
372	sparse.push(accel.len().try_into().unwrap());
373	sparse.extend_from_slice(accel);
374	}
375
376	let mut new = DFA {
377	tt: Transitions {
378	sparse,
379	classes: dfa.byte_classes().clone(),
380	state_len: dfa.state_len(),
381	pattern_len: dfa.pattern_len(),
382	},
383	st: StartTable::from_dense_dfa(dfa, &remap)?,
384	special: dfa.special().remap(\|id\| remap[dfa.to_index(id)]),
385	pre: dfa.get_prefilter().map(\|p\| p.clone()),
386	quitset: dfa.quitset().clone(),
387	flags: dfa.flags().clone(),
388	};
389	// And here's our second pass. Iterate over all of the dense states
390	// again, and update the transitions in each of the states in the
391	// sparse DFA.
392	for old_state in dfa.states() {
393	let new_id = remap[dfa.to_index(old_state.id())];
394	let mut new_state = new.tt.state_mut(new_id);
395	let sparse = old_state.sparse_transitions();
396	for (i, (_, _, next)) in sparse.enumerate() {
397	let next = remap[dfa.to_index(next)];
398	new_state.set_next_at(i, next);
399	}
400	}
401	debug!(
402	"created sparse DFA, memory usage: {} (dense memory usage: {})",
403	new.memory_usage(),
404	dfa.memory_usage(),
405	);
406	Ok(new)
407	}
408	}
409
410	impl<T: AsRef<[u8]>> DFA<T> {
411	/// Cheaply return a borrowed version of this sparse DFA. Specifically, the
412	/// DFA returned always uses `&[u8]` for its transitions.
413	pub fn as_ref<'a>(&'a self) -> DFA<&'a [u8]> {
414	DFA {
415	tt: self.tt.as_ref(),
416	st: self.st.as_ref(),
417	special: self.special,
418	pre: self.pre.clone(),
419	quitset: self.quitset,
420	flags: self.flags,
421	}
422	}
423
424	/// Return an owned version of this sparse DFA. Specifically, the DFA
425	/// returned always uses `Vec<u8>` for its transitions.
426	///
427	/// Effectively, this returns a sparse DFA whose transitions live on the
428	/// heap.
429	#[cfg(feature = "alloc")]
430	pub fn to_owned(&self) -> DFA<alloc::vec::Vec<u8>> {
431	DFA {
432	tt: self.tt.to_owned(),
433	st: self.st.to_owned(),
434	special: self.special,
435	pre: self.pre.clone(),
436	quitset: self.quitset,
437	flags: self.flags,
438	}
439	}
440
441	/// Returns the starting state configuration for this DFA.
442	///
443	/// The default is [`StartKind::Both`], which means the DFA supports both
444	/// unanchored and anchored searches. However, this can generally lead to
445	/// bigger DFAs. Therefore, a DFA might be compiled with support for just
446	/// unanchored or anchored searches. In that case, running a search with
447	/// an unsupported configuration will panic.
448	pub fn start_kind(&self) -> StartKind {
449	self.st.kind
450	}
451
452	/// Returns true only if this DFA has starting states for each pattern.
453	///
454	/// When a DFA has starting states for each pattern, then a search with the
455	/// DFA can be configured to only look for anchored matches of a specific
456	/// pattern. Specifically, APIs like [`Automaton::try_search_fwd`] can
457	/// accept a [`Anchored::Pattern`] if and only if this method returns true.
458	/// Otherwise, an error will be returned.
459	///
460	/// Note that if the DFA is empty, this always returns false.
461	pub fn starts_for_each_pattern(&self) -> bool {
462	self.st.pattern_len.is_some()
463	}
464
465	/// Returns the equivalence classes that make up the alphabet for this DFA.
466	///
467	/// Unless [`dense::Config::byte_classes`] was disabled, it is possible
468	/// that multiple distinct bytes are grouped into the same equivalence
469	/// class if it is impossible for them to discriminate between a match and
470	/// a non-match. This has the effect of reducing the overall alphabet size
471	/// and in turn potentially substantially reducing the size of the DFA's
472	/// transition table.
473	///
474	/// The downside of using equivalence classes like this is that every state
475	/// transition will automatically use this map to convert an arbitrary
476	/// byte to its corresponding equivalence class. In practice this has a
477	/// negligible impact on performance.
478	pub fn byte_classes(&self) -> &ByteClasses {
479	&self.tt.classes
480	}
481
482	/// Returns the memory usage, in bytes, of this DFA.
483	///
484	/// The memory usage is computed based on the number of bytes used to
485	/// represent this DFA.
486	///
487	/// This does not* include the stack size used up by this DFA. To*
488	/// compute that, use `std::mem::size_of::<sparse::DFA>()`.
489	pub fn memory_usage(&self) -> usize {
490	self.tt.memory_usage() + self.st.memory_usage()
491	}
492	}
493
494	/// Routines for converting a sparse DFA to other representations, such as raw
495	/// bytes suitable for persistent storage.
496	impl<T: AsRef<[u8]>> DFA<T> {
497	/// Serialize this DFA as raw bytes to a `Vec<u8>` in little endian
498	/// format.
499	///
500	/// The written bytes are guaranteed to be deserialized correctly and
501	/// without errors in a semver compatible release of this crate by a
502	/// `DFA`'s deserialization APIs (assuming all other criteria for the
503	/// deserialization APIs has been satisfied):
504	///
505	/// [`DFA::from_bytes`]*
506	/// [`DFA::from_bytes_unchecked`]*
507	///
508	/// Note that unlike a [`dense::DFA`]'s serialization methods, this does
509	/// not add any initial padding to the returned bytes. Padding isn't
510	/// required for sparse DFAs since they have no alignment requirements.
511	///
512	/// # Example
513	///
514	/// This example shows how to serialize and deserialize a DFA:
515	///
516	/// ```
517	/// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input};
518	///
519	/// // Compile our original DFA.
520	/// let original_dfa = DFA::new("foo[0-9]+")?;
521	///
522	/// // N.B. We use native endianness here to make the example work, but
523	/// // using to_bytes_little_endian would work on a little endian target.
524	/// let buf = original_dfa.to_bytes_native_endian();
525	/// // Even if buf has initial padding, DFA::from_bytes will automatically
526	/// // ignore it.
527	/// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0;
528	///
529	/// let expected = Some(HalfMatch::must(`0`, `8`));
530	/// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
531	/// # Ok::<(), Box<dyn std::error::Error>>(())
532	/// ```
533	#[cfg(feature = "dfa-build")]
534	pub fn to_bytes_little_endian(&self) -> Vec<u8> {
535	self.to_bytes::<wire::LE>()
536	}
537
538	/// Serialize this DFA as raw bytes to a `Vec<u8>` in big endian
539	/// format.
540	///
541	/// The written bytes are guaranteed to be deserialized correctly and
542	/// without errors in a semver compatible release of this crate by a
543	/// `DFA`'s deserialization APIs (assuming all other criteria for the
544	/// deserialization APIs has been satisfied):
545	///
546	/// [`DFA::from_bytes`]*
547	/// [`DFA::from_bytes_unchecked`]*
548	///
549	/// Note that unlike a [`dense::DFA`]'s serialization methods, this does
550	/// not add any initial padding to the returned bytes. Padding isn't
551	/// required for sparse DFAs since they have no alignment requirements.
552	///
553	/// # Example
554	///
555	/// This example shows how to serialize and deserialize a DFA:
556	///
557	/// ```
558	/// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input};
559	///
560	/// // Compile our original DFA.
561	/// let original_dfa = DFA::new("foo[0-9]+")?;
562	///
563	/// // N.B. We use native endianness here to make the example work, but
564	/// // using to_bytes_big_endian would work on a big endian target.
565	/// let buf = original_dfa.to_bytes_native_endian();
566	/// // Even if buf has initial padding, DFA::from_bytes will automatically
567	/// // ignore it.
568	/// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0;
569	///
570	/// let expected = Some(HalfMatch::must(`0`, `8`));
571	/// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
572	/// # Ok::<(), Box<dyn std::error::Error>>(())
573	/// ```
574	#[cfg(feature = "dfa-build")]
575	pub fn to_bytes_big_endian(&self) -> Vec<u8> {
576	self.to_bytes::<wire::BE>()
577	}
578
579	/// Serialize this DFA as raw bytes to a `Vec<u8>` in native endian
580	/// format.
581	///
582	/// The written bytes are guaranteed to be deserialized correctly and
583	/// without errors in a semver compatible release of this crate by a
584	/// `DFA`'s deserialization APIs (assuming all other criteria for the
585	/// deserialization APIs has been satisfied):
586	///
587	/// [`DFA::from_bytes`]*
588	/// [`DFA::from_bytes_unchecked`]*
589	///
590	/// Note that unlike a [`dense::DFA`]'s serialization methods, this does
591	/// not add any initial padding to the returned bytes. Padding isn't
592	/// required for sparse DFAs since they have no alignment requirements.
593	///
594	/// Generally speaking, native endian format should only be used when
595	/// you know that the target you're compiling the DFA for matches the
596	/// endianness of the target on which you're compiling DFA. For example,
597	/// if serialization and deserialization happen in the same process or on
598	/// the same machine. Otherwise, when serializing a DFA for use in a
599	/// portable environment, you'll almost certainly want to serialize _both_
600	/// a little endian and a big endian version and then load the correct one
601	/// based on the target's configuration.
602	///
603	/// # Example
604	///
605	/// This example shows how to serialize and deserialize a DFA:
606	///
607	/// ```
608	/// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input};
609	///
610	/// // Compile our original DFA.
611	/// let original_dfa = DFA::new("foo[0-9]+")?;
612	///
613	/// let buf = original_dfa.to_bytes_native_endian();
614	/// // Even if buf has initial padding, DFA::from_bytes will automatically
615	/// // ignore it.
616	/// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0;
617	///
618	/// let expected = Some(HalfMatch::must(`0`, `8`));
619	/// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
620	/// # Ok::<(), Box<dyn std::error::Error>>(())
621	/// ```
622	#[cfg(feature = "dfa-build")]
623	pub fn to_bytes_native_endian(&self) -> Vec<u8> {
624	self.to_bytes::<wire::NE>()
625	}
626
627	/// The implementation of the public `to_bytes` serialization methods,
628	/// which is generic over endianness.
629	#[cfg(feature = "dfa-build")]
630	fn to_bytes<E: Endian>(&self) -> Vec<u8> {
631	let mut buf = vec![`0`; self.write_to_len()];
632	// This should always succeed since the only possible serialization
633	// error is providing a buffer that's too small, but we've ensured that
634	// `buf` is big enough here.
635	self.write_to::<E>(&mut buf).unwrap();
636	buf
637	}
638
639	/// Serialize this DFA as raw bytes to the given slice, in little endian
640	/// format. Upon success, the total number of bytes written to `dst` is
641	/// returned.
642	///
643	/// The written bytes are guaranteed to be deserialized correctly and
644	/// without errors in a semver compatible release of this crate by a
645	/// `DFA`'s deserialization APIs (assuming all other criteria for the
646	/// deserialization APIs has been satisfied):
647	///
648	/// [`DFA::from_bytes`]*
649	/// [`DFA::from_bytes_unchecked`]*
650	///
651	/// # Errors
652	///
653	/// This returns an error if the given destination slice is not big enough
654	/// to contain the full serialized DFA. If an error occurs, then nothing
655	/// is written to `dst`.
656	///
657	/// # Example
658	///
659	/// This example shows how to serialize and deserialize a DFA without
660	/// dynamic memory allocation.
661	///
662	/// ```
663	/// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input};
664	///
665	/// // Compile our original DFA.
666	/// let original_dfa = DFA::new("foo[0-9]+")?;
667	///
668	/// // Create a 4KB buffer on the stack to store our serialized DFA.
669	/// let mut buf = [`0u8`; `4` * (`1`<<`10`)];
670	/// // N.B. We use native endianness here to make the example work, but
671	/// // using write_to_little_endian would work on a little endian target.
672	/// let written = original_dfa.write_to_native_endian(&mut buf)?;
673	/// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0;
674	///
675	/// let expected = Some(HalfMatch::must(`0`, `8`));
676	/// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
677	/// # Ok::<(), Box<dyn std::error::Error>>(())
678	/// ```
679	pub fn write_to_little_endian(
680	&self,
681	dst: &mut [u8],
682	) -> Result<usize, SerializeError> {
683	self.write_to::<wire::LE>(dst)
684	}
685
686	/// Serialize this DFA as raw bytes to the given slice, in big endian
687	/// format. Upon success, the total number of bytes written to `dst` is
688	/// returned.
689	///
690	/// The written bytes are guaranteed to be deserialized correctly and
691	/// without errors in a semver compatible release of this crate by a
692	/// `DFA`'s deserialization APIs (assuming all other criteria for the
693	/// deserialization APIs has been satisfied):
694	///
695	/// [`DFA::from_bytes`]*
696	/// [`DFA::from_bytes_unchecked`]*
697	///
698	/// # Errors
699	///
700	/// This returns an error if the given destination slice is not big enough
701	/// to contain the full serialized DFA. If an error occurs, then nothing
702	/// is written to `dst`.
703	///
704	/// # Example
705	///
706	/// This example shows how to serialize and deserialize a DFA without
707	/// dynamic memory allocation.
708	///
709	/// ```
710	/// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input};
711	///
712	/// // Compile our original DFA.
713	/// let original_dfa = DFA::new("foo[0-9]+")?;
714	///
715	/// // Create a 4KB buffer on the stack to store our serialized DFA.
716	/// let mut buf = [`0u8`; `4` * (`1`<<`10`)];
717	/// // N.B. We use native endianness here to make the example work, but
718	/// // using write_to_big_endian would work on a big endian target.
719	/// let written = original_dfa.write_to_native_endian(&mut buf)?;
720	/// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0;
721	///
722	/// let expected = Some(HalfMatch::must(`0`, `8`));
723	/// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
724	/// # Ok::<(), Box<dyn std::error::Error>>(())
725	/// ```
726	pub fn write_to_big_endian(
727	&self,
728	dst: &mut [u8],
729	) -> Result<usize, SerializeError> {
730	self.write_to::<wire::BE>(dst)
731	}
732
733	/// Serialize this DFA as raw bytes to the given slice, in native endian
734	/// format. Upon success, the total number of bytes written to `dst` is
735	/// returned.
736	///
737	/// The written bytes are guaranteed to be deserialized correctly and
738	/// without errors in a semver compatible release of this crate by a
739	/// `DFA`'s deserialization APIs (assuming all other criteria for the
740	/// deserialization APIs has been satisfied):
741	///
742	/// [`DFA::from_bytes`]*
743	/// [`DFA::from_bytes_unchecked`]*
744	///
745	/// Generally speaking, native endian format should only be used when
746	/// you know that the target you're compiling the DFA for matches the
747	/// endianness of the target on which you're compiling DFA. For example,
748	/// if serialization and deserialization happen in the same process or on
749	/// the same machine. Otherwise, when serializing a DFA for use in a
750	/// portable environment, you'll almost certainly want to serialize _both_
751	/// a little endian and a big endian version and then load the correct one
752	/// based on the target's configuration.
753	///
754	/// # Errors
755	///
756	/// This returns an error if the given destination slice is not big enough
757	/// to contain the full serialized DFA. If an error occurs, then nothing
758	/// is written to `dst`.
759	///
760	/// # Example
761	///
762	/// This example shows how to serialize and deserialize a DFA without
763	/// dynamic memory allocation.
764	///
765	/// ```
766	/// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input};
767	///
768	/// // Compile our original DFA.
769	/// let original_dfa = DFA::new("foo[0-9]+")?;
770	///
771	/// // Create a 4KB buffer on the stack to store our serialized DFA.
772	/// let mut buf = [`0u8`; `4` * (`1`<<`10`)];
773	/// let written = original_dfa.write_to_native_endian(&mut buf)?;
774	/// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0;
775	///
776	/// let expected = Some(HalfMatch::must(`0`, `8`));
777	/// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
778	/// # Ok::<(), Box<dyn std::error::Error>>(())
779	/// ```
780	pub fn write_to_native_endian(
781	&self,
782	dst: &mut [u8],
783	) -> Result<usize, SerializeError> {
784	self.write_to::<wire::NE>(dst)
785	}
786
787	/// The implementation of the public `write_to` serialization methods,
788	/// which is generic over endianness.
789	fn write_to<E: Endian>(
790	&self,
791	dst: &mut [u8],
792	) -> Result<usize, SerializeError> {
793	let mut nw = `0`;
794	nw += wire::write_label(LABEL, &mut dst[nw..])?;
795	nw += wire::write_endianness_check::<E>(&mut dst[nw..])?;
796	nw += wire::write_version::<E>(VERSION, &mut dst[nw..])?;
797	nw += {
798	// Currently unused, intended for future flexibility
799	E::write_u32(`0`, &mut dst[nw..]);
800	size_of::<u32>()
801	};
802	nw += self.flags.write_to::<E>(&mut dst[nw..])?;
803	nw += self.tt.write_to::<E>(&mut dst[nw..])?;
804	nw += self.st.write_to::<E>(&mut dst[nw..])?;
805	nw += self.special.write_to::<E>(&mut dst[nw..])?;
806	nw += self.quitset.write_to::<E>(&mut dst[nw..])?;
807	Ok(nw)
808	}
809
810	/// Return the total number of bytes required to serialize this DFA.
811	///
812	/// This is useful for determining the size of the buffer required to pass
813	/// to one of the serialization routines:
814	///
815	/// [`DFA::write_to_little_endian`]*
816	/// [`DFA::write_to_big_endian`]*
817	/// [`DFA::write_to_native_endian`]*
818	///
819	/// Passing a buffer smaller than the size returned by this method will
820	/// result in a serialization error.
821	///
822	/// # Example
823	///
824	/// This example shows how to dynamically allocate enough room to serialize
825	/// a sparse DFA.
826	///
827	/// ```
828	/// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input};
829	///
830	/// // Compile our original DFA.
831	/// let original_dfa = DFA::new("foo[0-9]+")?;
832	///
833	/// let mut buf = vec![`0`; original_dfa.write_to_len()];
834	/// let written = original_dfa.write_to_native_endian(&mut buf)?;
835	/// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0;
836	///
837	/// let expected = Some(HalfMatch::must(`0`, `8`));
838	/// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
839	/// # Ok::<(), Box<dyn std::error::Error>>(())
840	/// ```
841	pub fn write_to_len(&self) -> usize {
842	wire::write_label_len(LABEL)
843	+ wire::write_endianness_check_len()
844	+ wire::write_version_len()
845	+ size_of::<u32>() // unused, intended for future flexibility
846	+ self.flags.write_to_len()
847	+ self.tt.write_to_len()
848	+ self.st.write_to_len()
849	+ self.special.write_to_len()
850	+ self.quitset.write_to_len()
851	}
852	}
853
854	impl<'a> DFA<&'a [u8]> {
855	/// Safely deserialize a sparse DFA with a specific state identifier
856	/// representation. Upon success, this returns both the deserialized DFA
857	/// and the number of bytes read from the given slice. Namely, the contents
858	/// of the slice beyond the DFA are not read.
859	///
860	/// Deserializing a DFA using this routine will never allocate heap memory.
861	/// For safety purposes, the DFA's transitions will be verified such that
862	/// every transition points to a valid state. If this verification is too
863	/// costly, then a [`DFA::from_bytes_unchecked`] API is provided, which
864	/// will always execute in constant time.
865	///
866	/// The bytes given must be generated by one of the serialization APIs
867	/// of a `DFA` using a semver compatible release of this crate. Those
868	/// include:
869	///
870	/// [`DFA::to_bytes_little_endian`]*
871	/// [`DFA::to_bytes_big_endian`]*
872	/// [`DFA::to_bytes_native_endian`]*
873	/// [`DFA::write_to_little_endian`]*
874	/// [`DFA::write_to_big_endian`]*
875	/// [`DFA::write_to_native_endian`]*
876	///
877	/// The `to_bytes` methods allocate and return a `Vec<u8>` for you. The
878	/// `write_to` methods do not allocate and write to an existing slice
879	/// (which may be on the stack). Since deserialization always uses the
880	/// native endianness of the target platform, the serialization API you use
881	/// should match the endianness of the target platform. (It's often a good
882	/// idea to generate serialized DFAs for both forms of endianness and then
883	/// load the correct one based on endianness.)
884	///
885	/// # Errors
886	///
887	/// Generally speaking, it's easier to state the conditions in which an
888	/// error is _not_ returned. All of the following must be true:
889	///
890	/// The bytes given must be produced by one of the serialization APIs*
891	/// on this DFA, as mentioned above.
892	/// The endianness of the target platform matches the endianness used to*
893	/// serialized the provided DFA.
894	///
895	/// If any of the above are not true, then an error will be returned.
896	///
897	/// Note that unlike deserializing a [`dense::DFA`], deserializing a sparse
898	/// DFA has no alignment requirements. That is, an alignment of `1` is
899	/// valid.
900	///
901	/// # Panics
902	///
903	/// This routine will never panic for any input.
904	///
905	/// # Example
906	///
907	/// This example shows how to serialize a DFA to raw bytes, deserialize it
908	/// and then use it for searching.
909	///
910	/// ```
911	/// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input};
912	///
913	/// let initial = DFA::new("foo[0-9]+")?;
914	/// let bytes = initial.to_bytes_native_endian();
915	/// let dfa: DFA<&[u8]> = DFA::from_bytes(&bytes)?.0;
916	///
917	/// let expected = Some(HalfMatch::must(`0`, `8`));
918	/// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
919	/// # Ok::<(), Box<dyn std::error::Error>>(())
920	/// ```
921	///
922	/// # Example: loading a DFA from static memory
923	///
924	/// One use case this library supports is the ability to serialize a
925	/// DFA to disk and then use `include_bytes!` to store it in a compiled
926	/// Rust program. Those bytes can then be cheaply deserialized into a
927	/// `DFA` structure at runtime and used for searching without having to
928	/// re-compile the DFA (which can be quite costly).
929	///
930	/// We can show this in two parts. The first part is serializing the DFA to
931	/// a file:
932	///
933	/// ```no_run
934	/// use regex_automata::dfa::sparse::DFA;
935	///
936	/// let dfa = DFA::new("foo[0-9]+")?;
937	///
938	/// // Write a big endian serialized version of this DFA to a file.
939	/// let bytes = dfa.to_bytes_big_endian();
940	/// std::fs::write("foo.bigendian.dfa", &bytes)?;
941	///
942	/// // Do it again, but this time for little endian.
943	/// let bytes = dfa.to_bytes_little_endian();
944	/// std::fs::write("foo.littleendian.dfa", &bytes)?;
945	/// # Ok::<(), Box<dyn std::error::Error>>(())
946	/// ```
947	///
948	/// And now the second part is embedding the DFA into the compiled program
949	/// and deserializing it at runtime on first use. We use conditional
950	/// compilation to choose the correct endianness. We do not need to employ
951	/// any special tricks to ensure a proper alignment, since a sparse DFA has
952	/// no alignment requirements.
953	///
954	/// ```no_run
955	/// use regex_automata::{
956	/// dfa::{Automaton, sparse::DFA},
957	/// util::lazy::Lazy,
958	/// HalfMatch, Input,
959	/// };
960	///
961	/// // This crate provides its own "lazy" type, kind of like
962	/// // lazy_static! or once_cell::sync::Lazy. But it works in no-alloc
963	/// // no-std environments and let's us write this using completely
964	/// // safe code.
965	/// static RE: Lazy<DFA<&'static [u8]>> = Lazy::new(\|\| {
966	/// # const _: &str = stringify! {
967	/// #[cfg(target_endian = "big")]
968	/// static BYTES: &[u8] = include_bytes!("foo.bigendian.dfa");
969	/// #[cfg(target_endian = "little")]
970	/// static BYTES: &[u8] = include_bytes!("foo.littleendian.dfa");
971	/// # };
972	/// # static BYTES: &[u8] = b"";
973	///
974	/// let (dfa, _) = DFA::from_bytes(BYTES)
975	/// .expect("serialized DFA should be valid");
976	/// dfa
977	/// });
978	///
979	/// let expected = Ok(Some(HalfMatch::must(`0`, `8`)));
980	/// assert_eq!(expected, RE.try_search_fwd(&Input::new("foo12345")));
981	/// ```
982	///
983	/// Alternatively, consider using
984	/// [`lazy_static`](https://crates.io/crates/lazy_static)
985	/// or
986	/// [`once_cell`](https://crates.io/crates/once_cell),
987	/// which will guarantee safety for you.
988	pub fn from_bytes(
989	slice: &'a [u8],
990	) -> Result<(DFA<&'a [u8]>, usize), DeserializeError> {
991	// SAFETY: This is safe because we validate both the sparse transitions
992	// (by trying to decode every state) and start state ID list below. If
993	// either validation fails, then we return an error.
994	let (dfa, nread) = unsafe { DFA::from_bytes_unchecked(slice)? };
995	let seen = dfa.tt.validate(&dfa.special)?;
996	dfa.st.validate(&dfa.special, &seen)?;
997	// N.B. dfa.special doesn't have a way to do unchecked deserialization,
998	// so it has already been validated.
999	Ok((dfa, nread))
1000	}
1001
1002	/// Deserialize a DFA with a specific state identifier representation in
1003	/// constant time by omitting the verification of the validity of the
1004	/// sparse transitions.
1005	///
1006	/// This is just like [`DFA::from_bytes`], except it can potentially return
1007	/// a DFA that exhibits undefined behavior if its transitions contains
1008	/// invalid state identifiers.
1009	///
1010	/// This routine is useful if you need to deserialize a DFA cheaply and
1011	/// cannot afford the transition validation performed by `from_bytes`.
1012	///
1013	/// # Safety
1014	///
1015	/// This routine is not safe because it permits callers to provide
1016	/// arbitrary transitions with possibly incorrect state identifiers. While
1017	/// the various serialization routines will never return an incorrect
1018	/// DFA, there is no guarantee that the bytes provided here are correct.
1019	/// While `from_bytes_unchecked` will still do several forms of basic
1020	/// validation, this routine does not check that the transitions themselves
1021	/// are correct. Given an incorrect transition table, it is possible for
1022	/// the search routines to access out-of-bounds memory because of explicit
1023	/// bounds check elision.
1024	///
1025	/// # Example
1026	///
1027	/// ```
1028	/// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input};
1029	///
1030	/// let initial = DFA::new("foo[0-9]+")?;
1031	/// let bytes = initial.to_bytes_native_endian();
1032	/// // SAFETY: This is guaranteed to be safe since the bytes given come
1033	/// // directly from a compatible serialization routine.
1034	/// let dfa: DFA<&[u8]> = unsafe { DFA::from_bytes_unchecked(&bytes)?.0 };
1035	///
1036	/// let expected = Some(HalfMatch::must(`0`, `8`));
1037	/// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
1038	/// # Ok::<(), Box<dyn std::error::Error>>(())
1039	/// ```
1040	pub unsafe fn from_bytes_unchecked(
1041	slice: &'a [u8],
1042	) -> Result<(DFA<&'a [u8]>, usize), DeserializeError> {
1043	let mut nr = `0`;
1044
1045	nr += wire::read_label(&slice[nr..], LABEL)?;
1046	nr += wire::read_endianness_check(&slice[nr..])?;
1047	nr += wire::read_version(&slice[nr..], VERSION)?;
1048
1049	let _unused = wire::try_read_u32(&slice[nr..], "unused space")?;
1050	nr += size_of::<u32>();
1051
1052	let (flags, nread) = Flags::from_bytes(&slice[nr..])?;
1053	nr += nread;
1054
1055	let (tt, nread) = Transitions::from_bytes_unchecked(&slice[nr..])?;
1056	nr += nread;
1057
1058	let (st, nread) = StartTable::from_bytes_unchecked(&slice[nr..])?;
1059	nr += nread;
1060
1061	let (special, nread) = Special::from_bytes(&slice[nr..])?;
1062	nr += nread;
1063	if special.max.as_usize() >= tt.sparse().len() {
1064	return Err(DeserializeError::generic(
1065	"max should not be greater than or equal to sparse bytes",
1066	));
1067	}
1068
1069	let (quitset, nread) = ByteSet::from_bytes(&slice[nr..])?;
1070	nr += nread;
1071
1072	// Prefilters don't support serialization, so they're always absent.
1073	let pre = None;
1074	Ok((DFA { tt, st, special, pre, quitset, flags }, nr))
1075	}
1076	}
1077
1078	impl<T: AsRef<[u8]>> fmt::Debug for DFA<T> {
1079	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1080	writeln!(f, "sparse::DFA(")?;
1081	for state in self.tt.states() {
1082	fmt_state_indicator(f, self, state.id())?;
1083	writeln!(f, "{:06?}: {:?}", state.id().as_usize(), state)?;
1084	}
1085	writeln!(f, "")?;
1086	for (i, (start_id, anchored, sty)) in self.st.iter().enumerate() {
1087	if i % self.st.stride == `0` {
1088	match anchored {
1089	Anchored::No => writeln!(f, "START-GROUP(unanchored)")?,
1090	Anchored::Yes => writeln!(f, "START-GROUP(anchored)")?,
1091	Anchored::Pattern(pid) => writeln!(
1092	f,
1093	"START_GROUP(pattern: {:?})",
1094	pid.as_usize()
1095	)?,
1096	}
1097	}
1098	writeln!(f, " {:?} => {:06?}", sty, start_id.as_usize())?;
1099	}
1100	writeln!(f, "state length: {:?}", self.tt.state_len)?;
1101	writeln!(f, "pattern length: {:?}", self.pattern_len())?;
1102	writeln!(f, "flags: {:?}", self.flags)?;
1103	writeln!(f, ")")?;
1104	Ok(())
1105	}
1106	}
1107
1108	// SAFETY: We assert that our implementation of each method is correct.
1109	unsafe impl<T: AsRef<[u8]>> Automaton for DFA<T> {
1110	#[inline]
1111	fn is_special_state(&self, id: StateID) -> bool {
1112	self.special.is_special_state(id)
1113	}
1114
1115	#[inline]
1116	fn is_dead_state(&self, id: StateID) -> bool {
1117	self.special.is_dead_state(id)
1118	}
1119
1120	#[inline]
1121	fn is_quit_state(&self, id: StateID) -> bool {
1122	self.special.is_quit_state(id)
1123	}
1124
1125	#[inline]
1126	fn is_match_state(&self, id: StateID) -> bool {
1127	self.special.is_match_state(id)
1128	}
1129
1130	#[inline]
1131	fn is_start_state(&self, id: StateID) -> bool {
1132	self.special.is_start_state(id)
1133	}
1134
1135	#[inline]
1136	fn is_accel_state(&self, id: StateID) -> bool {
1137	self.special.is_accel_state(id)
1138	}
1139
1140	// This is marked as inline to help dramatically boost sparse searching,
1141	// which decodes each state it enters to follow the next transition.
1142	#[cfg_attr(feature = "perf-inline", inline(always))]
1143	fn next_state(&self, current: StateID, input: u8) -> StateID {
1144	let input = self.tt.classes.get(input);
1145	self.tt.state(current).next(input)
1146	}
1147
1148	#[inline]
1149	unsafe fn next_state_unchecked(
1150	&self,
1151	current: StateID,
1152	input: u8,
1153	) -> StateID {
1154	self.next_state(current, input)
1155	}
1156
1157	#[inline]
1158	fn next_eoi_state(&self, current: StateID) -> StateID {
1159	self.tt.state(current).next_eoi()
1160	}
1161
1162	#[inline]
1163	fn pattern_len(&self) -> usize {
1164	self.tt.pattern_len
1165	}
1166
1167	#[inline]
1168	fn match_len(&self, id: StateID) -> usize {
1169	self.tt.state(id).pattern_len()
1170	}
1171
1172	#[inline]
1173	fn match_pattern(&self, id: StateID, match_index: usize) -> PatternID {
1174	// This is an optimization for the very common case of a DFA with a
1175	// single pattern. This conditional avoids a somewhat more costly path
1176	// that finds the pattern ID from the state machine, which requires
1177	// a bit of slicing/pointer-chasing. This optimization tends to only
1178	// matter when matches are frequent.
1179	if self.tt.pattern_len == `1` {
1180	return PatternID::ZERO;
1181	}
1182	self.tt.state(id).pattern_id(match_index)
1183	}
1184
1185	#[inline]
1186	fn has_empty(&self) -> bool {
1187	self.flags.has_empty
1188	}
1189
1190	#[inline]
1191	fn is_utf8(&self) -> bool {
1192	self.flags.is_utf8
1193	}
1194
1195	#[inline]
1196	fn is_always_start_anchored(&self) -> bool {
1197	self.flags.is_always_start_anchored
1198	}
1199
1200	#[inline]
1201	fn start_state(
1202	&self,
1203	config: &start::Config,
1204	) -> Result<StateID, StartError> {
1205	let anchored = config.get_anchored();
1206	let start = match config.get_look_behind() {
1207	None => Start::Text,
1208	Some(byte) => {
1209	if !self.quitset.is_empty() && self.quitset.contains(byte) {
1210	return Err(StartError::quit(byte));
1211	}
1212	self.st.start_map.get(byte)
1213	}
1214	};
1215	self.st.start(anchored, start)
1216	}
1217
1218	#[inline]
1219	fn universal_start_state(&self, mode: Anchored) -> Option<StateID> {
1220	match mode {
1221	Anchored::No => self.st.universal_start_unanchored,
1222	Anchored::Yes => self.st.universal_start_anchored,
1223	Anchored::Pattern(_) => None,
1224	}
1225	}
1226
1227	#[inline]
1228	fn accelerator(&self, id: StateID) -> &[u8] {
1229	self.tt.state(id).accelerator()
1230	}
1231
1232	#[inline]
1233	fn get_prefilter(&self) -> Option<&Prefilter> {
1234	self.pre.as_ref()
1235	}
1236	}
1237
1238	/// The transition table portion of a sparse DFA.
1239	///
1240	/// The transition table is the core part of the DFA in that it describes how
1241	/// to move from one state to another based on the input sequence observed.
1242	///
1243	/// Unlike a typical dense table based DFA, states in a sparse transition
1244	/// table have variable size. That is, states with more transitions use more
1245	/// space than states with fewer transitions. This means that finding the next
1246	/// transition takes more work than with a dense DFA, but also typically uses
1247	/// much less space.
1248	#[derive(Clone)]
1249	struct Transitions<T> {
1250	/// The raw encoding of each state in this DFA.
1251	///
1252	/// Each state has the following information:
1253	///
1254	/// A set of transitions to subsequent states. Transitions to the dead*
1255	/// state are omitted.
1256	/// If the state can be accelerated, then any additional accelerator*
1257	/// information.
1258	/// If the state is a match state, then the state contains all pattern*
1259	/// IDs that match when in that state.
1260	///
1261	/// To decode a state, use Transitions::state.
1262	///
1263	/// In practice, T is either Vec<u8> or &[u8].
1264	sparse: T,
1265	/// A set of equivalence classes, where a single equivalence class
1266	/// represents a set of bytes that never discriminate between a match
1267	/// and a non-match in the DFA. Each equivalence class corresponds to a
1268	/// single character in this DFA's alphabet, where the maximum number of
1269	/// characters is 257 (each possible value of a byte plus the special
1270	/// EOI transition). Consequently, the number of equivalence classes
1271	/// corresponds to the number of transitions for each DFA state. Note
1272	/// though that the space* used by each DFA state in the transition table*
1273	/// may be larger. The total space used by each DFA state is known as the
1274	/// stride and is documented above.
1275	///
1276	/// The only time the number of equivalence classes is fewer than 257 is
1277	/// if the DFA's kind uses byte classes which is the default. Equivalence
1278	/// classes should generally only be disabled when debugging, so that
1279	/// the transitions themselves aren't obscured. Disabling them has no
1280	/// other benefit, since the equivalence class map is always used while
1281	/// searching. In the vast majority of cases, the number of equivalence
1282	/// classes is substantially smaller than 257, particularly when large
1283	/// Unicode classes aren't used.
1284	///
1285	/// N.B. Equivalence classes aren't particularly useful in a sparse DFA
1286	/// in the current implementation, since equivalence classes generally tend
1287	/// to correspond to continuous ranges of bytes that map to the same
1288	/// transition. So in a sparse DFA, equivalence classes don't really lead
1289	/// to a space savings. In the future, it would be good to try and remove
1290	/// them from sparse DFAs entirely, but requires a bit of work since sparse
1291	/// DFAs are built from dense DFAs, which are in turn built on top of
1292	/// equivalence classes.
1293	classes: ByteClasses,
1294	/// The total number of states in this DFA. Note that a DFA always has at
1295	/// least one state---the dead state---even the empty DFA. In particular,
1296	/// the dead state always has ID 0 and is correspondingly always the first
1297	/// state. The dead state is never a match state.
1298	state_len: usize,
1299	/// The total number of unique patterns represented by these match states.
1300	pattern_len: usize,
1301	}
1302
1303	impl<'a> Transitions<&'a [u8]> {
1304	unsafe fn from_bytes_unchecked(
1305	mut slice: &'a [u8],
1306	) -> Result<(Transitions<&'a [u8]>, usize), DeserializeError> {
1307	let slice_start = slice.as_ptr().as_usize();
1308
1309	let (state_len, nr) =
1310	wire::try_read_u32_as_usize(&slice, "state length")?;
1311	slice = &slice[nr..];
1312
1313	let (pattern_len, nr) =
1314	wire::try_read_u32_as_usize(&slice, "pattern length")?;
1315	slice = &slice[nr..];
1316
1317	let (classes, nr) = ByteClasses::from_bytes(&slice)?;
1318	slice = &slice[nr..];
1319
1320	let (len, nr) =
1321	wire::try_read_u32_as_usize(&slice, "sparse transitions length")?;
1322	slice = &slice[nr..];
1323
1324	wire::check_slice_len(slice, len, "sparse states byte length")?;
1325	let sparse = &slice[..len];
1326	slice = &slice[len..];
1327
1328	let trans = Transitions { sparse, classes, state_len, pattern_len };
1329	Ok((trans, slice.as_ptr().as_usize() - slice_start))
1330	}
1331	}
1332
1333	impl<T: AsRef<[u8]>> Transitions<T> {
1334	/// Writes a serialized form of this transition table to the buffer given.
1335	/// If the buffer is too small, then an error is returned. To determine
1336	/// how big the buffer must be, use `write_to_len`.
1337	fn write_to<E: Endian>(
1338	&self,
1339	mut dst: &mut [u8],
1340	) -> Result<usize, SerializeError> {
1341	let nwrite = self.write_to_len();
1342	if dst.len() < nwrite {
1343	return Err(SerializeError::buffer_too_small(
1344	"sparse transition table",
1345	));
1346	}
1347	dst = &mut dst[..nwrite];
1348
1349	// write state length
1350	E::write_u32(u32::try_from(self.state_len).unwrap(), dst);
1351	dst = &mut dst[size_of::<u32>()..];
1352
1353	// write pattern length
1354	E::write_u32(u32::try_from(self.pattern_len).unwrap(), dst);
1355	dst = &mut dst[size_of::<u32>()..];
1356
1357	// write byte class map
1358	let n = self.classes.write_to(dst)?;
1359	dst = &mut dst[n..];
1360
1361	// write number of bytes in sparse transitions
1362	E::write_u32(u32::try_from(self.sparse().len()).unwrap(), dst);
1363	dst = &mut dst[size_of::<u32>()..];
1364
1365	// write actual transitions
1366	let mut id = DEAD;
1367	while id.as_usize() < self.sparse().len() {
1368	let state = self.state(id);
1369	let n = state.write_to::<E>(&mut dst)?;
1370	dst = &mut dst[n..];
1371	// The next ID is the offset immediately following `state`.
1372	id = StateID::new(id.as_usize() + state.write_to_len()).unwrap();
1373	}
1374	Ok(nwrite)
1375	}
1376
1377	/// Returns the number of bytes the serialized form of this transition
1378	/// table will use.
1379	fn write_to_len(&self) -> usize {
1380	size_of::<u32>() // state length
1381	+ size_of::<u32>() // pattern length
1382	+ self.classes.write_to_len()
1383	+ size_of::<u32>() // sparse transitions length
1384	+ self.sparse().len()
1385	}
1386
1387	/// Validates that every state ID in this transition table is valid.
1388	///
1389	/// That is, every state ID can be used to correctly index a state in this
1390	/// table.
1391	fn validate(&self, sp: &Special) -> Result<Seen, DeserializeError> {
1392	let mut verified = Seen::new();
1393	// We need to make sure that we decode the correct number of states.
1394	// Otherwise, an empty set of transitions would validate even if the
1395	// recorded state length is non-empty.
1396	let mut len = `0`;
1397	// We can't use the self.states() iterator because it assumes the state
1398	// encodings are valid. It could panic if they aren't.
1399	let mut id = DEAD;
1400	while id.as_usize() < self.sparse().len() {
1401	// Before we even decode the state, we check that the ID itself
1402	// is well formed. That is, if it's a special state then it must
1403	// actually be a quit, dead, accel, match or start state.
1404	if sp.is_special_state(id) {
1405	let is_actually_special = sp.is_dead_state(id)
1406	\|\| sp.is_quit_state(id)
1407	\|\| sp.is_match_state(id)
1408	\|\| sp.is_start_state(id)
1409	\|\| sp.is_accel_state(id);
1410	if !is_actually_special {
1411	// This is kind of a cryptic error message...
1412	return Err(DeserializeError::generic(
1413	"found sparse state tagged as special but \
1414	wasn't actually special",
1415	));
1416	}
1417	}
1418	let state = self.try_state(sp, id)?;
1419	verified.insert(id);
1420	// The next ID should be the offset immediately following `state`.
1421	id = StateID::new(wire::add(
1422	id.as_usize(),
1423	state.write_to_len(),
1424	"next state ID offset",
1425	)?)
1426	.map_err(\|err\| {
1427	DeserializeError::state_id_error(err, "next state ID offset")
1428	})?;
1429	len += `1`;
1430	}
1431	// Now that we've checked that all top-level states are correct and
1432	// importantly, collected a set of valid state IDs, we have all the
1433	// information we need to check that all transitions are correct too.
1434	//
1435	// Note that we can't use `valid_ids` to iterate because it will
1436	// be empty in no-std no-alloc contexts. (And yes, that means our
1437	// verification isn't quite as good.) We can use `self.states()`
1438	// though at least, since we know that all states can at least be
1439	// decoded and traversed correctly.
1440	for state in self.states() {
1441	// Check that all transitions in this state are correct.
1442	for i in `0`..state.ntrans {
1443	let to = state.next_at(i);
1444	// For no-alloc, we just check that the state can decode. It is
1445	// technically possible that the state ID could still point to
1446	// a non-existent state even if it decodes (fuzzing proved this
1447	// to be true), but it shouldn't result in any memory unsafety
1448	// or panics in non-debug mode.
1449	#[cfg(not(feature = "alloc"))]
1450	{
1451	let _ = self.try_state(sp, to)?;
1452	}
1453	#[cfg(feature = "alloc")]
1454	{
1455	if !verified.contains(&to) {
1456	return Err(DeserializeError::generic(
1457	"found transition that points to a \
1458	non-existent state",
1459	));
1460	}
1461	}
1462	}
1463	}
1464	if len != self.state_len {
1465	return Err(DeserializeError::generic(
1466	"mismatching sparse state length",
1467	));
1468	}
1469	Ok(verified)
1470	}
1471
1472	/// Converts these transitions to a borrowed value.
1473	fn as_ref(&self) -> Transitions<&'_ [u8]> {
1474	Transitions {
1475	sparse: self.sparse(),
1476	classes: self.classes.clone(),
1477	state_len: self.state_len,
1478	pattern_len: self.pattern_len,
1479	}
1480	}
1481
1482	/// Converts these transitions to an owned value.
1483	#[cfg(feature = "alloc")]
1484	fn to_owned(&self) -> Transitions<alloc::vec::Vec<u8>> {
1485	Transitions {
1486	sparse: self.sparse().to_vec(),
1487	classes: self.classes.clone(),
1488	state_len: self.state_len,
1489	pattern_len: self.pattern_len,
1490	}
1491	}
1492
1493	/// Return a convenient representation of the given state.
1494	///
1495	/// This panics if the state is invalid.
1496	///
1497	/// This is marked as inline to help dramatically boost sparse searching,
1498	/// which decodes each state it enters to follow the next transition. Other
1499	/// functions involved are also inlined, which should hopefully eliminate
1500	/// a lot of the extraneous decoding that is never needed just to follow
1501	/// the next transition.
1502	#[cfg_attr(feature = "perf-inline", inline(always))]
1503	fn state(&self, id: StateID) -> State<'_> {
1504	let mut state = &self.sparse()[id.as_usize()..];
1505	let mut ntrans = wire::read_u16(&state).as_usize();
1506	let is_match = (`1` << `15`) & ntrans != `0`;
1507	ntrans &= !(`1` << `15`);
1508	state = &state[`2`..];
1509
1510	let (input_ranges, state) = state.split_at(ntrans * `2`);
1511	let (next, state) = state.split_at(ntrans * StateID::SIZE);
1512	let (pattern_ids, state) = if is_match {
1513	let npats = wire::read_u32(&state).as_usize();
1514	state[`4`..].split_at(npats * `4`)
1515	} else {
1516	(&[][..], state)
1517	};
1518
1519	let accel_len = usize::from(state[`0`]);
1520	let accel = &state[`1`..accel_len + `1`];
1521	State { id, is_match, ntrans, input_ranges, next, pattern_ids, accel }
1522	}
1523
1524	/// Like `state`, but will return an error if the state encoding is
1525	/// invalid. This is useful for verifying states after deserialization,
1526	/// which is required for a safe deserialization API.
1527	///
1528	/// Note that this only verifies that this state is decodable and that
1529	/// all of its data is consistent. It does not verify that its state ID
1530	/// transitions point to valid states themselves, nor does it verify that
1531	/// every pattern ID is valid.
1532	fn try_state(
1533	&self,
1534	sp: &Special,
1535	id: StateID,
1536	) -> Result<State<'_>, DeserializeError> {
1537	if id.as_usize() > self.sparse().len() {
1538	return Err(DeserializeError::generic(
1539	"invalid caller provided sparse state ID",
1540	));
1541	}
1542	let mut state = &self.sparse()[id.as_usize()..];
1543	// Encoding format starts with a u16 that stores the total number of
1544	// transitions in this state.
1545	let (mut ntrans, _) =
1546	wire::try_read_u16_as_usize(state, "state transition length")?;
1547	let is_match = ((`1` << `15`) & ntrans) != `0`;
1548	ntrans &= !(`1` << `15`);
1549	state = &state[`2`..];
1550	if ntrans > `257` \|\| ntrans == `0` {
1551	return Err(DeserializeError::generic(
1552	"invalid transition length",
1553	));
1554	}
1555	if is_match && !sp.is_match_state(id) {
1556	return Err(DeserializeError::generic(
1557	"state marked as match but not in match ID range",
1558	));
1559	} else if !is_match && sp.is_match_state(id) {
1560	return Err(DeserializeError::generic(
1561	"state in match ID range but not marked as match state",
1562	));
1563	}
1564
1565	// Each transition has two pieces: an inclusive range of bytes on which
1566	// it is defined, and the state ID that those bytes transition to. The
1567	// pairs come first, followed by a corresponding sequence of state IDs.
1568	let input_ranges_len = ntrans.checked_mul(`2`).unwrap();
1569	wire::check_slice_len(state, input_ranges_len, "sparse byte pairs")?;
1570	let (input_ranges, state) = state.split_at(input_ranges_len);
1571	// Every range should be of the form A-B, where A<=B.
1572	for pair in input_ranges.chunks(`2`) {
1573	let (start, end) = (pair[`0`], pair[`1`]);
1574	if start > end {
1575	return Err(DeserializeError::generic("invalid input range"));
1576	}
1577	}
1578
1579	// And now extract the corresponding sequence of state IDs. We leave
1580	// this sequence as a &[u8] instead of a &[S] because sparse DFAs do
1581	// not have any alignment requirements.
1582	let next_len = ntrans
1583	.checked_mul(self.id_len())
1584	.expect("state size * #trans should always fit in a usize");
1585	wire::check_slice_len(state, next_len, "sparse trans state IDs")?;
1586	let (next, state) = state.split_at(next_len);
1587	// We can at least verify that every state ID is in bounds.
1588	for idbytes in next.chunks(self.id_len()) {
1589	let (id, _) =
1590	wire::read_state_id(idbytes, "sparse state ID in try_state")?;
1591	wire::check_slice_len(
1592	self.sparse(),
1593	id.as_usize(),
1594	"invalid sparse state ID",
1595	)?;
1596	}
1597
1598	// If this is a match state, then read the pattern IDs for this state.
1599	// Pattern IDs is a u32-length prefixed sequence of native endian
1600	// encoded 32-bit integers.
1601	let (pattern_ids, state) = if is_match {
1602	let (npats, nr) =
1603	wire::try_read_u32_as_usize(state, "pattern ID length")?;
1604	let state = &state[nr..];
1605	if npats == `0` {
1606	return Err(DeserializeError::generic(
1607	"state marked as a match, but pattern length is zero",
1608	));
1609	}
1610
1611	let pattern_ids_len =
1612	wire::mul(npats, `4`, "sparse pattern ID byte length")?;
1613	wire::check_slice_len(
1614	state,
1615	pattern_ids_len,
1616	"sparse pattern IDs",
1617	)?;
1618	let (pattern_ids, state) = state.split_at(pattern_ids_len);
1619	for patbytes in pattern_ids.chunks(PatternID::SIZE) {
1620	wire::read_pattern_id(
1621	patbytes,
1622	"sparse pattern ID in try_state",
1623	)?;
1624	}
1625	(pattern_ids, state)
1626	} else {
1627	(&[][..], state)
1628	};
1629	if is_match && pattern_ids.is_empty() {
1630	return Err(DeserializeError::generic(
1631	"state marked as a match, but has no pattern IDs",
1632	));
1633	}
1634	if sp.is_match_state(id) && pattern_ids.is_empty() {
1635	return Err(DeserializeError::generic(
1636	"state marked special as a match, but has no pattern IDs",
1637	));
1638	}
1639	if sp.is_match_state(id) != is_match {
1640	return Err(DeserializeError::generic(
1641	"whether state is a match or not is inconsistent",
1642	));
1643	}
1644
1645	// Now read this state's accelerator info. The first byte is the length
1646	// of the accelerator, which is typically 0 (for no acceleration) but
1647	// is no bigger than 3. The length indicates the number of bytes that
1648	// follow, where each byte corresponds to a transition out of this
1649	// state.
1650	if state.is_empty() {
1651	return Err(DeserializeError::generic("no accelerator length"));
1652	}
1653	let (accel_len, state) = (usize::from(state[`0`]), &state[`1`..]);
1654
1655	if accel_len > `3` {
1656	return Err(DeserializeError::generic(
1657	"sparse invalid accelerator length",
1658	));
1659	} else if accel_len == `0` && sp.is_accel_state(id) {
1660	return Err(DeserializeError::generic(
1661	"got no accelerators in state, but in accelerator ID range",
1662	));
1663	} else if accel_len > `0` && !sp.is_accel_state(id) {
1664	return Err(DeserializeError::generic(
1665	"state in accelerator ID range, but has no accelerators",
1666	));
1667	}
1668
1669	wire::check_slice_len(
1670	state,
1671	accel_len,
1672	"sparse corrupt accelerator length",
1673	)?;
1674	let (accel, _) = (&state[..accel_len], &state[accel_len..]);
1675
1676	let state = State {
1677	id,
1678	is_match,
1679	ntrans,
1680	input_ranges,
1681	next,
1682	pattern_ids,
1683	accel,
1684	};
1685	if sp.is_quit_state(state.next_at(state.ntrans - `1`)) {
1686	return Err(DeserializeError::generic(
1687	"state with EOI transition to quit state is illegal",
1688	));
1689	}
1690	Ok(state)
1691	}
1692
1693	/// Return an iterator over all of the states in this DFA.
1694	///
1695	/// The iterator returned yields tuples, where the first element is the
1696	/// state ID and the second element is the state itself.
1697	fn states(&self) -> StateIter<'_, T> {
1698	StateIter { trans: self, id: DEAD.as_usize() }
1699	}
1700
1701	/// Returns the sparse transitions as raw bytes.
1702	fn sparse(&self) -> &[u8] {
1703	self.sparse.as_ref()
1704	}
1705
1706	/// Returns the number of bytes represented by a single state ID.
1707	fn id_len(&self) -> usize {
1708	StateID::SIZE
1709	}
1710
1711	/// Return the memory usage, in bytes, of these transitions.
1712	///
1713	/// This does not include the size of a `Transitions` value itself.
1714	fn memory_usage(&self) -> usize {
1715	self.sparse().len()
1716	}
1717	}
1718
1719	#[cfg(feature = "dfa-build")]
1720	impl<T: AsMut<[u8]>> Transitions<T> {
1721	/// Return a convenient mutable representation of the given state.
1722	/// This panics if the state is invalid.
1723	fn state_mut(&mut self, id: StateID) -> StateMut<'_> {
1724	let mut state = &mut self.sparse_mut()[id.as_usize()..];
1725	let mut ntrans = wire::read_u16(&state).as_usize();
1726	let is_match = (`1` << `15`) & ntrans != `0`;
1727	ntrans &= !(`1` << `15`);
1728	state = &mut state[`2`..];
1729
1730	let (input_ranges, state) = state.split_at_mut(ntrans * `2`);
1731	let (next, state) = state.split_at_mut(ntrans * StateID::SIZE);
1732	let (pattern_ids, state) = if is_match {
1733	let npats = wire::read_u32(&state).as_usize();
1734	state[`4`..].split_at_mut(npats * `4`)
1735	} else {
1736	(&mut [][..], state)
1737	};
1738
1739	let accel_len = usize::from(state[`0`]);
1740	let accel = &mut state[`1`..accel_len + `1`];
1741	StateMut {
1742	id,
1743	is_match,
1744	ntrans,
1745	input_ranges,
1746	next,
1747	pattern_ids,
1748	accel,
1749	}
1750	}
1751
1752	/// Returns the sparse transitions as raw mutable bytes.
1753	fn sparse_mut(&mut self) -> &mut [u8] {
1754	self.sparse.as_mut()
1755	}
1756	}
1757
1758	/// The set of all possible starting states in a DFA.
1759	///
1760	/// See the eponymous type in the `dense` module for more details. This type
1761	/// is very similar to `dense::StartTable`, except that its underlying
1762	/// representation is `&[u8]` instead of `&[S]`. (The latter would require
1763	/// sparse DFAs to be aligned, which is explicitly something we do not require
1764	/// because we don't really need it.)
1765	#[derive(Clone)]
1766	struct StartTable<T> {
1767	/// The initial start state IDs as a contiguous table of native endian
1768	/// encoded integers, represented by `S`.
1769	///
1770	/// In practice, T is either Vec<u8> or &[u8] and has no alignment
1771	/// requirements.
1772	///
1773	/// The first `2 stride` (currently always 8) entries always correspond*
1774	/// to the starts states for the entire DFA, with the first 4 entries being
1775	/// for unanchored searches and the second 4 entries being for anchored
1776	/// searches. To keep things simple, we always use 8 entries even if the
1777	/// `StartKind` is not both.
1778	///
1779	/// After that, there are `stride patterns` state IDs, where `patterns`*
1780	/// may be zero in the case of a DFA with no patterns or in the case where
1781	/// the DFA was built without enabling starting states for each pattern.
1782	table: T,
1783	/// The starting state configuration supported. When 'both', both
1784	/// unanchored and anchored searches work. When 'unanchored', anchored
1785	/// searches panic. When 'anchored', unanchored searches panic.
1786	kind: StartKind,
1787	/// The start state configuration for every possible byte.
1788	start_map: StartByteMap,
1789	/// The number of starting state IDs per pattern.
1790	stride: usize,
1791	/// The total number of patterns for which starting states are encoded.
1792	/// This is `None` for DFAs that were built without start states for each
1793	/// pattern. Thus, one cannot use this field to say how many patterns
1794	/// are in the DFA in all cases. It is specific to how many patterns are
1795	/// represented in this start table.
1796	pattern_len: Option<usize>,
1797	/// The universal starting state for unanchored searches. This is only
1798	/// present when the DFA supports unanchored searches and when all starting
1799	/// state IDs for an unanchored search are equivalent.
1800	universal_start_unanchored: Option<StateID>,
1801	/// The universal starting state for anchored searches. This is only
1802	/// present when the DFA supports anchored searches and when all starting
1803	/// state IDs for an anchored search are equivalent.
1804	universal_start_anchored: Option<StateID>,
1805	}
1806
1807	#[cfg(feature = "dfa-build")]
1808	impl StartTable<Vec<u8>> {
1809	fn new<T: AsRef<[u32]>>(
1810	dfa: &dense::DFA<T>,
1811	pattern_len: Option<usize>,
1812	) -> StartTable<Vec<u8>> {
1813	let stride = Start::len();
1814	// This is OK since the only way we're here is if a dense DFA could be
1815	// constructed successfully, which uses the same space.
1816	let len = stride
1817	.checked_mul(pattern_len.unwrap_or(`0`))
1818	.unwrap()
1819	.checked_add(stride.checked_mul(`2`).unwrap())
1820	.unwrap()
1821	.checked_mul(StateID::SIZE)
1822	.unwrap();
1823	StartTable {
1824	table: vec![`0`; len],
1825	kind: dfa.start_kind(),
1826	start_map: dfa.start_map().clone(),
1827	stride,
1828	pattern_len,
1829	universal_start_unanchored: dfa
1830	.universal_start_state(Anchored::No),
1831	universal_start_anchored: dfa.universal_start_state(Anchored::Yes),
1832	}
1833	}
1834
1835	fn from_dense_dfa<T: AsRef<[u32]>>(
1836	dfa: &dense::DFA<T>,
1837	remap: &[StateID],
1838	) -> Result<StartTable<Vec<u8>>, BuildError> {
1839	// Unless the DFA has start states compiled for each pattern, then
1840	// as far as the starting state table is concerned, there are zero
1841	// patterns to account for. It will instead only store starting states
1842	// for the entire DFA.
1843	let start_pattern_len = if dfa.starts_for_each_pattern() {
1844	Some(dfa.pattern_len())
1845	} else {
1846	None
1847	};
1848	let mut sl = StartTable::new(dfa, start_pattern_len);
1849	for (old_start_id, anchored, sty) in dfa.starts() {
1850	let new_start_id = remap[dfa.to_index(old_start_id)];
1851	sl.set_start(anchored, sty, new_start_id);
1852	}
1853	Ok(sl)
1854	}
1855	}
1856
1857	impl<'a> StartTable<&'a [u8]> {
1858	unsafe fn from_bytes_unchecked(
1859	mut slice: &'a [u8],
1860	) -> Result<(StartTable<&'a [u8]>, usize), DeserializeError> {
1861	let slice_start = slice.as_ptr().as_usize();
1862
1863	let (kind, nr) = StartKind::from_bytes(slice)?;
1864	slice = &slice[nr..];
1865
1866	let (start_map, nr) = StartByteMap::from_bytes(slice)?;
1867	slice = &slice[nr..];
1868
1869	let (stride, nr) =
1870	wire::try_read_u32_as_usize(slice, "sparse start table stride")?;
1871	slice = &slice[nr..];
1872	if stride != Start::len() {
1873	return Err(DeserializeError::generic(
1874	"invalid sparse starting table stride",
1875	));
1876	}
1877
1878	let (maybe_pattern_len, nr) =
1879	wire::try_read_u32_as_usize(slice, "sparse start table patterns")?;
1880	slice = &slice[nr..];
1881	let pattern_len = if maybe_pattern_len.as_u32() == u32::MAX {
1882	None
1883	} else {
1884	Some(maybe_pattern_len)
1885	};
1886	if pattern_len.map_or(`false`, \|len\| len > PatternID::LIMIT) {
1887	return Err(DeserializeError::generic(
1888	"sparse invalid number of patterns",
1889	));
1890	}
1891
1892	let (universal_unanchored, nr) =
1893	wire::try_read_u32(slice, "universal unanchored start")?;
1894	slice = &slice[nr..];
1895	let universal_start_unanchored = if universal_unanchored == u32::MAX {
1896	None
1897	} else {
1898	Some(StateID::try_from(universal_unanchored).map_err(\|e\| {
1899	DeserializeError::state_id_error(
1900	e,
1901	"universal unanchored start",
1902	)
1903	})?)
1904	};
1905
1906	let (universal_anchored, nr) =
1907	wire::try_read_u32(slice, "universal anchored start")?;
1908	slice = &slice[nr..];
1909	let universal_start_anchored = if universal_anchored == u32::MAX {
1910	None
1911	} else {
1912	Some(StateID::try_from(universal_anchored).map_err(\|e\| {
1913	DeserializeError::state_id_error(e, "universal anchored start")
1914	})?)
1915	};
1916
1917	let pattern_table_size = wire::mul(
1918	stride,
1919	pattern_len.unwrap_or(`0`),
1920	"sparse invalid pattern length",
1921	)?;
1922	// Our start states always start with a single stride of start states
1923	// for the entire automaton which permit it to match any pattern. What
1924	// follows it are an optional set of start states for each pattern.
1925	let start_state_len = wire::add(
1926	wire::mul(`2`, stride, "start state stride too big")?,
1927	pattern_table_size,
1928	"sparse invalid 'any' pattern starts size",
1929	)?;
1930	let table_bytes_len = wire::mul(
1931	start_state_len,
1932	StateID::SIZE,
1933	"sparse pattern table bytes length",
1934	)?;
1935	wire::check_slice_len(
1936	slice,
1937	table_bytes_len,
1938	"sparse start ID table",
1939	)?;
1940	let table = &slice[..table_bytes_len];
1941	slice = &slice[table_bytes_len..];
1942
1943	let sl = StartTable {
1944	table,
1945	kind,
1946	start_map,
1947	stride,
1948	pattern_len,
1949	universal_start_unanchored,
1950	universal_start_anchored,
1951	};
1952	Ok((sl, slice.as_ptr().as_usize() - slice_start))
1953	}
1954	}
1955
1956	impl<T: AsRef<[u8]>> StartTable<T> {
1957	fn write_to<E: Endian>(
1958	&self,
1959	mut dst: &mut [u8],
1960	) -> Result<usize, SerializeError> {
1961	let nwrite = self.write_to_len();
1962	if dst.len() < nwrite {
1963	return Err(SerializeError::buffer_too_small(
1964	"sparse starting table ids",
1965	));
1966	}
1967	dst = &mut dst[..nwrite];
1968
1969	// write start kind
1970	let nw = self.kind.write_to::<E>(dst)?;
1971	dst = &mut dst[nw..];
1972	// write start byte map
1973	let nw = self.start_map.write_to(dst)?;
1974	dst = &mut dst[nw..];
1975	// write stride
1976	E::write_u32(u32::try_from(self.stride).unwrap(), dst);
1977	dst = &mut dst[size_of::<u32>()..];
1978	// write pattern length
1979	E::write_u32(
1980	u32::try_from(self.pattern_len.unwrap_or(`0xFFFF_FFFF`)).unwrap(),
1981	dst,
1982	);
1983	dst = &mut dst[size_of::<u32>()..];
1984	// write universal start unanchored state id, u32::MAX if absent
1985	E::write_u32(
1986	self.universal_start_unanchored
1987	.map_or(u32::MAX, \|sid\| sid.as_u32()),
1988	dst,
1989	);
1990	dst = &mut dst[size_of::<u32>()..];
1991	// write universal start anchored state id, u32::MAX if absent
1992	E::write_u32(
1993	self.universal_start_anchored.map_or(u32::MAX, \|sid\| sid.as_u32()),
1994	dst,
1995	);
1996	dst = &mut dst[size_of::<u32>()..];
1997	// write start IDs
1998	for (sid, _, _) in self.iter() {
1999	E::write_u32(sid.as_u32(), dst);
2000	dst = &mut dst[StateID::SIZE..];
2001	}
2002	Ok(nwrite)
2003	}
2004
2005	/// Returns the number of bytes the serialized form of this transition
2006	/// table will use.
2007	fn write_to_len(&self) -> usize {
2008	self.kind.write_to_len()
2009	+ self.start_map.write_to_len()
2010	+ size_of::<u32>() // stride
2011	+ size_of::<u32>() // # patterns
2012	+ size_of::<u32>() // universal unanchored start
2013	+ size_of::<u32>() // universal anchored start
2014	+ self.table().len()
2015	}
2016
2017	/// Validates that every starting state ID in this table is valid.
2018	///
2019	/// That is, every starting state ID can be used to correctly decode a
2020	/// state in the DFA's sparse transitions.
2021	fn validate(
2022	&self,
2023	sp: &Special,
2024	seen: &Seen,
2025	) -> Result<(), DeserializeError> {
2026	for (id, _, _) in self.iter() {
2027	if !seen.contains(&id) {
2028	return Err(DeserializeError::generic(
2029	"found invalid start state ID",
2030	));
2031	}
2032	if sp.is_match_state(id) {
2033	return Err(DeserializeError::generic(
2034	"start states cannot be match states",
2035	));
2036	}
2037	}
2038	Ok(())
2039	}
2040
2041	/// Converts this start list to a borrowed value.
2042	fn as_ref(&self) -> StartTable<&'_ [u8]> {
2043	StartTable {
2044	table: self.table(),
2045	kind: self.kind,
2046	start_map: self.start_map.clone(),
2047	stride: self.stride,
2048	pattern_len: self.pattern_len,
2049	universal_start_unanchored: self.universal_start_unanchored,
2050	universal_start_anchored: self.universal_start_anchored,
2051	}
2052	}
2053
2054	/// Converts this start list to an owned value.
2055	#[cfg(feature = "alloc")]
2056	fn to_owned(&self) -> StartTable<alloc::vec::Vec<u8>> {
2057	StartTable {
2058	table: self.table().to_vec(),
2059	kind: self.kind,
2060	start_map: self.start_map.clone(),
2061	stride: self.stride,
2062	pattern_len: self.pattern_len,
2063	universal_start_unanchored: self.universal_start_unanchored,
2064	universal_start_anchored: self.universal_start_anchored,
2065	}
2066	}
2067
2068	/// Return the start state for the given index and pattern ID. If the
2069	/// pattern ID is None, then the corresponding start state for the entire
2070	/// DFA is returned. If the pattern ID is not None, then the corresponding
2071	/// starting state for the given pattern is returned. If this start table
2072	/// does not have individual starting states for each pattern, then this
2073	/// panics.
2074	fn start(
2075	&self,
2076	anchored: Anchored,
2077	start: Start,
2078	) -> Result<StateID, StartError> {
2079	let start_index = start.as_usize();
2080	let index = match anchored {
2081	Anchored::No => {
2082	if !self.kind.has_unanchored() {
2083	return Err(StartError::unsupported_anchored(anchored));
2084	}
2085	start_index
2086	}
2087	Anchored::Yes => {
2088	if !self.kind.has_anchored() {
2089	return Err(StartError::unsupported_anchored(anchored));
2090	}
2091	self.stride + start_index
2092	}
2093	Anchored::Pattern(pid) => {
2094	let len = match self.pattern_len {
2095	None => {
2096	return Err(StartError::unsupported_anchored(anchored))
2097	}
2098	Some(len) => len,
2099	};
2100	if pid.as_usize() >= len {
2101	return Ok(DEAD);
2102	}
2103	(`2` * self.stride)
2104	+ (self.stride * pid.as_usize())
2105	+ start_index
2106	}
2107	};
2108	let start = index * StateID::SIZE;
2109	// This OK since we're allowed to assume that the start table contains
2110	// valid StateIDs.
2111	Ok(wire::read_state_id_unchecked(&self.table()[start..]).0)
2112	}
2113
2114	/// Return an iterator over all start IDs in this table.
2115	fn iter(&self) -> StartStateIter<'_, T> {
2116	StartStateIter { st: self, i: `0` }
2117	}
2118
2119	/// Returns the total number of start state IDs in this table.
2120	fn len(&self) -> usize {
2121	self.table().len() / StateID::SIZE
2122	}
2123
2124	/// Returns the table as a raw slice of bytes.
2125	fn table(&self) -> &[u8] {
2126	self.table.as_ref()
2127	}
2128
2129	/// Return the memory usage, in bytes, of this start list.
2130	///
2131	/// This does not include the size of a `StartTable` value itself.
2132	fn memory_usage(&self) -> usize {
2133	self.table().len()
2134	}
2135	}
2136
2137	#[cfg(feature = "dfa-build")]
2138	impl<T: AsMut<[u8]>> StartTable<T> {
2139	/// Set the start state for the given index and pattern.
2140	///
2141	/// If the pattern ID or state ID are not valid, then this will panic.
2142	fn set_start(&mut self, anchored: Anchored, start: Start, id: StateID) {
2143	let start_index = start.as_usize();
2144	let index = match anchored {
2145	Anchored::No => start_index,
2146	Anchored::Yes => self.stride + start_index,
2147	Anchored::Pattern(pid) => {
2148	let pid = pid.as_usize();
2149	let len = self
2150	.pattern_len
2151	.expect("start states for each pattern enabled");
2152	assert!(pid < len, "invalid pattern ID {:?}", pid);
2153	self.stride
2154	.checked_mul(pid)
2155	.unwrap()
2156	.checked_add(self.stride.checked_mul(`2`).unwrap())
2157	.unwrap()
2158	.checked_add(start_index)
2159	.unwrap()
2160	}
2161	};
2162	let start = index * StateID::SIZE;
2163	let end = start + StateID::SIZE;
2164	wire::write_state_id::<wire::NE>(
2165	id,
2166	&mut self.table.as_mut()[start..end],
2167	);
2168	}
2169	}
2170
2171	/// An iterator over all state state IDs in a sparse DFA.
2172	struct StartStateIter<'a, T> {
2173	st: &'a StartTable<T>,
2174	i: usize,
2175	}
2176
2177	impl<'a, T: AsRef<[u8]>> Iterator for StartStateIter<'a, T> {
2178	type Item = (StateID, Anchored, Start);
2179
2180	fn next(&mut self) -> Option<(StateID, Anchored, Start)> {
2181	let i = self.i;
2182	if i >= self.st.len() {
2183	return None;
2184	}
2185	self.i += `1`;
2186
2187	// This unwrap is okay since the stride of any DFA must always match
2188	// the number of start state types.
2189	let start_type = Start::from_usize(i % self.st.stride).unwrap();
2190	let anchored = if i < self.st.stride {
2191	Anchored::No
2192	} else if i < (`2` * self.st.stride) {
2193	Anchored::Yes
2194	} else {
2195	let pid = (i - (`2` * self.st.stride)) / self.st.stride;
2196	Anchored::Pattern(PatternID::new(pid).unwrap())
2197	};
2198	let start = i * StateID::SIZE;
2199	let end = start + StateID::SIZE;
2200	let bytes = self.st.table()[start..end].try_into().unwrap();
2201	// This is OK since we're allowed to assume that any IDs in this start
2202	// table are correct and valid for this DFA.
2203	let id = StateID::from_ne_bytes_unchecked(bytes);
2204	Some((id, anchored, start_type))
2205	}
2206	}
2207
2208	impl<'a, T> fmt::Debug for StartStateIter<'a, T> {
2209	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
2210	f.debug_struct("StartStateIter").field("i", &self.i).finish()
2211	}
2212	}
2213
2214	/// An iterator over all states in a sparse DFA.
2215	///
2216	/// This iterator yields tuples, where the first element is the state ID and
2217	/// the second element is the state itself.
2218	struct StateIter<'a, T> {
2219	trans: &'a Transitions<T>,
2220	id: usize,
2221	}
2222
2223	impl<'a, T: AsRef<[u8]>> Iterator for StateIter<'a, T> {
2224	type Item = State<'a>;
2225
2226	fn next(&mut self) -> Option<State<'a>> {
2227	if self.id >= self.trans.sparse().len() {
2228	return None;
2229	}
2230	let state = self.trans.state(StateID::new_unchecked(self.id));
2231	self.id = self.id + state.write_to_len();
2232	Some(state)
2233	}
2234	}
2235
2236	impl<'a, T> fmt::Debug for StateIter<'a, T> {
2237	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
2238	f.debug_struct("StateIter").field("id", &self.id).finish()
2239	}
2240	}
2241
2242	/// A representation of a sparse DFA state that can be cheaply materialized
2243	/// from a state identifier.
2244	#[derive(Clone)]
2245	struct State<'a> {
2246	/// The identifier of this state.
2247	id: StateID,
2248	/// Whether this is a match state or not.
2249	is_match: bool,
2250	/// The number of transitions in this state.
2251	ntrans: usize,
2252	/// Pairs of input ranges, where there is one pair for each transition.
2253	/// Each pair specifies an inclusive start and end byte range for the
2254	/// corresponding transition.
2255	input_ranges: &'a [u8],
2256	/// Transitions to the next state. This slice contains native endian
2257	/// encoded state identifiers, with `S` as the representation. Thus, there
2258	/// are `ntrans size_of::<S>()` bytes in this slice.*
2259	next: &'a [u8],
2260	/// If this is a match state, then this contains the pattern IDs that match
2261	/// when the DFA is in this state.
2262	///
2263	/// This is a contiguous sequence of 32-bit native endian encoded integers.
2264	pattern_ids: &'a [u8],
2265	/// An accelerator for this state, if present. If this state has no
2266	/// accelerator, then this is an empty slice. When non-empty, this slice
2267	/// has length at most 3 and corresponds to the exhaustive set of bytes
2268	/// that must be seen in order to transition out of this state.
2269	accel: &'a [u8],
2270	}
2271
2272	impl<'a> State<'a> {
2273	/// Searches for the next transition given an input byte. If no such
2274	/// transition could be found, then a dead state is returned.
2275	///
2276	/// This is marked as inline to help dramatically boost sparse searching,
2277	/// which decodes each state it enters to follow the next transition.
2278	#[cfg_attr(feature = "perf-inline", inline(always))]
2279	fn next(&self, input: u8) -> StateID {
2280	// This straight linear search was observed to be much better than
2281	// binary search on ASCII haystacks, likely because a binary search
2282	// visits the ASCII case last but a linear search sees it first. A
2283	// binary search does do a little better on non-ASCII haystacks, but
2284	// not by much. There might be a better trade off lurking here.
2285	for i in `0`..(self.ntrans - `1`) {
2286	let (start, end) = self.range(i);
2287	if start <= input && input <= end {
2288	return self.next_at(i);
2289	}
2290	// We could bail early with an extra branch: if input < b1, then
2291	// we know we'll never find a matching transition. Interestingly,
2292	// this extra branch seems to not help performance, or will even
2293	// hurt it. It's likely very dependent on the DFA itself and what
2294	// is being searched.
2295	}
2296	DEAD
2297	}
2298
2299	/// Returns the next state ID for the special EOI transition.
2300	fn next_eoi(&self) -> StateID {
2301	self.next_at(self.ntrans - `1`)
2302	}
2303
2304	/// Returns the identifier for this state.
2305	fn id(&self) -> StateID {
2306	self.id
2307	}
2308
2309	/// Returns the inclusive input byte range for the ith transition in this
2310	/// state.
2311	fn range(&self, i: usize) -> (u8, u8) {
2312	(self.input_ranges[i * `2`], self.input_ranges[i * `2` + `1`])
2313	}
2314
2315	/// Returns the next state for the ith transition in this state.
2316	fn next_at(&self, i: usize) -> StateID {
2317	let start = i * StateID::SIZE;
2318	let end = start + StateID::SIZE;
2319	let bytes = self.next[start..end].try_into().unwrap();
2320	StateID::from_ne_bytes_unchecked(bytes)
2321	}
2322
2323	/// Returns the pattern ID for the given match index. If the match index
2324	/// is invalid, then this panics.
2325	fn pattern_id(&self, match_index: usize) -> PatternID {
2326	let start = match_index * PatternID::SIZE;
2327	wire::read_pattern_id_unchecked(&self.pattern_ids[start..]).0
2328	}
2329
2330	/// Returns the total number of pattern IDs for this state. This is always
2331	/// zero when `is_match` is false.
2332	fn pattern_len(&self) -> usize {
2333	assert_eq!(`0`, self.pattern_ids.len() % `4`);
2334	self.pattern_ids.len() / `4`
2335	}
2336
2337	/// Return an accelerator for this state.
2338	fn accelerator(&self) -> &'a [u8] {
2339	self.accel
2340	}
2341
2342	/// Write the raw representation of this state to the given buffer using
2343	/// the given endianness.
2344	fn write_to<E: Endian>(
2345	&self,
2346	mut dst: &mut [u8],
2347	) -> Result<usize, SerializeError> {
2348	let nwrite = self.write_to_len();
2349	if dst.len() < nwrite {
2350	return Err(SerializeError::buffer_too_small(
2351	"sparse state transitions",
2352	));
2353	}
2354
2355	let ntrans =
2356	if self.is_match { self.ntrans \| (`1` << `15`) } else { self.ntrans };
2357	E::write_u16(u16::try_from(ntrans).unwrap(), dst);
2358	dst = &mut dst[size_of::<u16>()..];
2359
2360	dst[..self.input_ranges.len()].copy_from_slice(self.input_ranges);
2361	dst = &mut dst[self.input_ranges.len()..];
2362
2363	for i in `0`..self.ntrans {
2364	E::write_u32(self.next_at(i).as_u32(), dst);
2365	dst = &mut dst[StateID::SIZE..];
2366	}
2367
2368	if self.is_match {
2369	E::write_u32(u32::try_from(self.pattern_len()).unwrap(), dst);
2370	dst = &mut dst[size_of::<u32>()..];
2371	for i in `0`..self.pattern_len() {
2372	let pid = self.pattern_id(i);
2373	E::write_u32(pid.as_u32(), dst);
2374	dst = &mut dst[PatternID::SIZE..];
2375	}
2376	}
2377
2378	dst[`0`] = u8::try_from(self.accel.len()).unwrap();
2379	dst[`1`..][..self.accel.len()].copy_from_slice(self.accel);
2380
2381	Ok(nwrite)
2382	}
2383
2384	/// Return the total number of bytes that this state consumes in its
2385	/// encoded form.
2386	fn write_to_len(&self) -> usize {
2387	let mut len = `2`
2388	+ (self.ntrans * `2`)
2389	+ (self.ntrans * StateID::SIZE)
2390	+ (`1` + self.accel.len());
2391	if self.is_match {
2392	len += size_of::<u32>() + self.pattern_ids.len();
2393	}
2394	len
2395	}
2396	}
2397
2398	impl<'a> fmt::Debug for State<'a> {
2399	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
2400	let mut printed = `false`;
2401	for i in `0`..(self.ntrans - `1`) {
2402	let next = self.next_at(i);
2403	if next == DEAD {
2404	continue;
2405	}
2406
2407	if printed {
2408	write!(f, ", ")?;
2409	}
2410	let (start, end) = self.range(i);
2411	if start == end {
2412	write!(f, "{:?} => {:?}", DebugByte(start), next.as_usize())?;
2413	} else {
2414	write!(
2415	f,
2416	"{:?}-{:?} => {:?}",
2417	DebugByte(start),
2418	DebugByte(end),
2419	next.as_usize(),
2420	)?;
2421	}
2422	printed = `true`;
2423	}
2424	let eoi = self.next_at(self.ntrans - `1`);
2425	if eoi != DEAD {
2426	if printed {
2427	write!(f, ", ")?;
2428	}
2429	write!(f, "EOI => {:?}", eoi.as_usize())?;
2430	}
2431	Ok(())
2432	}
2433	}
2434
2435	/// A representation of a mutable sparse DFA state that can be cheaply
2436	/// materialized from a state identifier.
2437	#[cfg(feature = "dfa-build")]
2438	struct StateMut<'a> {
2439	/// The identifier of this state.
2440	id: StateID,
2441	/// Whether this is a match state or not.
2442	is_match: bool,
2443	/// The number of transitions in this state.
2444	ntrans: usize,
2445	/// Pairs of input ranges, where there is one pair for each transition.
2446	/// Each pair specifies an inclusive start and end byte range for the
2447	/// corresponding transition.
2448	input_ranges: &'a mut [u8],
2449	/// Transitions to the next state. This slice contains native endian
2450	/// encoded state identifiers, with `S` as the representation. Thus, there
2451	/// are `ntrans size_of::<S>()` bytes in this slice.*
2452	next: &'a mut [u8],
2453	/// If this is a match state, then this contains the pattern IDs that match
2454	/// when the DFA is in this state.
2455	///
2456	/// This is a contiguous sequence of 32-bit native endian encoded integers.
2457	pattern_ids: &'a [u8],
2458	/// An accelerator for this state, if present. If this state has no
2459	/// accelerator, then this is an empty slice. When non-empty, this slice
2460	/// has length at most 3 and corresponds to the exhaustive set of bytes
2461	/// that must be seen in order to transition out of this state.
2462	accel: &'a mut [u8],
2463	}
2464
2465	#[cfg(feature = "dfa-build")]
2466	impl<'a> StateMut<'a> {
2467	/// Sets the ith transition to the given state.
2468	fn set_next_at(&mut self, i: usize, next: StateID) {
2469	let start = i * StateID::SIZE;
2470	let end = start + StateID::SIZE;
2471	wire::write_state_id::<wire::NE>(next, &mut self.next[start..end]);
2472	}
2473	}
2474
2475	#[cfg(feature = "dfa-build")]
2476	impl<'a> fmt::Debug for StateMut<'a> {
2477	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
2478	let state = State {
2479	id: self.id,
2480	is_match: self.is_match,
2481	ntrans: self.ntrans,
2482	input_ranges: self.input_ranges,
2483	next: self.next,
2484	pattern_ids: self.pattern_ids,
2485	accel: self.accel,
2486	};
2487	fmt::Debug::fmt(&state, f)
2488	}
2489	}
2490
2491	// In order to validate everything, we not only need to make sure we
2492	// can decode every state, but that every transition in every state
2493	// points to a valid state. There are many duplicative transitions, so
2494	// we record state IDs that we've verified so that we don't redo the
2495	// decoding work.
2496	//
2497	// Except, when in no_std mode, we don't have dynamic memory allocation
2498	// available to us, so we skip this optimization. It's not clear
2499	// whether doing something more clever is worth it just yet. If you're
2500	// profiling this code and need it to run faster, please file an issue.
2501	//
2502	// OK, so we also use this to record the set of valid state IDs. Since
2503	// it is possible for a transition to point to an invalid state ID that
2504	// still (somehow) deserializes to a valid state. So we need to make
2505	// sure our transitions are limited to actually correct state IDs.
2506	// The problem is, I'm not sure how to do this verification step in
2507	// no-std no-alloc mode. I think we'd have* to store the set of valid*
2508	// state IDs in the DFA itself. For now, we don't do this verification
2509	// in no-std no-alloc mode. The worst thing that can happen is an
2510	// incorrect result. But no panics or memory safety problems should
2511	// result. Because we still do validate that the state itself is
2512	// "valid" in the sense that everything it points to actually exists.
2513	//
2514	// ---AG
2515	#[derive(Debug)]
2516	struct Seen {
2517	#[cfg(feature = "alloc")]
2518	set: alloc::collections::BTreeSet<StateID>,
2519	#[cfg(not(feature = "alloc"))]
2520	set: core::marker::PhantomData<StateID>,
2521	}
2522
2523	#[cfg(feature = "alloc")]
2524	impl Seen {
2525	fn new() -> Seen {
2526	Seen { set: alloc::collections::BTreeSet::new() }
2527	}
2528	fn insert(&mut self, id: StateID) {
2529	self.set.insert(id);
2530	}
2531	fn contains(&self, id: &StateID) -> bool {
2532	self.set.contains(id)
2533	}
2534	}
2535
2536	#[cfg(not(feature = "alloc"))]
2537	impl Seen {
2538	fn new() -> Seen {
2539	Seen { set: core::marker::PhantomData }
2540	}
2541	fn insert(&mut self, _id: StateID) {}
2542	fn contains(&self, _id: &StateID) -> bool {
2543	`true`
2544	}
2545	}
2546
2547	/*
2548	/// A binary search routine specialized specifically to a sparse DFA state's
2549	/// transitions. Specifically, the transitions are defined as a set of pairs
2550	/// of input bytes that delineate an inclusive range of bytes. If the input
2551	/// byte is in the range, then the corresponding transition is a match.
2552	///
2553	/// This binary search accepts a slice of these pairs and returns the position
2554	/// of the matching pair (the ith transition), or None if no matching pair
2555	/// could be found.
2556	///
2557	/// Note that this routine is not currently used since it was observed to
2558	/// either decrease performance when searching ASCII, or did not provide enough
2559	/// of a boost on non-ASCII haystacks to be worth it. However, we leave it here
2560	/// for posterity in case we can find a way to use it.
2561	///
2562	/// In theory, we could use the standard library's search routine if we could
2563	/// cast a `&[u8]` to a `&[(u8, u8)]`, but I don't believe this is currently
2564	/// guaranteed to be safe and is thus UB (since I don't think the in-memory
2565	/// representation of `(u8, u8)` has been nailed down). One could define a
2566	/// repr(C) type, but the casting doesn't seem justified.
2567	#[cfg_attr(feature = "perf-inline", inline(always))]
2568	fn binary_search_ranges(ranges: &[u8], needle: u8) -> Option<usize> {
2569	debug_assert!(ranges.len() % 2 == 0, "ranges must have even length");
2570	debug_assert!(ranges.len() <= 512, "ranges should be short");
2571
2572	let (mut left, mut right) = (0, ranges.len() / 2);
2573	while left < right {
2574	let mid = (left + right) / 2;
2575	let (b1, b2) = (ranges[mid 2], ranges[mid * 2 + 1]);*
2576	if needle < b1 {
2577	right = mid;
2578	} else if needle > b2 {
2579	left = mid + 1;
2580	} else {
2581	return Some(mid);
2582	}
2583	}
2584	None
2585	}
2586	*/
2587
2588	#[cfg(all(test, feature = "syntax", feature = "dfa-build"))]
2589	mod tests {
2590	use crate::{
2591	dfa::{dense::DFA, Automaton},
2592	nfa::thompson,
2593	Input, MatchError,
2594	};
2595
2596	// See the analogous test in src/hybrid/dfa.rs and src/dfa/dense.rs.
2597	#[test]
2598	fn heuristic_unicode_forward() {
2599	let dfa = DFA::builder()
2600	.configure(DFA::config().unicode_word_boundary(`true`))
2601	.thompson(thompson::Config::new().reverse(`true`))
2602	.build(r"\b[0-9]+\b")
2603	.unwrap()
2604	.to_sparse()
2605	.unwrap();
2606
2607	let input = Input::new("β123").range(`2`..);
2608	let expected = MatchError::quit(`0xB2`, `1`);
2609	let got = dfa.try_search_fwd(&input);
2610	assert_eq!(Err(expected), got);
2611
2612	let input = Input::new("123β").range(..`3`);
2613	let expected = MatchError::quit(`0xCE`, `3`);
2614	let got = dfa.try_search_fwd(&input);
2615	assert_eq!(Err(expected), got);
2616	}
2617
2618	// See the analogous test in src/hybrid/dfa.rs and src/dfa/dense.rs.
2619	#[test]
2620	fn heuristic_unicode_reverse() {
2621	let dfa = DFA::builder()
2622	.configure(DFA::config().unicode_word_boundary(`true`))
2623	.thompson(thompson::Config::new().reverse(`true`))
2624	.build(r"\b[0-9]+\b")
2625	.unwrap()
2626	.to_sparse()
2627	.unwrap();
2628
2629	let input = Input::new("β123").range(`2`..);
2630	let expected = MatchError::quit(`0xB2`, `1`);
2631	let got = dfa.try_search_rev(&input);
2632	assert_eq!(Err(expected), got);
2633
2634	let input = Input::new("123β").range(..`3`);
2635	let expected = MatchError::quit(`0xCE`, `3`);
2636	let got = dfa.try_search_rev(&input);
2637	assert_eq!(Err(expected), got);
2638	}
2639	}
2640