sparse.rs source code [crates/regex-automata-0.2.0/src/dfa/sparse.rs]

1	/!*
2	Types and routines specific to sparse DFAs.
3
4	This module is the home of [`sparse::DFA`](DFA).
5
6	Unlike the [`dense`](super::dense) module, this module does not contain a
7	builder or configuration specific for sparse DFAs. Instead, the intended
8	way to build a sparse DFA is either by using a default configuration with
9	its constructor [`sparse::DFA::new`](DFA::new), or by first configuring the
10	construction of a dense DFA with [`dense::Builder`](super::dense::Builder)
11	and then calling [`dense::DFA::to_sparse`](super::dense::DFA::to_sparse). For
12	example, this configures a sparse DFA to do an overlapping search:
13
14	```
15	use regex_automata::{
16	dfa::{Automaton, OverlappingState, dense},
17	HalfMatch, MatchKind,
18	};
19
20	let dense_re = dense::Builder::new()
21	.configure(dense::Config::new().match_kind(MatchKind::All))
22	.build(r"Samwise\|Sam")?;
23	let sparse_re = dense_re.to_sparse()?;
24
25	// Setup our haystack and initial start state.
26	let haystack = b"Samwise";
27	let mut state = OverlappingState::start();
28
29	// First, 'Sam' will match.
30	let end1 = sparse_re.find_overlapping_fwd_at(
31	None, None, haystack, `0`, haystack.len(), &mut state,
32	)?;
33	assert_eq!(end1, Some(HalfMatch::must(`0`, `3`)));
34
35	// And now 'Samwise' will match.
36	let end2 = sparse_re.find_overlapping_fwd_at(
37	None, None, haystack, `3`, haystack.len(), &mut state,
38	)?;
39	assert_eq!(end2, Some(HalfMatch::must(`0`, `7`)));
40	# Ok::<(), Box<dyn std::error::Error>>(())
41	```
42	*/
43
44	#[cfg(feature = "alloc")]
45	use core::iter;
46	use core::{
47	convert::{TryFrom, TryInto},
48	fmt,
49	mem::size_of,
50	};
51
52	#[cfg(feature = "alloc")]
53	use alloc::{collections::BTreeSet, vec, vec::Vec};
54
55	#[cfg(feature = "alloc")]
56	use crate::dfa::{dense, error::Error};
57	use crate::{
58	dfa::{
59	automaton::{fmt_state_indicator, Automaton},
60	special::Special,
61	DEAD,
62	},
63	util::{
64	alphabet::ByteClasses,
65	bytes::{self, DeserializeError, Endian, SerializeError},
66	id::{PatternID, StateID},
67	start::Start,
68	DebugByte,
69	},
70	};
71
72	const LABEL: &str = "rust-regex-automata-dfa-sparse";
73	const VERSION: u32 = `2`;
74
75	/// A sparse deterministic finite automaton (DFA) with variable sized states.
76	///
77	/// In contrast to a [dense::DFA](crate::dfa::dense::DFA), a sparse DFA uses
78	/// a more space efficient representation for its transitions. Consequently,
79	/// sparse DFAs may use much less memory than dense DFAs, but this comes at a
80	/// price. In particular, reading the more space efficient transitions takes
81	/// more work, and consequently, searching using a sparse DFA is typically
82	/// slower than a dense DFA.
83	///
84	/// A sparse DFA can be built using the default configuration via the
85	/// [`DFA::new`] constructor. Otherwise, one can configure various aspects
86	/// of a dense DFA via [`dense::Builder`](crate::dfa::dense::Builder),
87	/// and then convert a dense DFA to a sparse DFA using
88	/// [`dense::DFA::to_sparse`](crate::dfa::dense::DFA::to_sparse).
89	///
90	/// In general, a sparse DFA supports all the same search operations as a dense
91	/// DFA.
92	///
93	/// Making the choice between a dense and sparse DFA depends on your specific
94	/// work load. If you can sacrifice a bit of search time performance, then a
95	/// sparse DFA might be the best choice. In particular, while sparse DFAs are
96	/// probably always slower than dense DFAs, you may find that they are easily
97	/// fast enough for your purposes!
98	///
99	/// # Type parameters
100	///
101	/// A `DFA` has one type parameter, `T`, which is used to represent the parts
102	/// of a sparse DFA. `T` is typically a `Vec<u8>` or a `&[u8]`.
103	///
104	/// # The `Automaton` trait
105	///
106	/// This type implements the [`Automaton`] trait, which means it can be used
107	/// for searching. For example:
108	///
109	/// ```
110	/// use regex_automata::{
111	/// dfa::{Automaton, sparse::DFA},
112	/// HalfMatch,
113	/// };
114	///
115	/// let dfa = DFA::new("foo[0-9]+")?;
116	/// let expected = HalfMatch::must(`0`, `8`);
117	/// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
118	/// # Ok::<(), Box<dyn std::error::Error>>(())
119	/// ```
120	#[derive(Clone)]
121	pub struct DFA<T> {
122	// When compared to a dense DFA, a sparse DFA looks* a lot simpler*
123	// representation-wise. In reality, it is perhaps more complicated. Namely,
124	// in a dense DFA, all information needs to be very cheaply accessible
125	// using only state IDs. In a sparse DFA however, each state uses a
126	// variable amount of space because each state encodes more information
127	// than just its transitions. Each state also includes an accelerator if
128	// one exists, along with the matching pattern IDs if the state is a match
129	// state.
130	//
131	// That is, a lot of the complexity is pushed down into how each state
132	// itself is represented.
133	trans: Transitions<T>,
134	starts: StartTable<T>,
135	special: Special,
136	}
137
138	#[cfg(feature = "alloc")]
139	impl DFA<Vec<u8>> {
140	/// Parse the given regular expression using a default configuration and
141	/// return the corresponding sparse DFA.
142	///
143	/// If you want a non-default configuration, then use
144	/// the [`dense::Builder`](crate::dfa::dense::Builder)
145	/// to set your own configuration, and then call
146	/// [`dense::DFA::to_sparse`](crate::dfa::dense::DFA::to_sparse) to create
147	/// a sparse DFA.
148	///
149	/// # Example
150	///
151	/// ```
152	/// use regex_automata::{
153	/// dfa::{Automaton, sparse},
154	/// HalfMatch,
155	/// };
156	///
157	/// let dfa = sparse::DFA::new("foo[0-9]+bar")?;
158	///
159	/// let expected = HalfMatch::must(0, 11);
160	/// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345bar")?);
161	/// # Ok::<(), Box<dyn std::error::Error>>(())
162	/// ```
163	pub fn new(pattern: &str) -> Result<DFA<Vec<u8>>, Error> {
164	dense::Builder::new()
165	.build(pattern)
166	.and_then(\|dense\| dense.to_sparse())
167	}
168
169	/// Parse the given regular expressions using a default configuration and
170	/// return the corresponding multi-DFA.
171	///
172	/// If you want a non-default configuration, then use
173	/// the [`dense::Builder`](crate::dfa::dense::Builder)
174	/// to set your own configuration, and then call
175	/// [`dense::DFA::to_sparse`](crate::dfa::dense::DFA::to_sparse) to create
176	/// a sparse DFA.
177	///
178	/// # Example
179	///
180	/// ```
181	/// use regex_automata::{
182	/// dfa::{Automaton, sparse},
183	/// HalfMatch,
184	/// };
185	///
186	/// let dfa = sparse::DFA::new_many(&["[0-9]+", "[a-z]+"])?;
187	/// let expected = HalfMatch::must(1, 3);
188	/// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345bar")?);
189	/// # Ok::<(), Box<dyn std::error::Error>>(())
190	/// ```
191	pub fn new_many<P: AsRef<str>>(
192	patterns: &[P],
193	) -> Result<DFA<Vec<u8>>, Error> {
194	dense::Builder::new()
195	.build_many(patterns)
196	.and_then(\|dense\| dense.to_sparse())
197	}
198	}
199
200	#[cfg(feature = "alloc")]
201	impl DFA<Vec<u8>> {
202	/// Create a new DFA that matches every input.
203	///
204	/// # Example
205	///
206	/// ```
207	/// use regex_automata::{
208	/// dfa::{Automaton, sparse},
209	/// HalfMatch,
210	/// };
211	///
212	/// let dfa = sparse::DFA::always_match()?;
213	///
214	/// let expected = HalfMatch::must(0, 0);
215	/// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"")?);
216	/// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo")?);
217	/// # Ok::<(), Box<dyn std::error::Error>>(())
218	/// ```
219	pub fn always_match() -> Result<DFA<Vec<u8>>, Error> {
220	dense::DFA::always_match()?.to_sparse()
221	}
222
223	/// Create a new sparse DFA that never matches any input.
224	///
225	/// # Example
226	///
227	/// ```
228	/// use regex_automata::dfa::{Automaton, sparse};
229	///
230	/// let dfa = sparse::DFA::never_match()?;
231	/// assert_eq!(None, dfa.find_leftmost_fwd(b"")?);
232	/// assert_eq!(None, dfa.find_leftmost_fwd(b"foo")?);
233	/// # Ok::<(), Box<dyn std::error::Error>>(())
234	/// ```
235	pub fn never_match() -> Result<DFA<Vec<u8>>, Error> {
236	dense::DFA::never_match()?.to_sparse()
237	}
238
239	/// The implementation for constructing a sparse DFA from a dense DFA.
240	pub(crate) fn from_dense<T: AsRef<[u32]>>(
241	dfa: &dense::DFA<T>,
242	) -> Result<DFA<Vec<u8>>, Error> {
243	// In order to build the transition table, we need to be able to write
244	// state identifiers for each of the "next" transitions in each state.
245	// Our state identifiers correspond to the byte offset in the
246	// transition table at which the state is encoded. Therefore, we do not
247	// actually know what the state identifiers are until we've allocated
248	// exactly as much space as we need for each state. Thus, construction
249	// of the transition table happens in two passes.
250	//
251	// In the first pass, we fill out the shell of each state, which
252	// includes the transition count, the input byte ranges and zero-filled
253	// space for the transitions and accelerators, if present. In this
254	// first pass, we also build up a map from the state identifier index
255	// of the dense DFA to the state identifier in this sparse DFA.
256	//
257	// In the second pass, we fill in the transitions based on the map
258	// built in the first pass.
259
260	// The capacity given here reflects a minimum. (Well, the true minimum
261	// is likely even bigger, but hopefully this saves a few reallocs.)
262	let mut sparse = Vec::with_capacity(StateID::SIZE * dfa.state_count());
263	// This maps state indices from the dense DFA to StateIDs in the sparse
264	// DFA. We build out this map on the first pass, and then use it in the
265	// second pass to back-fill our transitions.
266	let mut remap: Vec<StateID> = vec![DEAD; dfa.state_count()];
267	for state in dfa.states() {
268	let pos = sparse.len();
269
270	remap[dfa.to_index(state.id())] =
271	StateID::new(pos).map_err(\|_\| Error::too_many_states())?;
272	// zero-filled space for the transition count
273	sparse.push(`0`);
274	sparse.push(`0`);
275
276	let mut transition_count = `0`;
277	for (unit1, unit2, _) in state.sparse_transitions() {
278	match (unit1.as_u8(), unit2.as_u8()) {
279	(Some(b1), Some(b2)) => {
280	transition_count += `1`;
281	sparse.push(b1);
282	sparse.push(b2);
283	}
284	(None, None) => {}
285	(Some(_), None) \| (None, Some(_)) => {
286	// can never occur because sparse_transitions never
287	// groups EOI with any other transition.
288	unreachable!()
289	}
290	}
291	}
292	// Add dummy EOI transition. This is never actually read while
293	// searching, but having space equivalent to the total number
294	// of transitions is convenient. Otherwise, we'd need to track
295	// a different number of transitions for the byte ranges as for
296	// the 'next' states.
297	//
298	// N.B. The loop above is not guaranteed to yield the EOI
299	// transition, since it may point to a DEAD state. By putting
300	// it here, we always write the EOI transition, and thus
301	// guarantee that our transition count is >0. Why do we always
302	// need the EOI transition? Because in order to implement
303	// Automaton::next_eoi_state, this lets us just ask for the last
304	// transition. There are probably other/better ways to do this.
305	transition_count += `1`;
306	sparse.push(`0`);
307	sparse.push(`0`);
308
309	// Check some assumptions about transition count.
310	assert_ne!(
311	transition_count, `0`,
312	"transition count should be non-zero",
313	);
314	assert!(
315	transition_count <= `257`,
316	"expected transition count {} to be <= 257",
317	transition_count,
318	);
319
320	// Fill in the transition count.
321	// Since transition count is always <= 257, we use the most
322	// significant bit to indicate whether this is a match state or
323	// not.
324	let ntrans = if dfa.is_match_state(state.id()) {
325	transition_count \| (`1` << `15`)
326	} else {
327	transition_count
328	};
329	bytes::NE::write_u16(ntrans, &mut sparse[pos..]);
330
331	// zero-fill the actual transitions.
332	// Unwraps are OK since transition_count <= 257 and our minimum
333	// support usize size is 16-bits.
334	let zeros = usize::try_from(transition_count)
335	.unwrap()
336	.checked_mul(StateID::SIZE)
337	.unwrap();
338	sparse.extend(iter::repeat(`0`).take(zeros));
339
340	// If this is a match state, write the pattern IDs matched by this
341	// state.
342	if dfa.is_match_state(state.id()) {
343	let plen = dfa.match_pattern_len(state.id());
344	// Write the actual pattern IDs with a u32 length prefix.
345	// First, zero-fill space.
346	let mut pos = sparse.len();
347	// Unwraps are OK since it's guaranteed that plen <=
348	// PatternID::LIMIT, which is in turn guaranteed to fit into a
349	// u32.
350	let zeros = size_of::<u32>()
351	.checked_mul(plen)
352	.unwrap()
353	.checked_add(size_of::<u32>())
354	.unwrap();
355	sparse.extend(iter::repeat(`0`).take(zeros));
356
357	// Now write the length prefix.
358	bytes::NE::write_u32(
359	// Will never fail since u32::MAX is invalid pattern ID.
360	// Thus, the number of pattern IDs is representable by a
361	// u32.
362	plen.try_into().expect("pattern ID count fits in u32"),
363	&mut sparse[pos..],
364	);
365	pos += size_of::<u32>();
366
367	// Now write the pattern IDs.
368	for &pid in dfa.pattern_id_slice(state.id()) {
369	pos += bytes::write_pattern_id::<bytes::NE>(
370	pid,
371	&mut sparse[pos..],
372	);
373	}
374	}
375
376	// And now add the accelerator, if one exists. An accelerator is
377	// at most 4 bytes and at least 1 byte. The first byte is the
378	// length, N. N bytes follow the length. The set of bytes that
379	// follow correspond (exhaustively) to the bytes that must be seen
380	// to leave this state.
381	let accel = dfa.accelerator(state.id());
382	sparse.push(accel.len().try_into().unwrap());
383	sparse.extend_from_slice(accel);
384	}
385
386	let mut new = DFA {
387	trans: Transitions {
388	sparse,
389	classes: dfa.byte_classes().clone(),
390	count: dfa.state_count(),
391	patterns: dfa.pattern_count(),
392	},
393	starts: StartTable::from_dense_dfa(dfa, &remap)?,
394	special: dfa.special().remap(\|id\| remap[dfa.to_index(id)]),
395	};
396	// And here's our second pass. Iterate over all of the dense states
397	// again, and update the transitions in each of the states in the
398	// sparse DFA.
399	for old_state in dfa.states() {
400	let new_id = remap[dfa.to_index(old_state.id())];
401	let mut new_state = new.trans.state_mut(new_id);
402	let sparse = old_state.sparse_transitions();
403	for (i, (_, _, next)) in sparse.enumerate() {
404	let next = remap[dfa.to_index(next)];
405	new_state.set_next_at(i, next);
406	}
407	}
408	trace!(
409	"created sparse DFA, memory usage: {} (dense memory usage: {})",
410	new.memory_usage(),
411	dfa.memory_usage(),
412	);
413	Ok(new)
414	}
415	}
416
417	impl<T: AsRef<[u8]>> DFA<T> {
418	/// Cheaply return a borrowed version of this sparse DFA. Specifically, the
419	/// DFA returned always uses `&[u8]` for its transitions.
420	pub fn as_ref<'a>(&'a self) -> DFA<&'a [u8]> {
421	DFA {
422	trans: self.trans.as_ref(),
423	starts: self.starts.as_ref(),
424	special: self.special,
425	}
426	}
427
428	/// Return an owned version of this sparse DFA. Specifically, the DFA
429	/// returned always uses `Vec<u8>` for its transitions.
430	///
431	/// Effectively, this returns a sparse DFA whose transitions live on the
432	/// heap.
433	#[cfg(feature = "alloc")]
434	pub fn to_owned(&self) -> DFA<Vec<u8>> {
435	DFA {
436	trans: self.trans.to_owned(),
437	starts: self.starts.to_owned(),
438	special: self.special,
439	}
440	}
441
442	/// Returns the memory usage, in bytes, of this DFA.
443	///
444	/// The memory usage is computed based on the number of bytes used to
445	/// represent this DFA.
446	///
447	/// This does not* include the stack size used up by this DFA. To*
448	/// compute that, use `std::mem::size_of::<sparse::DFA>()`.
449	pub fn memory_usage(&self) -> usize {
450	self.trans.memory_usage() + self.starts.memory_usage()
451	}
452
453	/// Returns true only if this DFA has starting states for each pattern.
454	///
455	/// When a DFA has starting states for each pattern, then a search with the
456	/// DFA can be configured to only look for anchored matches of a specific
457	/// pattern. Specifically, APIs like [`Automaton::find_earliest_fwd_at`]
458	/// can accept a non-None `pattern_id` if and only if this method returns
459	/// true. Otherwise, calling `find_earliest_fwd_at` will panic.
460	///
461	/// Note that if the DFA is empty, this always returns false.
462	pub fn has_starts_for_each_pattern(&self) -> bool {
463	self.starts.patterns > `0`
464	}
465	}
466
467	/// Routines for converting a sparse DFA to other representations, such as raw
468	/// bytes suitable for persistent storage.
469	impl<T: AsRef<[u8]>> DFA<T> {
470	/// Serialize this DFA as raw bytes to a `Vec<u8>` in little endian
471	/// format.
472	///
473	/// The written bytes are guaranteed to be deserialized correctly and
474	/// without errors in a semver compatible release of this crate by a
475	/// `DFA`'s deserialization APIs (assuming all other criteria for the
476	/// deserialization APIs has been satisfied):
477	///
478	/// [`DFA::from_bytes`]*
479	/// [`DFA::from_bytes_unchecked`]*
480	///
481	/// Note that unlike a [`dense::DFA`](crate::dfa::dense::DFA)'s
482	/// serialization methods, this does not add any initial padding to the
483	/// returned bytes. Padding isn't required for sparse DFAs since they have
484	/// no alignment requirements.
485	///
486	/// # Example
487	///
488	/// This example shows how to serialize and deserialize a DFA:
489	///
490	/// ```
491	/// use regex_automata::{
492	/// dfa::{Automaton, sparse::DFA},
493	/// HalfMatch,
494	/// };
495	///
496	/// // Compile our original DFA.
497	/// let original_dfa = DFA::new("foo[0-9]+")?;
498	///
499	/// // N.B. We use native endianness here to make the example work, but
500	/// // using to_bytes_little_endian would work on a little endian target.
501	/// let buf = original_dfa.to_bytes_native_endian();
502	/// // Even if buf has initial padding, DFA::from_bytes will automatically
503	/// // ignore it.
504	/// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0;
505	///
506	/// let expected = HalfMatch::must(0, 8);
507	/// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
508	/// # Ok::<(), Box<dyn std::error::Error>>(())
509	/// ```
510	#[cfg(feature = "alloc")]
511	pub fn to_bytes_little_endian(&self) -> Vec<u8> {
512	self.to_bytes::<bytes::LE>()
513	}
514
515	/// Serialize this DFA as raw bytes to a `Vec<u8>` in big endian
516	/// format.
517	///
518	/// The written bytes are guaranteed to be deserialized correctly and
519	/// without errors in a semver compatible release of this crate by a
520	/// `DFA`'s deserialization APIs (assuming all other criteria for the
521	/// deserialization APIs has been satisfied):
522	///
523	/// [`DFA::from_bytes`]*
524	/// [`DFA::from_bytes_unchecked`]*
525	///
526	/// Note that unlike a [`dense::DFA`](crate::dfa::dense::DFA)'s
527	/// serialization methods, this does not add any initial padding to the
528	/// returned bytes. Padding isn't required for sparse DFAs since they have
529	/// no alignment requirements.
530	///
531	/// # Example
532	///
533	/// This example shows how to serialize and deserialize a DFA:
534	///
535	/// ```
536	/// use regex_automata::{
537	/// dfa::{Automaton, sparse::DFA},
538	/// HalfMatch,
539	/// };
540	///
541	/// // Compile our original DFA.
542	/// let original_dfa = DFA::new("foo[0-9]+")?;
543	///
544	/// // N.B. We use native endianness here to make the example work, but
545	/// // using to_bytes_big_endian would work on a big endian target.
546	/// let buf = original_dfa.to_bytes_native_endian();
547	/// // Even if buf has initial padding, DFA::from_bytes will automatically
548	/// // ignore it.
549	/// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0;
550	///
551	/// let expected = HalfMatch::must(0, 8);
552	/// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
553	/// # Ok::<(), Box<dyn std::error::Error>>(())
554	/// ```
555	#[cfg(feature = "alloc")]
556	pub fn to_bytes_big_endian(&self) -> Vec<u8> {
557	self.to_bytes::<bytes::BE>()
558	}
559
560	/// Serialize this DFA as raw bytes to a `Vec<u8>` in native endian
561	/// format.
562	///
563	/// The written bytes are guaranteed to be deserialized correctly and
564	/// without errors in a semver compatible release of this crate by a
565	/// `DFA`'s deserialization APIs (assuming all other criteria for the
566	/// deserialization APIs has been satisfied):
567	///
568	/// [`DFA::from_bytes`]*
569	/// [`DFA::from_bytes_unchecked`]*
570	///
571	/// Note that unlike a [`dense::DFA`](crate::dfa::dense::DFA)'s
572	/// serialization methods, this does not add any initial padding to the
573	/// returned bytes. Padding isn't required for sparse DFAs since they have
574	/// no alignment requirements.
575	///
576	/// Generally speaking, native endian format should only be used when
577	/// you know that the target you're compiling the DFA for matches the
578	/// endianness of the target on which you're compiling DFA. For example,
579	/// if serialization and deserialization happen in the same process or on
580	/// the same machine. Otherwise, when serializing a DFA for use in a
581	/// portable environment, you'll almost certainly want to serialize _both_
582	/// a little endian and a big endian version and then load the correct one
583	/// based on the target's configuration.
584	///
585	/// # Example
586	///
587	/// This example shows how to serialize and deserialize a DFA:
588	///
589	/// ```
590	/// use regex_automata::{
591	/// dfa::{Automaton, sparse::DFA},
592	/// HalfMatch,
593	/// };
594	///
595	/// // Compile our original DFA.
596	/// let original_dfa = DFA::new("foo[0-9]+")?;
597	///
598	/// let buf = original_dfa.to_bytes_native_endian();
599	/// // Even if buf has initial padding, DFA::from_bytes will automatically
600	/// // ignore it.
601	/// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0;
602	///
603	/// let expected = HalfMatch::must(0, 8);
604	/// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
605	/// # Ok::<(), Box<dyn std::error::Error>>(())
606	/// ```
607	#[cfg(feature = "alloc")]
608	pub fn to_bytes_native_endian(&self) -> Vec<u8> {
609	self.to_bytes::<bytes::NE>()
610	}
611
612	/// The implementation of the public `to_bytes` serialization methods,
613	/// which is generic over endianness.
614	#[cfg(feature = "alloc")]
615	fn to_bytes<E: Endian>(&self) -> Vec<u8> {
616	let mut buf = vec![`0`; self.write_to_len()];
617	// This should always succeed since the only possible serialization
618	// error is providing a buffer that's too small, but we've ensured that
619	// `buf` is big enough here.
620	self.write_to::<E>(&mut buf).unwrap();
621	buf
622	}
623
624	/// Serialize this DFA as raw bytes to the given slice, in little endian
625	/// format. Upon success, the total number of bytes written to `dst` is
626	/// returned.
627	///
628	/// The written bytes are guaranteed to be deserialized correctly and
629	/// without errors in a semver compatible release of this crate by a
630	/// `DFA`'s deserialization APIs (assuming all other criteria for the
631	/// deserialization APIs has been satisfied):
632	///
633	/// * [`DFA::from_bytes`]
634	/// * [`DFA::from_bytes_unchecked`]
635	///
636	/// # Errors
637	///
638	/// This returns an error if the given destination slice is not big enough
639	/// to contain the full serialized DFA. If an error occurs, then nothing
640	/// is written to `dst`.
641	///
642	/// # Example
643	///
644	/// This example shows how to serialize and deserialize a DFA without
645	/// dynamic memory allocation.
646	///
647	/// ```
648	/// use regex_automata::{
649	/// dfa::{Automaton, sparse::DFA},
650	/// HalfMatch,
651	/// };
652	///
653	/// // Compile our original DFA.
654	/// let original_dfa = DFA::new("foo[0-9]+")?;
655	///
656	/// // Create a 4KB buffer on the stack to store our serialized DFA.
657	/// let mut buf = [`0u8`; `4` * (`1`<<`10`)];
658	/// // N.B. We use native endianness here to make the example work, but
659	/// // using write_to_little_endian would work on a little endian target.
660	/// let written = original_dfa.write_to_native_endian(&mut buf)?;
661	/// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0;
662	///
663	/// let expected = HalfMatch::must(`0`, `8`);
664	/// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
665	/// # Ok::<(), Box<dyn std::error::Error>>(())
666	/// ```
667	pub fn write_to_little_endian(
668	&self,
669	dst: &mut [u8],
670	) -> Result<usize, SerializeError> {
671	self.write_to::<bytes::LE>(dst)
672	}
673
674	/// Serialize this DFA as raw bytes to the given slice, in big endian
675	/// format. Upon success, the total number of bytes written to `dst` is
676	/// returned.
677	///
678	/// The written bytes are guaranteed to be deserialized correctly and
679	/// without errors in a semver compatible release of this crate by a
680	/// `DFA`'s deserialization APIs (assuming all other criteria for the
681	/// deserialization APIs has been satisfied):
682	///
683	/// * [`DFA::from_bytes`]
684	/// * [`DFA::from_bytes_unchecked`]
685	///
686	/// # Errors
687	///
688	/// This returns an error if the given destination slice is not big enough
689	/// to contain the full serialized DFA. If an error occurs, then nothing
690	/// is written to `dst`.
691	///
692	/// # Example
693	///
694	/// This example shows how to serialize and deserialize a DFA without
695	/// dynamic memory allocation.
696	///
697	/// ```
698	/// use regex_automata::{
699	/// dfa::{Automaton, sparse::DFA},
700	/// HalfMatch,
701	/// };
702	///
703	/// // Compile our original DFA.
704	/// let original_dfa = DFA::new("foo[0-9]+")?;
705	///
706	/// // Create a 4KB buffer on the stack to store our serialized DFA.
707	/// let mut buf = [`0u8`; `4` * (`1`<<`10`)];
708	/// // N.B. We use native endianness here to make the example work, but
709	/// // using write_to_big_endian would work on a big endian target.
710	/// let written = original_dfa.write_to_native_endian(&mut buf)?;
711	/// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0;
712	///
713	/// let expected = HalfMatch::must(`0`, `8`);
714	/// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
715	/// # Ok::<(), Box<dyn std::error::Error>>(())
716	/// ```
717	pub fn write_to_big_endian(
718	&self,
719	dst: &mut [u8],
720	) -> Result<usize, SerializeError> {
721	self.write_to::<bytes::BE>(dst)
722	}
723
724	/// Serialize this DFA as raw bytes to the given slice, in native endian
725	/// format. Upon success, the total number of bytes written to `dst` is
726	/// returned.
727	///
728	/// The written bytes are guaranteed to be deserialized correctly and
729	/// without errors in a semver compatible release of this crate by a
730	/// `DFA`'s deserialization APIs (assuming all other criteria for the
731	/// deserialization APIs has been satisfied):
732	///
733	/// * [`DFA::from_bytes`]
734	/// * [`DFA::from_bytes_unchecked`]
735	///
736	/// Generally speaking, native endian format should only be used when
737	/// you know that the target you're compiling the DFA for matches the
738	/// endianness of the target on which you're compiling DFA. For example,
739	/// if serialization and deserialization happen in the same process or on
740	/// the same machine. Otherwise, when serializing a DFA for use in a
741	/// portable environment, you'll almost certainly want to serialize _both_
742	/// a little endian and a big endian version and then load the correct one
743	/// based on the target's configuration.
744	///
745	/// # Errors
746	///
747	/// This returns an error if the given destination slice is not big enough
748	/// to contain the full serialized DFA. If an error occurs, then nothing
749	/// is written to `dst`.
750	///
751	/// # Example
752	///
753	/// This example shows how to serialize and deserialize a DFA without
754	/// dynamic memory allocation.
755	///
756	/// ```
757	/// use regex_automata::{
758	/// dfa::{Automaton, sparse::DFA},
759	/// HalfMatch,
760	/// };
761	///
762	/// // Compile our original DFA.
763	/// let original_dfa = DFA::new("foo[0-9]+")?;
764	///
765	/// // Create a 4KB buffer on the stack to store our serialized DFA.
766	/// let mut buf = [`0u8`; `4` * (`1`<<`10`)];
767	/// let written = original_dfa.write_to_native_endian(&mut buf)?;
768	/// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0;
769	///
770	/// let expected = HalfMatch::must(`0`, `8`);
771	/// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
772	/// # Ok::<(), Box<dyn std::error::Error>>(())
773	/// ```
774	pub fn write_to_native_endian(
775	&self,
776	dst: &mut [u8],
777	) -> Result<usize, SerializeError> {
778	self.write_to::<bytes::NE>(dst)
779	}
780
781	/// The implementation of the public `write_to` serialization methods,
782	/// which is generic over endianness.
783	fn write_to<E: Endian>(
784	&self,
785	dst: &mut [u8],
786	) -> Result<usize, SerializeError> {
787	let mut nw = `0`;
788	nw += bytes::write_label(LABEL, &mut dst[nw..])?;
789	nw += bytes::write_endianness_check::<E>(&mut dst[nw..])?;
790	nw += bytes::write_version::<E>(VERSION, &mut dst[nw..])?;
791	nw += {
792	// Currently unused, intended for future flexibility
793	E::write_u32(`0`, &mut dst[nw..]);
794	size_of::<u32>()
795	};
796	nw += self.trans.write_to::<E>(&mut dst[nw..])?;
797	nw += self.starts.write_to::<E>(&mut dst[nw..])?;
798	nw += self.special.write_to::<E>(&mut dst[nw..])?;
799	Ok(nw)
800	}
801
802	/// Return the total number of bytes required to serialize this DFA.
803	///
804	/// This is useful for determining the size of the buffer required to pass
805	/// to one of the serialization routines:
806	///
807	/// * [`DFA::write_to_little_endian`]
808	/// * [`DFA::write_to_big_endian`]
809	/// * [`DFA::write_to_native_endian`]
810	///
811	/// Passing a buffer smaller than the size returned by this method will
812	/// result in a serialization error.
813	///
814	/// # Example
815	///
816	/// This example shows how to dynamically allocate enough room to serialize
817	/// a sparse DFA.
818	///
819	/// ```
820	/// use regex_automata::{
821	/// dfa::{Automaton, sparse::DFA},
822	/// HalfMatch,
823	/// };
824	///
825	/// // Compile our original DFA.
826	/// let original_dfa = DFA::new("foo[0-9]+")?;
827	///
828	/// let mut buf = vec![`0`; original_dfa.write_to_len()];
829	/// let written = original_dfa.write_to_native_endian(&mut buf)?;
830	/// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0;
831	///
832	/// let expected = HalfMatch::must(`0`, `8`);
833	/// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
834	/// # Ok::<(), Box<dyn std::error::Error>>(())
835	/// ```
836	pub fn write_to_len(&self) -> usize {
837	bytes::write_label_len(LABEL)
838	+ bytes::write_endianness_check_len()
839	+ bytes::write_version_len()
840	+ size_of::<u32>() // unused, intended for future flexibility
841	+ self.trans.write_to_len()
842	+ self.starts.write_to_len()
843	+ self.special.write_to_len()
844	}
845	}
846
847	impl<'a> DFA<&'a [u8]> {
848	/// Safely deserialize a sparse DFA with a specific state identifier
849	/// representation. Upon success, this returns both the deserialized DFA
850	/// and the number of bytes read from the given slice. Namely, the contents
851	/// of the slice beyond the DFA are not read.
852	///
853	/// Deserializing a DFA using this routine will never allocate heap memory.
854	/// For safety purposes, the DFA's transitions will be verified such that
855	/// every transition points to a valid state. If this verification is too
856	/// costly, then a [`DFA::from_bytes_unchecked`] API is provided, which
857	/// will always execute in constant time.
858	///
859	/// The bytes given must be generated by one of the serialization APIs
860	/// of a `DFA` using a semver compatible release of this crate. Those
861	/// include:
862	///
863	/// [`DFA::to_bytes_little_endian`]*
864	/// [`DFA::to_bytes_big_endian`]*
865	/// [`DFA::to_bytes_native_endian`]*
866	/// * [`DFA::write_to_little_endian`]
867	/// * [`DFA::write_to_big_endian`]
868	/// * [`DFA::write_to_native_endian`]
869	///
870	/// The `to_bytes` methods allocate and return a `Vec<u8>` for you. The
871	/// `write_to` methods do not allocate and write to an existing slice
872	/// (which may be on the stack). Since deserialization always uses the
873	/// native endianness of the target platform, the serialization API you use
874	/// should match the endianness of the target platform. (It's often a good
875	/// idea to generate serialized DFAs for both forms of endianness and then
876	/// load the correct one based on endianness.)
877	///
878	/// # Errors
879	///
880	/// Generally speaking, it's easier to state the conditions in which an
881	/// error is _not_ returned. All of the following must be true:
882	///
883	/// The bytes given must be produced by one of the serialization APIs*
884	/// on this DFA, as mentioned above.
885	/// The endianness of the target platform matches the endianness used to*
886	/// serialized the provided DFA.
887	///
888	/// If any of the above are not true, then an error will be returned.
889	///
890	/// Note that unlike deserializing a
891	/// [`dense::DFA`](crate::dfa::dense::DFA), deserializing a sparse DFA has
892	/// no alignment requirements. That is, an alignment of `1` is valid.
893	///
894	/// # Panics
895	///
896	/// This routine will never panic for any input.
897	///
898	/// # Example
899	///
900	/// This example shows how to serialize a DFA to raw bytes, deserialize it
901	/// and then use it for searching.
902	///
903	/// ```
904	/// use regex_automata::{
905	/// dfa::{Automaton, sparse::DFA},
906	/// HalfMatch,
907	/// };
908	///
909	/// let initial = DFA::new("foo[0-9]+")?;
910	/// let bytes = initial.to_bytes_native_endian();
911	/// let dfa: DFA<&[u8]> = DFA::from_bytes(&bytes)?.0;
912	///
913	/// let expected = HalfMatch::must(`0`, `8`);
914	/// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
915	/// # Ok::<(), Box<dyn std::error::Error>>(())
916	/// ```
917	///
918	/// # Example: loading a DFA from static memory
919	///
920	/// One use case this library supports is the ability to serialize a
921	/// DFA to disk and then use `include_bytes!` to store it in a compiled
922	/// Rust program. Those bytes can then be cheaply deserialized into a
923	/// `DFA` structure at runtime and used for searching without having to
924	/// re-compile the DFA (which can be quite costly).
925	///
926	/// We can show this in two parts. The first part is serializing the DFA to
927	/// a file:
928	///
929	/// ```no_run
930	/// use regex_automata::dfa::{Automaton, sparse::DFA};
931	///
932	/// let dfa = DFA::new("foo[0-9]+")?;
933	///
934	/// // Write a big endian serialized version of this DFA to a file.
935	/// let bytes = dfa.to_bytes_big_endian();
936	/// std::fs::write("foo.bigendian.dfa", &bytes)?;
937	///
938	/// // Do it again, but this time for little endian.
939	/// let bytes = dfa.to_bytes_little_endian();
940	/// std::fs::write("foo.littleendian.dfa", &bytes)?;
941	/// # Ok::<(), Box<dyn std::error::Error>>(())
942	/// ```
943	///
944	/// And now the second part is embedding the DFA into the compiled program
945	/// and deserializing it at runtime on first use. We use conditional
946	/// compilation to choose the correct endianness. As mentioned above, we
947	/// do not need to employ any special tricks to ensure a proper alignment,
948	/// since a sparse DFA has no alignment requirements.
949	///
950	/// ```no_run
951	/// use regex_automata::{
952	/// dfa::{Automaton, sparse},
953	/// HalfMatch,
954	/// };
955	///
956	/// type DFA = sparse::DFA<&'static [u8]>;
957	///
958	/// fn get_foo() -> &'static DFA {
959	/// use std::cell::Cell;
960	/// use std::mem::MaybeUninit;
961	/// use std::sync::Once;
962	///
963	/// # const _: &str = stringify! {
964	/// #[cfg(target_endian = "big")]
965	/// static BYTES: &[u8] = include_bytes!("foo.bigendian.dfa");
966	/// #[cfg(target_endian = "little")]
967	/// static BYTES: &[u8] = include_bytes!("foo.littleendian.dfa");
968	/// # };
969	/// # static BYTES: &[u8] = b"";
970	///
971	/// struct Lazy(Cell<MaybeUninit<DFA>>);
972	/// // SAFETY: This is safe because DFA impls Sync.
973	/// unsafe impl Sync for Lazy {}
974	///
975	/// static INIT: Once = Once::new();
976	/// static DFA: Lazy = Lazy(Cell::new(MaybeUninit::uninit()));
977	///
978	/// INIT.call_once(\|\| {
979	/// let (dfa, _) = DFA::from_bytes(BYTES)
980	/// .expect("serialized DFA should be valid");
981	/// // SAFETY: This is guaranteed to only execute once, and all
982	/// // we do with the pointer is write the DFA to it.
983	/// unsafe {
984	/// (*DFA.0.as_ptr()).as_mut_ptr().write(dfa);
985	/// }
986	/// });
987	/// // SAFETY: DFA is guaranteed to by initialized via INIT and is
988	/// // stored in static memory.
989	/// unsafe {
990	/// let dfa = (*DFA.0.as_ptr()).as_ptr();
991	/// std::mem::transmute::<*const DFA, &'static DFA>(dfa)
992	/// }
993	/// }
994	///
995	/// let dfa = get_foo();
996	/// let expected = HalfMatch::must(`0`, `8`);
997	/// assert_eq!(Ok(Some(expected)), dfa.find_leftmost_fwd(b"foo12345"));
998	/// ```
999	///
1000	/// Alternatively, consider using
1001	/// [`lazy_static`](https://crates.io/crates/lazy_static)
1002	/// or
1003	/// [`once_cell`](https://crates.io/crates/once_cell),
1004	/// which will guarantee safety for you.
1005	pub fn from_bytes(
1006	slice: &'a [u8],
1007	) -> Result<(DFA<&'a [u8]>, usize), DeserializeError> {
1008	// SAFETY: This is safe because we validate both the sparse transitions
1009	// (by trying to decode every state) and start state ID list below. If
1010	// either validation fails, then we return an error.
1011	let (dfa, nread) = unsafe { DFA::from_bytes_unchecked(slice)? };
1012	dfa.trans.validate()?;
1013	dfa.starts.validate(&dfa.trans)?;
1014	// N.B. dfa.special doesn't have a way to do unchecked deserialization,
1015	// so it has already been validated.
1016	Ok((dfa, nread))
1017	}
1018
1019	/// Deserialize a DFA with a specific state identifier representation in
1020	/// constant time by omitting the verification of the validity of the
1021	/// sparse transitions.
1022	///
1023	/// This is just like [`DFA::from_bytes`], except it can potentially return
1024	/// a DFA that exhibits undefined behavior if its transitions contains
1025	/// invalid state identifiers.
1026	///
1027	/// This routine is useful if you need to deserialize a DFA cheaply and
1028	/// cannot afford the transition validation performed by `from_bytes`.
1029	///
1030	/// # Safety
1031	///
1032	/// This routine is unsafe because it permits callers to provide
1033	/// arbitrary transitions with possibly incorrect state identifiers. While
1034	/// the various serialization routines will never return an incorrect
1035	/// DFA, there is no guarantee that the bytes provided here
1036	/// are correct. While `from_bytes_unchecked` will still do several forms
1037	/// of basic validation, this routine does not check that the transitions
1038	/// themselves are correct. Given an incorrect transition table, it is
1039	/// possible for the search routines to access out-of-bounds memory because
1040	/// of explicit bounds check elision.
1041	///
1042	/// # Example
1043	///
1044	/// ```
1045	/// use regex_automata::{
1046	/// dfa::{Automaton, sparse::DFA},
1047	/// HalfMatch,
1048	/// };
1049	///
1050	/// let initial = DFA::new("foo[0-9]+")?;
1051	/// let bytes = initial.to_bytes_native_endian();
1052	/// // SAFETY: This is guaranteed to be safe since the bytes given come
1053	/// // directly from a compatible serialization routine.
1054	/// let dfa: DFA<&[u8]> = unsafe { DFA::from_bytes_unchecked(&bytes)?.0 };
1055	///
1056	/// let expected = HalfMatch::must(`0`, `8`);
1057	/// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
1058	/// # Ok::<(), Box<dyn std::error::Error>>(())
1059	/// ```
1060	pub unsafe fn from_bytes_unchecked(
1061	slice: &'a [u8],
1062	) -> Result<(DFA<&'a [u8]>, usize), DeserializeError> {
1063	let mut nr = `0`;
1064
1065	nr += bytes::read_label(&slice[nr..], LABEL)?;
1066	nr += bytes::read_endianness_check(&slice[nr..])?;
1067	nr += bytes::read_version(&slice[nr..], VERSION)?;
1068
1069	let _unused = bytes::try_read_u32(&slice[nr..], "unused space")?;
1070	nr += size_of::<u32>();
1071
1072	let (trans, nread) = Transitions::from_bytes_unchecked(&slice[nr..])?;
1073	nr += nread;
1074
1075	let (starts, nread) = StartTable::from_bytes_unchecked(&slice[nr..])?;
1076	nr += nread;
1077
1078	let (special, nread) = Special::from_bytes(&slice[nr..])?;
1079	nr += nread;
1080	if special.max.as_usize() >= trans.sparse().len() {
1081	return Err(DeserializeError::generic(
1082	"max should not be greater than or equal to sparse bytes",
1083	));
1084	}
1085
1086	Ok((DFA { trans, starts, special }, nr))
1087	}
1088	}
1089
1090	impl<T: AsRef<[u8]>> fmt::Debug for DFA<T> {
1091	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1092	writeln!(f, "sparse::DFA(")?;
1093	for state: State<'_> in self.trans.states() {
1094	fmt_state_indicator(f, self, state.id())?;
1095	writeln!(f, "{:`06`?}: {:?}", state.id(), state)?;
1096	}
1097	writeln!(f, "")?;
1098	for (i: usize, (start_id: StateID, sty: Start, pid: Option)) in self.starts.iter().enumerate() {
1099	if i % self.starts.stride == `0` {
1100	match pid {
1101	None => writeln!(f, "START-GROUP(ALL)")?,
1102	Some(pid: PatternID) => {
1103	writeln!(f, "START_GROUP(pattern: {:?})", pid)?
1104	}
1105	}
1106	}
1107	writeln!(f, " {:?} => {:`06`?}", sty, start_id.as_usize())?;
1108	}
1109	writeln!(f, "state count: {:?}", self.trans.count)?;
1110	writeln!(f, ")")?;
1111	Ok(())
1112	}
1113	}
1114
1115	unsafe impl<T: AsRef<[u8]>> Automaton for DFA<T> {
1116	#[inline]
1117	fn is_special_state(&self, id: StateID) -> bool {
1118	self.special.is_special_state(id)
1119	}
1120
1121	#[inline]
1122	fn is_dead_state(&self, id: StateID) -> bool {
1123	self.special.is_dead_state(id)
1124	}
1125
1126	#[inline]
1127	fn is_quit_state(&self, id: StateID) -> bool {
1128	self.special.is_quit_state(id)
1129	}
1130
1131	#[inline]
1132	fn is_match_state(&self, id: StateID) -> bool {
1133	self.special.is_match_state(id)
1134	}
1135
1136	#[inline]
1137	fn is_start_state(&self, id: StateID) -> bool {
1138	self.special.is_start_state(id)
1139	}
1140
1141	#[inline]
1142	fn is_accel_state(&self, id: StateID) -> bool {
1143	self.special.is_accel_state(id)
1144	}
1145
1146	// This is marked as inline to help dramatically boost sparse searching,
1147	// which decodes each state it enters to follow the next transition.
1148	#[inline(always)]
1149	fn next_state(&self, current: StateID, input: u8) -> StateID {
1150	let input = self.trans.classes.get(input);
1151	self.trans.state(current).next(input)
1152	}
1153
1154	#[inline]
1155	unsafe fn next_state_unchecked(
1156	&self,
1157	current: StateID,
1158	input: u8,
1159	) -> StateID {
1160	self.next_state(current, input)
1161	}
1162
1163	#[inline]
1164	fn next_eoi_state(&self, current: StateID) -> StateID {
1165	self.trans.state(current).next_eoi()
1166	}
1167
1168	#[inline]
1169	fn pattern_count(&self) -> usize {
1170	self.trans.patterns
1171	}
1172
1173	#[inline]
1174	fn match_count(&self, id: StateID) -> usize {
1175	self.trans.state(id).pattern_count()
1176	}
1177
1178	#[inline]
1179	fn match_pattern(&self, id: StateID, match_index: usize) -> PatternID {
1180	// This is an optimization for the very common case of a DFA with a
1181	// single pattern. This conditional avoids a somewhat more costly path
1182	// that finds the pattern ID from the state machine, which requires
1183	// a bit of slicing/pointer-chasing. This optimization tends to only
1184	// matter when matches are frequent.
1185	if self.trans.patterns == `1` {
1186	return PatternID::ZERO;
1187	}
1188	self.trans.state(id).pattern_id(match_index)
1189	}
1190
1191	#[inline]
1192	fn start_state_forward(
1193	&self,
1194	pattern_id: Option<PatternID>,
1195	bytes: &[u8],
1196	start: usize,
1197	end: usize,
1198	) -> StateID {
1199	let index = Start::from_position_fwd(bytes, start, end);
1200	self.starts.start(index, pattern_id)
1201	}
1202
1203	#[inline]
1204	fn start_state_reverse(
1205	&self,
1206	pattern_id: Option<PatternID>,
1207	bytes: &[u8],
1208	start: usize,
1209	end: usize,
1210	) -> StateID {
1211	let index = Start::from_position_rev(bytes, start, end);
1212	self.starts.start(index, pattern_id)
1213	}
1214
1215	#[inline]
1216	fn accelerator(&self, id: StateID) -> &[u8] {
1217	self.trans.state(id).accelerator()
1218	}
1219	}
1220
1221	/// The transition table portion of a sparse DFA.
1222	///
1223	/// The transition table is the core part of the DFA in that it describes how
1224	/// to move from one state to another based on the input sequence observed.
1225	///
1226	/// Unlike a typical dense table based DFA, states in a sparse transition
1227	/// table have variable size. That is, states with more transitions use more
1228	/// space than states with fewer transitions. This means that finding the next
1229	/// transition takes more work than with a dense DFA, but also typically uses
1230	/// much less space.
1231	#[derive(Clone)]
1232	struct Transitions<T> {
1233	/// The raw encoding of each state in this DFA.
1234	///
1235	/// Each state has the following information:
1236	///
1237	/// A set of transitions to subsequent states. Transitions to the dead*
1238	/// state are omitted.
1239	/// If the state can be accelerated, then any additional accelerator*
1240	/// information.
1241	/// If the state is a match state, then the state contains all pattern*
1242	/// IDs that match when in that state.
1243	///
1244	/// To decode a state, use Transitions::state.
1245	///
1246	/// In practice, T is either Vec<u8> or &[u8].
1247	sparse: T,
1248	/// A set of equivalence classes, where a single equivalence class
1249	/// represents a set of bytes that never discriminate between a match
1250	/// and a non-match in the DFA. Each equivalence class corresponds to a
1251	/// single character in this DFA's alphabet, where the maximum number of
1252	/// characters is 257 (each possible value of a byte plus the special
1253	/// EOI transition). Consequently, the number of equivalence classes
1254	/// corresponds to the number of transitions for each DFA state. Note
1255	/// though that the space* used by each DFA state in the transition table*
1256	/// may be larger. The total space used by each DFA state is known as the
1257	/// stride and is documented above.
1258	///
1259	/// The only time the number of equivalence classes is fewer than 257 is
1260	/// if the DFA's kind uses byte classes which is the default. Equivalence
1261	/// classes should generally only be disabled when debugging, so that
1262	/// the transitions themselves aren't obscured. Disabling them has no
1263	/// other benefit, since the equivalence class map is always used while
1264	/// searching. In the vast majority of cases, the number of equivalence
1265	/// classes is substantially smaller than 257, particularly when large
1266	/// Unicode classes aren't used.
1267	///
1268	/// N.B. Equivalence classes aren't particularly useful in a sparse DFA
1269	/// in the current implementation, since equivalence classes generally tend
1270	/// to correspond to continuous ranges of bytes that map to the same
1271	/// transition. So in a sparse DFA, equivalence classes don't really lead
1272	/// to a space savings. In the future, it would be good to try and remove
1273	/// them from sparse DFAs entirely, but requires a bit of work since sparse
1274	/// DFAs are built from dense DFAs, which are in turn built on top of
1275	/// equivalence classes.
1276	classes: ByteClasses,
1277	/// The total number of states in this DFA. Note that a DFA always has at
1278	/// least one state---the dead state---even the empty DFA. In particular,
1279	/// the dead state always has ID 0 and is correspondingly always the first
1280	/// state. The dead state is never a match state.
1281	count: usize,
1282	/// The total number of unique patterns represented by these match states.
1283	patterns: usize,
1284	}
1285
1286	impl<'a> Transitions<&'a [u8]> {
1287	unsafe fn from_bytes_unchecked(
1288	mut slice: &'a [u8],
1289	) -> Result<(Transitions<&'a [u8]>, usize), DeserializeError> {
1290	let slice_start = slice.as_ptr() as usize;
1291
1292	let (state_count, nr) =
1293	bytes::try_read_u32_as_usize(&slice, "state count")?;
1294	slice = &slice[nr..];
1295
1296	let (pattern_count, nr) =
1297	bytes::try_read_u32_as_usize(&slice, "pattern count")?;
1298	slice = &slice[nr..];
1299
1300	let (classes, nr) = ByteClasses::from_bytes(&slice)?;
1301	slice = &slice[nr..];
1302
1303	let (len, nr) =
1304	bytes::try_read_u32_as_usize(&slice, "sparse transitions length")?;
1305	slice = &slice[nr..];
1306
1307	bytes::check_slice_len(slice, len, "sparse states byte length")?;
1308	let sparse = &slice[..len];
1309	slice = &slice[len..];
1310
1311	let trans = Transitions {
1312	sparse,
1313	classes,
1314	count: state_count,
1315	patterns: pattern_count,
1316	};
1317	Ok((trans, slice.as_ptr() as usize - slice_start))
1318	}
1319	}
1320
1321	impl<T: AsRef<[u8]>> Transitions<T> {
1322	/// Writes a serialized form of this transition table to the buffer given.
1323	/// If the buffer is too small, then an error is returned. To determine
1324	/// how big the buffer must be, use `write_to_len`.
1325	fn write_to<E: Endian>(
1326	&self,
1327	mut dst: &mut [u8],
1328	) -> Result<usize, SerializeError> {
1329	let nwrite = self.write_to_len();
1330	if dst.len() < nwrite {
1331	return Err(SerializeError::buffer_too_small(
1332	"sparse transition table",
1333	));
1334	}
1335	dst = &mut dst[..nwrite];
1336
1337	// write state count
1338	E::write_u32(u32::try_from(self.count).unwrap(), dst);
1339	dst = &mut dst[size_of::<u32>()..];
1340
1341	// write pattern count
1342	E::write_u32(u32::try_from(self.patterns).unwrap(), dst);
1343	dst = &mut dst[size_of::<u32>()..];
1344
1345	// write byte class map
1346	let n = self.classes.write_to(dst)?;
1347	dst = &mut dst[n..];
1348
1349	// write number of bytes in sparse transitions
1350	E::write_u32(u32::try_from(self.sparse().len()).unwrap(), dst);
1351	dst = &mut dst[size_of::<u32>()..];
1352
1353	// write actual transitions
1354	dst.copy_from_slice(self.sparse());
1355	Ok(nwrite)
1356	}
1357
1358	/// Returns the number of bytes the serialized form of this transition
1359	/// table will use.
1360	fn write_to_len(&self) -> usize {
1361	size_of::<u32>() // state count
1362	+ size_of::<u32>() // pattern count
1363	+ self.classes.write_to_len()
1364	+ size_of::<u32>() // sparse transitions length
1365	+ self.sparse().len()
1366	}
1367
1368	/// Validates that every state ID in this transition table is valid.
1369	///
1370	/// That is, every state ID can be used to correctly index a state in this
1371	/// table.
1372	fn validate(&self) -> Result<(), DeserializeError> {
1373	// In order to validate everything, we not only need to make sure we
1374	// can decode every state, but that every transition in every state
1375	// points to a valid state. There are many duplicative transitions, so
1376	// we record state IDs that we've verified so that we don't redo the
1377	// decoding work.
1378	//
1379	// Except, when in no_std mode, we don't have dynamic memory allocation
1380	// available to us, so we skip this optimization. It's not clear
1381	// whether doing something more clever is worth it just yet. If you're
1382	// profiling this code and need it to run faster, please file an issue.
1383	//
1384	// ---AG
1385	struct Seen {
1386	#[cfg(feature = "alloc")]
1387	set: BTreeSet<StateID>,
1388	#[cfg(not(feature = "alloc"))]
1389	set: core::marker::PhantomData<StateID>,
1390	}
1391
1392	#[cfg(feature = "alloc")]
1393	impl Seen {
1394	fn new() -> Seen {
1395	Seen { set: BTreeSet::new() }
1396	}
1397	fn insert(&mut self, id: StateID) {
1398	self.set.insert(id);
1399	}
1400	fn contains(&self, id: &StateID) -> bool {
1401	self.set.contains(id)
1402	}
1403	}
1404
1405	#[cfg(not(feature = "alloc"))]
1406	impl Seen {
1407	fn new() -> Seen {
1408	Seen { set: core::marker::PhantomData }
1409	}
1410	fn insert(&mut self, _id: StateID) {}
1411	fn contains(&self, _id: &StateID) -> bool {
1412	`false`
1413	}
1414	}
1415
1416	let mut verified: Seen = Seen::new();
1417	// We need to make sure that we decode the correct number of states.
1418	// Otherwise, an empty set of transitions would validate even if the
1419	// recorded state count is non-empty.
1420	let mut count = `0`;
1421	// We can't use the self.states() iterator because it assumes the state
1422	// encodings are valid. It could panic if they aren't.
1423	let mut id = DEAD;
1424	while id.as_usize() < self.sparse().len() {
1425	let state = self.try_state(id)?;
1426	verified.insert(id);
1427	// The next ID should be the offset immediately following `state`.
1428	id = StateID::new(bytes::add(
1429	id.as_usize(),
1430	state.bytes_len(),
1431	"next state ID offset",
1432	)?)
1433	.map_err(\|err\| {
1434	DeserializeError::state_id_error(err, "next state ID offset")
1435	})?;
1436	count += `1`;
1437
1438	// Now check that all transitions in this state are correct.
1439	for i in `0`..state.ntrans {
1440	let to = state.next_at(i);
1441	if verified.contains(&to) {
1442	continue;
1443	}
1444	let _ = self.try_state(to)?;
1445	verified.insert(id);
1446	}
1447	}
1448	if count != self.count {
1449	return Err(DeserializeError::generic(
1450	"mismatching sparse state count",
1451	));
1452	}
1453	Ok(())
1454	}
1455
1456	/// Converts these transitions to a borrowed value.
1457	fn as_ref(&self) -> Transitions<&'_ [u8]> {
1458	Transitions {
1459	sparse: self.sparse(),
1460	classes: self.classes.clone(),
1461	count: self.count,
1462	patterns: self.patterns,
1463	}
1464	}
1465
1466	/// Converts these transitions to an owned value.
1467	#[cfg(feature = "alloc")]
1468	fn to_owned(&self) -> Transitions<Vec<u8>> {
1469	Transitions {
1470	sparse: self.sparse().to_vec(),
1471	classes: self.classes.clone(),
1472	count: self.count,
1473	patterns: self.patterns,
1474	}
1475	}
1476
1477	/// Return a convenient representation of the given state.
1478	///
1479	/// This panics if the state is invalid.
1480	///
1481	/// This is marked as inline to help dramatically boost sparse searching,
1482	/// which decodes each state it enters to follow the next transition. Other
1483	/// functions involved are also inlined, which should hopefully eliminate
1484	/// a lot of the extraneous decoding that is never needed just to follow
1485	/// the next transition.
1486	#[inline(always)]
1487	fn state(&self, id: StateID) -> State<'_> {
1488	let mut state = &self.sparse()[id.as_usize()..];
1489	let mut ntrans = bytes::read_u16(&state) as usize;
1490	let is_match = (`1` << `15`) & ntrans != `0`;
1491	ntrans &= !(`1` << `15`);
1492	state = &state[`2`..];
1493
1494	let (input_ranges, state) = state.split_at(ntrans * `2`);
1495	let (next, state) = state.split_at(ntrans * StateID::SIZE);
1496	let (pattern_ids, state) = if is_match {
1497	let npats = bytes::read_u32(&state) as usize;
1498	state[`4`..].split_at(npats * `4`)
1499	} else {
1500	(&[][..], state)
1501	};
1502
1503	let accel_len = state[`0`] as usize;
1504	let accel = &state[`1`..accel_len + `1`];
1505	State { id, is_match, ntrans, input_ranges, next, pattern_ids, accel }
1506	}
1507
1508	/// Like `state`, but will return an error if the state encoding is
1509	/// invalid. This is useful for verifying states after deserialization,
1510	/// which is required for a safe deserialization API.
1511	///
1512	/// Note that this only verifies that this state is decodable and that
1513	/// all of its data is consistent. It does not verify that its state ID
1514	/// transitions point to valid states themselves, nor does it verify that
1515	/// every pattern ID is valid.
1516	fn try_state(&self, id: StateID) -> Result<State<'_>, DeserializeError> {
1517	if id.as_usize() > self.sparse().len() {
1518	return Err(DeserializeError::generic("invalid sparse state ID"));
1519	}
1520	let mut state = &self.sparse()[id.as_usize()..];
1521	// Encoding format starts with a u16 that stores the total number of
1522	// transitions in this state.
1523	let (mut ntrans, _) =
1524	bytes::try_read_u16_as_usize(state, "state transition count")?;
1525	let is_match = ((`1` << `15`) & ntrans) != `0`;
1526	ntrans &= !(`1` << `15`);
1527	state = &state[`2`..];
1528	if ntrans > `257` \|\| ntrans == `0` {
1529	return Err(DeserializeError::generic("invalid transition count"));
1530	}
1531
1532	// Each transition has two pieces: an inclusive range of bytes on which
1533	// it is defined, and the state ID that those bytes transition to. The
1534	// pairs come first, followed by a corresponding sequence of state IDs.
1535	let input_ranges_len = ntrans.checked_mul(`2`).unwrap();
1536	bytes::check_slice_len(state, input_ranges_len, "sparse byte pairs")?;
1537	let (input_ranges, state) = state.split_at(input_ranges_len);
1538	// Every range should be of the form A-B, where A<=B.
1539	for pair in input_ranges.chunks(`2`) {
1540	let (start, end) = (pair[`0`], pair[`1`]);
1541	if start > end {
1542	return Err(DeserializeError::generic("invalid input range"));
1543	}
1544	}
1545
1546	// And now extract the corresponding sequence of state IDs. We leave
1547	// this sequence as a &[u8] instead of a &[S] because sparse DFAs do
1548	// not have any alignment requirements.
1549	let next_len = ntrans
1550	.checked_mul(self.id_len())
1551	.expect("state size * #trans should always fit in a usize");
1552	bytes::check_slice_len(state, next_len, "sparse trans state IDs")?;
1553	let (next, state) = state.split_at(next_len);
1554	// We can at least verify that every state ID is in bounds.
1555	for idbytes in next.chunks(self.id_len()) {
1556	let (id, _) =
1557	bytes::read_state_id(idbytes, "sparse state ID in try_state")?;
1558	bytes::check_slice_len(
1559	self.sparse(),
1560	id.as_usize(),
1561	"invalid sparse state ID",
1562	)?;
1563	}
1564
1565	// If this is a match state, then read the pattern IDs for this state.
1566	// Pattern IDs is a u32-length prefixed sequence of native endian
1567	// encoded 32-bit integers.
1568	let (pattern_ids, state) = if is_match {
1569	let (npats, nr) =
1570	bytes::try_read_u32_as_usize(state, "pattern ID count")?;
1571	let state = &state[nr..];
1572
1573	let pattern_ids_len =
1574	bytes::mul(npats, `4`, "sparse pattern ID byte length")?;
1575	bytes::check_slice_len(
1576	state,
1577	pattern_ids_len,
1578	"sparse pattern IDs",
1579	)?;
1580	let (pattern_ids, state) = state.split_at(pattern_ids_len);
1581	for patbytes in pattern_ids.chunks(PatternID::SIZE) {
1582	bytes::read_pattern_id(
1583	patbytes,
1584	"sparse pattern ID in try_state",
1585	)?;
1586	}
1587	(pattern_ids, state)
1588	} else {
1589	(&[][..], state)
1590	};
1591
1592	// Now read this state's accelerator info. The first byte is the length
1593	// of the accelerator, which is typically 0 (for no acceleration) but
1594	// is no bigger than 3. The length indicates the number of bytes that
1595	// follow, where each byte corresponds to a transition out of this
1596	// state.
1597	if state.is_empty() {
1598	return Err(DeserializeError::generic("no accelerator length"));
1599	}
1600	let (accel_len, state) = (state[`0`] as usize, &state[`1`..]);
1601
1602	if accel_len > `3` {
1603	return Err(DeserializeError::generic(
1604	"sparse invalid accelerator length",
1605	));
1606	}
1607	bytes::check_slice_len(
1608	state,
1609	accel_len,
1610	"sparse corrupt accelerator length",
1611	)?;
1612	let (accel, _) = (&state[..accel_len], &state[accel_len..]);
1613
1614	Ok(State {
1615	id,
1616	is_match,
1617	ntrans,
1618	input_ranges,
1619	next,
1620	pattern_ids,
1621	accel,
1622	})
1623	}
1624
1625	/// Return an iterator over all of the states in this DFA.
1626	///
1627	/// The iterator returned yields tuples, where the first element is the
1628	/// state ID and the second element is the state itself.
1629	fn states(&self) -> StateIter<'_, T> {
1630	StateIter { trans: self, id: DEAD.as_usize() }
1631	}
1632
1633	/// Returns the sparse transitions as raw bytes.
1634	fn sparse(&self) -> &[u8] {
1635	self.sparse.as_ref()
1636	}
1637
1638	/// Returns the number of bytes represented by a single state ID.
1639	fn id_len(&self) -> usize {
1640	StateID::SIZE
1641	}
1642
1643	/// Return the memory usage, in bytes, of these transitions.
1644	///
1645	/// This does not include the size of a `Transitions` value itself.
1646	fn memory_usage(&self) -> usize {
1647	self.sparse().len()
1648	}
1649	}
1650
1651	#[cfg(feature = "alloc")]
1652	impl<T: AsMut<[u8]>> Transitions<T> {
1653	/// Return a convenient mutable representation of the given state.
1654	/// This panics if the state is invalid.
1655	fn state_mut(&mut self, id: StateID) -> StateMut<'_> {
1656	let mut state = &mut self.sparse_mut()[id.as_usize()..];
1657	let mut ntrans = bytes::read_u16(&state) as usize;
1658	let is_match = (`1` << `15`) & ntrans != `0`;
1659	ntrans &= !(`1` << `15`);
1660	state = &mut state[`2`..];
1661
1662	let (input_ranges, state) = state.split_at_mut(ntrans * `2`);
1663	let (next, state) = state.split_at_mut(ntrans * StateID::SIZE);
1664	let (pattern_ids, state) = if is_match {
1665	let npats = bytes::read_u32(&state) as usize;
1666	state[`4`..].split_at_mut(npats * `4`)
1667	} else {
1668	(&mut [][..], state)
1669	};
1670
1671	let accel_len = state[`0`] as usize;
1672	let accel = &mut state[`1`..accel_len + `1`];
1673	StateMut {
1674	id,
1675	is_match,
1676	ntrans,
1677	input_ranges,
1678	next,
1679	pattern_ids,
1680	accel,
1681	}
1682	}
1683
1684	/// Returns the sparse transitions as raw mutable bytes.
1685	fn sparse_mut(&mut self) -> &mut [u8] {
1686	self.sparse.as_mut()
1687	}
1688	}
1689
1690	/// The set of all possible starting states in a DFA.
1691	///
1692	/// See the eponymous type in the `dense` module for more details. This type
1693	/// is very similar to `dense::StartTable`, except that its underlying
1694	/// representation is `&[u8]` instead of `&[S]`. (The latter would require
1695	/// sparse DFAs to be aligned, which is explicitly something we do not require
1696	/// because we don't really need it.)
1697	#[derive(Clone)]
1698	struct StartTable<T> {
1699	/// The initial start state IDs as a contiguous table of native endian
1700	/// encoded integers, represented by `S`.
1701	///
1702	/// In practice, T is either Vec<u8> or &[u8] and has no alignment
1703	/// requirements.
1704	///
1705	/// The first `stride` (currently always 4) entries always correspond to
1706	/// the start states for the entire DFA. After that, there are
1707	/// `stride patterns` state IDs, where `patterns` may be zero in the*
1708	/// case of a DFA with no patterns or in the case where the DFA was built
1709	/// without enabling starting states for each pattern.
1710	table: T,
1711	/// The number of starting state IDs per pattern.
1712	stride: usize,
1713	/// The total number of patterns for which starting states are encoded.
1714	/// This may be zero for non-empty DFAs when the DFA was built without
1715	/// start states for each pattern.
1716	patterns: usize,
1717	}
1718
1719	#[cfg(feature = "alloc")]
1720	impl StartTable<Vec<u8>> {
1721	fn new(patterns: usize) -> StartTable<Vec<u8>> {
1722	let stride = Start::count();
1723	// This is OK since the only way we're here is if a dense DFA could be
1724	// constructed successfully, which uses the same space.
1725	let len = stride
1726	.checked_mul(patterns)
1727	.unwrap()
1728	.checked_add(stride)
1729	.unwrap()
1730	.checked_mul(StateID::SIZE)
1731	.unwrap();
1732	StartTable { table: vec![`0`; len], stride, patterns }
1733	}
1734
1735	fn from_dense_dfa<T: AsRef<[u32]>>(
1736	dfa: &dense::DFA<T>,
1737	remap: &[StateID],
1738	) -> Result<StartTable<Vec<u8>>, Error> {
1739	// Unless the DFA has start states compiled for each pattern, then
1740	// as far as the starting state table is concerned, there are zero
1741	// patterns to account for. It will instead only store starting states
1742	// for the entire DFA.
1743	let start_pattern_count = if dfa.has_starts_for_each_pattern() {
1744	dfa.pattern_count()
1745	} else {
1746	`0`
1747	};
1748	let mut sl = StartTable::new(start_pattern_count);
1749	for (old_start_id, sty, pid) in dfa.starts() {
1750	let new_start_id = remap[dfa.to_index(old_start_id)];
1751	sl.set_start(sty, pid, new_start_id);
1752	}
1753	Ok(sl)
1754	}
1755	}
1756
1757	impl<'a> StartTable<&'a [u8]> {
1758	unsafe fn from_bytes_unchecked(
1759	mut slice: &'a [u8],
1760	) -> Result<(StartTable<&'a [u8]>, usize), DeserializeError> {
1761	let slice_start = slice.as_ptr() as usize;
1762
1763	let (stride, nr) =
1764	bytes::try_read_u32_as_usize(slice, "sparse start table stride")?;
1765	slice = &slice[nr..];
1766
1767	let (patterns, nr) = bytes::try_read_u32_as_usize(
1768	slice,
1769	"sparse start table patterns",
1770	)?;
1771	slice = &slice[nr..];
1772
1773	if stride != Start::count() {
1774	return Err(DeserializeError::generic(
1775	"invalid sparse starting table stride",
1776	));
1777	}
1778	if patterns > PatternID::LIMIT {
1779	return Err(DeserializeError::generic(
1780	"sparse invalid number of patterns",
1781	));
1782	}
1783	let pattern_table_size =
1784	bytes::mul(stride, patterns, "sparse invalid pattern count")?;
1785	// Our start states always start with a single stride of start states
1786	// for the entire automaton which permit it to match any pattern. What
1787	// follows it are an optional set of start states for each pattern.
1788	let start_state_count = bytes::add(
1789	stride,
1790	pattern_table_size,
1791	"sparse invalid 'any' pattern starts size",
1792	)?;
1793	let table_bytes_len = bytes::mul(
1794	start_state_count,
1795	StateID::SIZE,
1796	"sparse pattern table bytes length",
1797	)?;
1798	bytes::check_slice_len(
1799	slice,
1800	table_bytes_len,
1801	"sparse start ID table",
1802	)?;
1803	let table_bytes = &slice[..table_bytes_len];
1804	slice = &slice[table_bytes_len..];
1805
1806	let sl = StartTable { table: table_bytes, stride, patterns };
1807	Ok((sl, slice.as_ptr() as usize - slice_start))
1808	}
1809	}
1810
1811	impl<T: AsRef<[u8]>> StartTable<T> {
1812	fn write_to<E: Endian>(
1813	&self,
1814	mut dst: &mut [u8],
1815	) -> Result<usize, SerializeError> {
1816	let nwrite = self.write_to_len();
1817	if dst.len() < nwrite {
1818	return Err(SerializeError::buffer_too_small(
1819	"sparse starting table ids",
1820	));
1821	}
1822	dst = &mut dst[..nwrite];
1823
1824	// write stride
1825	E::write_u32(u32::try_from(self.stride).unwrap(), dst);
1826	dst = &mut dst[size_of::<u32>()..];
1827	// write pattern count
1828	E::write_u32(u32::try_from(self.patterns).unwrap(), dst);
1829	dst = &mut dst[size_of::<u32>()..];
1830	// write start IDs
1831	dst.copy_from_slice(self.table());
1832	Ok(nwrite)
1833	}
1834
1835	/// Returns the number of bytes the serialized form of this transition
1836	/// table will use.
1837	fn write_to_len(&self) -> usize {
1838	size_of::<u32>() // stride
1839	+ size_of::<u32>() // # patterns
1840	+ self.table().len()
1841	}
1842
1843	/// Validates that every starting state ID in this table is valid.
1844	///
1845	/// That is, every starting state ID can be used to correctly decode a
1846	/// state in the DFA's sparse transitions.
1847	fn validate(
1848	&self,
1849	trans: &Transitions<T>,
1850	) -> Result<(), DeserializeError> {
1851	for (id, _, _) in self.iter() {
1852	let _ = trans.try_state(id)?;
1853	}
1854	Ok(())
1855	}
1856
1857	/// Converts this start list to a borrowed value.
1858	fn as_ref(&self) -> StartTable<&'_ [u8]> {
1859	StartTable {
1860	table: self.table(),
1861	stride: self.stride,
1862	patterns: self.patterns,
1863	}
1864	}
1865
1866	/// Converts this start list to an owned value.
1867	#[cfg(feature = "alloc")]
1868	fn to_owned(&self) -> StartTable<Vec<u8>> {
1869	StartTable {
1870	table: self.table().to_vec(),
1871	stride: self.stride,
1872	patterns: self.patterns,
1873	}
1874	}
1875
1876	/// Return the start state for the given index and pattern ID. If the
1877	/// pattern ID is None, then the corresponding start state for the entire
1878	/// DFA is returned. If the pattern ID is not None, then the corresponding
1879	/// starting state for the given pattern is returned. If this start table
1880	/// does not have individual starting states for each pattern, then this
1881	/// panics.
1882	fn start(&self, index: Start, pattern_id: Option<PatternID>) -> StateID {
1883	let start_index = index.as_usize();
1884	let index = match pattern_id {
1885	None => start_index,
1886	Some(pid) => {
1887	let pid = pid.as_usize();
1888	assert!(pid < self.patterns, "invalid pattern ID {:?}", pid);
1889	self.stride
1890	.checked_mul(pid)
1891	.unwrap()
1892	.checked_add(self.stride)
1893	.unwrap()
1894	.checked_add(start_index)
1895	.unwrap()
1896	}
1897	};
1898	let start = index * StateID::SIZE;
1899	// This OK since we're allowed to assume that the start table contains
1900	// valid StateIDs.
1901	bytes::read_state_id_unchecked(&self.table()[start..]).0
1902	}
1903
1904	/// Return an iterator over all start IDs in this table.
1905	fn iter(&self) -> StartStateIter<'_, T> {
1906	StartStateIter { st: self, i: `0` }
1907	}
1908
1909	/// Returns the total number of start state IDs in this table.
1910	fn len(&self) -> usize {
1911	self.table().len() / StateID::SIZE
1912	}
1913
1914	/// Returns the table as a raw slice of bytes.
1915	fn table(&self) -> &[u8] {
1916	self.table.as_ref()
1917	}
1918
1919	/// Return the memory usage, in bytes, of this start list.
1920	///
1921	/// This does not include the size of a `StartTable` value itself.
1922	fn memory_usage(&self) -> usize {
1923	self.table().len()
1924	}
1925	}
1926
1927	#[cfg(feature = "alloc")]
1928	impl<T: AsMut<[u8]>> StartTable<T> {
1929	/// Set the start state for the given index and pattern.
1930	///
1931	/// If the pattern ID or state ID are not valid, then this will panic.
1932	fn set_start(
1933	&mut self,
1934	index: Start,
1935	pattern_id: Option<PatternID>,
1936	id: StateID,
1937	) {
1938	let start_index = index.as_usize();
1939	let index = match pattern_id {
1940	None => start_index,
1941	Some(pid) => {
1942	let pid = pid.as_usize();
1943	assert!(pid < self.patterns, "invalid pattern ID {:?}", pid);
1944	self.stride
1945	.checked_mul(pid)
1946	.unwrap()
1947	.checked_add(self.stride)
1948	.unwrap()
1949	.checked_add(start_index)
1950	.unwrap()
1951	}
1952	};
1953	let start = index * StateID::SIZE;
1954	let end = start + StateID::SIZE;
1955	bytes::write_state_id::<bytes::NE>(
1956	id,
1957	&mut self.table.as_mut()[start..end],
1958	);
1959	}
1960	}
1961
1962	/// An iterator over all state state IDs in a sparse DFA.
1963	struct StartStateIter<'a, T> {
1964	st: &'a StartTable<T>,
1965	i: usize,
1966	}
1967
1968	impl<'a, T: AsRef<[u8]>> Iterator for StartStateIter<'a, T> {
1969	type Item = (StateID, Start, Option<PatternID>);
1970
1971	fn next(&mut self) -> Option<(StateID, Start, Option<PatternID>)> {
1972	let i = self.i;
1973	if i >= self.st.len() {
1974	return None;
1975	}
1976	self.i += `1`;
1977
1978	// This unwrap is okay since the stride of any DFA must always match
1979	// the number of start state types.
1980	let start_type = Start::from_usize(i % self.st.stride).unwrap();
1981	let pid = if i < self.st.stride {
1982	// This means we don't have start states for each pattern.
1983	None
1984	} else {
1985	// These unwraps are OK since we may assume our table and stride
1986	// is correct.
1987	let pid = i
1988	.checked_sub(self.st.stride)
1989	.unwrap()
1990	.checked_div(self.st.stride)
1991	.unwrap();
1992	Some(PatternID::new(pid).unwrap())
1993	};
1994	let start = i * StateID::SIZE;
1995	let end = start + StateID::SIZE;
1996	let bytes = self.st.table()[start..end].try_into().unwrap();
1997	// This is OK since we're allowed to assume that any IDs in this start
1998	// table are correct and valid for this DFA.
1999	let id = StateID::from_ne_bytes_unchecked(bytes);
2000	Some((id, start_type, pid))
2001	}
2002	}
2003
2004	impl<'a, T> fmt::Debug for StartStateIter<'a, T> {
2005	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
2006	f.debug_struct("StartStateIter").field(name:"i", &self.i).finish()
2007	}
2008	}
2009
2010	/// An iterator over all states in a sparse DFA.
2011	///
2012	/// This iterator yields tuples, where the first element is the state ID and
2013	/// the second element is the state itself.
2014	struct StateIter<'a, T> {
2015	trans: &'a Transitions<T>,
2016	id: usize,
2017	}
2018
2019	impl<'a, T: AsRef<[u8]>> Iterator for StateIter<'a, T> {
2020	type Item = State<'a>;
2021
2022	fn next(&mut self) -> Option<State<'a>> {
2023	if self.id >= self.trans.sparse().len() {
2024	return None;
2025	}
2026	let state: State<'_> = self.trans.state(id:StateID::new_unchecked(self.id));
2027	self.id = self.id + state.bytes_len();
2028	Some(state)
2029	}
2030	}
2031
2032	impl<'a, T> fmt::Debug for StateIter<'a, T> {
2033	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
2034	f.debug_struct("StateIter").field(name:"id", &self.id).finish()
2035	}
2036	}
2037
2038	/// A representation of a sparse DFA state that can be cheaply materialized
2039	/// from a state identifier.
2040	#[derive(Clone)]
2041	struct State<'a> {
2042	/// The identifier of this state.
2043	id: StateID,
2044	/// Whether this is a match state or not.
2045	is_match: bool,
2046	/// The number of transitions in this state.
2047	ntrans: usize,
2048	/// Pairs of input ranges, where there is one pair for each transition.
2049	/// Each pair specifies an inclusive start and end byte range for the
2050	/// corresponding transition.
2051	input_ranges: &'a [u8],
2052	/// Transitions to the next state. This slice contains native endian
2053	/// encoded state identifiers, with `S` as the representation. Thus, there
2054	/// are `ntrans size_of::<S>()` bytes in this slice.*
2055	next: &'a [u8],
2056	/// If this is a match state, then this contains the pattern IDs that match
2057	/// when the DFA is in this state.
2058	///
2059	/// This is a contiguous sequence of 32-bit native endian encoded integers.
2060	pattern_ids: &'a [u8],
2061	/// An accelerator for this state, if present. If this state has no
2062	/// accelerator, then this is an empty slice. When non-empty, this slice
2063	/// has length at most 3 and corresponds to the exhaustive set of bytes
2064	/// that must be seen in order to transition out of this state.
2065	accel: &'a [u8],
2066	}
2067
2068	impl<'a> State<'a> {
2069	/// Searches for the next transition given an input byte. If no such
2070	/// transition could be found, then a dead state is returned.
2071	///
2072	/// This is marked as inline to help dramatically boost sparse searching,
2073	/// which decodes each state it enters to follow the next transition.
2074	#[inline(always)]
2075	fn next(&self, input: u8) -> StateID {
2076	// This straight linear search was observed to be much better than
2077	// binary search on ASCII haystacks, likely because a binary search
2078	// visits the ASCII case last but a linear search sees it first. A
2079	// binary search does do a little better on non-ASCII haystacks, but
2080	// not by much. There might be a better trade off lurking here.
2081	for i in `0`..(self.ntrans - `1`) {
2082	let (start, end) = self.range(i);
2083	if start <= input && input <= end {
2084	return self.next_at(i);
2085	}
2086	// We could bail early with an extra branch: if input < b1, then
2087	// we know we'll never find a matching transition. Interestingly,
2088	// this extra branch seems to not help performance, or will even
2089	// hurt it. It's likely very dependent on the DFA itself and what
2090	// is being searched.
2091	}
2092	DEAD
2093	}
2094
2095	/// Returns the next state ID for the special EOI transition.
2096	fn next_eoi(&self) -> StateID {
2097	self.next_at(self.ntrans - `1`)
2098	}
2099
2100	/// Returns the identifier for this state.
2101	fn id(&self) -> StateID {
2102	self.id
2103	}
2104
2105	/// Returns the inclusive input byte range for the ith transition in this
2106	/// state.
2107	fn range(&self, i: usize) -> (u8, u8) {
2108	(self.input_ranges[i * `2`], self.input_ranges[i * `2` + `1`])
2109	}
2110
2111	/// Returns the next state for the ith transition in this state.
2112	fn next_at(&self, i: usize) -> StateID {
2113	let start = i * StateID::SIZE;
2114	let end = start + StateID::SIZE;
2115	let bytes = self.next[start..end].try_into().unwrap();
2116	StateID::from_ne_bytes_unchecked(bytes)
2117	}
2118
2119	/// Returns the pattern ID for the given match index. If the match index
2120	/// is invalid, then this panics.
2121	fn pattern_id(&self, match_index: usize) -> PatternID {
2122	let start = match_index * PatternID::SIZE;
2123	bytes::read_pattern_id_unchecked(&self.pattern_ids[start..]).0
2124	}
2125
2126	/// Returns the total number of pattern IDs for this state. This is always
2127	/// zero when `is_match` is false.
2128	fn pattern_count(&self) -> usize {
2129	assert_eq!(`0`, self.pattern_ids.len() % `4`);
2130	self.pattern_ids.len() / `4`
2131	}
2132
2133	/// Return the total number of bytes that this state consumes in its
2134	/// encoded form.
2135	fn bytes_len(&self) -> usize {
2136	let mut len = `2`
2137	+ (self.ntrans * `2`)
2138	+ (self.ntrans * StateID::SIZE)
2139	+ (`1` + self.accel.len());
2140	if self.is_match {
2141	len += size_of::<u32>() + self.pattern_ids.len();
2142	}
2143	len
2144	}
2145
2146	/// Return an accelerator for this state.
2147	fn accelerator(&self) -> &'a [u8] {
2148	self.accel
2149	}
2150	}
2151
2152	impl<'a> fmt::Debug for State<'a> {
2153	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
2154	let mut printed = `false`;
2155	for i in `0`..(self.ntrans - `1`) {
2156	let next = self.next_at(i);
2157	if next == DEAD {
2158	continue;
2159	}
2160
2161	if printed {
2162	write!(f, ", ")?;
2163	}
2164	let (start, end) = self.range(i);
2165	if start == end {
2166	write!(f, "{:?} => {:?}", DebugByte(start), next)?;
2167	} else {
2168	write!(
2169	f,
2170	"{:?}-{:?} => {:?}",
2171	DebugByte(start),
2172	DebugByte(end),
2173	next,
2174	)?;
2175	}
2176	printed = `true`;
2177	}
2178	let eoi = self.next_at(self.ntrans - `1`);
2179	if eoi != DEAD {
2180	if printed {
2181	write!(f, ", ")?;
2182	}
2183	write!(f, "EOI => {:?}", eoi)?;
2184	}
2185	Ok(())
2186	}
2187	}
2188
2189	/// A representation of a mutable sparse DFA state that can be cheaply
2190	/// materialized from a state identifier.
2191	#[cfg(feature = "alloc")]
2192	struct StateMut<'a> {
2193	/// The identifier of this state.
2194	id: StateID,
2195	/// Whether this is a match state or not.
2196	is_match: bool,
2197	/// The number of transitions in this state.
2198	ntrans: usize,
2199	/// Pairs of input ranges, where there is one pair for each transition.
2200	/// Each pair specifies an inclusive start and end byte range for the
2201	/// corresponding transition.
2202	input_ranges: &'a mut [u8],
2203	/// Transitions to the next state. This slice contains native endian
2204	/// encoded state identifiers, with `S` as the representation. Thus, there
2205	/// are `ntrans size_of::<S>()` bytes in this slice.*
2206	next: &'a mut [u8],
2207	/// If this is a match state, then this contains the pattern IDs that match
2208	/// when the DFA is in this state.
2209	///
2210	/// This is a contiguous sequence of 32-bit native endian encoded integers.
2211	pattern_ids: &'a [u8],
2212	/// An accelerator for this state, if present. If this state has no
2213	/// accelerator, then this is an empty slice. When non-empty, this slice
2214	/// has length at most 3 and corresponds to the exhaustive set of bytes
2215	/// that must be seen in order to transition out of this state.
2216	accel: &'a mut [u8],
2217	}
2218
2219	#[cfg(feature = "alloc")]
2220	impl<'a> StateMut<'a> {
2221	/// Sets the ith transition to the given state.
2222	fn set_next_at(&mut self, i: usize, next: StateID) {
2223	let start = i * StateID::SIZE;
2224	let end = start + StateID::SIZE;
2225	bytes::write_state_id::<bytes::NE>(next, &mut self.next[start..end]);
2226	}
2227	}
2228
2229	#[cfg(feature = "alloc")]
2230	impl<'a> fmt::Debug for StateMut<'a> {
2231	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
2232	let state = State {
2233	id: self.id,
2234	is_match: self.is_match,
2235	ntrans: self.ntrans,
2236	input_ranges: self.input_ranges,
2237	next: self.next,
2238	pattern_ids: self.pattern_ids,
2239	accel: self.accel,
2240	};
2241	fmt::Debug::fmt(&state, f)
2242	}
2243	}
2244
2245	/// A binary search routine specialized specifically to a sparse DFA state's
2246	/// transitions. Specifically, the transitions are defined as a set of pairs
2247	/// of input bytes that delineate an inclusive range of bytes. If the input
2248	/// byte is in the range, then the corresponding transition is a match.
2249	///
2250	/// This binary search accepts a slice of these pairs and returns the position
2251	/// of the matching pair (the ith transition), or None if no matching pair
2252	/// could be found.
2253	///
2254	/// Note that this routine is not currently used since it was observed to
2255	/// either decrease performance when searching ASCII, or did not provide enough
2256	/// of a boost on non-ASCII haystacks to be worth it. However, we leave it here
2257	/// for posterity in case we can find a way to use it.
2258	///
2259	/// In theory, we could use the standard library's search routine if we could
2260	/// cast a `&[u8]` to a `&[(u8, u8)]`, but I don't believe this is currently
2261	/// guaranteed to be safe and is thus UB (since I don't think the in-memory
2262	/// representation of `(u8, u8)` has been nailed down). One could define a
2263	/// repr(C) type, but the casting doesn't seem justified.
2264	#[allow(dead_code)]
2265	#[inline(always)]
2266	fn binary_search_ranges(ranges: &[u8], needle: u8) -> Option<usize> {
2267	debug_assert!(ranges.len() % `2` == `0`, "ranges must have even length");
2268	debug_assert!(ranges.len() <= `512`, "ranges should be short");
2269
2270	let (mut left: usize, mut right: usize) = (`0`, ranges.len() / `2`);
2271	while left < right {
2272	let mid: usize = (left + right) / `2`;
2273	let (b1: u8, b2: u8) = (ranges[mid * `2`], ranges[mid * `2` + `1`]);
2274	if needle < b1 {
2275	right = mid;
2276	} else if needle > b2 {
2277	left = mid + `1`;
2278	} else {
2279	return Some(mid);
2280	}
2281	}
2282	None
2283	}
2284