state.rs source code [crates/regex-automata-0.4.9/src/util/determinize/state.rs]

1	/!*
2	This module defines a DFA state representation and builders for constructing
3	DFA states.
4
5	This representation is specifically for use in implementations of NFA-to-DFA
6	conversion via powerset construction. (Also called "determinization" in this
7	crate.)
8
9	The term "DFA state" is somewhat overloaded in this crate. In some cases, it
10	refers to the set of transitions over an alphabet for a particular state. In
11	other cases, it refers to a set of NFA states. The former is really about the
12	final representation of a state in a DFA's transition table, where as the
13	latter---what this module is focused on---is closer to an intermediate form
14	that is used to help eventually build the transition table.
15
16	This module exports four types. All four types represent the same idea: an
17	ordered set of NFA states. This ordered set represents the epsilon closure of a
18	particular NFA state, where the "epsilon closure" is the set of NFA states that
19	can be transitioned to without consuming any input. i.e., Follow all of the NFA
20	state's epsilon transitions. In addition, this implementation of DFA states
21	cares about two other things: the ordered set of pattern IDs corresponding
22	to the patterns that match if the state is a match state, and the set of
23	look-behind assertions that were true when the state was created.
24
25	The first, `State`, is a frozen representation of a state that cannot be
26	modified. It may be cheaply cloned without copying the state itself and can be
27	accessed safely from multiple threads simultaneously. This type is useful for
28	when one knows that the DFA state being constructed is distinct from any other
29	previously constructed states. Namely, powerset construction, in practice,
30	requires one to keep a cache of previously created DFA states. Otherwise,
31	the number of DFA states created in memory balloons to an impractically
32	large number. For this reason, equivalent states should endeavor to have an
33	equivalent byte-level representation. (In general, "equivalency" here means,
34	"equivalent assertions, pattern IDs and NFA state IDs." We do not require that
35	full DFA minimization be implemented here. This form of equivalency is only
36	surface deep and is more-or-less a practical necessity.)
37
38	The other three types represent different phases in the construction of a
39	DFA state. Internally, these three types (and `State`) all use the same
40	byte-oriented representation. That means one can use any of the builder types
41	to check whether the state it represents already exists or not. If it does,
42	then there is no need to freeze it into a `State` (which requires an alloc and
43	a copy). Here are the three types described succinctly:
44
45	* `StateBuilderEmpty` represents a state with no pattern IDs, no assertions
46	and no NFA states. Creating a `StateBuilderEmpty` performs no allocs. A
47	`StateBuilderEmpty` can only be used to query its underlying memory capacity,
48	or to convert into a builder for recording pattern IDs and/or assertions.
49
50	* `StateBuilderMatches` represents a state with zero or more pattern IDs, zero
51	or more satisfied assertions and zero NFA state IDs. A `StateBuilderMatches`
52	can only be used for adding pattern IDs and recording assertions.
53
54	* `StateBuilderNFA` represents a state with zero or more pattern IDs, zero or
55	more satisfied assertions and zero or more NFA state IDs. A `StateBuilderNFA`
56	can only be used for adding NFA state IDs and recording some assertions.
57
58	The expected flow here is to use the above builders to construct a candidate
59	DFA state to check if it already exists. If it does, then there's no need to
60	freeze it into a `State`. If it doesn't exist, then `StateBuilderNFA::to_state`
61	can be called to freeze the builder into an immutable `State`. In either
62	case, `clear` should be called on the builder to turn it back into a
63	`StateBuilderEmpty` that reuses the underlying memory.
64
65	The main purpose for splitting the builder into these distinct types is to
66	make it impossible to do things like adding a pattern ID after adding an NFA
67	state ID. Namely, this makes it simpler to use a space-and-time efficient
68	binary representation for the state. (The format is documented on the `Repr`
69	type below.) If we just used one type for everything, it would be possible for
70	callers to use an incorrect interleaving of calls and thus result in a corrupt
71	representation. I chose to use more type machinery to make this impossible to
72	do because 1) determinization is itself pretty complex and it wouldn't be too
73	hard to foul this up and 2) there isn't too much machinery involved and it's
74	well contained.
75
76	As an optimization, sometimes states won't have certain things set. For
77	example, if the underlying NFA has no word boundary assertions, then there is
78	no reason to set a state's look-behind assertion as to whether it was generated
79	from a word byte or not. Similarly, if a state has no NFA states corresponding
80	to look-around assertions, then there is no reason to set `look_have` to a
81	non-empty set. Finally, callers usually omit unconditional epsilon transitions
82	when adding NFA state IDs since they aren't discriminatory.
83
84	Finally, the binary representation used by these states is, thankfully, not
85	serialized anywhere. So any kind of change can be made with reckless abandon,
86	as long as everything in this module agrees.
87	*/
88
89	use core::mem;
90
91	use alloc::{sync::Arc, vec::Vec};
92
93	use crate::util::{
94	int::{I32, U32},
95	look::LookSet,
96	primitives::{PatternID, StateID},
97	wire::{self, Endian},
98	};
99
100	/// A DFA state that, at its core, is represented by an ordered set of NFA
101	/// states.
102	///
103	/// This type is intended to be used only in NFA-to-DFA conversion via powerset
104	/// construction.
105	///
106	/// It may be cheaply cloned and accessed safely from multiple threads
107	/// simultaneously.
108	#[derive(Clone, Eq, Hash, PartialEq, PartialOrd, Ord)]
109	pub(crate) struct State(Arc<[u8]>);
110
111	/// This Borrow impl permits us to lookup any state in a map by its byte
112	/// representation. This is particularly convenient when one has a StateBuilder
113	/// and we want to see if a correspondingly equivalent state already exists. If
114	/// one does exist, then we can reuse the allocation required by StateBuilder
115	/// without having to convert it into a State first.
116	impl core::borrow::Borrow<[u8]> for State {
117	fn borrow(&self) -> &[u8] {
118	&*self.0
119	}
120	}
121
122	impl core::fmt::Debug for State {
123	fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
124	f.debug_tuple(name:"State").field(&self.repr()).finish()
125	}
126	}
127
128	/// For docs on these routines, see the internal Repr and ReprVec types below.
129	impl State {
130	pub(crate) fn dead() -> State {
131	StateBuilderEmpty::new().into_matches().into_nfa().to_state()
132	}
133
134	pub(crate) fn is_match(&self) -> bool {
135	self.repr().is_match()
136	}
137
138	pub(crate) fn is_from_word(&self) -> bool {
139	self.repr().is_from_word()
140	}
141
142	pub(crate) fn is_half_crlf(&self) -> bool {
143	self.repr().is_half_crlf()
144	}
145
146	pub(crate) fn look_have(&self) -> LookSet {
147	self.repr().look_have()
148	}
149
150	pub(crate) fn look_need(&self) -> LookSet {
151	self.repr().look_need()
152	}
153
154	pub(crate) fn match_len(&self) -> usize {
155	self.repr().match_len()
156	}
157
158	pub(crate) fn match_pattern(&self, index: usize) -> PatternID {
159	self.repr().match_pattern(index)
160	}
161
162	pub(crate) fn match_pattern_ids(&self) -> Option<Vec<PatternID>> {
163	self.repr().match_pattern_ids()
164	}
165
166	#[cfg(all(test, not(miri)))]
167	pub(crate) fn iter_match_pattern_ids<F: FnMut(PatternID)>(&self, f: F) {
168	self.repr().iter_match_pattern_ids(f)
169	}
170
171	pub(crate) fn iter_nfa_state_ids<F: FnMut(StateID)>(&self, f: F) {
172	self.repr().iter_nfa_state_ids(f)
173	}
174
175	pub(crate) fn memory_usage(&self) -> usize {
176	self.0.len()
177	}
178
179	fn repr(&self) -> Repr<'_> {
180	Repr(&*self.0)
181	}
182	}
183
184	/// A state builder that represents an empty state.
185	///
186	/// This is a useful "initial condition" for state construction. It has no
187	/// NFA state IDs, no assertions set and no pattern IDs. No allocations are
188	/// made when new() is called. Its main use is for being converted into a
189	/// builder that can capture assertions and pattern IDs.
190	#[derive(Clone, Debug)]
191	pub(crate) struct StateBuilderEmpty(Vec<u8>);
192
193	/// For docs on these routines, see the internal Repr and ReprVec types below.
194	impl StateBuilderEmpty {
195	pub(crate) fn new() -> StateBuilderEmpty {
196	StateBuilderEmpty(alloc::vec![])
197	}
198
199	pub(crate) fn into_matches(mut self) -> StateBuilderMatches {
200	self.0.extend_from_slice(&[`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`]);
201	StateBuilderMatches(self.0)
202	}
203
204	fn clear(&mut self) {
205	self.0.clear();
206	}
207
208	pub(crate) fn capacity(&self) -> usize {
209	self.0.capacity()
210	}
211	}
212
213	/// A state builder that collects assertions and pattern IDs.
214	///
215	/// When collecting pattern IDs is finished, this can be converted into a
216	/// builder that collects NFA state IDs.
217	#[derive(Clone)]
218	pub(crate) struct StateBuilderMatches(Vec<u8>);
219
220	impl core::fmt::Debug for StateBuilderMatches {
221	fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
222	f.debug_tuple(name:"StateBuilderMatches").field(&self.repr()).finish()
223	}
224	}
225
226	/// For docs on these routines, see the internal Repr and ReprVec types below.
227	impl StateBuilderMatches {
228	pub(crate) fn into_nfa(mut self) -> StateBuilderNFA {
229	self.repr_vec().close_match_pattern_ids();
230	StateBuilderNFA { repr: self.0, prev_nfa_state_id: StateID::ZERO }
231	}
232
233	pub(crate) fn set_is_from_word(&mut self) {
234	self.repr_vec().set_is_from_word()
235	}
236
237	pub(crate) fn set_is_half_crlf(&mut self) {
238	self.repr_vec().set_is_half_crlf()
239	}
240
241	pub(crate) fn look_have(&self) -> LookSet {
242	LookSet::read_repr(&self.0[`1`..])
243	}
244
245	pub(crate) fn set_look_have(
246	&mut self,
247	set: impl FnMut(LookSet) -> LookSet,
248	) {
249	self.repr_vec().set_look_have(set)
250	}
251
252	pub(crate) fn add_match_pattern_id(&mut self, pid: PatternID) {
253	self.repr_vec().add_match_pattern_id(pid)
254	}
255
256	fn repr(&self) -> Repr<'_> {
257	Repr(&self.0)
258	}
259
260	fn repr_vec(&mut self) -> ReprVec<'_> {
261	ReprVec(&mut self.0)
262	}
263	}
264
265	/// A state builder that collects some assertions and NFA state IDs.
266	///
267	/// When collecting NFA state IDs is finished, this can be used to build a
268	/// `State` if necessary.
269	///
270	/// When dont with building a state (regardless of whether it got kept or not),
271	/// it's usually a good idea to call `clear` to get an empty builder back so
272	/// that it can be reused to build the next state.
273	#[derive(Clone)]
274	pub(crate) struct StateBuilderNFA {
275	repr: Vec<u8>,
276	prev_nfa_state_id: StateID,
277	}
278
279	impl core::fmt::Debug for StateBuilderNFA {
280	fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
281	f.debug_tuple(name:"StateBuilderNFA").field(&self.repr()).finish()
282	}
283	}
284
285	/// For docs on these routines, see the internal Repr and ReprVec types below.
286	impl StateBuilderNFA {
287	pub(crate) fn to_state(&self) -> State {
288	State(Arc::from(&*self.repr))
289	}
290
291	pub(crate) fn clear(self) -> StateBuilderEmpty {
292	let mut builder = StateBuilderEmpty(self.repr);
293	builder.clear();
294	builder
295	}
296
297	pub(crate) fn look_need(&self) -> LookSet {
298	self.repr().look_need()
299	}
300
301	pub(crate) fn set_look_have(
302	&mut self,
303	set: impl FnMut(LookSet) -> LookSet,
304	) {
305	self.repr_vec().set_look_have(set)
306	}
307
308	pub(crate) fn set_look_need(
309	&mut self,
310	set: impl FnMut(LookSet) -> LookSet,
311	) {
312	self.repr_vec().set_look_need(set)
313	}
314
315	pub(crate) fn add_nfa_state_id(&mut self, sid: StateID) {
316	ReprVec(&mut self.repr)
317	.add_nfa_state_id(&mut self.prev_nfa_state_id, sid)
318	}
319
320	pub(crate) fn as_bytes(&self) -> &[u8] {
321	&self.repr
322	}
323
324	fn repr(&self) -> Repr<'_> {
325	Repr(&self.repr)
326	}
327
328	fn repr_vec(&mut self) -> ReprVec<'_> {
329	ReprVec(&mut self.repr)
330	}
331	}
332
333	/// Repr is a read-only view into the representation of a DFA state.
334	///
335	/// Primarily, a Repr is how we achieve DRY: we implement decoding the format
336	/// in one place, and then use a Repr to implement the various methods on the
337	/// public state types.
338	///
339	/// The format is as follows:
340	///
341	/// The first three bytes correspond to bitsets.
342	///
343	/// Byte 0 is a bitset corresponding to miscellaneous flags associated with the
344	/// state. Bit 0 is set to 1 if the state is a match state. Bit 1 is set to 1
345	/// if the state has pattern IDs explicitly written to it. (This is a flag that
346	/// is not meant to be set by determinization, but rather, is used as part of
347	/// an internal space-saving optimization.) Bit 2 is set to 1 if the state was
348	/// generated by a transition over a "word" byte. (Callers may not always set
349	/// this. For example, if the NFA has no word boundary assertion, then needing
350	/// to track whether a state came from a word byte or not is superfluous and
351	/// wasteful.) Bit 3 is set to 1 if the state was generated by a transition
352	/// from a `\r` (forward search) or a `\n` (reverse search) when CRLF mode is
353	/// enabled.
354	///
355	/// Bytes 1..5 correspond to the look-behind assertions that were satisfied
356	/// by the transition that created this state. (Look-ahead assertions are not
357	/// tracked as part of states. Instead, these are applied by re-computing the
358	/// epsilon closure of a state when computing the transition function. See
359	/// `next` in the parent module.)
360	///
361	/// Bytes 5..9 correspond to the set of look-around assertions (including both
362	/// look-behind and look-ahead) that appear somewhere in this state's set of
363	/// NFA state IDs. This is used to determine whether this state's epsilon
364	/// closure should be re-computed when computing the transition function.
365	/// Namely, look-around assertions are "just" conditional epsilon transitions,
366	/// so if there are new assertions available when computing the transition
367	/// function, we should only re-compute the epsilon closure if those new
368	/// assertions are relevant to this particular state.
369	///
370	/// Bytes 9..13 correspond to a 32-bit native-endian encoded integer
371	/// corresponding to the number of patterns encoded in this state. If the state
372	/// is not a match state (byte 0 bit 0 is 0) or if it's only pattern ID is
373	/// PatternID::ZERO, then no integer is encoded at this position. Instead, byte
374	/// offset 3 is the position at which the first NFA state ID is encoded.
375	///
376	/// For a match state with at least one non-ZERO pattern ID, the next bytes
377	/// correspond to a sequence of 32-bit native endian encoded integers that
378	/// represent each pattern ID, in order, that this match state represents.
379	///
380	/// After the pattern IDs (if any), NFA state IDs are delta encoded as
381	/// varints.[1] The first NFA state ID is encoded as itself, and each
382	/// subsequent NFA state ID is encoded as the difference between itself and the
383	/// previous NFA state ID.
384	///
385	/// [1] - https://developers.google.com/protocol-buffers/docs/encoding#varints
386	struct Repr<'a>(&'a [u8]);
387
388	impl<'a> Repr<'a> {
389	/// Returns true if and only if this is a match state.
390	///
391	/// If callers have added pattern IDs to this state, then callers MUST set
392	/// this state as a match state explicitly. However, as a special case,
393	/// states that are marked as match states but with no pattern IDs, then
394	/// the state is treated as if it had a single pattern ID equivalent to
395	/// PatternID::ZERO.
396	fn is_match(&self) -> bool {
397	self.0[`0`] & (`1` << `0`) > `0`
398	}
399
400	/// Returns true if and only if this state has had at least one pattern
401	/// ID added to it.
402	///
403	/// This is an internal-only flag that permits the representation to save
404	/// space in the common case of an NFA with one pattern in it. In that
405	/// case, a match state can only ever have exactly one pattern ID:
406	/// PatternID::ZERO. So there's no need to represent it.
407	fn has_pattern_ids(&self) -> bool {
408	self.0[`0`] & (`1` << `1`) > `0`
409	}
410
411	/// Returns true if and only if this state is marked as having been created
412	/// from a transition over a word byte. This is useful for checking whether
413	/// a word boundary assertion is true or not, which requires look-behind
414	/// (whether the current state came from a word byte or not) and look-ahead
415	/// (whether the transition byte is a word byte or not).
416	///
417	/// Since states with this set are distinct from states that don't have
418	/// this set (even if they are otherwise equivalent), callers should not
419	/// set this assertion unless the underlying NFA has at least one word
420	/// boundary assertion somewhere. Otherwise, a superfluous number of states
421	/// may be created.
422	fn is_from_word(&self) -> bool {
423	self.0[`0`] & (`1` << `2`) > `0`
424	}
425
426	/// Returns true if and only if this state is marked as being inside of a
427	/// CRLF terminator. In the forward direction, this means the state was
428	/// created after seeing a `\r`. In the reverse direction, this means the
429	/// state was created after seeing a `\n`.
430	fn is_half_crlf(&self) -> bool {
431	self.0[`0`] & (`1` << `3`) > `0`
432	}
433
434	/// The set of look-behind assertions that were true in the transition that
435	/// created this state.
436	///
437	/// Generally, this should be empty if 'look_need' is empty, since there is
438	/// no reason to track which look-behind assertions are true if the state
439	/// has no conditional epsilon transitions.
440	///
441	/// Satisfied look-ahead assertions are not tracked in states. Instead,
442	/// these are re-computed on demand via epsilon closure when computing the
443	/// transition function.
444	fn look_have(&self) -> LookSet {
445	LookSet::read_repr(&self.0[`1`..])
446	}
447
448	/// The set of look-around (both behind and ahead) assertions that appear
449	/// at least once in this state's set of NFA states.
450	///
451	/// This is used to determine whether the epsilon closure needs to be
452	/// re-computed when computing the transition function. Namely, if the
453	/// state has no conditional epsilon transitions, then there is no need
454	/// to re-compute the epsilon closure.
455	fn look_need(&self) -> LookSet {
456	LookSet::read_repr(&self.0[`5`..])
457	}
458
459	/// Returns the total number of match pattern IDs in this state.
460	///
461	/// If this state is not a match state, then this always returns 0.
462	fn match_len(&self) -> usize {
463	if !self.is_match() {
464	return `0`;
465	} else if !self.has_pattern_ids() {
466	`1`
467	} else {
468	self.encoded_pattern_len()
469	}
470	}
471
472	/// Returns the pattern ID for this match state at the given index.
473	///
474	/// If the given index is greater than or equal to `match_len()` for this
475	/// state, then this could panic or return incorrect results.
476	fn match_pattern(&self, index: usize) -> PatternID {
477	if !self.has_pattern_ids() {
478	PatternID::ZERO
479	} else {
480	let offset = `13` + index * PatternID::SIZE;
481	// This is OK since we only ever serialize valid PatternIDs to
482	// states.
483	wire::read_pattern_id_unchecked(&self.0[offset..]).0
484	}
485	}
486
487	/// Returns a copy of all match pattern IDs in this state. If this state
488	/// is not a match state, then this returns None.
489	fn match_pattern_ids(&self) -> Option<Vec<PatternID>> {
490	if !self.is_match() {
491	return None;
492	}
493	let mut pids = alloc::vec![];
494	self.iter_match_pattern_ids(\|pid\| pids.push(pid));
495	Some(pids)
496	}
497
498	/// Calls the given function on every pattern ID in this state.
499	fn iter_match_pattern_ids<F: FnMut(PatternID)>(&self, mut f: F) {
500	if !self.is_match() {
501	return;
502	}
503	// As an optimization for a very common case, when this is a match
504	// state for an NFA with only one pattern, we don't actually write the
505	// pattern ID to the state representation. Instead, we know it must
506	// be there since it is the only possible choice.
507	if !self.has_pattern_ids() {
508	f(PatternID::ZERO);
509	return;
510	}
511	let mut pids = &self.0[`13`..self.pattern_offset_end()];
512	while !pids.is_empty() {
513	let pid = wire::read_u32(pids);
514	pids = &pids[PatternID::SIZE..];
515	// This is OK since we only ever serialize valid PatternIDs to
516	// states. And since pattern IDs can never exceed a usize, the
517	// unwrap is OK.
518	f(PatternID::new_unchecked(usize::try_from(pid).unwrap()));
519	}
520	}
521
522	/// Calls the given function on every NFA state ID in this state.
523	fn iter_nfa_state_ids<F: FnMut(StateID)>(&self, mut f: F) {
524	let mut sids = &self.0[self.pattern_offset_end()..];
525	let mut prev = `0i32`;
526	while !sids.is_empty() {
527	let (delta, nr) = read_vari32(sids);
528	sids = &sids[nr..];
529	let sid = prev + delta;
530	prev = sid;
531	// This is OK since we only ever serialize valid StateIDs to
532	// states. And since state IDs can never exceed an isize, they must
533	// always be able to fit into a usize, and thus cast is OK.
534	f(StateID::new_unchecked(sid.as_usize()))
535	}
536	}
537
538	/// Returns the offset into this state's representation where the pattern
539	/// IDs end and the NFA state IDs begin.
540	fn pattern_offset_end(&self) -> usize {
541	let encoded = self.encoded_pattern_len();
542	if encoded == `0` {
543	return `9`;
544	}
545	// This arithmetic is OK since we were able to address this many bytes
546	// when writing to the state, thus, it must fit into a usize.
547	encoded.checked_mul(`4`).unwrap().checked_add(`13`).unwrap()
548	}
549
550	/// Returns the total number of encoded* pattern IDs in this state.*
551	///
552	/// This may return 0 even when this is a match state, since the pattern
553	/// ID `PatternID::ZERO` is not encoded when it's the only pattern ID in
554	/// the match state (the overwhelming common case).
555	fn encoded_pattern_len(&self) -> usize {
556	if !self.has_pattern_ids() {
557	return `0`;
558	}
559	// This unwrap is OK since the total number of patterns is always
560	// guaranteed to fit into a usize.
561	usize::try_from(wire::read_u32(&self.0[`9`..`13`])).unwrap()
562	}
563	}
564
565	impl<'a> core::fmt::Debug for Repr<'a> {
566	fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
567	let mut nfa_ids: Vec = alloc::vec![];
568	self.iter_nfa_state_ids(\|sid: StateID\| nfa_ids.push(sid));
569	f&mut DebugStruct<'_, '_>.debug_struct("Repr")
570	.field("is_match", &self.is_match())
571	.field("is_from_word", &self.is_from_word())
572	.field("is_half_crlf", &self.is_half_crlf())
573	.field("look_have", &self.look_have())
574	.field("look_need", &self.look_need())
575	.field("match_pattern_ids", &self.match_pattern_ids())
576	.field(name:"nfa_state_ids", &nfa_ids)
577	.finish()
578	}
579	}
580
581	/// ReprVec is a write-only view into the representation of a DFA state.
582	///
583	/// See Repr for more details on the purpose of this type and also the format.
584	///
585	/// Note that not all possible combinations of methods may be called. This is
586	/// precisely what the various StateBuilder types encapsulate: they only
587	/// permit valid combinations via Rust's linear typing.
588	struct ReprVec<'a>(&'a mut Vec<u8>);
589
590	impl<'a> ReprVec<'a> {
591	/// Set this state as a match state.
592	///
593	/// This should not be exposed explicitly outside of this module. It is
594	/// set automatically when a pattern ID is added.
595	fn set_is_match(&mut self) {
596	self.0[`0`] \|= `1` << `0`;
597	}
598
599	/// Set that this state has pattern IDs explicitly written to it.
600	///
601	/// This should not be exposed explicitly outside of this module. This is
602	/// used internally as a space saving optimization. Namely, if the state
603	/// is a match state but does not have any pattern IDs written to it,
604	/// then it is automatically inferred to have a pattern ID of ZERO.
605	fn set_has_pattern_ids(&mut self) {
606	self.0[`0`] \|= `1` << `1`;
607	}
608
609	/// Set this state as being built from a transition over a word byte.
610	///
611	/// Setting this is only necessary when one needs to deal with word
612	/// boundary assertions. Therefore, if the underlying NFA has no word
613	/// boundary assertions, callers should not set this.
614	fn set_is_from_word(&mut self) {
615	self.0[`0`] \|= `1` << `2`;
616	}
617
618	/// Set this state as having seen half of a CRLF terminator.
619	///
620	/// In the forward direction, this should be set when a `\r` has been seen.
621	/// In the reverse direction, this should be set when a `\n` has been seen.
622	fn set_is_half_crlf(&mut self) {
623	self.0[`0`] \|= `1` << `3`;
624	}
625
626	/// The set of look-behind assertions that were true in the transition that
627	/// created this state.
628	fn look_have(&self) -> LookSet {
629	self.repr().look_have()
630	}
631
632	/// The set of look-around (both behind and ahead) assertions that appear
633	/// at least once in this state's set of NFA states.
634	fn look_need(&self) -> LookSet {
635	self.repr().look_need()
636	}
637
638	/// Mutate the set of look-behind assertions that were true in the
639	/// transition that created this state.
640	fn set_look_have(&mut self, mut set: impl FnMut(LookSet) -> LookSet) {
641	set(self.look_have()).write_repr(&mut self.0[`1`..]);
642	}
643
644	/// Mutate the set of look-around (both behind and ahead) assertions that
645	/// appear at least once in this state's set of NFA states.
646	fn set_look_need(&mut self, mut set: impl FnMut(LookSet) -> LookSet) {
647	set(self.look_need()).write_repr(&mut self.0[`5`..]);
648	}
649
650	/// Add a pattern ID to this state. All match states must have at least
651	/// one pattern ID associated with it.
652	///
653	/// Callers must never add duplicative pattern IDs.
654	///
655	/// The order in which patterns are added must correspond to the order
656	/// in which patterns are reported as matches.
657	fn add_match_pattern_id(&mut self, pid: PatternID) {
658	// As a (somewhat small) space saving optimization, in the case where
659	// a matching state has exactly one pattern ID, PatternID::ZERO, we do
660	// not write either the pattern ID or the number of patterns encoded.
661	// Instead, all we do is set the 'is_match' bit on this state. Overall,
662	// this saves 8 bytes per match state for the overwhelming majority of
663	// match states.
664	//
665	// In order to know whether pattern IDs need to be explicitly read or
666	// not, we use another internal-only bit, 'has_pattern_ids', to
667	// indicate whether they have been explicitly written or not.
668	if !self.repr().has_pattern_ids() {
669	if pid == PatternID::ZERO {
670	self.set_is_match();
671	return;
672	}
673	// Make room for 'close_match_pattern_ids' to write the total
674	// number of pattern IDs written.
675	self.0.extend(core::iter::repeat(`0`).take(PatternID::SIZE));
676	self.set_has_pattern_ids();
677	// If this was already a match state, then the only way that's
678	// possible when the state doesn't have pattern IDs is if
679	// PatternID::ZERO was added by the caller previously. In this
680	// case, we are now adding a non-ZERO pattern ID after it, in
681	// which case, we want to make sure to represent ZERO explicitly
682	// now.
683	if self.repr().is_match() {
684	write_u32(self.0, `0`)
685	} else {
686	// Otherwise, just make sure the 'is_match' bit is set.
687	self.set_is_match();
688	}
689	}
690	write_u32(self.0, pid.as_u32());
691	}
692
693	/// Indicate that no more pattern IDs will be added to this state.
694	///
695	/// Once this is called, callers must not call it or 'add_match_pattern_id'
696	/// again.
697	///
698	/// This should not be exposed explicitly outside of this module. It
699	/// should be called only when converting a StateBuilderMatches into a
700	/// StateBuilderNFA.
701	fn close_match_pattern_ids(&mut self) {
702	// If we never wrote any pattern IDs, then there's nothing to do here.
703	if !self.repr().has_pattern_ids() {
704	return;
705	}
706	let patsize = PatternID::SIZE;
707	let pattern_bytes = self.0.len() - `13`;
708	// Every pattern ID uses 4 bytes, so number of bytes should be
709	// divisible by 4.
710	assert_eq!(pattern_bytes % patsize, `0`);
711	// This unwrap is OK since we are guaranteed that the maximum number
712	// of possible patterns fits into a u32.
713	let count32 = u32::try_from(pattern_bytes / patsize).unwrap();
714	wire::NE::write_u32(count32, &mut self.0[`9`..`13`]);
715	}
716
717	/// Add an NFA state ID to this state. The order in which NFA states are
718	/// added matters. It is the caller's responsibility to ensure that
719	/// duplicate NFA state IDs are not added.
720	fn add_nfa_state_id(&mut self, prev: &mut StateID, sid: StateID) {
721	let delta = sid.as_i32() - prev.as_i32();
722	write_vari32(self.0, delta);
723	*prev = sid;
724	}
725
726	/// Return a read-only view of this state's representation.
727	fn repr(&self) -> Repr<'_> {
728	Repr(self.0.as_slice())
729	}
730	}
731
732	/// Write a signed 32-bit integer using zig-zag encoding.
733	///
734	/// https://developers.google.com/protocol-buffers/docs/encoding#varints
735	fn write_vari32(data: &mut Vec<u8>, n: i32) {
736	let mut un: u32 = n.to_bits() << `1`;
737	if n < `0` {
738	un = !un;
739	}
740	write_varu32(data, n:un)
741	}
742
743	/// Read a signed 32-bit integer using zig-zag encoding. Also, return the
744	/// number of bytes read.
745	///
746	/// https://developers.google.com/protocol-buffers/docs/encoding#varints
747	fn read_vari32(data: &[u8]) -> (i32, usize) {
748	let (un: u32, i: usize) = read_varu32(data);
749	let mut n: i32 = i32::from_bits(un >> `1`);
750	if un & `1` != `0` {
751	n = !n;
752	}
753	(n, i)
754	}
755
756	/// Write an unsigned 32-bit integer as a varint. In essence, `n` is written
757	/// as a sequence of bytes where all bytes except for the last one have the
758	/// most significant bit set. The least significant 7 bits correspond to the
759	/// actual bits of `n`. So in the worst case, a varint uses 5 bytes, but in
760	/// very common cases, it uses fewer than 4.
761	///
762	/// https://developers.google.com/protocol-buffers/docs/encoding#varints
763	fn write_varu32(data: &mut Vec<u8>, mut n: u32) {
764	while n >= `0b1000_0000` {
765	data.push(n.low_u8() \| `0b1000_0000`);
766	n >>= `7`;
767	}
768	data.push(n.low_u8());
769	}
770
771	/// Read an unsigned 32-bit varint. Also, return the number of bytes read.
772	///
773	/// https://developers.google.com/protocol-buffers/docs/encoding#varints
774	fn read_varu32(data: &[u8]) -> (u32, usize) {
775	// N.B. We can assume correctness here since we know that all varuints are
776	// written with write_varu32. Hence, the 'as' uses and unchecked arithmetic
777	// is all okay.
778	let mut n: u32 = `0`;
779	let mut shift: u32 = `0`;
780	for (i: usize, &b: u8) in data.iter().enumerate() {
781	if b < `0b1000_0000` {
782	return (n \| (u32::from(b) << shift), i + `1`);
783	}
784	n \|= (u32::from(b) & `0b0111_1111`) << shift;
785	shift += `7`;
786	}
787	(`0`, `0`)
788	}
789
790	/// Push a native-endian encoded `n` on to `dst`.
791	fn write_u32(dst: &mut Vec<u8>, n: u32) {
792	use crate::util::wire::NE;
793
794	let start: usize = dst.len();
795	dst.extend(iter:core::iter::repeat(elt:`0`).take(mem::size_of::<u32>()));
796	NE::write_u32(n, &mut dst[start..]);
797	}
798
799	#[cfg(test)]
800	mod tests {
801	use alloc::vec;
802
803	use quickcheck::quickcheck;
804
805	use super::*;
806
807	#[cfg(not(miri))]
808	quickcheck! {
809	fn prop_state_read_write_nfa_state_ids(sids: Vec<StateID>) -> bool {
810	// Builders states do not permit duplicate IDs.
811	let sids = dedup_state_ids(sids);
812
813	let mut b = StateBuilderEmpty::new().into_matches().into_nfa();
814	for &sid in &sids {
815	b.add_nfa_state_id(sid);
816	}
817	let s = b.to_state();
818	let mut got = vec![];
819	s.iter_nfa_state_ids(\|sid\| got.push(sid));
820	got == sids
821	}
822
823	fn prop_state_read_write_pattern_ids(pids: Vec<PatternID>) -> bool {
824	// Builders states do not permit duplicate IDs.
825	let pids = dedup_pattern_ids(pids);
826
827	let mut b = StateBuilderEmpty::new().into_matches();
828	for &pid in &pids {
829	b.add_match_pattern_id(pid);
830	}
831	let s = b.into_nfa().to_state();
832	let mut got = vec![];
833	s.iter_match_pattern_ids(\|pid\| got.push(pid));
834	got == pids
835	}
836
837	fn prop_state_read_write_nfa_state_and_pattern_ids(
838	sids: Vec<StateID>,
839	pids: Vec<PatternID>
840	) -> bool {
841	// Builders states do not permit duplicate IDs.
842	let sids = dedup_state_ids(sids);
843	let pids = dedup_pattern_ids(pids);
844
845	let mut b = StateBuilderEmpty::new().into_matches();
846	for &pid in &pids {
847	b.add_match_pattern_id(pid);
848	}
849
850	let mut b = b.into_nfa();
851	for &sid in &sids {
852	b.add_nfa_state_id(sid);
853	}
854
855	let s = b.to_state();
856	let mut got_pids = vec![];
857	s.iter_match_pattern_ids(\|pid\| got_pids.push(pid));
858	let mut got_sids = vec![];
859	s.iter_nfa_state_ids(\|sid\| got_sids.push(sid));
860	got_pids == pids && got_sids == sids
861	}
862	}
863
864	quickcheck! {
865	fn prop_read_write_varu32(n: u32) -> bool {
866	let mut buf = vec![];
867	write_varu32(&mut buf, n);
868	let (got, nread) = read_varu32(&buf);
869	nread == buf.len() && got == n
870	}
871
872	fn prop_read_write_vari32(n: i32) -> bool {
873	let mut buf = vec![];
874	write_vari32(&mut buf, n);
875	let (got, nread) = read_vari32(&buf);
876	nread == buf.len() && got == n
877	}
878	}
879
880	#[cfg(not(miri))]
881	fn dedup_state_ids(sids: Vec<StateID>) -> Vec<StateID> {
882	let mut set = alloc::collections::BTreeSet::new();
883	let mut deduped = vec![];
884	for sid in sids {
885	if set.contains(&sid) {
886	continue;
887	}
888	set.insert(sid);
889	deduped.push(sid);
890	}
891	deduped
892	}
893
894	#[cfg(not(miri))]
895	fn dedup_pattern_ids(pids: Vec<PatternID>) -> Vec<PatternID> {
896	let mut set = alloc::collections::BTreeSet::new();
897	let mut deduped = vec![];
898	for pid in pids {
899	if set.contains(&pid) {
900	continue;
901	}
902	set.insert(pid);
903	deduped.push(pid);
904	}
905	deduped
906	}
907	}
908