remapper.rs source code [crates/aho_corasick/src/util/remapper.rs]

1	use alloc::vec::Vec;
2
3	use crate::{nfa::noncontiguous, util::primitives::StateID};
4
5	/// Remappable is a tightly coupled abstraction that facilitates remapping
6	/// state identifiers in DFAs.
7	///
8	/// The main idea behind remapping state IDs is that DFAs often need to check
9	/// if a certain state is a "special" state of some kind (like a match state)
10	/// during a search. Since this is extremely perf critical code, we want this
11	/// check to be as fast as possible. Partitioning state IDs into, for example,
12	/// into "non-match" and "match" states means one can tell if a state is a
13	/// match state via a simple comparison of the state ID.
14	///
15	/// The issue is that during the DFA construction process, it's not
16	/// particularly easy to partition the states. Instead, the simplest thing is
17	/// to often just do a pass over all of the states and shuffle them into their
18	/// desired partitionings. To do that, we need a mechanism for swapping states.
19	/// Hence, this abstraction.
20	///
21	/// Normally, for such little code, I would just duplicate it. But this is a
22	/// key optimization and the implementation is a bit subtle. So the abstraction
23	/// is basically a ham-fisted attempt at DRY. The only place we use this is in
24	/// the dense and one-pass DFAs.
25	///
26	/// See also src/dfa/special.rs for a more detailed explanation of how dense
27	/// DFAs are partitioned.
28	pub(crate) trait Remappable: core::fmt::Debug {
29	/// Return the total number of states.
30	fn state_len(&self) -> usize;
31
32	/// Swap the states pointed to by the given IDs. The underlying finite
33	/// state machine should be mutated such that all of the transitions in
34	/// `id1` are now in the memory region where the transitions for `id2`
35	/// were, and all of the transitions in `id2` are now in the memory region
36	/// where the transitions for `id1` were.
37	///
38	/// Essentially, this "moves" `id1` to `id2` and `id2` to `id1`.
39	///
40	/// It is expected that, after calling this, the underlying state machine
41	/// will be left in an inconsistent state, since any other transitions
42	/// pointing to, e.g., `id1` need to be updated to point to `id2`, since
43	/// that's where `id1` moved to.
44	///
45	/// In order to "fix" the underlying inconsistent state, a `Remapper`
46	/// should be used to guarantee that `remap` is called at the appropriate
47	/// time.
48	fn swap_states(&mut self, id1: StateID, id2: StateID);
49
50	/// This must remap every single state ID in the underlying value according
51	/// to the function given. For example, in a DFA, this should remap every
52	/// transition and every starting state ID.
53	fn remap(&mut self, map: impl Fn(StateID) -> StateID);
54	}
55
56	/// Remapper is an abstraction the manages the remapping of state IDs in a
57	/// finite state machine. This is useful when one wants to shuffle states into
58	/// different positions in the machine.
59	///
60	/// One of the key complexities this manages is the ability to correctly move
61	/// one state multiple times.
62	///
63	/// Once shuffling is complete, `remap` must be called, which will rewrite
64	/// all pertinent transitions to updated state IDs. Neglecting to call `remap`
65	/// will almost certainly result in a corrupt machine.
66	#[derive(Debug)]
67	pub(crate) struct Remapper {
68	/// A map from the index of a state to its pre-multiplied identifier.
69	///
70	/// When a state is swapped with another, then their corresponding
71	/// locations in this map are also swapped. Thus, its new position will
72	/// still point to its old pre-multiplied StateID.
73	///
74	/// While there is a bit more to it, this then allows us to rewrite the
75	/// state IDs in a DFA's transition table in a single pass. This is done
76	/// by iterating over every ID in this map, then iterating over each
77	/// transition for the state at that ID and re-mapping the transition from
78	/// `old_id` to `map[dfa.to_index(old_id)]`. That is, we find the position
79	/// in this map where `old_id` started, and set it to where it ended up
80	/// after all swaps have been completed.
81	map: Vec<StateID>,
82	/// A way to map indices to state IDs (and back).
83	idx: IndexMapper,
84	}
85
86	impl Remapper {
87	/// Create a new remapper from the given remappable implementation. The
88	/// remapper can then be used to swap states. The remappable value given
89	/// here must the same one given to `swap` and `remap`.
90	///
91	/// The given stride should be the stride of the transition table expressed
92	/// as a power of 2. This stride is used to map between state IDs and state
93	/// indices. If state IDs and state indices are equivalent, then provide
94	/// a `stride2` of `0`, which acts as an identity.
95	pub(crate) fn new(r: &impl Remappable, stride2: usize) -> Remapper {
96	let idx = IndexMapper { stride2 };
97	let map = (`0`..r.state_len()).map(\|i\| idx.to_state_id(i)).collect();
98	Remapper { map, idx }
99	}
100
101	/// Swap two states. Once this is called, callers must follow through to
102	/// call `remap`, or else it's possible for the underlying remappable
103	/// value to be in a corrupt state.
104	pub(crate) fn swap(
105	&mut self,
106	r: &mut impl Remappable,
107	id1: StateID,
108	id2: StateID,
109	) {
110	if id1 == id2 {
111	return;
112	}
113	r.swap_states(id1, id2);
114	self.map.swap(self.idx.to_index(id1), self.idx.to_index(id2));
115	}
116
117	/// Complete the remapping process by rewriting all state IDs in the
118	/// remappable value according to the swaps performed.
119	pub(crate) fn remap(mut self, r: &mut impl Remappable) {
120	// Update the map to account for states that have been swapped
121	// multiple times. For example, if (A, C) and (C, G) are swapped, then
122	// transitions previously pointing to A should now point to G. But if
123	// we don't update our map, they will erroneously be set to C. All we
124	// do is follow the swaps in our map until we see our original state
125	// ID.
126	//
127	// The intuition here is to think about how changes are made to the
128	// map: only through pairwise swaps. That means that starting at any
129	// given state, it is always possible to find the loop back to that
130	// state by following the swaps represented in the map (which might be
131	// 0 swaps).
132	//
133	// We are also careful to clone the map before starting in order to
134	// freeze it. We use the frozen map to find our loops, since we need to
135	// update our map as well. Without freezing it, our updates could break
136	// the loops referenced above and produce incorrect results.
137	let oldmap = self.map.clone();
138	for i in `0`..r.state_len() {
139	let cur_id = self.idx.to_state_id(i);
140	let mut new_id = oldmap[i];
141	if cur_id == new_id {
142	continue;
143	}
144	loop {
145	let id = oldmap[self.idx.to_index(new_id)];
146	if cur_id == id {
147	self.map[i] = new_id;
148	break;
149	}
150	new_id = id;
151	}
152	}
153	r.remap(\|sid\| self.map[self.idx.to_index(sid)]);
154	}
155	}
156
157	/// A simple type for mapping between state indices and state IDs.
158	///
159	/// The reason why this exists is because state IDs are "premultiplied" in a
160	/// DFA. That is, in order to get to the transitions for a particular state,
161	/// one need only use the state ID as-is, instead of having to multiply it by
162	/// transition table's stride.
163	///
164	/// The downside of this is that it's inconvenient to map between state IDs
165	/// using a dense map, e.g., Vec<StateID>. That's because state IDs look like
166	/// `0`, `stride`, `2stride`, `3stride`, etc., instead of `0`, `1`, `2`, `3`,
167	/// etc.
168	///
169	/// Since our state IDs are premultiplied, we can convert back-and-forth
170	/// between IDs and indices by simply unmultiplying the IDs and multiplying the
171	/// indices.
172	///
173	/// Note that for a sparse NFA, state IDs and indices are equivalent. In this
174	/// case, we set the stride of the index mapped to be `0`, which acts as an
175	/// identity.
176	#[derive(Debug)]
177	struct IndexMapper {
178	/// The power of 2 corresponding to the stride of the corresponding
179	/// transition table. 'id >> stride2' de-multiplies an ID while 'index <<
180	/// stride2' pre-multiplies an index to an ID.
181	stride2: usize,
182	}
183
184	impl IndexMapper {
185	/// Convert a state ID to a state index.
186	fn to_index(&self, id: StateID) -> usize {
187	id.as_usize() >> self.stride2
188	}
189
190	/// Convert a state index to a state ID.
191	fn to_state_id(&self, index: usize) -> StateID {
192	// CORRECTNESS: If the given index is not valid, then it is not
193	// required for this to panic or return a valid state ID. We'll "just"
194	// wind up with panics or silent logic errors at some other point. But
195	// this is OK because if Remappable::state_len is correct and so is
196	// 'to_index', then all inputs to 'to_state_id' should be valid indices
197	// and thus transform into valid state IDs.
198	StateID::new_unchecked(index << self.stride2)
199	}
200	}
201
202	impl Remappable for noncontiguous::NFA {
203	fn state_len(&self) -> usize {
204	noncontiguous::NFA::states(self).len()
205	}
206
207	fn swap_states(&mut self, id1: StateID, id2: StateID) {
208	noncontiguous::NFA::swap_states(self, id1, id2)
209	}
210
211	fn remap(&mut self, map: impl Fn(StateID) -> StateID) {
212	noncontiguous::NFA::remap(self, map)
213	}
214	}
215