1 | /*! |
2 | This module defines a sparse set data structure. Its most interesting |
3 | properties are: |
4 | |
5 | * They preserve insertion order. |
6 | * Set membership testing is done in constant time. |
7 | * Set insertion is done in constant time. |
8 | * Clearing the set is done in constant time. |
9 | |
10 | The cost for doing this is that the capacity of the set needs to be known up |
11 | front, and the elements in the set are limited to state identifiers. |
12 | |
13 | These sets are principally used when traversing an NFA state graph. This |
14 | happens at search time, for example, in the PikeVM. It also happens during DFA |
15 | determinization. |
16 | */ |
17 | |
18 | use alloc::{vec, vec::Vec}; |
19 | |
20 | use crate::util::primitives::StateID; |
21 | |
22 | /// A pairse of sparse sets. |
23 | /// |
24 | /// This is useful when one needs to compute NFA epsilon closures from a |
25 | /// previous set of states derived from an epsilon closure. One set can be the |
26 | /// starting states where as the other set can be the destination states after |
27 | /// following the transitions for a particular byte of input. |
28 | /// |
29 | /// There is no significance to 'set1' or 'set2'. They are both sparse sets of |
30 | /// the same size. |
31 | /// |
32 | /// The members of this struct are exposed so that callers may borrow 'set1' |
33 | /// and 'set2' individually without being force to borrow both at the same |
34 | /// time. |
35 | #[derive(Clone, Debug)] |
36 | pub(crate) struct SparseSets { |
37 | pub(crate) set1: SparseSet, |
38 | pub(crate) set2: SparseSet, |
39 | } |
40 | |
41 | impl SparseSets { |
42 | /// Create a new pair of sparse sets where each set has the given capacity. |
43 | /// |
44 | /// This panics if the capacity given is bigger than `StateID::LIMIT`. |
45 | pub(crate) fn new(capacity: usize) -> SparseSets { |
46 | SparseSets { |
47 | set1: SparseSet::new(capacity), |
48 | set2: SparseSet::new(capacity), |
49 | } |
50 | } |
51 | |
52 | /// Resizes these sparse sets to have the new capacity given. |
53 | /// |
54 | /// The sets are automatically cleared. |
55 | /// |
56 | /// This panics if the capacity given is bigger than `StateID::LIMIT`. |
57 | #[inline ] |
58 | pub(crate) fn resize(&mut self, new_capacity: usize) { |
59 | self.set1.resize(new_capacity); |
60 | self.set2.resize(new_capacity); |
61 | } |
62 | |
63 | /// Clear both sparse sets. |
64 | pub(crate) fn clear(&mut self) { |
65 | self.set1.clear(); |
66 | self.set2.clear(); |
67 | } |
68 | |
69 | /// Swap set1 with set2. |
70 | pub(crate) fn swap(&mut self) { |
71 | core::mem::swap(&mut self.set1, &mut self.set2); |
72 | } |
73 | |
74 | /// Returns the memory usage, in bytes, used by this pair of sparse sets. |
75 | pub(crate) fn memory_usage(&self) -> usize { |
76 | self.set1.memory_usage() + self.set2.memory_usage() |
77 | } |
78 | } |
79 | |
80 | /// A sparse set used for representing ordered NFA states. |
81 | /// |
82 | /// This supports constant time addition and membership testing. Clearing an |
83 | /// entire set can also be done in constant time. Iteration yields elements |
84 | /// in the order in which they were inserted. |
85 | /// |
86 | /// The data structure is based on: https://research.swtch.com/sparse |
87 | /// Note though that we don't actually use uninitialized memory. We generally |
88 | /// reuse sparse sets, so the initial allocation cost is bareable. However, its |
89 | /// other properties listed above are extremely useful. |
90 | #[derive(Clone)] |
91 | pub(crate) struct SparseSet { |
92 | /// The number of elements currently in this set. |
93 | len: usize, |
94 | /// Dense contains the ids in the order in which they were inserted. |
95 | dense: Vec<StateID>, |
96 | /// Sparse maps ids to their location in dense. |
97 | /// |
98 | /// A state ID is in the set if and only if |
99 | /// sparse[id] < len && id == dense[sparse[id]]. |
100 | /// |
101 | /// Note that these are indices into 'dense'. It's a little weird to use |
102 | /// StateID here, but we know our length can never exceed the bounds of |
103 | /// StateID (enforced by 'resize') and StateID will be at most 4 bytes |
104 | /// where as a usize is likely double that in most cases. |
105 | sparse: Vec<StateID>, |
106 | } |
107 | |
108 | impl SparseSet { |
109 | /// Create a new sparse set with the given capacity. |
110 | /// |
111 | /// Sparse sets have a fixed size and they cannot grow. Attempting to |
112 | /// insert more distinct elements than the total capacity of the set will |
113 | /// result in a panic. |
114 | /// |
115 | /// This panics if the capacity given is bigger than `StateID::LIMIT`. |
116 | #[inline ] |
117 | pub(crate) fn new(capacity: usize) -> SparseSet { |
118 | let mut set = SparseSet { len: 0, dense: vec![], sparse: vec![] }; |
119 | set.resize(capacity); |
120 | set |
121 | } |
122 | |
123 | /// Resizes this sparse set to have the new capacity given. |
124 | /// |
125 | /// This set is automatically cleared. |
126 | /// |
127 | /// This panics if the capacity given is bigger than `StateID::LIMIT`. |
128 | #[inline ] |
129 | pub(crate) fn resize(&mut self, new_capacity: usize) { |
130 | assert!( |
131 | new_capacity <= StateID::LIMIT, |
132 | "sparse set capacity cannot excced {:?}" , |
133 | StateID::LIMIT |
134 | ); |
135 | self.clear(); |
136 | self.dense.resize(new_capacity, StateID::ZERO); |
137 | self.sparse.resize(new_capacity, StateID::ZERO); |
138 | } |
139 | |
140 | /// Returns the capacity of this set. |
141 | /// |
142 | /// The capacity represents a fixed limit on the number of distinct |
143 | /// elements that are allowed in this set. The capacity cannot be changed. |
144 | #[inline ] |
145 | pub(crate) fn capacity(&self) -> usize { |
146 | self.dense.len() |
147 | } |
148 | |
149 | /// Returns the number of elements in this set. |
150 | #[inline ] |
151 | pub(crate) fn len(&self) -> usize { |
152 | self.len |
153 | } |
154 | |
155 | /// Returns true if and only if this set is empty. |
156 | #[inline ] |
157 | pub(crate) fn is_empty(&self) -> bool { |
158 | self.len() == 0 |
159 | } |
160 | |
161 | /// Insert the state ID value into this set and return true if the given |
162 | /// state ID was not previously in this set. |
163 | /// |
164 | /// This operation is idempotent. If the given value is already in this |
165 | /// set, then this is a no-op. |
166 | /// |
167 | /// If more than `capacity` ids are inserted, then this panics. |
168 | /// |
169 | /// This is marked as inline(always) since the compiler won't inline it |
170 | /// otherwise, and it's a fairly hot piece of code in DFA determinization. |
171 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
172 | pub(crate) fn insert(&mut self, id: StateID) -> bool { |
173 | if self.contains(id) { |
174 | return false; |
175 | } |
176 | |
177 | let i = self.len(); |
178 | assert!( |
179 | i < self.capacity(), |
180 | "{:?} exceeds capacity of {:?} when inserting {:?}" , |
181 | i, |
182 | self.capacity(), |
183 | id, |
184 | ); |
185 | // OK since i < self.capacity() and self.capacity() is guaranteed to |
186 | // be <= StateID::LIMIT. |
187 | let index = StateID::new_unchecked(i); |
188 | self.dense[index] = id; |
189 | self.sparse[id] = index; |
190 | self.len += 1; |
191 | true |
192 | } |
193 | |
194 | /// Returns true if and only if this set contains the given value. |
195 | #[inline ] |
196 | pub(crate) fn contains(&self, id: StateID) -> bool { |
197 | let index = self.sparse[id]; |
198 | index.as_usize() < self.len() && self.dense[index] == id |
199 | } |
200 | |
201 | /// Clear this set such that it has no members. |
202 | #[inline ] |
203 | pub(crate) fn clear(&mut self) { |
204 | self.len = 0; |
205 | } |
206 | |
207 | #[inline ] |
208 | pub(crate) fn iter(&self) -> SparseSetIter<'_> { |
209 | SparseSetIter(self.dense[..self.len()].iter()) |
210 | } |
211 | |
212 | /// Returns the heap memory usage, in bytes, used by this sparse set. |
213 | #[inline ] |
214 | pub(crate) fn memory_usage(&self) -> usize { |
215 | self.dense.len() * StateID::SIZE + self.sparse.len() * StateID::SIZE |
216 | } |
217 | } |
218 | |
219 | impl core::fmt::Debug for SparseSet { |
220 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
221 | let elements: Vec<StateID> = self.iter().collect(); |
222 | f.debug_tuple("SparseSet" ).field(&elements).finish() |
223 | } |
224 | } |
225 | |
226 | /// An iterator over all elements in a sparse set. |
227 | /// |
228 | /// The lifetime `'a` refers to the lifetime of the set being iterated over. |
229 | #[derive(Debug)] |
230 | pub(crate) struct SparseSetIter<'a>(core::slice::Iter<'a, StateID>); |
231 | |
232 | impl<'a> Iterator for SparseSetIter<'a> { |
233 | type Item = StateID; |
234 | |
235 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
236 | fn next(&mut self) -> Option<StateID> { |
237 | self.0.next().map(|&id| id) |
238 | } |
239 | } |
240 | |