| 1 | /*! |
| 2 | This module defines a sparse set data structure. Its most interesting |
| 3 | properties are: |
| 4 | |
| 5 | * They preserve insertion order. |
| 6 | * Set membership testing is done in constant time. |
| 7 | * Set insertion is done in constant time. |
| 8 | * Clearing the set is done in constant time. |
| 9 | |
| 10 | The cost for doing this is that the capacity of the set needs to be known up |
| 11 | front, and the elements in the set are limited to state identifiers. |
| 12 | |
| 13 | These sets are principally used when traversing an NFA state graph. This |
| 14 | happens at search time, for example, in the PikeVM. It also happens during DFA |
| 15 | determinization. |
| 16 | */ |
| 17 | |
| 18 | use alloc::{vec, vec::Vec}; |
| 19 | |
| 20 | use crate::util::primitives::StateID; |
| 21 | |
| 22 | /// A pairse of sparse sets. |
| 23 | /// |
| 24 | /// This is useful when one needs to compute NFA epsilon closures from a |
| 25 | /// previous set of states derived from an epsilon closure. One set can be the |
| 26 | /// starting states where as the other set can be the destination states after |
| 27 | /// following the transitions for a particular byte of input. |
| 28 | /// |
| 29 | /// There is no significance to 'set1' or 'set2'. They are both sparse sets of |
| 30 | /// the same size. |
| 31 | /// |
| 32 | /// The members of this struct are exposed so that callers may borrow 'set1' |
| 33 | /// and 'set2' individually without being force to borrow both at the same |
| 34 | /// time. |
| 35 | #[derive (Clone, Debug)] |
| 36 | pub(crate) struct SparseSets { |
| 37 | pub(crate) set1: SparseSet, |
| 38 | pub(crate) set2: SparseSet, |
| 39 | } |
| 40 | |
| 41 | impl SparseSets { |
| 42 | /// Create a new pair of sparse sets where each set has the given capacity. |
| 43 | /// |
| 44 | /// This panics if the capacity given is bigger than `StateID::LIMIT`. |
| 45 | pub(crate) fn new(capacity: usize) -> SparseSets { |
| 46 | SparseSets { |
| 47 | set1: SparseSet::new(capacity), |
| 48 | set2: SparseSet::new(capacity), |
| 49 | } |
| 50 | } |
| 51 | |
| 52 | /// Resizes these sparse sets to have the new capacity given. |
| 53 | /// |
| 54 | /// The sets are automatically cleared. |
| 55 | /// |
| 56 | /// This panics if the capacity given is bigger than `StateID::LIMIT`. |
| 57 | #[inline ] |
| 58 | pub(crate) fn resize(&mut self, new_capacity: usize) { |
| 59 | self.set1.resize(new_capacity); |
| 60 | self.set2.resize(new_capacity); |
| 61 | } |
| 62 | |
| 63 | /// Clear both sparse sets. |
| 64 | pub(crate) fn clear(&mut self) { |
| 65 | self.set1.clear(); |
| 66 | self.set2.clear(); |
| 67 | } |
| 68 | |
| 69 | /// Swap set1 with set2. |
| 70 | pub(crate) fn swap(&mut self) { |
| 71 | core::mem::swap(&mut self.set1, &mut self.set2); |
| 72 | } |
| 73 | |
| 74 | /// Returns the memory usage, in bytes, used by this pair of sparse sets. |
| 75 | pub(crate) fn memory_usage(&self) -> usize { |
| 76 | self.set1.memory_usage() + self.set2.memory_usage() |
| 77 | } |
| 78 | } |
| 79 | |
| 80 | /// A sparse set used for representing ordered NFA states. |
| 81 | /// |
| 82 | /// This supports constant time addition and membership testing. Clearing an |
| 83 | /// entire set can also be done in constant time. Iteration yields elements |
| 84 | /// in the order in which they were inserted. |
| 85 | /// |
| 86 | /// The data structure is based on: https://research.swtch.com/sparse |
| 87 | /// Note though that we don't actually use uninitialized memory. We generally |
| 88 | /// reuse sparse sets, so the initial allocation cost is bareable. However, its |
| 89 | /// other properties listed above are extremely useful. |
| 90 | #[derive (Clone)] |
| 91 | pub(crate) struct SparseSet { |
| 92 | /// The number of elements currently in this set. |
| 93 | len: usize, |
| 94 | /// Dense contains the ids in the order in which they were inserted. |
| 95 | dense: Vec<StateID>, |
| 96 | /// Sparse maps ids to their location in dense. |
| 97 | /// |
| 98 | /// A state ID is in the set if and only if |
| 99 | /// sparse[id] < len && id == dense[sparse[id]]. |
| 100 | /// |
| 101 | /// Note that these are indices into 'dense'. It's a little weird to use |
| 102 | /// StateID here, but we know our length can never exceed the bounds of |
| 103 | /// StateID (enforced by 'resize') and StateID will be at most 4 bytes |
| 104 | /// where as a usize is likely double that in most cases. |
| 105 | sparse: Vec<StateID>, |
| 106 | } |
| 107 | |
| 108 | impl SparseSet { |
| 109 | /// Create a new sparse set with the given capacity. |
| 110 | /// |
| 111 | /// Sparse sets have a fixed size and they cannot grow. Attempting to |
| 112 | /// insert more distinct elements than the total capacity of the set will |
| 113 | /// result in a panic. |
| 114 | /// |
| 115 | /// This panics if the capacity given is bigger than `StateID::LIMIT`. |
| 116 | #[inline ] |
| 117 | pub(crate) fn new(capacity: usize) -> SparseSet { |
| 118 | let mut set = SparseSet { len: 0, dense: vec![], sparse: vec![] }; |
| 119 | set.resize(capacity); |
| 120 | set |
| 121 | } |
| 122 | |
| 123 | /// Resizes this sparse set to have the new capacity given. |
| 124 | /// |
| 125 | /// This set is automatically cleared. |
| 126 | /// |
| 127 | /// This panics if the capacity given is bigger than `StateID::LIMIT`. |
| 128 | #[inline ] |
| 129 | pub(crate) fn resize(&mut self, new_capacity: usize) { |
| 130 | assert!( |
| 131 | new_capacity <= StateID::LIMIT, |
| 132 | "sparse set capacity cannot excced {:?}" , |
| 133 | StateID::LIMIT |
| 134 | ); |
| 135 | self.clear(); |
| 136 | self.dense.resize(new_capacity, StateID::ZERO); |
| 137 | self.sparse.resize(new_capacity, StateID::ZERO); |
| 138 | } |
| 139 | |
| 140 | /// Returns the capacity of this set. |
| 141 | /// |
| 142 | /// The capacity represents a fixed limit on the number of distinct |
| 143 | /// elements that are allowed in this set. The capacity cannot be changed. |
| 144 | #[inline ] |
| 145 | pub(crate) fn capacity(&self) -> usize { |
| 146 | self.dense.len() |
| 147 | } |
| 148 | |
| 149 | /// Returns the number of elements in this set. |
| 150 | #[inline ] |
| 151 | pub(crate) fn len(&self) -> usize { |
| 152 | self.len |
| 153 | } |
| 154 | |
| 155 | /// Returns true if and only if this set is empty. |
| 156 | #[inline ] |
| 157 | pub(crate) fn is_empty(&self) -> bool { |
| 158 | self.len() == 0 |
| 159 | } |
| 160 | |
| 161 | /// Insert the state ID value into this set and return true if the given |
| 162 | /// state ID was not previously in this set. |
| 163 | /// |
| 164 | /// This operation is idempotent. If the given value is already in this |
| 165 | /// set, then this is a no-op. |
| 166 | /// |
| 167 | /// If more than `capacity` ids are inserted, then this panics. |
| 168 | /// |
| 169 | /// This is marked as inline(always) since the compiler won't inline it |
| 170 | /// otherwise, and it's a fairly hot piece of code in DFA determinization. |
| 171 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 172 | pub(crate) fn insert(&mut self, id: StateID) -> bool { |
| 173 | if self.contains(id) { |
| 174 | return false; |
| 175 | } |
| 176 | |
| 177 | let i = self.len(); |
| 178 | assert!( |
| 179 | i < self.capacity(), |
| 180 | " {:?} exceeds capacity of {:?} when inserting {:?}" , |
| 181 | i, |
| 182 | self.capacity(), |
| 183 | id, |
| 184 | ); |
| 185 | // OK since i < self.capacity() and self.capacity() is guaranteed to |
| 186 | // be <= StateID::LIMIT. |
| 187 | let index = StateID::new_unchecked(i); |
| 188 | self.dense[index] = id; |
| 189 | self.sparse[id] = index; |
| 190 | self.len += 1; |
| 191 | true |
| 192 | } |
| 193 | |
| 194 | /// Returns true if and only if this set contains the given value. |
| 195 | #[inline ] |
| 196 | pub(crate) fn contains(&self, id: StateID) -> bool { |
| 197 | let index = self.sparse[id]; |
| 198 | index.as_usize() < self.len() && self.dense[index] == id |
| 199 | } |
| 200 | |
| 201 | /// Clear this set such that it has no members. |
| 202 | #[inline ] |
| 203 | pub(crate) fn clear(&mut self) { |
| 204 | self.len = 0; |
| 205 | } |
| 206 | |
| 207 | #[inline ] |
| 208 | pub(crate) fn iter(&self) -> SparseSetIter<'_> { |
| 209 | SparseSetIter(self.dense[..self.len()].iter()) |
| 210 | } |
| 211 | |
| 212 | /// Returns the heap memory usage, in bytes, used by this sparse set. |
| 213 | #[inline ] |
| 214 | pub(crate) fn memory_usage(&self) -> usize { |
| 215 | self.dense.len() * StateID::SIZE + self.sparse.len() * StateID::SIZE |
| 216 | } |
| 217 | } |
| 218 | |
| 219 | impl core::fmt::Debug for SparseSet { |
| 220 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
| 221 | let elements: Vec<StateID> = self.iter().collect(); |
| 222 | f.debug_tuple(name:"SparseSet" ).field(&elements).finish() |
| 223 | } |
| 224 | } |
| 225 | |
| 226 | /// An iterator over all elements in a sparse set. |
| 227 | /// |
| 228 | /// The lifetime `'a` refers to the lifetime of the set being iterated over. |
| 229 | #[derive (Debug)] |
| 230 | pub(crate) struct SparseSetIter<'a>(core::slice::Iter<'a, StateID>); |
| 231 | |
| 232 | impl<'a> Iterator for SparseSetIter<'a> { |
| 233 | type Item = StateID; |
| 234 | |
| 235 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 236 | fn next(&mut self) -> Option<StateID> { |
| 237 | self.0.next().map(|&id: StateID| id) |
| 238 | } |
| 239 | } |
| 240 | |