1 | /*! |
---|---|

2 | This module defines a sparse set data structure. Its most interesting |

3 | properties are: |

4 | |

5 | * They preserve insertion order. |

6 | * Set membership testing is done in constant time. |

7 | * Set insertion is done in constant time. |

8 | * Clearing the set is done in constant time. |

9 | |

10 | The cost for doing this is that the capacity of the set needs to be known up |

11 | front, and the elements in the set are limited to state identifiers. |

12 | |

13 | These sets are principally used when traversing an NFA state graph. This |

14 | happens at search time, for example, in the PikeVM. It also happens during DFA |

15 | determinization. |

16 | */ |

17 | |

18 | use alloc::{vec, vec::Vec}; |

19 | |

20 | use crate::util::primitives::StateID; |

21 | |

22 | /// A pairse of sparse sets. |

23 | /// |

24 | /// This is useful when one needs to compute NFA epsilon closures from a |

25 | /// previous set of states derived from an epsilon closure. One set can be the |

26 | /// starting states where as the other set can be the destination states after |

27 | /// following the transitions for a particular byte of input. |

28 | /// |

29 | /// There is no significance to 'set1' or 'set2'. They are both sparse sets of |

30 | /// the same size. |

31 | /// |

32 | /// The members of this struct are exposed so that callers may borrow 'set1' |

33 | /// and 'set2' individually without being force to borrow both at the same |

34 | /// time. |

35 | #[derive(Clone, Debug)] |

36 | pub(crate) struct SparseSets { |

37 | pub(crate) set1: SparseSet, |

38 | pub(crate) set2: SparseSet, |

39 | } |

40 | |

41 | impl SparseSets { |

42 | /// Create a new pair of sparse sets where each set has the given capacity. |

43 | /// |

44 | /// This panics if the capacity given is bigger than `StateID::LIMIT`. |

45 | pub(crate) fn new(capacity: usize) -> SparseSets { |

46 | SparseSets { |

47 | set1: SparseSet::new(capacity), |

48 | set2: SparseSet::new(capacity), |

49 | } |

50 | } |

51 | |

52 | /// Resizes these sparse sets to have the new capacity given. |

53 | /// |

54 | /// The sets are automatically cleared. |

55 | /// |

56 | /// This panics if the capacity given is bigger than `StateID::LIMIT`. |

57 | #[inline] |

58 | pub(crate) fn resize(&mut self, new_capacity: usize) { |

59 | self.set1.resize(new_capacity); |

60 | self.set2.resize(new_capacity); |

61 | } |

62 | |

63 | /// Clear both sparse sets. |

64 | pub(crate) fn clear(&mut self) { |

65 | self.set1.clear(); |

66 | self.set2.clear(); |

67 | } |

68 | |

69 | /// Swap set1 with set2. |

70 | pub(crate) fn swap(&mut self) { |

71 | core::mem::swap(&mut self.set1, &mut self.set2); |

72 | } |

73 | |

74 | /// Returns the memory usage, in bytes, used by this pair of sparse sets. |

75 | pub(crate) fn memory_usage(&self) -> usize { |

76 | self.set1.memory_usage() + self.set2.memory_usage() |

77 | } |

78 | } |

79 | |

80 | /// A sparse set used for representing ordered NFA states. |

81 | /// |

82 | /// This supports constant time addition and membership testing. Clearing an |

83 | /// entire set can also be done in constant time. Iteration yields elements |

84 | /// in the order in which they were inserted. |

85 | /// |

86 | /// The data structure is based on: https://research.swtch.com/sparse |

87 | /// Note though that we don't actually use uninitialized memory. We generally |

88 | /// reuse sparse sets, so the initial allocation cost is bareable. However, its |

89 | /// other properties listed above are extremely useful. |

90 | #[derive(Clone)] |

91 | pub(crate) struct SparseSet { |

92 | /// The number of elements currently in this set. |

93 | len: usize, |

94 | /// Dense contains the ids in the order in which they were inserted. |

95 | dense: Vec<StateID>, |

96 | /// Sparse maps ids to their location in dense. |

97 | /// |

98 | /// A state ID is in the set if and only if |

99 | /// sparse[id] < len && id == dense[sparse[id]]. |

100 | /// |

101 | /// Note that these are indices into 'dense'. It's a little weird to use |

102 | /// StateID here, but we know our length can never exceed the bounds of |

103 | /// StateID (enforced by 'resize') and StateID will be at most 4 bytes |

104 | /// where as a usize is likely double that in most cases. |

105 | sparse: Vec<StateID>, |

106 | } |

107 | |

108 | impl SparseSet { |

109 | /// Create a new sparse set with the given capacity. |

110 | /// |

111 | /// Sparse sets have a fixed size and they cannot grow. Attempting to |

112 | /// insert more distinct elements than the total capacity of the set will |

113 | /// result in a panic. |

114 | /// |

115 | /// This panics if the capacity given is bigger than `StateID::LIMIT`. |

116 | #[inline] |

117 | pub(crate) fn new(capacity: usize) -> SparseSet { |

118 | let mut set = SparseSet { len: 0, dense: vec![], sparse: vec![] }; |

119 | set.resize(capacity); |

120 | set |

121 | } |

122 | |

123 | /// Resizes this sparse set to have the new capacity given. |

124 | /// |

125 | /// This set is automatically cleared. |

126 | /// |

127 | /// This panics if the capacity given is bigger than `StateID::LIMIT`. |

128 | #[inline] |

129 | pub(crate) fn resize(&mut self, new_capacity: usize) { |

130 | assert!( |

131 | new_capacity <= StateID::LIMIT, |

132 | "sparse set capacity cannot excced {:?}", |

133 | StateID::LIMIT |

134 | ); |

135 | self.clear(); |

136 | self.dense.resize(new_capacity, StateID::ZERO); |

137 | self.sparse.resize(new_capacity, StateID::ZERO); |

138 | } |

139 | |

140 | /// Returns the capacity of this set. |

141 | /// |

142 | /// The capacity represents a fixed limit on the number of distinct |

143 | /// elements that are allowed in this set. The capacity cannot be changed. |

144 | #[inline] |

145 | pub(crate) fn capacity(&self) -> usize { |

146 | self.dense.len() |

147 | } |

148 | |

149 | /// Returns the number of elements in this set. |

150 | #[inline] |

151 | pub(crate) fn len(&self) -> usize { |

152 | self.len |

153 | } |

154 | |

155 | /// Returns true if and only if this set is empty. |

156 | #[inline] |

157 | pub(crate) fn is_empty(&self) -> bool { |

158 | self.len() == 0 |

159 | } |

160 | |

161 | /// Insert the state ID value into this set and return true if the given |

162 | /// state ID was not previously in this set. |

163 | /// |

164 | /// This operation is idempotent. If the given value is already in this |

165 | /// set, then this is a no-op. |

166 | /// |

167 | /// If more than `capacity` ids are inserted, then this panics. |

168 | /// |

169 | /// This is marked as inline(always) since the compiler won't inline it |

170 | /// otherwise, and it's a fairly hot piece of code in DFA determinization. |

171 | #[cfg_attr(feature = "perf-inline", inline(always))] |

172 | pub(crate) fn insert(&mut self, id: StateID) -> bool { |

173 | if self.contains(id) { |

174 | return false; |

175 | } |

176 | |

177 | let i = self.len(); |

178 | assert!( |

179 | i < self.capacity(), |

180 | "{:?} exceeds capacity of {:?} when inserting {:?}", |

181 | i, |

182 | self.capacity(), |

183 | id, |

184 | ); |

185 | // OK since i < self.capacity() and self.capacity() is guaranteed to |

186 | // be <= StateID::LIMIT. |

187 | let index = StateID::new_unchecked(i); |

188 | self.dense[index] = id; |

189 | self.sparse[id] = index; |

190 | self.len += 1; |

191 | true |

192 | } |

193 | |

194 | /// Returns true if and only if this set contains the given value. |

195 | #[inline] |

196 | pub(crate) fn contains(&self, id: StateID) -> bool { |

197 | let index = self.sparse[id]; |

198 | index.as_usize() < self.len() && self.dense[index] == id |

199 | } |

200 | |

201 | /// Clear this set such that it has no members. |

202 | #[inline] |

203 | pub(crate) fn clear(&mut self) { |

204 | self.len = 0; |

205 | } |

206 | |

207 | #[inline] |

208 | pub(crate) fn iter(&self) -> SparseSetIter<'_> { |

209 | SparseSetIter(self.dense[..self.len()].iter()) |

210 | } |

211 | |

212 | /// Returns the heap memory usage, in bytes, used by this sparse set. |

213 | #[inline] |

214 | pub(crate) fn memory_usage(&self) -> usize { |

215 | self.dense.len() * StateID::SIZE + self.sparse.len() * StateID::SIZE |

216 | } |

217 | } |

218 | |

219 | impl core::fmt::Debug for SparseSet { |

220 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |

221 | let elements: Vec<StateID> = self.iter().collect(); |

222 | f.debug_tuple("SparseSet").field(&elements).finish() |

223 | } |

224 | } |

225 | |

226 | /// An iterator over all elements in a sparse set. |

227 | /// |

228 | /// The lifetime `'a` refers to the lifetime of the set being iterated over. |

229 | #[derive(Debug)] |

230 | pub(crate) struct SparseSetIter<'a>(core::slice::Iter<'a, StateID>); |

231 | |

232 | impl<'a> Iterator for SparseSetIter<'a> { |

233 | type Item = StateID; |

234 | |

235 | #[cfg_attr(feature = "perf-inline", inline(always))] |

236 | fn next(&mut self) -> Option<StateID> { |

237 | self.0.next().map(|&id| id) |

238 | } |

239 | } |

240 |