1use core::fmt;
2
3/// A representation of byte oriented equivalence classes.
4///
5/// This is used in a DFA to reduce the size of the transition table. This can
6/// have a particularly large impact not only on the total size of a dense DFA,
7/// but also on compile times.
8#[derive(Clone, Copy)]
9pub struct ByteClasses([u8; 256]);
10
11impl ByteClasses {
12 /// Creates a new set of equivalence classes where all bytes are mapped to
13 /// the same class.
14 pub fn empty() -> ByteClasses {
15 ByteClasses([0; 256])
16 }
17
18 /// Creates a new set of equivalence classes where each byte belongs to
19 /// its own equivalence class.
20 pub fn singletons() -> ByteClasses {
21 let mut classes = ByteClasses::empty();
22 for i in 0..256 {
23 classes.set(i as u8, i as u8);
24 }
25 classes
26 }
27
28 /// Copies the byte classes given. The given slice must have length 0 or
29 /// length 256. Slices of length 0 are treated as singletons (every byte
30 /// is its own class).
31 pub fn from_slice(slice: &[u8]) -> ByteClasses {
32 assert!(slice.is_empty() || slice.len() == 256);
33
34 if slice.is_empty() {
35 ByteClasses::singletons()
36 } else {
37 let mut classes = ByteClasses::empty();
38 for (b, &class) in slice.iter().enumerate() {
39 classes.set(b as u8, class);
40 }
41 classes
42 }
43 }
44
45 /// Set the equivalence class for the given byte.
46 #[inline]
47 pub fn set(&mut self, byte: u8, class: u8) {
48 self.0[byte as usize] = class;
49 }
50
51 /// Get the equivalence class for the given byte.
52 #[inline]
53 pub fn get(&self, byte: u8) -> u8 {
54 self.0[byte as usize]
55 }
56
57 /// Get the equivalence class for the given byte while forcefully
58 /// eliding bounds checks.
59 #[inline]
60 pub unsafe fn get_unchecked(&self, byte: u8) -> u8 {
61 *self.0.get_unchecked(byte as usize)
62 }
63
64 /// Return the total number of elements in the alphabet represented by
65 /// these equivalence classes. Equivalently, this returns the total number
66 /// of equivalence classes.
67 #[inline]
68 pub fn alphabet_len(&self) -> usize {
69 self.0[255] as usize + 1
70 }
71
72 /// Returns true if and only if every byte in this class maps to its own
73 /// equivalence class. Equivalently, there are 256 equivalence classes
74 /// and each class contains exactly one byte.
75 #[inline]
76 pub fn is_singleton(&self) -> bool {
77 self.alphabet_len() == 256
78 }
79
80 /// Returns an iterator over a sequence of representative bytes from each
81 /// equivalence class. Namely, this yields exactly N items, where N is
82 /// equivalent to the number of equivalence classes. Each item is an
83 /// arbitrary byte drawn from each equivalence class.
84 ///
85 /// This is useful when one is determinizing an NFA and the NFA's alphabet
86 /// hasn't been converted to equivalence classes yet. Picking an arbitrary
87 /// byte from each equivalence class then permits a full exploration of
88 /// the NFA instead of using every possible byte value.
89 #[cfg(feature = "std")]
90 pub fn representatives(&self) -> ByteClassRepresentatives {
91 ByteClassRepresentatives { classes: self, byte: 0, last_class: None }
92 }
93
94 /// Returns all of the bytes in the given equivalence class.
95 ///
96 /// The second element in the tuple indicates the number of elements in
97 /// the array.
98 fn elements(&self, equiv: u8) -> ([u8; 256], usize) {
99 let (mut array, mut len) = ([0; 256], 0);
100 for b in 0..256 {
101 if self.get(b as u8) == equiv {
102 array[len] = b as u8;
103 len += 1;
104 }
105 }
106 (array, len)
107 }
108}
109
110impl fmt::Debug for ByteClasses {
111 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
112 if self.is_singleton() {
113 write!(f, "ByteClasses({{singletons}})")
114 } else {
115 write!(f, "ByteClasses(")?;
116 for equiv in 0..self.alphabet_len() {
117 let (members, len) = self.elements(equiv as u8);
118 write!(f, "{} => {:?}", equiv, &members[..len])?;
119 }
120 write!(f, ")")
121 }
122 }
123}
124
125/// An iterator over representative bytes from each equivalence class.
126#[cfg(feature = "std")]
127#[derive(Debug)]
128pub struct ByteClassRepresentatives<'a> {
129 classes: &'a ByteClasses,
130 byte: usize,
131 last_class: Option<u8>,
132}
133
134#[cfg(feature = "std")]
135impl<'a> Iterator for ByteClassRepresentatives<'a> {
136 type Item = u8;
137
138 fn next(&mut self) -> Option<u8> {
139 while self.byte < 256 {
140 let byte = self.byte as u8;
141 let class = self.classes.get(byte);
142 self.byte += 1;
143
144 if self.last_class != Some(class) {
145 self.last_class = Some(class);
146 return Some(byte);
147 }
148 }
149 None
150 }
151}
152
153/// A byte class set keeps track of an *approximation* of equivalence classes
154/// of bytes during NFA construction. That is, every byte in an equivalence
155/// class cannot discriminate between a match and a non-match.
156///
157/// For example, in the regex `[ab]+`, the bytes `a` and `b` would be in the
158/// same equivalence class because it never matters whether an `a` or a `b` is
159/// seen, and no combination of `a`s and `b`s in the text can discriminate
160/// a match.
161///
162/// Note though that this does not compute the minimal set of equivalence
163/// classes. For example, in the regex `[ac]+`, both `a` and `c` are in the
164/// same equivalence class for the same reason that `a` and `b` are in the
165/// same equivalence class in the aforementioned regex. However, in this
166/// implementation, `a` and `c` are put into distinct equivalence classes.
167/// The reason for this is implementation complexity. In the future, we should
168/// endeavor to compute the minimal equivalence classes since they can have a
169/// rather large impact on the size of the DFA.
170///
171/// The representation here is 256 booleans, all initially set to false. Each
172/// boolean maps to its corresponding byte based on position. A `true` value
173/// indicates the end of an equivalence class, where its corresponding byte
174/// and all of the bytes corresponding to all previous contiguous `false`
175/// values are in the same equivalence class.
176///
177/// This particular representation only permits contiguous ranges of bytes to
178/// be in the same equivalence class, which means that we can never discover
179/// the true minimal set of equivalence classes.
180#[cfg(feature = "std")]
181#[derive(Debug)]
182pub struct ByteClassSet(Vec<bool>);
183
184#[cfg(feature = "std")]
185impl ByteClassSet {
186 /// Create a new set of byte classes where all bytes are part of the same
187 /// equivalence class.
188 pub fn new() -> Self {
189 ByteClassSet(vec![false; 256])
190 }
191
192 /// Indicate the the range of byte given (inclusive) can discriminate a
193 /// match between it and all other bytes outside of the range.
194 pub fn set_range(&mut self, start: u8, end: u8) {
195 debug_assert!(start <= end);
196 if start > 0 {
197 self.0[start as usize - 1] = true;
198 }
199 self.0[end as usize] = true;
200 }
201
202 /// Convert this boolean set to a map that maps all byte values to their
203 /// corresponding equivalence class. The last mapping indicates the largest
204 /// equivalence class identifier (which is never bigger than 255).
205 pub fn byte_classes(&self) -> ByteClasses {
206 let mut classes = ByteClasses::empty();
207 let mut class = 0u8;
208 let mut i = 0;
209 loop {
210 classes.set(i as u8, class as u8);
211 if i >= 255 {
212 break;
213 }
214 if self.0[i] {
215 class = class.checked_add(1).unwrap();
216 }
217 i += 1;
218 }
219 classes
220 }
221}
222
223#[cfg(test)]
224mod tests {
225 #[cfg(feature = "std")]
226 #[test]
227 fn byte_classes() {
228 use super::ByteClassSet;
229
230 let mut set = ByteClassSet::new();
231 set.set_range(b'a', b'z');
232
233 let classes = set.byte_classes();
234 assert_eq!(classes.get(0), 0);
235 assert_eq!(classes.get(1), 0);
236 assert_eq!(classes.get(2), 0);
237 assert_eq!(classes.get(b'a' - 1), 0);
238 assert_eq!(classes.get(b'a'), 1);
239 assert_eq!(classes.get(b'm'), 1);
240 assert_eq!(classes.get(b'z'), 1);
241 assert_eq!(classes.get(b'z' + 1), 2);
242 assert_eq!(classes.get(254), 2);
243 assert_eq!(classes.get(255), 2);
244
245 let mut set = ByteClassSet::new();
246 set.set_range(0, 2);
247 set.set_range(4, 6);
248 let classes = set.byte_classes();
249 assert_eq!(classes.get(0), 0);
250 assert_eq!(classes.get(1), 0);
251 assert_eq!(classes.get(2), 0);
252 assert_eq!(classes.get(3), 1);
253 assert_eq!(classes.get(4), 2);
254 assert_eq!(classes.get(5), 2);
255 assert_eq!(classes.get(6), 2);
256 assert_eq!(classes.get(7), 3);
257 assert_eq!(classes.get(255), 3);
258 }
259
260 #[cfg(feature = "std")]
261 #[test]
262 fn full_byte_classes() {
263 use super::ByteClassSet;
264
265 let mut set = ByteClassSet::new();
266 for i in 0..256u16 {
267 set.set_range(i as u8, i as u8);
268 }
269 assert_eq!(set.byte_classes().alphabet_len(), 256);
270 }
271}
272