1 | use std::fmt; |
2 | |
3 | /// A representation of byte oriented equivalence classes. |
4 | /// |
5 | /// This is used in an FSM to reduce the size of the transition table. This can |
6 | /// have a particularly large impact not only on the total size of an FSM, but |
7 | /// also on compile times. |
8 | #[derive (Clone, Copy)] |
9 | pub struct ByteClasses([u8; 256]); |
10 | |
11 | impl ByteClasses { |
12 | /// Creates a new set of equivalence classes where all bytes are mapped to |
13 | /// the same class. |
14 | pub fn empty() -> ByteClasses { |
15 | ByteClasses([0; 256]) |
16 | } |
17 | |
18 | /// Creates a new set of equivalence classes where each byte belongs to |
19 | /// its own equivalence class. |
20 | pub fn singletons() -> ByteClasses { |
21 | let mut classes = ByteClasses::empty(); |
22 | for i in 0..256 { |
23 | classes.set(i as u8, i as u8); |
24 | } |
25 | classes |
26 | } |
27 | |
28 | /// Set the equivalence class for the given byte. |
29 | #[inline ] |
30 | pub fn set(&mut self, byte: u8, class: u8) { |
31 | self.0[byte as usize] = class; |
32 | } |
33 | |
34 | /// Get the equivalence class for the given byte. |
35 | #[inline ] |
36 | pub fn get(&self, byte: u8) -> u8 { |
37 | // SAFETY: This is safe because all dense transitions have |
38 | // exactly 256 elements, so all u8 values are valid indices. |
39 | self.0[byte as usize] |
40 | } |
41 | |
42 | /// Return the total number of elements in the alphabet represented by |
43 | /// these equivalence classes. Equivalently, this returns the total number |
44 | /// of equivalence classes. |
45 | #[inline ] |
46 | pub fn alphabet_len(&self) -> usize { |
47 | self.0[255] as usize + 1 |
48 | } |
49 | |
50 | /// Returns true if and only if every byte in this class maps to its own |
51 | /// equivalence class. Equivalently, there are 256 equivalence classes |
52 | /// and each class contains exactly one byte. |
53 | #[inline ] |
54 | pub fn is_singleton(&self) -> bool { |
55 | self.alphabet_len() == 256 |
56 | } |
57 | |
58 | /// Returns an iterator over a sequence of representative bytes from each |
59 | /// equivalence class. Namely, this yields exactly N items, where N is |
60 | /// equivalent to the number of equivalence classes. Each item is an |
61 | /// arbitrary byte drawn from each equivalence class. |
62 | /// |
63 | /// This is useful when one is determinizing an NFA and the NFA's alphabet |
64 | /// hasn't been converted to equivalence classes yet. Picking an arbitrary |
65 | /// byte from each equivalence class then permits a full exploration of |
66 | /// the NFA instead of using every possible byte value. |
67 | pub fn representatives(&self) -> ByteClassRepresentatives<'_> { |
68 | ByteClassRepresentatives { classes: self, byte: 0, last_class: None } |
69 | } |
70 | |
71 | /// Returns all of the bytes in the given equivalence class. |
72 | /// |
73 | /// The second element in the tuple indicates the number of elements in |
74 | /// the array. |
75 | fn elements(&self, equiv: u8) -> ([u8; 256], usize) { |
76 | let (mut array, mut len) = ([0; 256], 0); |
77 | for b in 0..256 { |
78 | if self.get(b as u8) == equiv { |
79 | array[len] = b as u8; |
80 | len += 1; |
81 | } |
82 | } |
83 | (array, len) |
84 | } |
85 | } |
86 | |
87 | impl fmt::Debug for ByteClasses { |
88 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
89 | if self.is_singleton() { |
90 | write!(f, "ByteClasses( {{singletons }})" ) |
91 | } else { |
92 | write!(f, "ByteClasses(" )?; |
93 | for equiv: usize in 0..self.alphabet_len() { |
94 | let (members: [u8; 256], len: usize) = self.elements(equiv as u8); |
95 | write!(f, " {} => {:?}" , equiv, &members[..len])?; |
96 | } |
97 | write!(f, ")" ) |
98 | } |
99 | } |
100 | } |
101 | |
102 | /// An iterator over representative bytes from each equivalence class. |
103 | #[derive (Debug)] |
104 | pub struct ByteClassRepresentatives<'a> { |
105 | classes: &'a ByteClasses, |
106 | byte: usize, |
107 | last_class: Option<u8>, |
108 | } |
109 | |
110 | impl<'a> Iterator for ByteClassRepresentatives<'a> { |
111 | type Item = u8; |
112 | |
113 | fn next(&mut self) -> Option<u8> { |
114 | while self.byte < 256 { |
115 | let byte: u8 = self.byte as u8; |
116 | let class: u8 = self.classes.get(byte); |
117 | self.byte += 1; |
118 | |
119 | if self.last_class != Some(class) { |
120 | self.last_class = Some(class); |
121 | return Some(byte); |
122 | } |
123 | } |
124 | None |
125 | } |
126 | } |
127 | |
128 | /// A byte class builder keeps track of an *approximation* of equivalence |
129 | /// classes of bytes during NFA construction. That is, every byte in an |
130 | /// equivalence class cannot discriminate between a match and a non-match. |
131 | /// |
132 | /// For example, in the literals `abc` and `xyz`, the bytes [\x00-`], [d-w] |
133 | /// and [{-\xFF] never discriminate between a match and a non-match, precisely |
134 | /// because they never occur in the literals anywhere. |
135 | /// |
136 | /// Note though that this does not necessarily compute the minimal set of |
137 | /// equivalence classes. For example, in the literals above, the byte ranges |
138 | /// [\x00-`], [d-w] and [{-\xFF] are all treated as distinct equivalence |
139 | /// classes even though they could be treated a single class. The reason for |
140 | /// this is implementation complexity. In the future, we should endeavor to |
141 | /// compute the minimal equivalence classes since they can have a rather large |
142 | /// impact on the size of the DFA. |
143 | /// |
144 | /// The representation here is 256 booleans, all initially set to false. Each |
145 | /// boolean maps to its corresponding byte based on position. A `true` value |
146 | /// indicates the end of an equivalence class, where its corresponding byte |
147 | /// and all of the bytes corresponding to all previous contiguous `false` |
148 | /// values are in the same equivalence class. |
149 | /// |
150 | /// This particular representation only permits contiguous ranges of bytes to |
151 | /// be in the same equivalence class, which means that we can never discover |
152 | /// the true minimal set of equivalence classes. |
153 | #[derive (Debug)] |
154 | pub struct ByteClassBuilder(Vec<bool>); |
155 | |
156 | impl ByteClassBuilder { |
157 | /// Create a new builder of byte classes where all bytes are part of the |
158 | /// same equivalence class. |
159 | pub fn new() -> ByteClassBuilder { |
160 | ByteClassBuilder(vec![false; 256]) |
161 | } |
162 | |
163 | /// Indicate the the range of byte given (inclusive) can discriminate a |
164 | /// match between it and all other bytes outside of the range. |
165 | pub fn set_range(&mut self, start: u8, end: u8) { |
166 | debug_assert!(start <= end); |
167 | if start > 0 { |
168 | self.0[start as usize - 1] = true; |
169 | } |
170 | self.0[end as usize] = true; |
171 | } |
172 | |
173 | /// Build byte classes that map all byte values to their corresponding |
174 | /// equivalence class. The last mapping indicates the largest equivalence |
175 | /// class identifier (which is never bigger than 255). |
176 | pub fn build(&self) -> ByteClasses { |
177 | let mut classes = ByteClasses::empty(); |
178 | let mut class = 0u8; |
179 | let mut i = 0; |
180 | loop { |
181 | classes.set(i as u8, class as u8); |
182 | if i >= 255 { |
183 | break; |
184 | } |
185 | if self.0[i] { |
186 | class = class.checked_add(1).unwrap(); |
187 | } |
188 | i += 1; |
189 | } |
190 | classes |
191 | } |
192 | } |
193 | |
194 | #[cfg (test)] |
195 | mod tests { |
196 | use super::*; |
197 | |
198 | #[test ] |
199 | fn byte_classes() { |
200 | let mut set = ByteClassBuilder::new(); |
201 | set.set_range(b'a' , b'z' ); |
202 | |
203 | let classes = set.build(); |
204 | assert_eq!(classes.get(0), 0); |
205 | assert_eq!(classes.get(1), 0); |
206 | assert_eq!(classes.get(2), 0); |
207 | assert_eq!(classes.get(b'a' - 1), 0); |
208 | assert_eq!(classes.get(b'a' ), 1); |
209 | assert_eq!(classes.get(b'm' ), 1); |
210 | assert_eq!(classes.get(b'z' ), 1); |
211 | assert_eq!(classes.get(b'z' + 1), 2); |
212 | assert_eq!(classes.get(254), 2); |
213 | assert_eq!(classes.get(255), 2); |
214 | |
215 | let mut set = ByteClassBuilder::new(); |
216 | set.set_range(0, 2); |
217 | set.set_range(4, 6); |
218 | let classes = set.build(); |
219 | assert_eq!(classes.get(0), 0); |
220 | assert_eq!(classes.get(1), 0); |
221 | assert_eq!(classes.get(2), 0); |
222 | assert_eq!(classes.get(3), 1); |
223 | assert_eq!(classes.get(4), 2); |
224 | assert_eq!(classes.get(5), 2); |
225 | assert_eq!(classes.get(6), 2); |
226 | assert_eq!(classes.get(7), 3); |
227 | assert_eq!(classes.get(255), 3); |
228 | } |
229 | |
230 | #[test ] |
231 | fn full_byte_classes() { |
232 | let mut set = ByteClassBuilder::new(); |
233 | for i in 0..256u16 { |
234 | set.set_range(i as u8, i as u8); |
235 | } |
236 | assert_eq!(set.build().alphabet_len(), 256); |
237 | } |
238 | } |
239 | |