classes.rs source code [crates/regex-automata-0.1.10/src/classes.rs]

1	use core::fmt;
2
3	/// A representation of byte oriented equivalence classes.
4	///
5	/// This is used in a DFA to reduce the size of the transition table. This can
6	/// have a particularly large impact not only on the total size of a dense DFA,
7	/// but also on compile times.
8	#[derive(Clone, Copy)]
9	pub struct ByteClasses([u8; `256`]);
10
11	impl ByteClasses {
12	/// Creates a new set of equivalence classes where all bytes are mapped to
13	/// the same class.
14	pub fn empty() -> ByteClasses {
15	ByteClasses([`0`; `256`])
16	}
17
18	/// Creates a new set of equivalence classes where each byte belongs to
19	/// its own equivalence class.
20	pub fn singletons() -> ByteClasses {
21	let mut classes = ByteClasses::empty();
22	for i in `0`..`256` {
23	classes.set(i as u8, i as u8);
24	}
25	classes
26	}
27
28	/// Copies the byte classes given. The given slice must have length 0 or
29	/// length 256. Slices of length 0 are treated as singletons (every byte
30	/// is its own class).
31	pub fn from_slice(slice: &[u8]) -> ByteClasses {
32	assert!(slice.is_empty() \|\| slice.len() == `256`);
33
34	if slice.is_empty() {
35	ByteClasses::singletons()
36	} else {
37	let mut classes = ByteClasses::empty();
38	for (b, &class) in slice.iter().enumerate() {
39	classes.set(b as u8, class);
40	}
41	classes
42	}
43	}
44
45	/// Set the equivalence class for the given byte.
46	#[inline]
47	pub fn set(&mut self, byte: u8, class: u8) {
48	self.0[byte as usize] = class;
49	}
50
51	/// Get the equivalence class for the given byte.
52	#[inline]
53	pub fn get(&self, byte: u8) -> u8 {
54	self.0[byte as usize]
55	}
56
57	/// Get the equivalence class for the given byte while forcefully
58	/// eliding bounds checks.
59	#[inline]
60	pub unsafe fn get_unchecked(&self, byte: u8) -> u8 {
61	self.0.get_unchecked(byte as usize*)
62	}
63
64	/// Return the total number of elements in the alphabet represented by
65	/// these equivalence classes. Equivalently, this returns the total number
66	/// of equivalence classes.
67	#[inline]
68	pub fn alphabet_len(&self) -> usize {
69	self.0[`255`] as usize + `1`
70	}
71
72	/// Returns true if and only if every byte in this class maps to its own
73	/// equivalence class. Equivalently, there are 256 equivalence classes
74	/// and each class contains exactly one byte.
75	#[inline]
76	pub fn is_singleton(&self) -> bool {
77	self.alphabet_len() == `256`
78	}
79
80	/// Returns an iterator over a sequence of representative bytes from each
81	/// equivalence class. Namely, this yields exactly N items, where N is
82	/// equivalent to the number of equivalence classes. Each item is an
83	/// arbitrary byte drawn from each equivalence class.
84	///
85	/// This is useful when one is determinizing an NFA and the NFA's alphabet
86	/// hasn't been converted to equivalence classes yet. Picking an arbitrary
87	/// byte from each equivalence class then permits a full exploration of
88	/// the NFA instead of using every possible byte value.
89	#[cfg(feature = "std")]
90	pub fn representatives(&self) -> ByteClassRepresentatives {
91	ByteClassRepresentatives { classes: self, byte: `0`, last_class: None }
92	}
93
94	/// Returns all of the bytes in the given equivalence class.
95	///
96	/// The second element in the tuple indicates the number of elements in
97	/// the array.
98	fn elements(&self, equiv: u8) -> ([u8; `256`], usize) {
99	let (mut array, mut len) = ([`0`; `256`], `0`);
100	for b in `0`..`256` {
101	if self.get(b as u8) == equiv {
102	array[len] = b as u8;
103	len += `1`;
104	}
105	}
106	(array, len)
107	}
108	}
109
110	impl fmt::Debug for ByteClasses {
111	fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
112	if self.is_singleton() {
113	write!(f, "ByteClasses(`{{`singletons`}}`)")
114	} else {
115	write!(f, "ByteClasses(")?;
116	for equiv: usize in `0`..self.alphabet_len() {
117	let (members: [u8; 256], len: usize) = self.elements(equiv as u8);
118	write!(f, "{} => {:?}", equiv, &members[..len])?;
119	}
120	write!(f, ")")
121	}
122	}
123	}
124
125	/// An iterator over representative bytes from each equivalence class.
126	#[cfg(feature = "std")]
127	#[derive(Debug)]
128	pub struct ByteClassRepresentatives<'a> {
129	classes: &'a ByteClasses,
130	byte: usize,
131	last_class: Option<u8>,
132	}
133
134	#[cfg(feature = "std")]
135	impl<'a> Iterator for ByteClassRepresentatives<'a> {
136	type Item = u8;
137
138	fn next(&mut self) -> Option<u8> {
139	while self.byte < `256` {
140	let byte: u8 = self.byte as u8;
141	let class: u8 = self.classes.get(byte);
142	self.byte += `1`;
143
144	if self.last_class != Some(class) {
145	self.last_class = Some(class);
146	return Some(byte);
147	}
148	}
149	None
150	}
151	}
152
153	/// A byte class set keeps track of an approximation* of equivalence classes*
154	/// of bytes during NFA construction. That is, every byte in an equivalence
155	/// class cannot discriminate between a match and a non-match.
156	///
157	/// For example, in the regex `[ab]+`, the bytes `a` and `b` would be in the
158	/// same equivalence class because it never matters whether an `a` or a `b` is
159	/// seen, and no combination of `a`s and `b`s in the text can discriminate
160	/// a match.
161	///
162	/// Note though that this does not compute the minimal set of equivalence
163	/// classes. For example, in the regex `[ac]+`, both `a` and `c` are in the
164	/// same equivalence class for the same reason that `a` and `b` are in the
165	/// same equivalence class in the aforementioned regex. However, in this
166	/// implementation, `a` and `c` are put into distinct equivalence classes.
167	/// The reason for this is implementation complexity. In the future, we should
168	/// endeavor to compute the minimal equivalence classes since they can have a
169	/// rather large impact on the size of the DFA.
170	///
171	/// The representation here is 256 booleans, all initially set to false. Each
172	/// boolean maps to its corresponding byte based on position. A `true` value
173	/// indicates the end of an equivalence class, where its corresponding byte
174	/// and all of the bytes corresponding to all previous contiguous `false`
175	/// values are in the same equivalence class.
176	///
177	/// This particular representation only permits contiguous ranges of bytes to
178	/// be in the same equivalence class, which means that we can never discover
179	/// the true minimal set of equivalence classes.
180	#[cfg(feature = "std")]
181	#[derive(Debug)]
182	pub struct ByteClassSet(Vec<bool>);
183
184	#[cfg(feature = "std")]
185	impl ByteClassSet {
186	/// Create a new set of byte classes where all bytes are part of the same
187	/// equivalence class.
188	pub fn new() -> Self {
189	ByteClassSet(vec![`false`; `256`])
190	}
191
192	/// Indicate the the range of byte given (inclusive) can discriminate a
193	/// match between it and all other bytes outside of the range.
194	pub fn set_range(&mut self, start: u8, end: u8) {
195	debug_assert!(start <= end);
196	if start > `0` {
197	self.0[start as usize - `1`] = `true`;
198	}
199	self.0[end as usize] = `true`;
200	}
201
202	/// Convert this boolean set to a map that maps all byte values to their
203	/// corresponding equivalence class. The last mapping indicates the largest
204	/// equivalence class identifier (which is never bigger than 255).
205	pub fn byte_classes(&self) -> ByteClasses {
206	let mut classes = ByteClasses::empty();
207	let mut class = `0u8`;
208	let mut i = `0`;
209	loop {
210	classes.set(i as u8, class as u8);
211	if i >= `255` {
212	break;
213	}
214	if self.0[i] {
215	class = class.checked_add(`1`).unwrap();
216	}
217	i += `1`;
218	}
219	classes
220	}
221	}
222
223	#[cfg(test)]
224	mod tests {
225	#[cfg(feature = "std")]
226	#[test]
227	fn byte_classes() {
228	use super::ByteClassSet;
229
230	let mut set = ByteClassSet::new();
231	set.set_range(b'a', b'z');
232
233	let classes = set.byte_classes();
234	assert_eq!(classes.get(`0`), `0`);
235	assert_eq!(classes.get(`1`), `0`);
236	assert_eq!(classes.get(`2`), `0`);
237	assert_eq!(classes.get(b'a' - `1`), `0`);
238	assert_eq!(classes.get(b'a'), `1`);
239	assert_eq!(classes.get(b'm'), `1`);
240	assert_eq!(classes.get(b'z'), `1`);
241	assert_eq!(classes.get(b'z' + `1`), `2`);
242	assert_eq!(classes.get(`254`), `2`);
243	assert_eq!(classes.get(`255`), `2`);
244
245	let mut set = ByteClassSet::new();
246	set.set_range(`0`, `2`);
247	set.set_range(`4`, `6`);
248	let classes = set.byte_classes();
249	assert_eq!(classes.get(`0`), `0`);
250	assert_eq!(classes.get(`1`), `0`);
251	assert_eq!(classes.get(`2`), `0`);
252	assert_eq!(classes.get(`3`), `1`);
253	assert_eq!(classes.get(`4`), `2`);
254	assert_eq!(classes.get(`5`), `2`);
255	assert_eq!(classes.get(`6`), `2`);
256	assert_eq!(classes.get(`7`), `3`);
257	assert_eq!(classes.get(`255`), `3`);
258	}
259
260	#[cfg(feature = "std")]
261	#[test]
262	fn full_byte_classes() {
263	use super::ByteClassSet;
264
265	let mut set = ByteClassSet::new();
266	for i in `0`..`256u16` {
267	set.set_range(i as u8, i as u8);
268	}
269	assert_eq!(set.byte_classes().alphabet_len(), `256`);
270	}
271	}
272