1 | #[cfg (feature = "std" )] |
2 | use core::fmt; |
3 | #[cfg (feature = "std" )] |
4 | use core::iter; |
5 | use core::marker::PhantomData; |
6 | use core::mem::size_of; |
7 | #[cfg (feature = "std" )] |
8 | use std::collections::HashMap; |
9 | |
10 | #[cfg (feature = "std" )] |
11 | use byteorder::{BigEndian, LittleEndian}; |
12 | use byteorder::{ByteOrder, NativeEndian}; |
13 | |
14 | use classes::ByteClasses; |
15 | use dense; |
16 | use dfa::DFA; |
17 | #[cfg (feature = "std" )] |
18 | use error::{Error, Result}; |
19 | #[cfg (feature = "std" )] |
20 | use state_id::{dead_id, usize_to_state_id, write_state_id_bytes, StateID}; |
21 | #[cfg (not(feature = "std" ))] |
22 | use state_id::{dead_id, StateID}; |
23 | |
24 | /// A sparse table-based deterministic finite automaton (DFA). |
25 | /// |
26 | /// In contrast to a [dense DFA](enum.DenseDFA.html), a sparse DFA uses a |
27 | /// more space efficient representation for its transition table. Consequently, |
28 | /// sparse DFAs can use much less memory than dense DFAs, but this comes at a |
29 | /// price. In particular, reading the more space efficient transitions takes |
30 | /// more work, and consequently, searching using a sparse DFA is typically |
31 | /// slower than a dense DFA. |
32 | /// |
33 | /// A sparse DFA can be built using the default configuration via the |
34 | /// [`SparseDFA::new`](enum.SparseDFA.html#method.new) constructor. Otherwise, |
35 | /// one can configure various aspects of a dense DFA via |
36 | /// [`dense::Builder`](dense/struct.Builder.html), and then convert a dense |
37 | /// DFA to a sparse DFA using |
38 | /// [`DenseDFA::to_sparse`](enum.DenseDFA.html#method.to_sparse). |
39 | /// |
40 | /// In general, a sparse DFA supports all the same operations as a dense DFA. |
41 | /// |
42 | /// Making the choice between a dense and sparse DFA depends on your specific |
43 | /// work load. If you can sacrifice a bit of search time performance, then a |
44 | /// sparse DFA might be the best choice. In particular, while sparse DFAs are |
45 | /// probably always slower than dense DFAs, you may find that they are easily |
46 | /// fast enough for your purposes! |
47 | /// |
48 | /// # State size |
49 | /// |
50 | /// A `SparseDFA` has two type parameters, `T` and `S`. `T` corresponds to |
51 | /// the type of the DFA's transition table while `S` corresponds to the |
52 | /// representation used for the DFA's state identifiers as described by the |
53 | /// [`StateID`](trait.StateID.html) trait. This type parameter is typically |
54 | /// `usize`, but other valid choices provided by this crate include `u8`, |
55 | /// `u16`, `u32` and `u64`. The primary reason for choosing a different state |
56 | /// identifier representation than the default is to reduce the amount of |
57 | /// memory used by a DFA. Note though, that if the chosen representation cannot |
58 | /// accommodate the size of your DFA, then building the DFA will fail and |
59 | /// return an error. |
60 | /// |
61 | /// While the reduction in heap memory used by a DFA is one reason for choosing |
62 | /// a smaller state identifier representation, another possible reason is for |
63 | /// decreasing the serialization size of a DFA, as returned by |
64 | /// [`to_bytes_little_endian`](enum.SparseDFA.html#method.to_bytes_little_endian), |
65 | /// [`to_bytes_big_endian`](enum.SparseDFA.html#method.to_bytes_big_endian) |
66 | /// or |
67 | /// [`to_bytes_native_endian`](enum.DenseDFA.html#method.to_bytes_native_endian). |
68 | /// |
69 | /// The type of the transition table is typically either `Vec<u8>` or `&[u8]`, |
70 | /// depending on where the transition table is stored. Note that this is |
71 | /// different than a dense DFA, whose transition table is typically |
72 | /// `Vec<S>` or `&[S]`. The reason for this is that a sparse DFA always reads |
73 | /// its transition table from raw bytes because the table is compactly packed. |
74 | /// |
75 | /// # Variants |
76 | /// |
77 | /// This DFA is defined as a non-exhaustive enumeration of different types of |
78 | /// dense DFAs. All of the variants use the same internal representation |
79 | /// for the transition table, but they vary in how the transition table is |
80 | /// read. A DFA's specific variant depends on the configuration options set via |
81 | /// [`dense::Builder`](dense/struct.Builder.html). The default variant is |
82 | /// `ByteClass`. |
83 | /// |
84 | /// # The `DFA` trait |
85 | /// |
86 | /// This type implements the [`DFA`](trait.DFA.html) trait, which means it |
87 | /// can be used for searching. For example: |
88 | /// |
89 | /// ``` |
90 | /// use regex_automata::{DFA, SparseDFA}; |
91 | /// |
92 | /// # fn example() -> Result<(), regex_automata::Error> { |
93 | /// let dfa = SparseDFA::new("foo[0-9]+" )?; |
94 | /// assert_eq!(Some(8), dfa.find(b"foo12345" )); |
95 | /// # Ok(()) }; example().unwrap() |
96 | /// ``` |
97 | /// |
98 | /// The `DFA` trait also provides an assortment of other lower level methods |
99 | /// for DFAs, such as `start_state` and `next_state`. While these are correctly |
100 | /// implemented, it is an anti-pattern to use them in performance sensitive |
101 | /// code on the `SparseDFA` type directly. Namely, each implementation requires |
102 | /// a branch to determine which type of sparse DFA is being used. Instead, |
103 | /// this branch should be pushed up a layer in the code since walking the |
104 | /// transitions of a DFA is usually a hot path. If you do need to use these |
105 | /// lower level methods in performance critical code, then you should match on |
106 | /// the variants of this DFA and use each variant's implementation of the `DFA` |
107 | /// trait directly. |
108 | #[derive(Clone, Debug)] |
109 | pub enum SparseDFA<T: AsRef<[u8]>, S: StateID = usize> { |
110 | /// A standard DFA that does not use byte classes. |
111 | Standard(Standard<T, S>), |
112 | /// A DFA that shrinks its alphabet to a set of equivalence classes instead |
113 | /// of using all possible byte values. Any two bytes belong to the same |
114 | /// equivalence class if and only if they can be used interchangeably |
115 | /// anywhere in the DFA while never discriminating between a match and a |
116 | /// non-match. |
117 | /// |
118 | /// Unlike dense DFAs, sparse DFAs do not tend to benefit nearly as much |
119 | /// from using byte classes. In some cases, using byte classes can even |
120 | /// marginally increase the size of a sparse DFA's transition table. The |
121 | /// reason for this is that a sparse DFA already compacts each state's |
122 | /// transitions separate from whether byte classes are used. |
123 | ByteClass(ByteClass<T, S>), |
124 | /// Hints that destructuring should not be exhaustive. |
125 | /// |
126 | /// This enum may grow additional variants, so this makes sure clients |
127 | /// don't count on exhaustive matching. (Otherwise, adding a new variant |
128 | /// could break existing code.) |
129 | #[doc (hidden)] |
130 | __Nonexhaustive, |
131 | } |
132 | |
133 | #[cfg (feature = "std" )] |
134 | impl SparseDFA<Vec<u8>, usize> { |
135 | /// Parse the given regular expression using a default configuration and |
136 | /// return the corresponding sparse DFA. |
137 | /// |
138 | /// The default configuration uses `usize` for state IDs and reduces the |
139 | /// alphabet size by splitting bytes into equivalence classes. The |
140 | /// resulting DFA is *not* minimized. |
141 | /// |
142 | /// If you want a non-default configuration, then use the |
143 | /// [`dense::Builder`](dense/struct.Builder.html) |
144 | /// to set your own configuration, and then call |
145 | /// [`DenseDFA::to_sparse`](enum.DenseDFA.html#method.to_sparse) |
146 | /// to create a sparse DFA. |
147 | /// |
148 | /// # Example |
149 | /// |
150 | /// ``` |
151 | /// use regex_automata::{DFA, SparseDFA}; |
152 | /// |
153 | /// # fn example() -> Result<(), regex_automata::Error> { |
154 | /// let dfa = SparseDFA::new("foo[0-9]+bar" )?; |
155 | /// assert_eq!(Some(11), dfa.find(b"foo12345bar" )); |
156 | /// # Ok(()) }; example().unwrap() |
157 | /// ``` |
158 | pub fn new(pattern: &str) -> Result<SparseDFA<Vec<u8>, usize>> { |
159 | dense::Builder::new() |
160 | .build(pattern) |
161 | .and_then(|dense| dense.to_sparse()) |
162 | } |
163 | } |
164 | |
165 | #[cfg (feature = "std" )] |
166 | impl<S: StateID> SparseDFA<Vec<u8>, S> { |
167 | /// Create a new empty sparse DFA that never matches any input. |
168 | /// |
169 | /// # Example |
170 | /// |
171 | /// In order to build an empty DFA, callers must provide a type hint |
172 | /// indicating their choice of state identifier representation. |
173 | /// |
174 | /// ``` |
175 | /// use regex_automata::{DFA, SparseDFA}; |
176 | /// |
177 | /// # fn example() -> Result<(), regex_automata::Error> { |
178 | /// let dfa: SparseDFA<Vec<u8>, usize> = SparseDFA::empty(); |
179 | /// assert_eq!(None, dfa.find(b"" )); |
180 | /// assert_eq!(None, dfa.find(b"foo" )); |
181 | /// # Ok(()) }; example().unwrap() |
182 | /// ``` |
183 | pub fn empty() -> SparseDFA<Vec<u8>, S> { |
184 | dense::DenseDFA::empty().to_sparse().unwrap() |
185 | } |
186 | |
187 | pub(crate) fn from_dense_sized<T: AsRef<[S]>, A: StateID>( |
188 | dfa: &dense::Repr<T, S>, |
189 | ) -> Result<SparseDFA<Vec<u8>, A>> { |
190 | Repr::from_dense_sized(dfa).map(|r| r.into_sparse_dfa()) |
191 | } |
192 | } |
193 | |
194 | impl<T: AsRef<[u8]>, S: StateID> SparseDFA<T, S> { |
195 | /// Cheaply return a borrowed version of this sparse DFA. Specifically, the |
196 | /// DFA returned always uses `&[u8]` for its transition table while keeping |
197 | /// the same state identifier representation. |
198 | pub fn as_ref<'a>(&'a self) -> SparseDFA<&'a [u8], S> { |
199 | match *self { |
200 | SparseDFA::Standard(Standard(ref r)) => { |
201 | SparseDFA::Standard(Standard(r.as_ref())) |
202 | } |
203 | SparseDFA::ByteClass(ByteClass(ref r)) => { |
204 | SparseDFA::ByteClass(ByteClass(r.as_ref())) |
205 | } |
206 | SparseDFA::__Nonexhaustive => unreachable!(), |
207 | } |
208 | } |
209 | |
210 | /// Return an owned version of this sparse DFA. Specifically, the DFA |
211 | /// returned always uses `Vec<u8>` for its transition table while keeping |
212 | /// the same state identifier representation. |
213 | /// |
214 | /// Effectively, this returns a sparse DFA whose transition table lives |
215 | /// on the heap. |
216 | #[cfg (feature = "std" )] |
217 | pub fn to_owned(&self) -> SparseDFA<Vec<u8>, S> { |
218 | match *self { |
219 | SparseDFA::Standard(Standard(ref r)) => { |
220 | SparseDFA::Standard(Standard(r.to_owned())) |
221 | } |
222 | SparseDFA::ByteClass(ByteClass(ref r)) => { |
223 | SparseDFA::ByteClass(ByteClass(r.to_owned())) |
224 | } |
225 | SparseDFA::__Nonexhaustive => unreachable!(), |
226 | } |
227 | } |
228 | |
229 | /// Returns the memory usage, in bytes, of this DFA. |
230 | /// |
231 | /// The memory usage is computed based on the number of bytes used to |
232 | /// represent this DFA's transition table. This typically corresponds to |
233 | /// heap memory usage. |
234 | /// |
235 | /// This does **not** include the stack size used up by this DFA. To |
236 | /// compute that, used `std::mem::size_of::<SparseDFA>()`. |
237 | pub fn memory_usage(&self) -> usize { |
238 | self.repr().memory_usage() |
239 | } |
240 | |
241 | fn repr(&self) -> &Repr<T, S> { |
242 | match *self { |
243 | SparseDFA::Standard(ref r) => &r.0, |
244 | SparseDFA::ByteClass(ref r) => &r.0, |
245 | SparseDFA::__Nonexhaustive => unreachable!(), |
246 | } |
247 | } |
248 | } |
249 | |
250 | /// Routines for converting a sparse DFA to other representations, such as |
251 | /// smaller state identifiers or raw bytes suitable for persistent storage. |
252 | #[cfg (feature = "std" )] |
253 | impl<T: AsRef<[u8]>, S: StateID> SparseDFA<T, S> { |
254 | /// Create a new sparse DFA whose match semantics are equivalent to |
255 | /// this DFA, but attempt to use `u8` for the representation of state |
256 | /// identifiers. If `u8` is insufficient to represent all state identifiers |
257 | /// in this DFA, then this returns an error. |
258 | /// |
259 | /// This is a convenience routine for `to_sized::<u8>()`. |
260 | pub fn to_u8(&self) -> Result<SparseDFA<Vec<u8>, u8>> { |
261 | self.to_sized() |
262 | } |
263 | |
264 | /// Create a new sparse DFA whose match semantics are equivalent to |
265 | /// this DFA, but attempt to use `u16` for the representation of state |
266 | /// identifiers. If `u16` is insufficient to represent all state |
267 | /// identifiers in this DFA, then this returns an error. |
268 | /// |
269 | /// This is a convenience routine for `to_sized::<u16>()`. |
270 | pub fn to_u16(&self) -> Result<SparseDFA<Vec<u8>, u16>> { |
271 | self.to_sized() |
272 | } |
273 | |
274 | /// Create a new sparse DFA whose match semantics are equivalent to |
275 | /// this DFA, but attempt to use `u32` for the representation of state |
276 | /// identifiers. If `u32` is insufficient to represent all state |
277 | /// identifiers in this DFA, then this returns an error. |
278 | /// |
279 | /// This is a convenience routine for `to_sized::<u32>()`. |
280 | #[cfg (any(target_pointer_width = "32" , target_pointer_width = "64" ))] |
281 | pub fn to_u32(&self) -> Result<SparseDFA<Vec<u8>, u32>> { |
282 | self.to_sized() |
283 | } |
284 | |
285 | /// Create a new sparse DFA whose match semantics are equivalent to |
286 | /// this DFA, but attempt to use `u64` for the representation of state |
287 | /// identifiers. If `u64` is insufficient to represent all state |
288 | /// identifiers in this DFA, then this returns an error. |
289 | /// |
290 | /// This is a convenience routine for `to_sized::<u64>()`. |
291 | #[cfg (target_pointer_width = "64" )] |
292 | pub fn to_u64(&self) -> Result<SparseDFA<Vec<u8>, u64>> { |
293 | self.to_sized() |
294 | } |
295 | |
296 | /// Create a new sparse DFA whose match semantics are equivalent to |
297 | /// this DFA, but attempt to use `A` for the representation of state |
298 | /// identifiers. If `A` is insufficient to represent all state identifiers |
299 | /// in this DFA, then this returns an error. |
300 | /// |
301 | /// An alternative way to construct such a DFA is to use |
302 | /// [`DenseDFA::to_sparse_sized`](enum.DenseDFA.html#method.to_sparse_sized). |
303 | /// In general, picking the appropriate size upon initial construction of |
304 | /// a sparse DFA is preferred, since it will do the conversion in one |
305 | /// step instead of two. |
306 | pub fn to_sized<A: StateID>(&self) -> Result<SparseDFA<Vec<u8>, A>> { |
307 | self.repr().to_sized().map(|r| r.into_sparse_dfa()) |
308 | } |
309 | |
310 | /// Serialize a sparse DFA to raw bytes in little endian format. |
311 | /// |
312 | /// If the state identifier representation of this DFA has a size different |
313 | /// than 1, 2, 4 or 8 bytes, then this returns an error. All |
314 | /// implementations of `StateID` provided by this crate satisfy this |
315 | /// requirement. |
316 | pub fn to_bytes_little_endian(&self) -> Result<Vec<u8>> { |
317 | self.repr().to_bytes::<LittleEndian>() |
318 | } |
319 | |
320 | /// Serialize a sparse DFA to raw bytes in big endian format. |
321 | /// |
322 | /// If the state identifier representation of this DFA has a size different |
323 | /// than 1, 2, 4 or 8 bytes, then this returns an error. All |
324 | /// implementations of `StateID` provided by this crate satisfy this |
325 | /// requirement. |
326 | pub fn to_bytes_big_endian(&self) -> Result<Vec<u8>> { |
327 | self.repr().to_bytes::<BigEndian>() |
328 | } |
329 | |
330 | /// Serialize a sparse DFA to raw bytes in native endian format. |
331 | /// Generally, it is better to pick an explicit endianness using either |
332 | /// `to_bytes_little_endian` or `to_bytes_big_endian`. This routine is |
333 | /// useful in tests where the DFA is serialized and deserialized on the |
334 | /// same platform. |
335 | /// |
336 | /// If the state identifier representation of this DFA has a size different |
337 | /// than 1, 2, 4 or 8 bytes, then this returns an error. All |
338 | /// implementations of `StateID` provided by this crate satisfy this |
339 | /// requirement. |
340 | pub fn to_bytes_native_endian(&self) -> Result<Vec<u8>> { |
341 | self.repr().to_bytes::<NativeEndian>() |
342 | } |
343 | } |
344 | |
345 | impl<'a, S: StateID> SparseDFA<&'a [u8], S> { |
346 | /// Deserialize a sparse DFA with a specific state identifier |
347 | /// representation. |
348 | /// |
349 | /// Deserializing a DFA using this routine will never allocate heap memory. |
350 | /// This is also guaranteed to be a constant time operation that does not |
351 | /// vary with the size of the DFA. |
352 | /// |
353 | /// The bytes given should be generated by the serialization of a DFA with |
354 | /// either the |
355 | /// [`to_bytes_little_endian`](enum.DenseDFA.html#method.to_bytes_little_endian) |
356 | /// method or the |
357 | /// [`to_bytes_big_endian`](enum.DenseDFA.html#method.to_bytes_big_endian) |
358 | /// endian, depending on the endianness of the machine you are |
359 | /// deserializing this DFA from. |
360 | /// |
361 | /// If the state identifier representation is `usize`, then deserialization |
362 | /// is dependent on the pointer size. For this reason, it is best to |
363 | /// serialize DFAs using a fixed size representation for your state |
364 | /// identifiers, such as `u8`, `u16`, `u32` or `u64`. |
365 | /// |
366 | /// # Panics |
367 | /// |
368 | /// The bytes given should be *trusted*. In particular, if the bytes |
369 | /// are not a valid serialization of a DFA, or if the endianness of the |
370 | /// serialized bytes is different than the endianness of the machine that |
371 | /// is deserializing the DFA, then this routine will panic. Moreover, it |
372 | /// is possible for this deserialization routine to succeed even if the |
373 | /// given bytes do not represent a valid serialized sparse DFA. |
374 | /// |
375 | /// # Safety |
376 | /// |
377 | /// This routine is unsafe because it permits callers to provide an |
378 | /// arbitrary transition table with possibly incorrect transitions. While |
379 | /// the various serialization routines will never return an incorrect |
380 | /// transition table, there is no guarantee that the bytes provided here |
381 | /// are correct. While deserialization does many checks (as documented |
382 | /// above in the panic conditions), this routine does not check that the |
383 | /// transition table is correct. Given an incorrect transition table, it is |
384 | /// possible for the search routines to access out-of-bounds memory because |
385 | /// of explicit bounds check elision. |
386 | /// |
387 | /// # Example |
388 | /// |
389 | /// This example shows how to serialize a DFA to raw bytes, deserialize it |
390 | /// and then use it for searching. Note that we first convert the DFA to |
391 | /// using `u16` for its state identifier representation before serializing |
392 | /// it. While this isn't strictly necessary, it's good practice in order to |
393 | /// decrease the size of the DFA and to avoid platform specific pitfalls |
394 | /// such as differing pointer sizes. |
395 | /// |
396 | /// ``` |
397 | /// use regex_automata::{DFA, DenseDFA, SparseDFA}; |
398 | /// |
399 | /// # fn example() -> Result<(), regex_automata::Error> { |
400 | /// let sparse = SparseDFA::new("foo[0-9]+" )?; |
401 | /// let bytes = sparse.to_u16()?.to_bytes_native_endian()?; |
402 | /// |
403 | /// let dfa: SparseDFA<&[u8], u16> = unsafe { |
404 | /// SparseDFA::from_bytes(&bytes) |
405 | /// }; |
406 | /// |
407 | /// assert_eq!(Some(8), dfa.find(b"foo12345" )); |
408 | /// # Ok(()) }; example().unwrap() |
409 | /// ``` |
410 | pub unsafe fn from_bytes(buf: &'a [u8]) -> SparseDFA<&'a [u8], S> { |
411 | Repr::from_bytes(buf).into_sparse_dfa() |
412 | } |
413 | } |
414 | |
415 | impl<T: AsRef<[u8]>, S: StateID> DFA for SparseDFA<T, S> { |
416 | type ID = S; |
417 | |
418 | #[inline ] |
419 | fn start_state(&self) -> S { |
420 | self.repr().start_state() |
421 | } |
422 | |
423 | #[inline ] |
424 | fn is_match_state(&self, id: S) -> bool { |
425 | self.repr().is_match_state(id) |
426 | } |
427 | |
428 | #[inline ] |
429 | fn is_dead_state(&self, id: S) -> bool { |
430 | self.repr().is_dead_state(id) |
431 | } |
432 | |
433 | #[inline ] |
434 | fn is_match_or_dead_state(&self, id: S) -> bool { |
435 | self.repr().is_match_or_dead_state(id) |
436 | } |
437 | |
438 | #[inline ] |
439 | fn is_anchored(&self) -> bool { |
440 | self.repr().is_anchored() |
441 | } |
442 | |
443 | #[inline ] |
444 | fn next_state(&self, current: S, input: u8) -> S { |
445 | match *self { |
446 | SparseDFA::Standard(ref r) => r.next_state(current, input), |
447 | SparseDFA::ByteClass(ref r) => r.next_state(current, input), |
448 | SparseDFA::__Nonexhaustive => unreachable!(), |
449 | } |
450 | } |
451 | |
452 | #[inline ] |
453 | unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S { |
454 | self.next_state(current, input) |
455 | } |
456 | |
457 | // We specialize the following methods because it lets us lift the |
458 | // case analysis between the different types of sparse DFAs. Instead of |
459 | // doing the case analysis for every transition, we do it once before |
460 | // searching. For sparse DFAs, this doesn't seem to benefit performance as |
461 | // much as it does for the dense DFAs, but it's easy to do so we might as |
462 | // well do it. |
463 | |
464 | #[inline ] |
465 | fn is_match_at(&self, bytes: &[u8], start: usize) -> bool { |
466 | match *self { |
467 | SparseDFA::Standard(ref r) => r.is_match_at(bytes, start), |
468 | SparseDFA::ByteClass(ref r) => r.is_match_at(bytes, start), |
469 | SparseDFA::__Nonexhaustive => unreachable!(), |
470 | } |
471 | } |
472 | |
473 | #[inline ] |
474 | fn shortest_match_at(&self, bytes: &[u8], start: usize) -> Option<usize> { |
475 | match *self { |
476 | SparseDFA::Standard(ref r) => r.shortest_match_at(bytes, start), |
477 | SparseDFA::ByteClass(ref r) => r.shortest_match_at(bytes, start), |
478 | SparseDFA::__Nonexhaustive => unreachable!(), |
479 | } |
480 | } |
481 | |
482 | #[inline ] |
483 | fn find_at(&self, bytes: &[u8], start: usize) -> Option<usize> { |
484 | match *self { |
485 | SparseDFA::Standard(ref r) => r.find_at(bytes, start), |
486 | SparseDFA::ByteClass(ref r) => r.find_at(bytes, start), |
487 | SparseDFA::__Nonexhaustive => unreachable!(), |
488 | } |
489 | } |
490 | |
491 | #[inline ] |
492 | fn rfind_at(&self, bytes: &[u8], start: usize) -> Option<usize> { |
493 | match *self { |
494 | SparseDFA::Standard(ref r) => r.rfind_at(bytes, start), |
495 | SparseDFA::ByteClass(ref r) => r.rfind_at(bytes, start), |
496 | SparseDFA::__Nonexhaustive => unreachable!(), |
497 | } |
498 | } |
499 | } |
500 | |
501 | /// A standard sparse DFA that does not use premultiplication or byte classes. |
502 | /// |
503 | /// Generally, it isn't necessary to use this type directly, since a |
504 | /// `SparseDFA` can be used for searching directly. One possible reason why |
505 | /// one might want to use this type directly is if you are implementing your |
506 | /// own search routines by walking a DFA's transitions directly. In that case, |
507 | /// you'll want to use this type (or any of the other DFA variant types) |
508 | /// directly, since they implement `next_state` more efficiently. |
509 | #[derive(Clone, Debug)] |
510 | pub struct Standard<T: AsRef<[u8]>, S: StateID = usize>(Repr<T, S>); |
511 | |
512 | impl<T: AsRef<[u8]>, S: StateID> DFA for Standard<T, S> { |
513 | type ID = S; |
514 | |
515 | #[inline ] |
516 | fn start_state(&self) -> S { |
517 | self.0.start_state() |
518 | } |
519 | |
520 | #[inline ] |
521 | fn is_match_state(&self, id: S) -> bool { |
522 | self.0.is_match_state(id) |
523 | } |
524 | |
525 | #[inline ] |
526 | fn is_dead_state(&self, id: S) -> bool { |
527 | self.0.is_dead_state(id) |
528 | } |
529 | |
530 | #[inline ] |
531 | fn is_match_or_dead_state(&self, id: S) -> bool { |
532 | self.0.is_match_or_dead_state(id) |
533 | } |
534 | |
535 | #[inline ] |
536 | fn is_anchored(&self) -> bool { |
537 | self.0.is_anchored() |
538 | } |
539 | |
540 | #[inline ] |
541 | fn next_state(&self, current: S, input: u8) -> S { |
542 | self.0.state(current).next(input) |
543 | } |
544 | |
545 | #[inline ] |
546 | unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S { |
547 | self.next_state(current, input) |
548 | } |
549 | } |
550 | |
551 | /// A sparse DFA that shrinks its alphabet. |
552 | /// |
553 | /// Alphabet shrinking is achieved by using a set of equivalence classes |
554 | /// instead of using all possible byte values. Any two bytes belong to the same |
555 | /// equivalence class if and only if they can be used interchangeably anywhere |
556 | /// in the DFA while never discriminating between a match and a non-match. |
557 | /// |
558 | /// Unlike dense DFAs, sparse DFAs do not tend to benefit nearly as much from |
559 | /// using byte classes. In some cases, using byte classes can even marginally |
560 | /// increase the size of a sparse DFA's transition table. The reason for this |
561 | /// is that a sparse DFA already compacts each state's transitions separate |
562 | /// from whether byte classes are used. |
563 | /// |
564 | /// Generally, it isn't necessary to use this type directly, since a |
565 | /// `SparseDFA` can be used for searching directly. One possible reason why |
566 | /// one might want to use this type directly is if you are implementing your |
567 | /// own search routines by walking a DFA's transitions directly. In that case, |
568 | /// you'll want to use this type (or any of the other DFA variant types) |
569 | /// directly, since they implement `next_state` more efficiently. |
570 | #[derive(Clone, Debug)] |
571 | pub struct ByteClass<T: AsRef<[u8]>, S: StateID = usize>(Repr<T, S>); |
572 | |
573 | impl<T: AsRef<[u8]>, S: StateID> DFA for ByteClass<T, S> { |
574 | type ID = S; |
575 | |
576 | #[inline ] |
577 | fn start_state(&self) -> S { |
578 | self.0.start_state() |
579 | } |
580 | |
581 | #[inline ] |
582 | fn is_match_state(&self, id: S) -> bool { |
583 | self.0.is_match_state(id) |
584 | } |
585 | |
586 | #[inline ] |
587 | fn is_dead_state(&self, id: S) -> bool { |
588 | self.0.is_dead_state(id) |
589 | } |
590 | |
591 | #[inline ] |
592 | fn is_match_or_dead_state(&self, id: S) -> bool { |
593 | self.0.is_match_or_dead_state(id) |
594 | } |
595 | |
596 | #[inline ] |
597 | fn is_anchored(&self) -> bool { |
598 | self.0.is_anchored() |
599 | } |
600 | |
601 | #[inline ] |
602 | fn next_state(&self, current: S, input: u8) -> S { |
603 | let input = self.0.byte_classes.get(input); |
604 | self.0.state(current).next(input) |
605 | } |
606 | |
607 | #[inline ] |
608 | unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S { |
609 | self.next_state(current, input) |
610 | } |
611 | } |
612 | |
613 | /// The underlying representation of a sparse DFA. This is shared by all of |
614 | /// the different variants of a sparse DFA. |
615 | #[derive(Clone)] |
616 | #[cfg_attr (not(feature = "std" ), derive(Debug))] |
617 | struct Repr<T: AsRef<[u8]>, S: StateID = usize> { |
618 | anchored: bool, |
619 | start: S, |
620 | state_count: usize, |
621 | max_match: S, |
622 | byte_classes: ByteClasses, |
623 | trans: T, |
624 | } |
625 | |
626 | impl<T: AsRef<[u8]>, S: StateID> Repr<T, S> { |
627 | fn into_sparse_dfa(self) -> SparseDFA<T, S> { |
628 | if self.byte_classes.is_singleton() { |
629 | SparseDFA::Standard(Standard(self)) |
630 | } else { |
631 | SparseDFA::ByteClass(ByteClass(self)) |
632 | } |
633 | } |
634 | |
635 | fn as_ref<'a>(&'a self) -> Repr<&'a [u8], S> { |
636 | Repr { |
637 | anchored: self.anchored, |
638 | start: self.start, |
639 | state_count: self.state_count, |
640 | max_match: self.max_match, |
641 | byte_classes: self.byte_classes.clone(), |
642 | trans: self.trans(), |
643 | } |
644 | } |
645 | |
646 | #[cfg (feature = "std" )] |
647 | fn to_owned(&self) -> Repr<Vec<u8>, S> { |
648 | Repr { |
649 | anchored: self.anchored, |
650 | start: self.start, |
651 | state_count: self.state_count, |
652 | max_match: self.max_match, |
653 | byte_classes: self.byte_classes.clone(), |
654 | trans: self.trans().to_vec(), |
655 | } |
656 | } |
657 | |
658 | /// Return a convenient representation of the given state. |
659 | /// |
660 | /// This is marked as inline because it doesn't seem to get inlined |
661 | /// otherwise, which leads to a fairly significant performance loss (~25%). |
662 | #[inline ] |
663 | fn state<'a>(&'a self, id: S) -> State<'a, S> { |
664 | let mut pos = id.to_usize(); |
665 | let ntrans = NativeEndian::read_u16(&self.trans()[pos..]) as usize; |
666 | pos += 2; |
667 | let input_ranges = &self.trans()[pos..pos + (ntrans * 2)]; |
668 | pos += 2 * ntrans; |
669 | let next = &self.trans()[pos..pos + (ntrans * size_of::<S>())]; |
670 | State { _state_id_repr: PhantomData, ntrans, input_ranges, next } |
671 | } |
672 | |
673 | /// Return an iterator over all of the states in this DFA. |
674 | /// |
675 | /// The iterator returned yields tuples, where the first element is the |
676 | /// state ID and the second element is the state itself. |
677 | #[cfg (feature = "std" )] |
678 | fn states<'a>(&'a self) -> StateIter<'a, T, S> { |
679 | StateIter { dfa: self, id: dead_id() } |
680 | } |
681 | |
682 | fn memory_usage(&self) -> usize { |
683 | self.trans().len() |
684 | } |
685 | |
686 | fn start_state(&self) -> S { |
687 | self.start |
688 | } |
689 | |
690 | fn is_match_state(&self, id: S) -> bool { |
691 | self.is_match_or_dead_state(id) && !self.is_dead_state(id) |
692 | } |
693 | |
694 | fn is_dead_state(&self, id: S) -> bool { |
695 | id == dead_id() |
696 | } |
697 | |
698 | fn is_match_or_dead_state(&self, id: S) -> bool { |
699 | id <= self.max_match |
700 | } |
701 | |
702 | fn is_anchored(&self) -> bool { |
703 | self.anchored |
704 | } |
705 | |
706 | fn trans(&self) -> &[u8] { |
707 | self.trans.as_ref() |
708 | } |
709 | |
710 | /// Create a new sparse DFA whose match semantics are equivalent to this |
711 | /// DFA, but attempt to use `A` for the representation of state |
712 | /// identifiers. If `A` is insufficient to represent all state identifiers |
713 | /// in this DFA, then this returns an error. |
714 | #[cfg (feature = "std" )] |
715 | fn to_sized<A: StateID>(&self) -> Result<Repr<Vec<u8>, A>> { |
716 | // To build the new DFA, we proceed much like the initial construction |
717 | // of the sparse DFA. Namely, since the state ID size is changing, |
718 | // we don't actually know all of our state IDs until we've allocated |
719 | // all necessary space. So we do one pass that allocates all of the |
720 | // storage we need, and then another pass to fill in the transitions. |
721 | |
722 | let mut trans = Vec::with_capacity(size_of::<A>() * self.state_count); |
723 | let mut map: HashMap<S, A> = HashMap::with_capacity(self.state_count); |
724 | for (old_id, state) in self.states() { |
725 | let pos = trans.len(); |
726 | map.insert(old_id, usize_to_state_id(pos)?); |
727 | |
728 | let n = state.ntrans; |
729 | let zeros = 2 + (n * 2) + (n * size_of::<A>()); |
730 | trans.extend(iter::repeat(0).take(zeros)); |
731 | |
732 | NativeEndian::write_u16(&mut trans[pos..], n as u16); |
733 | let (s, e) = (pos + 2, pos + 2 + (n * 2)); |
734 | trans[s..e].copy_from_slice(state.input_ranges); |
735 | } |
736 | |
737 | let mut new = Repr { |
738 | anchored: self.anchored, |
739 | start: map[&self.start], |
740 | state_count: self.state_count, |
741 | max_match: map[&self.max_match], |
742 | byte_classes: self.byte_classes.clone(), |
743 | trans, |
744 | }; |
745 | for (&old_id, &new_id) in map.iter() { |
746 | let old_state = self.state(old_id); |
747 | let mut new_state = new.state_mut(new_id); |
748 | for i in 0..new_state.ntrans { |
749 | let next = map[&old_state.next_at(i)]; |
750 | new_state.set_next_at(i, usize_to_state_id(next.to_usize())?); |
751 | } |
752 | } |
753 | new.start = map[&self.start]; |
754 | new.max_match = map[&self.max_match]; |
755 | Ok(new) |
756 | } |
757 | |
758 | /// Serialize a sparse DFA to raw bytes using the provided endianness. |
759 | /// |
760 | /// If the state identifier representation of this DFA has a size different |
761 | /// than 1, 2, 4 or 8 bytes, then this returns an error. All |
762 | /// implementations of `StateID` provided by this crate satisfy this |
763 | /// requirement. |
764 | /// |
765 | /// Unlike dense DFAs, the result is not necessarily aligned since a |
766 | /// sparse DFA's transition table is always read as a sequence of bytes. |
767 | #[cfg (feature = "std" )] |
768 | fn to_bytes<A: ByteOrder>(&self) -> Result<Vec<u8>> { |
769 | let label = b"rust-regex-automata-sparse-dfa \x00" ; |
770 | let size = |
771 | // For human readable label. |
772 | label.len() |
773 | // endiannes check, must be equal to 0xFEFF for native endian |
774 | + 2 |
775 | // For version number. |
776 | + 2 |
777 | // Size of state ID representation, in bytes. |
778 | // Must be 1, 2, 4 or 8. |
779 | + 2 |
780 | // For DFA misc options. (Currently unused.) |
781 | + 2 |
782 | // For start state. |
783 | + 8 |
784 | // For state count. |
785 | + 8 |
786 | // For max match state. |
787 | + 8 |
788 | // For byte class map. |
789 | + 256 |
790 | // For transition table. |
791 | + self.trans().len(); |
792 | |
793 | let mut i = 0; |
794 | let mut buf = vec![0; size]; |
795 | |
796 | // write label |
797 | for &b in label { |
798 | buf[i] = b; |
799 | i += 1; |
800 | } |
801 | // endianness check |
802 | A::write_u16(&mut buf[i..], 0xFEFF); |
803 | i += 2; |
804 | // version number |
805 | A::write_u16(&mut buf[i..], 1); |
806 | i += 2; |
807 | // size of state ID |
808 | let state_size = size_of::<S>(); |
809 | if ![1, 2, 4, 8].contains(&state_size) { |
810 | return Err(Error::serialize(&format!( |
811 | "state size of {} not supported, must be 1, 2, 4 or 8" , |
812 | state_size |
813 | ))); |
814 | } |
815 | A::write_u16(&mut buf[i..], state_size as u16); |
816 | i += 2; |
817 | // DFA misc options |
818 | let mut options = 0u16; |
819 | if self.anchored { |
820 | options |= dense::MASK_ANCHORED; |
821 | } |
822 | A::write_u16(&mut buf[i..], options); |
823 | i += 2; |
824 | // start state |
825 | A::write_u64(&mut buf[i..], self.start.to_usize() as u64); |
826 | i += 8; |
827 | // state count |
828 | A::write_u64(&mut buf[i..], self.state_count as u64); |
829 | i += 8; |
830 | // max match state |
831 | A::write_u64(&mut buf[i..], self.max_match.to_usize() as u64); |
832 | i += 8; |
833 | // byte class map |
834 | for b in (0..256).map(|b| b as u8) { |
835 | buf[i] = self.byte_classes.get(b); |
836 | i += 1; |
837 | } |
838 | // transition table |
839 | for (_, state) in self.states() { |
840 | A::write_u16(&mut buf[i..], state.ntrans as u16); |
841 | i += 2; |
842 | buf[i..i + (state.ntrans * 2)].copy_from_slice(state.input_ranges); |
843 | i += state.ntrans * 2; |
844 | for j in 0..state.ntrans { |
845 | write_state_id_bytes::<A, _>(&mut buf[i..], state.next_at(j)); |
846 | i += size_of::<S>(); |
847 | } |
848 | } |
849 | |
850 | assert_eq!(size, i, "expected to consume entire buffer" ); |
851 | |
852 | Ok(buf) |
853 | } |
854 | } |
855 | |
856 | impl<'a, S: StateID> Repr<&'a [u8], S> { |
857 | /// The implementation for deserializing a sparse DFA from raw bytes. |
858 | unsafe fn from_bytes(mut buf: &'a [u8]) -> Repr<&'a [u8], S> { |
859 | // skip over label |
860 | match buf.iter().position(|&b| b == b' \x00' ) { |
861 | None => panic!("could not find label" ), |
862 | Some(i) => buf = &buf[i + 1..], |
863 | } |
864 | |
865 | // check that current endianness is same as endianness of DFA |
866 | let endian_check = NativeEndian::read_u16(buf); |
867 | buf = &buf[2..]; |
868 | if endian_check != 0xFEFF { |
869 | panic!( |
870 | "endianness mismatch, expected 0xFEFF but got 0x{:X}. \ |
871 | are you trying to load a SparseDFA serialized with a \ |
872 | different endianness?" , |
873 | endian_check, |
874 | ); |
875 | } |
876 | |
877 | // check that the version number is supported |
878 | let version = NativeEndian::read_u16(buf); |
879 | buf = &buf[2..]; |
880 | if version != 1 { |
881 | panic!( |
882 | "expected version 1, but found unsupported version {}" , |
883 | version, |
884 | ); |
885 | } |
886 | |
887 | // read size of state |
888 | let state_size = NativeEndian::read_u16(buf) as usize; |
889 | if state_size != size_of::<S>() { |
890 | panic!( |
891 | "state size of SparseDFA ({}) does not match \ |
892 | requested state size ({})" , |
893 | state_size, |
894 | size_of::<S>(), |
895 | ); |
896 | } |
897 | buf = &buf[2..]; |
898 | |
899 | // read miscellaneous options |
900 | let opts = NativeEndian::read_u16(buf); |
901 | buf = &buf[2..]; |
902 | |
903 | // read start state |
904 | let start = S::from_usize(NativeEndian::read_u64(buf) as usize); |
905 | buf = &buf[8..]; |
906 | |
907 | // read state count |
908 | let state_count = NativeEndian::read_u64(buf) as usize; |
909 | buf = &buf[8..]; |
910 | |
911 | // read max match state |
912 | let max_match = S::from_usize(NativeEndian::read_u64(buf) as usize); |
913 | buf = &buf[8..]; |
914 | |
915 | // read byte classes |
916 | let byte_classes = ByteClasses::from_slice(&buf[..256]); |
917 | buf = &buf[256..]; |
918 | |
919 | Repr { |
920 | anchored: opts & dense::MASK_ANCHORED > 0, |
921 | start, |
922 | state_count, |
923 | max_match, |
924 | byte_classes, |
925 | trans: buf, |
926 | } |
927 | } |
928 | } |
929 | |
930 | #[cfg (feature = "std" )] |
931 | impl<S: StateID> Repr<Vec<u8>, S> { |
932 | /// The implementation for constructing a sparse DFA from a dense DFA. |
933 | fn from_dense_sized<T: AsRef<[S]>, A: StateID>( |
934 | dfa: &dense::Repr<T, S>, |
935 | ) -> Result<Repr<Vec<u8>, A>> { |
936 | // In order to build the transition table, we need to be able to write |
937 | // state identifiers for each of the "next" transitions in each state. |
938 | // Our state identifiers correspond to the byte offset in the |
939 | // transition table at which the state is encoded. Therefore, we do not |
940 | // actually know what the state identifiers are until we've allocated |
941 | // exactly as much space as we need for each state. Thus, construction |
942 | // of the transition table happens in two passes. |
943 | // |
944 | // In the first pass, we fill out the shell of each state, which |
945 | // includes the transition count, the input byte ranges and zero-filled |
946 | // space for the transitions. In this first pass, we also build up a |
947 | // map from the state identifier index of the dense DFA to the state |
948 | // identifier in this sparse DFA. |
949 | // |
950 | // In the second pass, we fill in the transitions based on the map |
951 | // built in the first pass. |
952 | |
953 | let mut trans = Vec::with_capacity(size_of::<A>() * dfa.state_count()); |
954 | let mut remap: Vec<A> = vec![dead_id(); dfa.state_count()]; |
955 | for (old_id, state) in dfa.states() { |
956 | let pos = trans.len(); |
957 | |
958 | remap[dfa.state_id_to_index(old_id)] = usize_to_state_id(pos)?; |
959 | // zero-filled space for the transition count |
960 | trans.push(0); |
961 | trans.push(0); |
962 | |
963 | let mut trans_count = 0; |
964 | for (b1, b2, _) in state.sparse_transitions() { |
965 | trans_count += 1; |
966 | trans.push(b1); |
967 | trans.push(b2); |
968 | } |
969 | // fill in the transition count |
970 | NativeEndian::write_u16(&mut trans[pos..], trans_count); |
971 | |
972 | // zero-fill the actual transitions |
973 | let zeros = trans_count as usize * size_of::<A>(); |
974 | trans.extend(iter::repeat(0).take(zeros)); |
975 | } |
976 | |
977 | let mut new = Repr { |
978 | anchored: dfa.is_anchored(), |
979 | start: remap[dfa.state_id_to_index(dfa.start_state())], |
980 | state_count: dfa.state_count(), |
981 | max_match: remap[dfa.state_id_to_index(dfa.max_match_state())], |
982 | byte_classes: dfa.byte_classes().clone(), |
983 | trans, |
984 | }; |
985 | for (old_id, old_state) in dfa.states() { |
986 | let new_id = remap[dfa.state_id_to_index(old_id)]; |
987 | let mut new_state = new.state_mut(new_id); |
988 | let sparse = old_state.sparse_transitions(); |
989 | for (i, (_, _, next)) in sparse.enumerate() { |
990 | let next = remap[dfa.state_id_to_index(next)]; |
991 | new_state.set_next_at(i, next); |
992 | } |
993 | } |
994 | Ok(new) |
995 | } |
996 | |
997 | /// Return a convenient mutable representation of the given state. |
998 | fn state_mut<'a>(&'a mut self, id: S) -> StateMut<'a, S> { |
999 | let mut pos = id.to_usize(); |
1000 | let ntrans = NativeEndian::read_u16(&self.trans[pos..]) as usize; |
1001 | pos += 2; |
1002 | |
1003 | let size = (ntrans * 2) + (ntrans * size_of::<S>()); |
1004 | let ranges_and_next = &mut self.trans[pos..pos + size]; |
1005 | let (input_ranges, next) = ranges_and_next.split_at_mut(ntrans * 2); |
1006 | StateMut { _state_id_repr: PhantomData, ntrans, input_ranges, next } |
1007 | } |
1008 | } |
1009 | |
1010 | #[cfg (feature = "std" )] |
1011 | impl<T: AsRef<[u8]>, S: StateID> fmt::Debug for Repr<T, S> { |
1012 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
1013 | fn state_status<T: AsRef<[u8]>, S: StateID>( |
1014 | dfa: &Repr<T, S>, |
1015 | id: S, |
1016 | ) -> &'static str { |
1017 | if id == dead_id() { |
1018 | if dfa.is_match_state(id) { |
1019 | "D*" |
1020 | } else { |
1021 | "D " |
1022 | } |
1023 | } else if id == dfa.start_state() { |
1024 | if dfa.is_match_state(id) { |
1025 | ">*" |
1026 | } else { |
1027 | "> " |
1028 | } |
1029 | } else { |
1030 | if dfa.is_match_state(id) { |
1031 | " *" |
1032 | } else { |
1033 | " " |
1034 | } |
1035 | } |
1036 | } |
1037 | |
1038 | writeln!(f, "SparseDFA(" )?; |
1039 | for (id, state) in self.states() { |
1040 | let status = state_status(self, id); |
1041 | writeln!(f, "{}{:06}: {:?}" , status, id.to_usize(), state)?; |
1042 | } |
1043 | writeln!(f, ")" )?; |
1044 | Ok(()) |
1045 | } |
1046 | } |
1047 | |
1048 | /// An iterator over all states in a sparse DFA. |
1049 | /// |
1050 | /// This iterator yields tuples, where the first element is the state ID and |
1051 | /// the second element is the state itself. |
1052 | #[cfg (feature = "std" )] |
1053 | #[derive(Debug)] |
1054 | struct StateIter<'a, T: AsRef<[u8]> + 'a, S: StateID + 'a = usize> { |
1055 | dfa: &'a Repr<T, S>, |
1056 | id: S, |
1057 | } |
1058 | |
1059 | #[cfg (feature = "std" )] |
1060 | impl<'a, T: AsRef<[u8]>, S: StateID> Iterator for StateIter<'a, T, S> { |
1061 | type Item = (S, State<'a, S>); |
1062 | |
1063 | fn next(&mut self) -> Option<(S, State<'a, S>)> { |
1064 | if self.id.to_usize() >= self.dfa.trans().len() { |
1065 | return None; |
1066 | } |
1067 | let id = self.id; |
1068 | let state = self.dfa.state(id); |
1069 | self.id = S::from_usize(self.id.to_usize() + state.bytes()); |
1070 | Some((id, state)) |
1071 | } |
1072 | } |
1073 | |
1074 | /// A representation of a sparse DFA state that can be cheaply materialized |
1075 | /// from a state identifier. |
1076 | #[derive(Clone)] |
1077 | struct State<'a, S: StateID = usize> { |
1078 | /// The state identifier representation used by the DFA from which this |
1079 | /// state was extracted. Since our transition table is compacted in a |
1080 | /// &[u8], we don't actually use the state ID type parameter explicitly |
1081 | /// anywhere, so we fake it. This prevents callers from using an incorrect |
1082 | /// state ID representation to read from this state. |
1083 | _state_id_repr: PhantomData<S>, |
1084 | /// The number of transitions in this state. |
1085 | ntrans: usize, |
1086 | /// Pairs of input ranges, where there is one pair for each transition. |
1087 | /// Each pair specifies an inclusive start and end byte range for the |
1088 | /// corresponding transition. |
1089 | input_ranges: &'a [u8], |
1090 | /// Transitions to the next state. This slice contains native endian |
1091 | /// encoded state identifiers, with `S` as the representation. Thus, there |
1092 | /// are `ntrans * size_of::<S>()` bytes in this slice. |
1093 | next: &'a [u8], |
1094 | } |
1095 | |
1096 | impl<'a, S: StateID> State<'a, S> { |
1097 | /// Searches for the next transition given an input byte. If no such |
1098 | /// transition could be found, then a dead state is returned. |
1099 | fn next(&self, input: u8) -> S { |
1100 | // This straight linear search was observed to be much better than |
1101 | // binary search on ASCII haystacks, likely because a binary search |
1102 | // visits the ASCII case last but a linear search sees it first. A |
1103 | // binary search does do a little better on non-ASCII haystacks, but |
1104 | // not by much. There might be a better trade off lurking here. |
1105 | for i in 0..self.ntrans { |
1106 | let (start, end) = self.range(i); |
1107 | if start <= input && input <= end { |
1108 | return self.next_at(i); |
1109 | } |
1110 | // We could bail early with an extra branch: if input < b1, then |
1111 | // we know we'll never find a matching transition. Interestingly, |
1112 | // this extra branch seems to not help performance, or will even |
1113 | // hurt it. It's likely very dependent on the DFA itself and what |
1114 | // is being searched. |
1115 | } |
1116 | dead_id() |
1117 | } |
1118 | |
1119 | /// Returns the inclusive input byte range for the ith transition in this |
1120 | /// state. |
1121 | fn range(&self, i: usize) -> (u8, u8) { |
1122 | (self.input_ranges[i * 2], self.input_ranges[i * 2 + 1]) |
1123 | } |
1124 | |
1125 | /// Returns the next state for the ith transition in this state. |
1126 | fn next_at(&self, i: usize) -> S { |
1127 | S::read_bytes(&self.next[i * size_of::<S>()..]) |
1128 | } |
1129 | |
1130 | /// Return the total number of bytes that this state consumes in its |
1131 | /// encoded form. |
1132 | #[cfg (feature = "std" )] |
1133 | fn bytes(&self) -> usize { |
1134 | 2 + (self.ntrans * 2) + (self.ntrans * size_of::<S>()) |
1135 | } |
1136 | } |
1137 | |
1138 | #[cfg (feature = "std" )] |
1139 | impl<'a, S: StateID> fmt::Debug for State<'a, S> { |
1140 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
1141 | let mut transitions = vec![]; |
1142 | for i in 0..self.ntrans { |
1143 | let next = self.next_at(i); |
1144 | if next == dead_id() { |
1145 | continue; |
1146 | } |
1147 | |
1148 | let (start, end) = self.range(i); |
1149 | if start == end { |
1150 | transitions.push(format!( |
1151 | "{} => {}" , |
1152 | escape(start), |
1153 | next.to_usize() |
1154 | )); |
1155 | } else { |
1156 | transitions.push(format!( |
1157 | "{}-{} => {}" , |
1158 | escape(start), |
1159 | escape(end), |
1160 | next.to_usize(), |
1161 | )); |
1162 | } |
1163 | } |
1164 | write!(f, "{}" , transitions.join(", " )) |
1165 | } |
1166 | } |
1167 | |
1168 | /// A representation of a mutable sparse DFA state that can be cheaply |
1169 | /// materialized from a state identifier. |
1170 | #[cfg (feature = "std" )] |
1171 | struct StateMut<'a, S: StateID = usize> { |
1172 | /// The state identifier representation used by the DFA from which this |
1173 | /// state was extracted. Since our transition table is compacted in a |
1174 | /// &[u8], we don't actually use the state ID type parameter explicitly |
1175 | /// anywhere, so we fake it. This prevents callers from using an incorrect |
1176 | /// state ID representation to read from this state. |
1177 | _state_id_repr: PhantomData<S>, |
1178 | /// The number of transitions in this state. |
1179 | ntrans: usize, |
1180 | /// Pairs of input ranges, where there is one pair for each transition. |
1181 | /// Each pair specifies an inclusive start and end byte range for the |
1182 | /// corresponding transition. |
1183 | input_ranges: &'a mut [u8], |
1184 | /// Transitions to the next state. This slice contains native endian |
1185 | /// encoded state identifiers, with `S` as the representation. Thus, there |
1186 | /// are `ntrans * size_of::<S>()` bytes in this slice. |
1187 | next: &'a mut [u8], |
1188 | } |
1189 | |
1190 | #[cfg (feature = "std" )] |
1191 | impl<'a, S: StateID> StateMut<'a, S> { |
1192 | /// Sets the ith transition to the given state. |
1193 | fn set_next_at(&mut self, i: usize, next: S) { |
1194 | next.write_bytes(&mut self.next[i * size_of::<S>()..]); |
1195 | } |
1196 | } |
1197 | |
1198 | #[cfg (feature = "std" )] |
1199 | impl<'a, S: StateID> fmt::Debug for StateMut<'a, S> { |
1200 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
1201 | let state = State { |
1202 | _state_id_repr: self._state_id_repr, |
1203 | ntrans: self.ntrans, |
1204 | input_ranges: self.input_ranges, |
1205 | next: self.next, |
1206 | }; |
1207 | fmt::Debug::fmt(&state, f) |
1208 | } |
1209 | } |
1210 | |
1211 | /// Return the given byte as its escaped string form. |
1212 | #[cfg (feature = "std" )] |
1213 | fn escape(b: u8) -> String { |
1214 | use std::ascii; |
1215 | |
1216 | String::from_utf8(ascii::escape_default(b).collect::<Vec<_>>()).unwrap() |
1217 | } |
1218 | |
1219 | /// A binary search routine specialized specifically to a sparse DFA state's |
1220 | /// transitions. Specifically, the transitions are defined as a set of pairs |
1221 | /// of input bytes that delineate an inclusive range of bytes. If the input |
1222 | /// byte is in the range, then the corresponding transition is a match. |
1223 | /// |
1224 | /// This binary search accepts a slice of these pairs and returns the position |
1225 | /// of the matching pair (the ith transition), or None if no matching pair |
1226 | /// could be found. |
1227 | /// |
1228 | /// Note that this routine is not currently used since it was observed to |
1229 | /// either decrease performance when searching ASCII, or did not provide enough |
1230 | /// of a boost on non-ASCII haystacks to be worth it. However, we leave it here |
1231 | /// for posterity in case we can find a way to use it. |
1232 | /// |
1233 | /// In theory, we could use the standard library's search routine if we could |
1234 | /// cast a `&[u8]` to a `&[(u8, u8)]`, but I don't believe this is currently |
1235 | /// guaranteed to be safe and is thus UB (since I don't think the in-memory |
1236 | /// representation of `(u8, u8)` has been nailed down). |
1237 | #[inline (always)] |
1238 | #[allow (dead_code)] |
1239 | fn binary_search_ranges(ranges: &[u8], needle: u8) -> Option<usize> { |
1240 | debug_assert!(ranges.len() % 2 == 0, "ranges must have even length" ); |
1241 | debug_assert!(ranges.len() <= 512, "ranges should be short" ); |
1242 | |
1243 | let (mut left, mut right) = (0, ranges.len() / 2); |
1244 | while left < right { |
1245 | let mid = (left + right) / 2; |
1246 | let (b1, b2) = (ranges[mid * 2], ranges[mid * 2 + 1]); |
1247 | if needle < b1 { |
1248 | right = mid; |
1249 | } else if needle > b2 { |
1250 | left = mid + 1; |
1251 | } else { |
1252 | return Some(mid); |
1253 | } |
1254 | } |
1255 | None |
1256 | } |
1257 | |