1#[cfg(feature = "std")]
2use core::fmt;
3#[cfg(feature = "std")]
4use core::iter;
5use core::marker::PhantomData;
6use core::mem::size_of;
7#[cfg(feature = "std")]
8use std::collections::HashMap;
9
10#[cfg(feature = "std")]
11use byteorder::{BigEndian, LittleEndian};
12use byteorder::{ByteOrder, NativeEndian};
13
14use classes::ByteClasses;
15use dense;
16use dfa::DFA;
17#[cfg(feature = "std")]
18use error::{Error, Result};
19#[cfg(feature = "std")]
20use state_id::{dead_id, usize_to_state_id, write_state_id_bytes, StateID};
21#[cfg(not(feature = "std"))]
22use state_id::{dead_id, StateID};
23
24/// A sparse table-based deterministic finite automaton (DFA).
25///
26/// In contrast to a [dense DFA](enum.DenseDFA.html), a sparse DFA uses a
27/// more space efficient representation for its transition table. Consequently,
28/// sparse DFAs can use much less memory than dense DFAs, but this comes at a
29/// price. In particular, reading the more space efficient transitions takes
30/// more work, and consequently, searching using a sparse DFA is typically
31/// slower than a dense DFA.
32///
33/// A sparse DFA can be built using the default configuration via the
34/// [`SparseDFA::new`](enum.SparseDFA.html#method.new) constructor. Otherwise,
35/// one can configure various aspects of a dense DFA via
36/// [`dense::Builder`](dense/struct.Builder.html), and then convert a dense
37/// DFA to a sparse DFA using
38/// [`DenseDFA::to_sparse`](enum.DenseDFA.html#method.to_sparse).
39///
40/// In general, a sparse DFA supports all the same operations as a dense DFA.
41///
42/// Making the choice between a dense and sparse DFA depends on your specific
43/// work load. If you can sacrifice a bit of search time performance, then a
44/// sparse DFA might be the best choice. In particular, while sparse DFAs are
45/// probably always slower than dense DFAs, you may find that they are easily
46/// fast enough for your purposes!
47///
48/// # State size
49///
50/// A `SparseDFA` has two type parameters, `T` and `S`. `T` corresponds to
51/// the type of the DFA's transition table while `S` corresponds to the
52/// representation used for the DFA's state identifiers as described by the
53/// [`StateID`](trait.StateID.html) trait. This type parameter is typically
54/// `usize`, but other valid choices provided by this crate include `u8`,
55/// `u16`, `u32` and `u64`. The primary reason for choosing a different state
56/// identifier representation than the default is to reduce the amount of
57/// memory used by a DFA. Note though, that if the chosen representation cannot
58/// accommodate the size of your DFA, then building the DFA will fail and
59/// return an error.
60///
61/// While the reduction in heap memory used by a DFA is one reason for choosing
62/// a smaller state identifier representation, another possible reason is for
63/// decreasing the serialization size of a DFA, as returned by
64/// [`to_bytes_little_endian`](enum.SparseDFA.html#method.to_bytes_little_endian),
65/// [`to_bytes_big_endian`](enum.SparseDFA.html#method.to_bytes_big_endian)
66/// or
67/// [`to_bytes_native_endian`](enum.DenseDFA.html#method.to_bytes_native_endian).
68///
69/// The type of the transition table is typically either `Vec<u8>` or `&[u8]`,
70/// depending on where the transition table is stored. Note that this is
71/// different than a dense DFA, whose transition table is typically
72/// `Vec<S>` or `&[S]`. The reason for this is that a sparse DFA always reads
73/// its transition table from raw bytes because the table is compactly packed.
74///
75/// # Variants
76///
77/// This DFA is defined as a non-exhaustive enumeration of different types of
78/// dense DFAs. All of the variants use the same internal representation
79/// for the transition table, but they vary in how the transition table is
80/// read. A DFA's specific variant depends on the configuration options set via
81/// [`dense::Builder`](dense/struct.Builder.html). The default variant is
82/// `ByteClass`.
83///
84/// # The `DFA` trait
85///
86/// This type implements the [`DFA`](trait.DFA.html) trait, which means it
87/// can be used for searching. For example:
88///
89/// ```
90/// use regex_automata::{DFA, SparseDFA};
91///
92/// # fn example() -> Result<(), regex_automata::Error> {
93/// let dfa = SparseDFA::new("foo[0-9]+")?;
94/// assert_eq!(Some(8), dfa.find(b"foo12345"));
95/// # Ok(()) }; example().unwrap()
96/// ```
97///
98/// The `DFA` trait also provides an assortment of other lower level methods
99/// for DFAs, such as `start_state` and `next_state`. While these are correctly
100/// implemented, it is an anti-pattern to use them in performance sensitive
101/// code on the `SparseDFA` type directly. Namely, each implementation requires
102/// a branch to determine which type of sparse DFA is being used. Instead,
103/// this branch should be pushed up a layer in the code since walking the
104/// transitions of a DFA is usually a hot path. If you do need to use these
105/// lower level methods in performance critical code, then you should match on
106/// the variants of this DFA and use each variant's implementation of the `DFA`
107/// trait directly.
108#[derive(Clone, Debug)]
109pub enum SparseDFA<T: AsRef<[u8]>, S: StateID = usize> {
110 /// A standard DFA that does not use byte classes.
111 Standard(Standard<T, S>),
112 /// A DFA that shrinks its alphabet to a set of equivalence classes instead
113 /// of using all possible byte values. Any two bytes belong to the same
114 /// equivalence class if and only if they can be used interchangeably
115 /// anywhere in the DFA while never discriminating between a match and a
116 /// non-match.
117 ///
118 /// Unlike dense DFAs, sparse DFAs do not tend to benefit nearly as much
119 /// from using byte classes. In some cases, using byte classes can even
120 /// marginally increase the size of a sparse DFA's transition table. The
121 /// reason for this is that a sparse DFA already compacts each state's
122 /// transitions separate from whether byte classes are used.
123 ByteClass(ByteClass<T, S>),
124 /// Hints that destructuring should not be exhaustive.
125 ///
126 /// This enum may grow additional variants, so this makes sure clients
127 /// don't count on exhaustive matching. (Otherwise, adding a new variant
128 /// could break existing code.)
129 #[doc(hidden)]
130 __Nonexhaustive,
131}
132
133#[cfg(feature = "std")]
134impl SparseDFA<Vec<u8>, usize> {
135 /// Parse the given regular expression using a default configuration and
136 /// return the corresponding sparse DFA.
137 ///
138 /// The default configuration uses `usize` for state IDs and reduces the
139 /// alphabet size by splitting bytes into equivalence classes. The
140 /// resulting DFA is *not* minimized.
141 ///
142 /// If you want a non-default configuration, then use the
143 /// [`dense::Builder`](dense/struct.Builder.html)
144 /// to set your own configuration, and then call
145 /// [`DenseDFA::to_sparse`](enum.DenseDFA.html#method.to_sparse)
146 /// to create a sparse DFA.
147 ///
148 /// # Example
149 ///
150 /// ```
151 /// use regex_automata::{DFA, SparseDFA};
152 ///
153 /// # fn example() -> Result<(), regex_automata::Error> {
154 /// let dfa = SparseDFA::new("foo[0-9]+bar")?;
155 /// assert_eq!(Some(11), dfa.find(b"foo12345bar"));
156 /// # Ok(()) }; example().unwrap()
157 /// ```
158 pub fn new(pattern: &str) -> Result<SparseDFA<Vec<u8>, usize>> {
159 dense::Builder::new()
160 .build(pattern)
161 .and_then(|dense| dense.to_sparse())
162 }
163}
164
165#[cfg(feature = "std")]
166impl<S: StateID> SparseDFA<Vec<u8>, S> {
167 /// Create a new empty sparse DFA that never matches any input.
168 ///
169 /// # Example
170 ///
171 /// In order to build an empty DFA, callers must provide a type hint
172 /// indicating their choice of state identifier representation.
173 ///
174 /// ```
175 /// use regex_automata::{DFA, SparseDFA};
176 ///
177 /// # fn example() -> Result<(), regex_automata::Error> {
178 /// let dfa: SparseDFA<Vec<u8>, usize> = SparseDFA::empty();
179 /// assert_eq!(None, dfa.find(b""));
180 /// assert_eq!(None, dfa.find(b"foo"));
181 /// # Ok(()) }; example().unwrap()
182 /// ```
183 pub fn empty() -> SparseDFA<Vec<u8>, S> {
184 dense::DenseDFA::empty().to_sparse().unwrap()
185 }
186
187 pub(crate) fn from_dense_sized<T: AsRef<[S]>, A: StateID>(
188 dfa: &dense::Repr<T, S>,
189 ) -> Result<SparseDFA<Vec<u8>, A>> {
190 Repr::from_dense_sized(dfa).map(|r| r.into_sparse_dfa())
191 }
192}
193
194impl<T: AsRef<[u8]>, S: StateID> SparseDFA<T, S> {
195 /// Cheaply return a borrowed version of this sparse DFA. Specifically, the
196 /// DFA returned always uses `&[u8]` for its transition table while keeping
197 /// the same state identifier representation.
198 pub fn as_ref<'a>(&'a self) -> SparseDFA<&'a [u8], S> {
199 match *self {
200 SparseDFA::Standard(Standard(ref r)) => {
201 SparseDFA::Standard(Standard(r.as_ref()))
202 }
203 SparseDFA::ByteClass(ByteClass(ref r)) => {
204 SparseDFA::ByteClass(ByteClass(r.as_ref()))
205 }
206 SparseDFA::__Nonexhaustive => unreachable!(),
207 }
208 }
209
210 /// Return an owned version of this sparse DFA. Specifically, the DFA
211 /// returned always uses `Vec<u8>` for its transition table while keeping
212 /// the same state identifier representation.
213 ///
214 /// Effectively, this returns a sparse DFA whose transition table lives
215 /// on the heap.
216 #[cfg(feature = "std")]
217 pub fn to_owned(&self) -> SparseDFA<Vec<u8>, S> {
218 match *self {
219 SparseDFA::Standard(Standard(ref r)) => {
220 SparseDFA::Standard(Standard(r.to_owned()))
221 }
222 SparseDFA::ByteClass(ByteClass(ref r)) => {
223 SparseDFA::ByteClass(ByteClass(r.to_owned()))
224 }
225 SparseDFA::__Nonexhaustive => unreachable!(),
226 }
227 }
228
229 /// Returns the memory usage, in bytes, of this DFA.
230 ///
231 /// The memory usage is computed based on the number of bytes used to
232 /// represent this DFA's transition table. This typically corresponds to
233 /// heap memory usage.
234 ///
235 /// This does **not** include the stack size used up by this DFA. To
236 /// compute that, used `std::mem::size_of::<SparseDFA>()`.
237 pub fn memory_usage(&self) -> usize {
238 self.repr().memory_usage()
239 }
240
241 fn repr(&self) -> &Repr<T, S> {
242 match *self {
243 SparseDFA::Standard(ref r) => &r.0,
244 SparseDFA::ByteClass(ref r) => &r.0,
245 SparseDFA::__Nonexhaustive => unreachable!(),
246 }
247 }
248}
249
250/// Routines for converting a sparse DFA to other representations, such as
251/// smaller state identifiers or raw bytes suitable for persistent storage.
252#[cfg(feature = "std")]
253impl<T: AsRef<[u8]>, S: StateID> SparseDFA<T, S> {
254 /// Create a new sparse DFA whose match semantics are equivalent to
255 /// this DFA, but attempt to use `u8` for the representation of state
256 /// identifiers. If `u8` is insufficient to represent all state identifiers
257 /// in this DFA, then this returns an error.
258 ///
259 /// This is a convenience routine for `to_sized::<u8>()`.
260 pub fn to_u8(&self) -> Result<SparseDFA<Vec<u8>, u8>> {
261 self.to_sized()
262 }
263
264 /// Create a new sparse DFA whose match semantics are equivalent to
265 /// this DFA, but attempt to use `u16` for the representation of state
266 /// identifiers. If `u16` is insufficient to represent all state
267 /// identifiers in this DFA, then this returns an error.
268 ///
269 /// This is a convenience routine for `to_sized::<u16>()`.
270 pub fn to_u16(&self) -> Result<SparseDFA<Vec<u8>, u16>> {
271 self.to_sized()
272 }
273
274 /// Create a new sparse DFA whose match semantics are equivalent to
275 /// this DFA, but attempt to use `u32` for the representation of state
276 /// identifiers. If `u32` is insufficient to represent all state
277 /// identifiers in this DFA, then this returns an error.
278 ///
279 /// This is a convenience routine for `to_sized::<u32>()`.
280 #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
281 pub fn to_u32(&self) -> Result<SparseDFA<Vec<u8>, u32>> {
282 self.to_sized()
283 }
284
285 /// Create a new sparse DFA whose match semantics are equivalent to
286 /// this DFA, but attempt to use `u64` for the representation of state
287 /// identifiers. If `u64` is insufficient to represent all state
288 /// identifiers in this DFA, then this returns an error.
289 ///
290 /// This is a convenience routine for `to_sized::<u64>()`.
291 #[cfg(target_pointer_width = "64")]
292 pub fn to_u64(&self) -> Result<SparseDFA<Vec<u8>, u64>> {
293 self.to_sized()
294 }
295
296 /// Create a new sparse DFA whose match semantics are equivalent to
297 /// this DFA, but attempt to use `A` for the representation of state
298 /// identifiers. If `A` is insufficient to represent all state identifiers
299 /// in this DFA, then this returns an error.
300 ///
301 /// An alternative way to construct such a DFA is to use
302 /// [`DenseDFA::to_sparse_sized`](enum.DenseDFA.html#method.to_sparse_sized).
303 /// In general, picking the appropriate size upon initial construction of
304 /// a sparse DFA is preferred, since it will do the conversion in one
305 /// step instead of two.
306 pub fn to_sized<A: StateID>(&self) -> Result<SparseDFA<Vec<u8>, A>> {
307 self.repr().to_sized().map(|r| r.into_sparse_dfa())
308 }
309
310 /// Serialize a sparse DFA to raw bytes in little endian format.
311 ///
312 /// If the state identifier representation of this DFA has a size different
313 /// than 1, 2, 4 or 8 bytes, then this returns an error. All
314 /// implementations of `StateID` provided by this crate satisfy this
315 /// requirement.
316 pub fn to_bytes_little_endian(&self) -> Result<Vec<u8>> {
317 self.repr().to_bytes::<LittleEndian>()
318 }
319
320 /// Serialize a sparse DFA to raw bytes in big endian format.
321 ///
322 /// If the state identifier representation of this DFA has a size different
323 /// than 1, 2, 4 or 8 bytes, then this returns an error. All
324 /// implementations of `StateID` provided by this crate satisfy this
325 /// requirement.
326 pub fn to_bytes_big_endian(&self) -> Result<Vec<u8>> {
327 self.repr().to_bytes::<BigEndian>()
328 }
329
330 /// Serialize a sparse DFA to raw bytes in native endian format.
331 /// Generally, it is better to pick an explicit endianness using either
332 /// `to_bytes_little_endian` or `to_bytes_big_endian`. This routine is
333 /// useful in tests where the DFA is serialized and deserialized on the
334 /// same platform.
335 ///
336 /// If the state identifier representation of this DFA has a size different
337 /// than 1, 2, 4 or 8 bytes, then this returns an error. All
338 /// implementations of `StateID` provided by this crate satisfy this
339 /// requirement.
340 pub fn to_bytes_native_endian(&self) -> Result<Vec<u8>> {
341 self.repr().to_bytes::<NativeEndian>()
342 }
343}
344
345impl<'a, S: StateID> SparseDFA<&'a [u8], S> {
346 /// Deserialize a sparse DFA with a specific state identifier
347 /// representation.
348 ///
349 /// Deserializing a DFA using this routine will never allocate heap memory.
350 /// This is also guaranteed to be a constant time operation that does not
351 /// vary with the size of the DFA.
352 ///
353 /// The bytes given should be generated by the serialization of a DFA with
354 /// either the
355 /// [`to_bytes_little_endian`](enum.DenseDFA.html#method.to_bytes_little_endian)
356 /// method or the
357 /// [`to_bytes_big_endian`](enum.DenseDFA.html#method.to_bytes_big_endian)
358 /// endian, depending on the endianness of the machine you are
359 /// deserializing this DFA from.
360 ///
361 /// If the state identifier representation is `usize`, then deserialization
362 /// is dependent on the pointer size. For this reason, it is best to
363 /// serialize DFAs using a fixed size representation for your state
364 /// identifiers, such as `u8`, `u16`, `u32` or `u64`.
365 ///
366 /// # Panics
367 ///
368 /// The bytes given should be *trusted*. In particular, if the bytes
369 /// are not a valid serialization of a DFA, or if the endianness of the
370 /// serialized bytes is different than the endianness of the machine that
371 /// is deserializing the DFA, then this routine will panic. Moreover, it
372 /// is possible for this deserialization routine to succeed even if the
373 /// given bytes do not represent a valid serialized sparse DFA.
374 ///
375 /// # Safety
376 ///
377 /// This routine is unsafe because it permits callers to provide an
378 /// arbitrary transition table with possibly incorrect transitions. While
379 /// the various serialization routines will never return an incorrect
380 /// transition table, there is no guarantee that the bytes provided here
381 /// are correct. While deserialization does many checks (as documented
382 /// above in the panic conditions), this routine does not check that the
383 /// transition table is correct. Given an incorrect transition table, it is
384 /// possible for the search routines to access out-of-bounds memory because
385 /// of explicit bounds check elision.
386 ///
387 /// # Example
388 ///
389 /// This example shows how to serialize a DFA to raw bytes, deserialize it
390 /// and then use it for searching. Note that we first convert the DFA to
391 /// using `u16` for its state identifier representation before serializing
392 /// it. While this isn't strictly necessary, it's good practice in order to
393 /// decrease the size of the DFA and to avoid platform specific pitfalls
394 /// such as differing pointer sizes.
395 ///
396 /// ```
397 /// use regex_automata::{DFA, DenseDFA, SparseDFA};
398 ///
399 /// # fn example() -> Result<(), regex_automata::Error> {
400 /// let sparse = SparseDFA::new("foo[0-9]+")?;
401 /// let bytes = sparse.to_u16()?.to_bytes_native_endian()?;
402 ///
403 /// let dfa: SparseDFA<&[u8], u16> = unsafe {
404 /// SparseDFA::from_bytes(&bytes)
405 /// };
406 ///
407 /// assert_eq!(Some(8), dfa.find(b"foo12345"));
408 /// # Ok(()) }; example().unwrap()
409 /// ```
410 pub unsafe fn from_bytes(buf: &'a [u8]) -> SparseDFA<&'a [u8], S> {
411 Repr::from_bytes(buf).into_sparse_dfa()
412 }
413}
414
415impl<T: AsRef<[u8]>, S: StateID> DFA for SparseDFA<T, S> {
416 type ID = S;
417
418 #[inline]
419 fn start_state(&self) -> S {
420 self.repr().start_state()
421 }
422
423 #[inline]
424 fn is_match_state(&self, id: S) -> bool {
425 self.repr().is_match_state(id)
426 }
427
428 #[inline]
429 fn is_dead_state(&self, id: S) -> bool {
430 self.repr().is_dead_state(id)
431 }
432
433 #[inline]
434 fn is_match_or_dead_state(&self, id: S) -> bool {
435 self.repr().is_match_or_dead_state(id)
436 }
437
438 #[inline]
439 fn is_anchored(&self) -> bool {
440 self.repr().is_anchored()
441 }
442
443 #[inline]
444 fn next_state(&self, current: S, input: u8) -> S {
445 match *self {
446 SparseDFA::Standard(ref r) => r.next_state(current, input),
447 SparseDFA::ByteClass(ref r) => r.next_state(current, input),
448 SparseDFA::__Nonexhaustive => unreachable!(),
449 }
450 }
451
452 #[inline]
453 unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S {
454 self.next_state(current, input)
455 }
456
457 // We specialize the following methods because it lets us lift the
458 // case analysis between the different types of sparse DFAs. Instead of
459 // doing the case analysis for every transition, we do it once before
460 // searching. For sparse DFAs, this doesn't seem to benefit performance as
461 // much as it does for the dense DFAs, but it's easy to do so we might as
462 // well do it.
463
464 #[inline]
465 fn is_match_at(&self, bytes: &[u8], start: usize) -> bool {
466 match *self {
467 SparseDFA::Standard(ref r) => r.is_match_at(bytes, start),
468 SparseDFA::ByteClass(ref r) => r.is_match_at(bytes, start),
469 SparseDFA::__Nonexhaustive => unreachable!(),
470 }
471 }
472
473 #[inline]
474 fn shortest_match_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
475 match *self {
476 SparseDFA::Standard(ref r) => r.shortest_match_at(bytes, start),
477 SparseDFA::ByteClass(ref r) => r.shortest_match_at(bytes, start),
478 SparseDFA::__Nonexhaustive => unreachable!(),
479 }
480 }
481
482 #[inline]
483 fn find_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
484 match *self {
485 SparseDFA::Standard(ref r) => r.find_at(bytes, start),
486 SparseDFA::ByteClass(ref r) => r.find_at(bytes, start),
487 SparseDFA::__Nonexhaustive => unreachable!(),
488 }
489 }
490
491 #[inline]
492 fn rfind_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
493 match *self {
494 SparseDFA::Standard(ref r) => r.rfind_at(bytes, start),
495 SparseDFA::ByteClass(ref r) => r.rfind_at(bytes, start),
496 SparseDFA::__Nonexhaustive => unreachable!(),
497 }
498 }
499}
500
501/// A standard sparse DFA that does not use premultiplication or byte classes.
502///
503/// Generally, it isn't necessary to use this type directly, since a
504/// `SparseDFA` can be used for searching directly. One possible reason why
505/// one might want to use this type directly is if you are implementing your
506/// own search routines by walking a DFA's transitions directly. In that case,
507/// you'll want to use this type (or any of the other DFA variant types)
508/// directly, since they implement `next_state` more efficiently.
509#[derive(Clone, Debug)]
510pub struct Standard<T: AsRef<[u8]>, S: StateID = usize>(Repr<T, S>);
511
512impl<T: AsRef<[u8]>, S: StateID> DFA for Standard<T, S> {
513 type ID = S;
514
515 #[inline]
516 fn start_state(&self) -> S {
517 self.0.start_state()
518 }
519
520 #[inline]
521 fn is_match_state(&self, id: S) -> bool {
522 self.0.is_match_state(id)
523 }
524
525 #[inline]
526 fn is_dead_state(&self, id: S) -> bool {
527 self.0.is_dead_state(id)
528 }
529
530 #[inline]
531 fn is_match_or_dead_state(&self, id: S) -> bool {
532 self.0.is_match_or_dead_state(id)
533 }
534
535 #[inline]
536 fn is_anchored(&self) -> bool {
537 self.0.is_anchored()
538 }
539
540 #[inline]
541 fn next_state(&self, current: S, input: u8) -> S {
542 self.0.state(current).next(input)
543 }
544
545 #[inline]
546 unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S {
547 self.next_state(current, input)
548 }
549}
550
551/// A sparse DFA that shrinks its alphabet.
552///
553/// Alphabet shrinking is achieved by using a set of equivalence classes
554/// instead of using all possible byte values. Any two bytes belong to the same
555/// equivalence class if and only if they can be used interchangeably anywhere
556/// in the DFA while never discriminating between a match and a non-match.
557///
558/// Unlike dense DFAs, sparse DFAs do not tend to benefit nearly as much from
559/// using byte classes. In some cases, using byte classes can even marginally
560/// increase the size of a sparse DFA's transition table. The reason for this
561/// is that a sparse DFA already compacts each state's transitions separate
562/// from whether byte classes are used.
563///
564/// Generally, it isn't necessary to use this type directly, since a
565/// `SparseDFA` can be used for searching directly. One possible reason why
566/// one might want to use this type directly is if you are implementing your
567/// own search routines by walking a DFA's transitions directly. In that case,
568/// you'll want to use this type (or any of the other DFA variant types)
569/// directly, since they implement `next_state` more efficiently.
570#[derive(Clone, Debug)]
571pub struct ByteClass<T: AsRef<[u8]>, S: StateID = usize>(Repr<T, S>);
572
573impl<T: AsRef<[u8]>, S: StateID> DFA for ByteClass<T, S> {
574 type ID = S;
575
576 #[inline]
577 fn start_state(&self) -> S {
578 self.0.start_state()
579 }
580
581 #[inline]
582 fn is_match_state(&self, id: S) -> bool {
583 self.0.is_match_state(id)
584 }
585
586 #[inline]
587 fn is_dead_state(&self, id: S) -> bool {
588 self.0.is_dead_state(id)
589 }
590
591 #[inline]
592 fn is_match_or_dead_state(&self, id: S) -> bool {
593 self.0.is_match_or_dead_state(id)
594 }
595
596 #[inline]
597 fn is_anchored(&self) -> bool {
598 self.0.is_anchored()
599 }
600
601 #[inline]
602 fn next_state(&self, current: S, input: u8) -> S {
603 let input = self.0.byte_classes.get(input);
604 self.0.state(current).next(input)
605 }
606
607 #[inline]
608 unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S {
609 self.next_state(current, input)
610 }
611}
612
613/// The underlying representation of a sparse DFA. This is shared by all of
614/// the different variants of a sparse DFA.
615#[derive(Clone)]
616#[cfg_attr(not(feature = "std"), derive(Debug))]
617struct Repr<T: AsRef<[u8]>, S: StateID = usize> {
618 anchored: bool,
619 start: S,
620 state_count: usize,
621 max_match: S,
622 byte_classes: ByteClasses,
623 trans: T,
624}
625
626impl<T: AsRef<[u8]>, S: StateID> Repr<T, S> {
627 fn into_sparse_dfa(self) -> SparseDFA<T, S> {
628 if self.byte_classes.is_singleton() {
629 SparseDFA::Standard(Standard(self))
630 } else {
631 SparseDFA::ByteClass(ByteClass(self))
632 }
633 }
634
635 fn as_ref<'a>(&'a self) -> Repr<&'a [u8], S> {
636 Repr {
637 anchored: self.anchored,
638 start: self.start,
639 state_count: self.state_count,
640 max_match: self.max_match,
641 byte_classes: self.byte_classes.clone(),
642 trans: self.trans(),
643 }
644 }
645
646 #[cfg(feature = "std")]
647 fn to_owned(&self) -> Repr<Vec<u8>, S> {
648 Repr {
649 anchored: self.anchored,
650 start: self.start,
651 state_count: self.state_count,
652 max_match: self.max_match,
653 byte_classes: self.byte_classes.clone(),
654 trans: self.trans().to_vec(),
655 }
656 }
657
658 /// Return a convenient representation of the given state.
659 ///
660 /// This is marked as inline because it doesn't seem to get inlined
661 /// otherwise, which leads to a fairly significant performance loss (~25%).
662 #[inline]
663 fn state<'a>(&'a self, id: S) -> State<'a, S> {
664 let mut pos = id.to_usize();
665 let ntrans = NativeEndian::read_u16(&self.trans()[pos..]) as usize;
666 pos += 2;
667 let input_ranges = &self.trans()[pos..pos + (ntrans * 2)];
668 pos += 2 * ntrans;
669 let next = &self.trans()[pos..pos + (ntrans * size_of::<S>())];
670 State { _state_id_repr: PhantomData, ntrans, input_ranges, next }
671 }
672
673 /// Return an iterator over all of the states in this DFA.
674 ///
675 /// The iterator returned yields tuples, where the first element is the
676 /// state ID and the second element is the state itself.
677 #[cfg(feature = "std")]
678 fn states<'a>(&'a self) -> StateIter<'a, T, S> {
679 StateIter { dfa: self, id: dead_id() }
680 }
681
682 fn memory_usage(&self) -> usize {
683 self.trans().len()
684 }
685
686 fn start_state(&self) -> S {
687 self.start
688 }
689
690 fn is_match_state(&self, id: S) -> bool {
691 self.is_match_or_dead_state(id) && !self.is_dead_state(id)
692 }
693
694 fn is_dead_state(&self, id: S) -> bool {
695 id == dead_id()
696 }
697
698 fn is_match_or_dead_state(&self, id: S) -> bool {
699 id <= self.max_match
700 }
701
702 fn is_anchored(&self) -> bool {
703 self.anchored
704 }
705
706 fn trans(&self) -> &[u8] {
707 self.trans.as_ref()
708 }
709
710 /// Create a new sparse DFA whose match semantics are equivalent to this
711 /// DFA, but attempt to use `A` for the representation of state
712 /// identifiers. If `A` is insufficient to represent all state identifiers
713 /// in this DFA, then this returns an error.
714 #[cfg(feature = "std")]
715 fn to_sized<A: StateID>(&self) -> Result<Repr<Vec<u8>, A>> {
716 // To build the new DFA, we proceed much like the initial construction
717 // of the sparse DFA. Namely, since the state ID size is changing,
718 // we don't actually know all of our state IDs until we've allocated
719 // all necessary space. So we do one pass that allocates all of the
720 // storage we need, and then another pass to fill in the transitions.
721
722 let mut trans = Vec::with_capacity(size_of::<A>() * self.state_count);
723 let mut map: HashMap<S, A> = HashMap::with_capacity(self.state_count);
724 for (old_id, state) in self.states() {
725 let pos = trans.len();
726 map.insert(old_id, usize_to_state_id(pos)?);
727
728 let n = state.ntrans;
729 let zeros = 2 + (n * 2) + (n * size_of::<A>());
730 trans.extend(iter::repeat(0).take(zeros));
731
732 NativeEndian::write_u16(&mut trans[pos..], n as u16);
733 let (s, e) = (pos + 2, pos + 2 + (n * 2));
734 trans[s..e].copy_from_slice(state.input_ranges);
735 }
736
737 let mut new = Repr {
738 anchored: self.anchored,
739 start: map[&self.start],
740 state_count: self.state_count,
741 max_match: map[&self.max_match],
742 byte_classes: self.byte_classes.clone(),
743 trans,
744 };
745 for (&old_id, &new_id) in map.iter() {
746 let old_state = self.state(old_id);
747 let mut new_state = new.state_mut(new_id);
748 for i in 0..new_state.ntrans {
749 let next = map[&old_state.next_at(i)];
750 new_state.set_next_at(i, usize_to_state_id(next.to_usize())?);
751 }
752 }
753 new.start = map[&self.start];
754 new.max_match = map[&self.max_match];
755 Ok(new)
756 }
757
758 /// Serialize a sparse DFA to raw bytes using the provided endianness.
759 ///
760 /// If the state identifier representation of this DFA has a size different
761 /// than 1, 2, 4 or 8 bytes, then this returns an error. All
762 /// implementations of `StateID` provided by this crate satisfy this
763 /// requirement.
764 ///
765 /// Unlike dense DFAs, the result is not necessarily aligned since a
766 /// sparse DFA's transition table is always read as a sequence of bytes.
767 #[cfg(feature = "std")]
768 fn to_bytes<A: ByteOrder>(&self) -> Result<Vec<u8>> {
769 let label = b"rust-regex-automata-sparse-dfa\x00";
770 let size =
771 // For human readable label.
772 label.len()
773 // endiannes check, must be equal to 0xFEFF for native endian
774 + 2
775 // For version number.
776 + 2
777 // Size of state ID representation, in bytes.
778 // Must be 1, 2, 4 or 8.
779 + 2
780 // For DFA misc options. (Currently unused.)
781 + 2
782 // For start state.
783 + 8
784 // For state count.
785 + 8
786 // For max match state.
787 + 8
788 // For byte class map.
789 + 256
790 // For transition table.
791 + self.trans().len();
792
793 let mut i = 0;
794 let mut buf = vec![0; size];
795
796 // write label
797 for &b in label {
798 buf[i] = b;
799 i += 1;
800 }
801 // endianness check
802 A::write_u16(&mut buf[i..], 0xFEFF);
803 i += 2;
804 // version number
805 A::write_u16(&mut buf[i..], 1);
806 i += 2;
807 // size of state ID
808 let state_size = size_of::<S>();
809 if ![1, 2, 4, 8].contains(&state_size) {
810 return Err(Error::serialize(&format!(
811 "state size of {} not supported, must be 1, 2, 4 or 8",
812 state_size
813 )));
814 }
815 A::write_u16(&mut buf[i..], state_size as u16);
816 i += 2;
817 // DFA misc options
818 let mut options = 0u16;
819 if self.anchored {
820 options |= dense::MASK_ANCHORED;
821 }
822 A::write_u16(&mut buf[i..], options);
823 i += 2;
824 // start state
825 A::write_u64(&mut buf[i..], self.start.to_usize() as u64);
826 i += 8;
827 // state count
828 A::write_u64(&mut buf[i..], self.state_count as u64);
829 i += 8;
830 // max match state
831 A::write_u64(&mut buf[i..], self.max_match.to_usize() as u64);
832 i += 8;
833 // byte class map
834 for b in (0..256).map(|b| b as u8) {
835 buf[i] = self.byte_classes.get(b);
836 i += 1;
837 }
838 // transition table
839 for (_, state) in self.states() {
840 A::write_u16(&mut buf[i..], state.ntrans as u16);
841 i += 2;
842 buf[i..i + (state.ntrans * 2)].copy_from_slice(state.input_ranges);
843 i += state.ntrans * 2;
844 for j in 0..state.ntrans {
845 write_state_id_bytes::<A, _>(&mut buf[i..], state.next_at(j));
846 i += size_of::<S>();
847 }
848 }
849
850 assert_eq!(size, i, "expected to consume entire buffer");
851
852 Ok(buf)
853 }
854}
855
856impl<'a, S: StateID> Repr<&'a [u8], S> {
857 /// The implementation for deserializing a sparse DFA from raw bytes.
858 unsafe fn from_bytes(mut buf: &'a [u8]) -> Repr<&'a [u8], S> {
859 // skip over label
860 match buf.iter().position(|&b| b == b'\x00') {
861 None => panic!("could not find label"),
862 Some(i) => buf = &buf[i + 1..],
863 }
864
865 // check that current endianness is same as endianness of DFA
866 let endian_check = NativeEndian::read_u16(buf);
867 buf = &buf[2..];
868 if endian_check != 0xFEFF {
869 panic!(
870 "endianness mismatch, expected 0xFEFF but got 0x{:X}. \
871 are you trying to load a SparseDFA serialized with a \
872 different endianness?",
873 endian_check,
874 );
875 }
876
877 // check that the version number is supported
878 let version = NativeEndian::read_u16(buf);
879 buf = &buf[2..];
880 if version != 1 {
881 panic!(
882 "expected version 1, but found unsupported version {}",
883 version,
884 );
885 }
886
887 // read size of state
888 let state_size = NativeEndian::read_u16(buf) as usize;
889 if state_size != size_of::<S>() {
890 panic!(
891 "state size of SparseDFA ({}) does not match \
892 requested state size ({})",
893 state_size,
894 size_of::<S>(),
895 );
896 }
897 buf = &buf[2..];
898
899 // read miscellaneous options
900 let opts = NativeEndian::read_u16(buf);
901 buf = &buf[2..];
902
903 // read start state
904 let start = S::from_usize(NativeEndian::read_u64(buf) as usize);
905 buf = &buf[8..];
906
907 // read state count
908 let state_count = NativeEndian::read_u64(buf) as usize;
909 buf = &buf[8..];
910
911 // read max match state
912 let max_match = S::from_usize(NativeEndian::read_u64(buf) as usize);
913 buf = &buf[8..];
914
915 // read byte classes
916 let byte_classes = ByteClasses::from_slice(&buf[..256]);
917 buf = &buf[256..];
918
919 Repr {
920 anchored: opts & dense::MASK_ANCHORED > 0,
921 start,
922 state_count,
923 max_match,
924 byte_classes,
925 trans: buf,
926 }
927 }
928}
929
930#[cfg(feature = "std")]
931impl<S: StateID> Repr<Vec<u8>, S> {
932 /// The implementation for constructing a sparse DFA from a dense DFA.
933 fn from_dense_sized<T: AsRef<[S]>, A: StateID>(
934 dfa: &dense::Repr<T, S>,
935 ) -> Result<Repr<Vec<u8>, A>> {
936 // In order to build the transition table, we need to be able to write
937 // state identifiers for each of the "next" transitions in each state.
938 // Our state identifiers correspond to the byte offset in the
939 // transition table at which the state is encoded. Therefore, we do not
940 // actually know what the state identifiers are until we've allocated
941 // exactly as much space as we need for each state. Thus, construction
942 // of the transition table happens in two passes.
943 //
944 // In the first pass, we fill out the shell of each state, which
945 // includes the transition count, the input byte ranges and zero-filled
946 // space for the transitions. In this first pass, we also build up a
947 // map from the state identifier index of the dense DFA to the state
948 // identifier in this sparse DFA.
949 //
950 // In the second pass, we fill in the transitions based on the map
951 // built in the first pass.
952
953 let mut trans = Vec::with_capacity(size_of::<A>() * dfa.state_count());
954 let mut remap: Vec<A> = vec![dead_id(); dfa.state_count()];
955 for (old_id, state) in dfa.states() {
956 let pos = trans.len();
957
958 remap[dfa.state_id_to_index(old_id)] = usize_to_state_id(pos)?;
959 // zero-filled space for the transition count
960 trans.push(0);
961 trans.push(0);
962
963 let mut trans_count = 0;
964 for (b1, b2, _) in state.sparse_transitions() {
965 trans_count += 1;
966 trans.push(b1);
967 trans.push(b2);
968 }
969 // fill in the transition count
970 NativeEndian::write_u16(&mut trans[pos..], trans_count);
971
972 // zero-fill the actual transitions
973 let zeros = trans_count as usize * size_of::<A>();
974 trans.extend(iter::repeat(0).take(zeros));
975 }
976
977 let mut new = Repr {
978 anchored: dfa.is_anchored(),
979 start: remap[dfa.state_id_to_index(dfa.start_state())],
980 state_count: dfa.state_count(),
981 max_match: remap[dfa.state_id_to_index(dfa.max_match_state())],
982 byte_classes: dfa.byte_classes().clone(),
983 trans,
984 };
985 for (old_id, old_state) in dfa.states() {
986 let new_id = remap[dfa.state_id_to_index(old_id)];
987 let mut new_state = new.state_mut(new_id);
988 let sparse = old_state.sparse_transitions();
989 for (i, (_, _, next)) in sparse.enumerate() {
990 let next = remap[dfa.state_id_to_index(next)];
991 new_state.set_next_at(i, next);
992 }
993 }
994 Ok(new)
995 }
996
997 /// Return a convenient mutable representation of the given state.
998 fn state_mut<'a>(&'a mut self, id: S) -> StateMut<'a, S> {
999 let mut pos = id.to_usize();
1000 let ntrans = NativeEndian::read_u16(&self.trans[pos..]) as usize;
1001 pos += 2;
1002
1003 let size = (ntrans * 2) + (ntrans * size_of::<S>());
1004 let ranges_and_next = &mut self.trans[pos..pos + size];
1005 let (input_ranges, next) = ranges_and_next.split_at_mut(ntrans * 2);
1006 StateMut { _state_id_repr: PhantomData, ntrans, input_ranges, next }
1007 }
1008}
1009
1010#[cfg(feature = "std")]
1011impl<T: AsRef<[u8]>, S: StateID> fmt::Debug for Repr<T, S> {
1012 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1013 fn state_status<T: AsRef<[u8]>, S: StateID>(
1014 dfa: &Repr<T, S>,
1015 id: S,
1016 ) -> &'static str {
1017 if id == dead_id() {
1018 if dfa.is_match_state(id) {
1019 "D*"
1020 } else {
1021 "D "
1022 }
1023 } else if id == dfa.start_state() {
1024 if dfa.is_match_state(id) {
1025 ">*"
1026 } else {
1027 "> "
1028 }
1029 } else {
1030 if dfa.is_match_state(id) {
1031 " *"
1032 } else {
1033 " "
1034 }
1035 }
1036 }
1037
1038 writeln!(f, "SparseDFA(")?;
1039 for (id, state) in self.states() {
1040 let status = state_status(self, id);
1041 writeln!(f, "{}{:06}: {:?}", status, id.to_usize(), state)?;
1042 }
1043 writeln!(f, ")")?;
1044 Ok(())
1045 }
1046}
1047
1048/// An iterator over all states in a sparse DFA.
1049///
1050/// This iterator yields tuples, where the first element is the state ID and
1051/// the second element is the state itself.
1052#[cfg(feature = "std")]
1053#[derive(Debug)]
1054struct StateIter<'a, T: AsRef<[u8]> + 'a, S: StateID + 'a = usize> {
1055 dfa: &'a Repr<T, S>,
1056 id: S,
1057}
1058
1059#[cfg(feature = "std")]
1060impl<'a, T: AsRef<[u8]>, S: StateID> Iterator for StateIter<'a, T, S> {
1061 type Item = (S, State<'a, S>);
1062
1063 fn next(&mut self) -> Option<(S, State<'a, S>)> {
1064 if self.id.to_usize() >= self.dfa.trans().len() {
1065 return None;
1066 }
1067 let id = self.id;
1068 let state = self.dfa.state(id);
1069 self.id = S::from_usize(self.id.to_usize() + state.bytes());
1070 Some((id, state))
1071 }
1072}
1073
1074/// A representation of a sparse DFA state that can be cheaply materialized
1075/// from a state identifier.
1076#[derive(Clone)]
1077struct State<'a, S: StateID = usize> {
1078 /// The state identifier representation used by the DFA from which this
1079 /// state was extracted. Since our transition table is compacted in a
1080 /// &[u8], we don't actually use the state ID type parameter explicitly
1081 /// anywhere, so we fake it. This prevents callers from using an incorrect
1082 /// state ID representation to read from this state.
1083 _state_id_repr: PhantomData<S>,
1084 /// The number of transitions in this state.
1085 ntrans: usize,
1086 /// Pairs of input ranges, where there is one pair for each transition.
1087 /// Each pair specifies an inclusive start and end byte range for the
1088 /// corresponding transition.
1089 input_ranges: &'a [u8],
1090 /// Transitions to the next state. This slice contains native endian
1091 /// encoded state identifiers, with `S` as the representation. Thus, there
1092 /// are `ntrans * size_of::<S>()` bytes in this slice.
1093 next: &'a [u8],
1094}
1095
1096impl<'a, S: StateID> State<'a, S> {
1097 /// Searches for the next transition given an input byte. If no such
1098 /// transition could be found, then a dead state is returned.
1099 fn next(&self, input: u8) -> S {
1100 // This straight linear search was observed to be much better than
1101 // binary search on ASCII haystacks, likely because a binary search
1102 // visits the ASCII case last but a linear search sees it first. A
1103 // binary search does do a little better on non-ASCII haystacks, but
1104 // not by much. There might be a better trade off lurking here.
1105 for i in 0..self.ntrans {
1106 let (start, end) = self.range(i);
1107 if start <= input && input <= end {
1108 return self.next_at(i);
1109 }
1110 // We could bail early with an extra branch: if input < b1, then
1111 // we know we'll never find a matching transition. Interestingly,
1112 // this extra branch seems to not help performance, or will even
1113 // hurt it. It's likely very dependent on the DFA itself and what
1114 // is being searched.
1115 }
1116 dead_id()
1117 }
1118
1119 /// Returns the inclusive input byte range for the ith transition in this
1120 /// state.
1121 fn range(&self, i: usize) -> (u8, u8) {
1122 (self.input_ranges[i * 2], self.input_ranges[i * 2 + 1])
1123 }
1124
1125 /// Returns the next state for the ith transition in this state.
1126 fn next_at(&self, i: usize) -> S {
1127 S::read_bytes(&self.next[i * size_of::<S>()..])
1128 }
1129
1130 /// Return the total number of bytes that this state consumes in its
1131 /// encoded form.
1132 #[cfg(feature = "std")]
1133 fn bytes(&self) -> usize {
1134 2 + (self.ntrans * 2) + (self.ntrans * size_of::<S>())
1135 }
1136}
1137
1138#[cfg(feature = "std")]
1139impl<'a, S: StateID> fmt::Debug for State<'a, S> {
1140 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1141 let mut transitions = vec![];
1142 for i in 0..self.ntrans {
1143 let next = self.next_at(i);
1144 if next == dead_id() {
1145 continue;
1146 }
1147
1148 let (start, end) = self.range(i);
1149 if start == end {
1150 transitions.push(format!(
1151 "{} => {}",
1152 escape(start),
1153 next.to_usize()
1154 ));
1155 } else {
1156 transitions.push(format!(
1157 "{}-{} => {}",
1158 escape(start),
1159 escape(end),
1160 next.to_usize(),
1161 ));
1162 }
1163 }
1164 write!(f, "{}", transitions.join(", "))
1165 }
1166}
1167
1168/// A representation of a mutable sparse DFA state that can be cheaply
1169/// materialized from a state identifier.
1170#[cfg(feature = "std")]
1171struct StateMut<'a, S: StateID = usize> {
1172 /// The state identifier representation used by the DFA from which this
1173 /// state was extracted. Since our transition table is compacted in a
1174 /// &[u8], we don't actually use the state ID type parameter explicitly
1175 /// anywhere, so we fake it. This prevents callers from using an incorrect
1176 /// state ID representation to read from this state.
1177 _state_id_repr: PhantomData<S>,
1178 /// The number of transitions in this state.
1179 ntrans: usize,
1180 /// Pairs of input ranges, where there is one pair for each transition.
1181 /// Each pair specifies an inclusive start and end byte range for the
1182 /// corresponding transition.
1183 input_ranges: &'a mut [u8],
1184 /// Transitions to the next state. This slice contains native endian
1185 /// encoded state identifiers, with `S` as the representation. Thus, there
1186 /// are `ntrans * size_of::<S>()` bytes in this slice.
1187 next: &'a mut [u8],
1188}
1189
1190#[cfg(feature = "std")]
1191impl<'a, S: StateID> StateMut<'a, S> {
1192 /// Sets the ith transition to the given state.
1193 fn set_next_at(&mut self, i: usize, next: S) {
1194 next.write_bytes(&mut self.next[i * size_of::<S>()..]);
1195 }
1196}
1197
1198#[cfg(feature = "std")]
1199impl<'a, S: StateID> fmt::Debug for StateMut<'a, S> {
1200 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1201 let state = State {
1202 _state_id_repr: self._state_id_repr,
1203 ntrans: self.ntrans,
1204 input_ranges: self.input_ranges,
1205 next: self.next,
1206 };
1207 fmt::Debug::fmt(&state, f)
1208 }
1209}
1210
1211/// Return the given byte as its escaped string form.
1212#[cfg(feature = "std")]
1213fn escape(b: u8) -> String {
1214 use std::ascii;
1215
1216 String::from_utf8(ascii::escape_default(b).collect::<Vec<_>>()).unwrap()
1217}
1218
1219/// A binary search routine specialized specifically to a sparse DFA state's
1220/// transitions. Specifically, the transitions are defined as a set of pairs
1221/// of input bytes that delineate an inclusive range of bytes. If the input
1222/// byte is in the range, then the corresponding transition is a match.
1223///
1224/// This binary search accepts a slice of these pairs and returns the position
1225/// of the matching pair (the ith transition), or None if no matching pair
1226/// could be found.
1227///
1228/// Note that this routine is not currently used since it was observed to
1229/// either decrease performance when searching ASCII, or did not provide enough
1230/// of a boost on non-ASCII haystacks to be worth it. However, we leave it here
1231/// for posterity in case we can find a way to use it.
1232///
1233/// In theory, we could use the standard library's search routine if we could
1234/// cast a `&[u8]` to a `&[(u8, u8)]`, but I don't believe this is currently
1235/// guaranteed to be safe and is thus UB (since I don't think the in-memory
1236/// representation of `(u8, u8)` has been nailed down).
1237#[inline(always)]
1238#[allow(dead_code)]
1239fn binary_search_ranges(ranges: &[u8], needle: u8) -> Option<usize> {
1240 debug_assert!(ranges.len() % 2 == 0, "ranges must have even length");
1241 debug_assert!(ranges.len() <= 512, "ranges should be short");
1242
1243 let (mut left, mut right) = (0, ranges.len() / 2);
1244 while left < right {
1245 let mid = (left + right) / 2;
1246 let (b1, b2) = (ranges[mid * 2], ranges[mid * 2 + 1]);
1247 if needle < b1 {
1248 right = mid;
1249 } else if needle > b2 {
1250 left = mid + 1;
1251 } else {
1252 return Some(mid);
1253 }
1254 }
1255 None
1256}
1257