1/*!
2Types and routines specific to sparse DFAs.
3
4This module is the home of [`sparse::DFA`](DFA).
5
6Unlike the [`dense`](super::dense) module, this module does not contain a
7builder or configuration specific for sparse DFAs. Instead, the intended
8way to build a sparse DFA is either by using a default configuration with
9its constructor [`sparse::DFA::new`](DFA::new), or by first configuring the
10construction of a dense DFA with [`dense::Builder`](super::dense::Builder)
11and then calling [`dense::DFA::to_sparse`](super::dense::DFA::to_sparse). For
12example, this configures a sparse DFA to do an overlapping search:
13
14```
15use regex_automata::{
16 dfa::{Automaton, OverlappingState, dense},
17 HalfMatch, MatchKind,
18};
19
20let dense_re = dense::Builder::new()
21 .configure(dense::Config::new().match_kind(MatchKind::All))
22 .build(r"Samwise|Sam")?;
23let sparse_re = dense_re.to_sparse()?;
24
25// Setup our haystack and initial start state.
26let haystack = b"Samwise";
27let mut state = OverlappingState::start();
28
29// First, 'Sam' will match.
30let end1 = sparse_re.find_overlapping_fwd_at(
31 None, None, haystack, 0, haystack.len(), &mut state,
32)?;
33assert_eq!(end1, Some(HalfMatch::must(0, 3)));
34
35// And now 'Samwise' will match.
36let end2 = sparse_re.find_overlapping_fwd_at(
37 None, None, haystack, 3, haystack.len(), &mut state,
38)?;
39assert_eq!(end2, Some(HalfMatch::must(0, 7)));
40# Ok::<(), Box<dyn std::error::Error>>(())
41```
42*/
43
44#[cfg(feature = "alloc")]
45use core::iter;
46use core::{
47 convert::{TryFrom, TryInto},
48 fmt,
49 mem::size_of,
50};
51
52#[cfg(feature = "alloc")]
53use alloc::{collections::BTreeSet, vec, vec::Vec};
54
55#[cfg(feature = "alloc")]
56use crate::dfa::{dense, error::Error};
57use crate::{
58 dfa::{
59 automaton::{fmt_state_indicator, Automaton},
60 special::Special,
61 DEAD,
62 },
63 util::{
64 alphabet::ByteClasses,
65 bytes::{self, DeserializeError, Endian, SerializeError},
66 id::{PatternID, StateID},
67 start::Start,
68 DebugByte,
69 },
70};
71
72const LABEL: &str = "rust-regex-automata-dfa-sparse";
73const VERSION: u32 = 2;
74
75/// A sparse deterministic finite automaton (DFA) with variable sized states.
76///
77/// In contrast to a [dense::DFA](crate::dfa::dense::DFA), a sparse DFA uses
78/// a more space efficient representation for its transitions. Consequently,
79/// sparse DFAs may use much less memory than dense DFAs, but this comes at a
80/// price. In particular, reading the more space efficient transitions takes
81/// more work, and consequently, searching using a sparse DFA is typically
82/// slower than a dense DFA.
83///
84/// A sparse DFA can be built using the default configuration via the
85/// [`DFA::new`] constructor. Otherwise, one can configure various aspects
86/// of a dense DFA via [`dense::Builder`](crate::dfa::dense::Builder),
87/// and then convert a dense DFA to a sparse DFA using
88/// [`dense::DFA::to_sparse`](crate::dfa::dense::DFA::to_sparse).
89///
90/// In general, a sparse DFA supports all the same search operations as a dense
91/// DFA.
92///
93/// Making the choice between a dense and sparse DFA depends on your specific
94/// work load. If you can sacrifice a bit of search time performance, then a
95/// sparse DFA might be the best choice. In particular, while sparse DFAs are
96/// probably always slower than dense DFAs, you may find that they are easily
97/// fast enough for your purposes!
98///
99/// # Type parameters
100///
101/// A `DFA` has one type parameter, `T`, which is used to represent the parts
102/// of a sparse DFA. `T` is typically a `Vec<u8>` or a `&[u8]`.
103///
104/// # The `Automaton` trait
105///
106/// This type implements the [`Automaton`] trait, which means it can be used
107/// for searching. For example:
108///
109/// ```
110/// use regex_automata::{
111/// dfa::{Automaton, sparse::DFA},
112/// HalfMatch,
113/// };
114///
115/// let dfa = DFA::new("foo[0-9]+")?;
116/// let expected = HalfMatch::must(0, 8);
117/// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
118/// # Ok::<(), Box<dyn std::error::Error>>(())
119/// ```
120#[derive(Clone)]
121pub struct DFA<T> {
122 // When compared to a dense DFA, a sparse DFA *looks* a lot simpler
123 // representation-wise. In reality, it is perhaps more complicated. Namely,
124 // in a dense DFA, all information needs to be very cheaply accessible
125 // using only state IDs. In a sparse DFA however, each state uses a
126 // variable amount of space because each state encodes more information
127 // than just its transitions. Each state also includes an accelerator if
128 // one exists, along with the matching pattern IDs if the state is a match
129 // state.
130 //
131 // That is, a lot of the complexity is pushed down into how each state
132 // itself is represented.
133 trans: Transitions<T>,
134 starts: StartTable<T>,
135 special: Special,
136}
137
138#[cfg(feature = "alloc")]
139impl DFA<Vec<u8>> {
140 /// Parse the given regular expression using a default configuration and
141 /// return the corresponding sparse DFA.
142 ///
143 /// If you want a non-default configuration, then use
144 /// the [`dense::Builder`](crate::dfa::dense::Builder)
145 /// to set your own configuration, and then call
146 /// [`dense::DFA::to_sparse`](crate::dfa::dense::DFA::to_sparse) to create
147 /// a sparse DFA.
148 ///
149 /// # Example
150 ///
151 /// ```
152 /// use regex_automata::{
153 /// dfa::{Automaton, sparse},
154 /// HalfMatch,
155 /// };
156 ///
157 /// let dfa = sparse::DFA::new("foo[0-9]+bar")?;
158 ///
159 /// let expected = HalfMatch::must(0, 11);
160 /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345bar")?);
161 /// # Ok::<(), Box<dyn std::error::Error>>(())
162 /// ```
163 pub fn new(pattern: &str) -> Result<DFA<Vec<u8>>, Error> {
164 dense::Builder::new()
165 .build(pattern)
166 .and_then(|dense| dense.to_sparse())
167 }
168
169 /// Parse the given regular expressions using a default configuration and
170 /// return the corresponding multi-DFA.
171 ///
172 /// If you want a non-default configuration, then use
173 /// the [`dense::Builder`](crate::dfa::dense::Builder)
174 /// to set your own configuration, and then call
175 /// [`dense::DFA::to_sparse`](crate::dfa::dense::DFA::to_sparse) to create
176 /// a sparse DFA.
177 ///
178 /// # Example
179 ///
180 /// ```
181 /// use regex_automata::{
182 /// dfa::{Automaton, sparse},
183 /// HalfMatch,
184 /// };
185 ///
186 /// let dfa = sparse::DFA::new_many(&["[0-9]+", "[a-z]+"])?;
187 /// let expected = HalfMatch::must(1, 3);
188 /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345bar")?);
189 /// # Ok::<(), Box<dyn std::error::Error>>(())
190 /// ```
191 pub fn new_many<P: AsRef<str>>(
192 patterns: &[P],
193 ) -> Result<DFA<Vec<u8>>, Error> {
194 dense::Builder::new()
195 .build_many(patterns)
196 .and_then(|dense| dense.to_sparse())
197 }
198}
199
200#[cfg(feature = "alloc")]
201impl DFA<Vec<u8>> {
202 /// Create a new DFA that matches every input.
203 ///
204 /// # Example
205 ///
206 /// ```
207 /// use regex_automata::{
208 /// dfa::{Automaton, sparse},
209 /// HalfMatch,
210 /// };
211 ///
212 /// let dfa = sparse::DFA::always_match()?;
213 ///
214 /// let expected = HalfMatch::must(0, 0);
215 /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"")?);
216 /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo")?);
217 /// # Ok::<(), Box<dyn std::error::Error>>(())
218 /// ```
219 pub fn always_match() -> Result<DFA<Vec<u8>>, Error> {
220 dense::DFA::always_match()?.to_sparse()
221 }
222
223 /// Create a new sparse DFA that never matches any input.
224 ///
225 /// # Example
226 ///
227 /// ```
228 /// use regex_automata::dfa::{Automaton, sparse};
229 ///
230 /// let dfa = sparse::DFA::never_match()?;
231 /// assert_eq!(None, dfa.find_leftmost_fwd(b"")?);
232 /// assert_eq!(None, dfa.find_leftmost_fwd(b"foo")?);
233 /// # Ok::<(), Box<dyn std::error::Error>>(())
234 /// ```
235 pub fn never_match() -> Result<DFA<Vec<u8>>, Error> {
236 dense::DFA::never_match()?.to_sparse()
237 }
238
239 /// The implementation for constructing a sparse DFA from a dense DFA.
240 pub(crate) fn from_dense<T: AsRef<[u32]>>(
241 dfa: &dense::DFA<T>,
242 ) -> Result<DFA<Vec<u8>>, Error> {
243 // In order to build the transition table, we need to be able to write
244 // state identifiers for each of the "next" transitions in each state.
245 // Our state identifiers correspond to the byte offset in the
246 // transition table at which the state is encoded. Therefore, we do not
247 // actually know what the state identifiers are until we've allocated
248 // exactly as much space as we need for each state. Thus, construction
249 // of the transition table happens in two passes.
250 //
251 // In the first pass, we fill out the shell of each state, which
252 // includes the transition count, the input byte ranges and zero-filled
253 // space for the transitions and accelerators, if present. In this
254 // first pass, we also build up a map from the state identifier index
255 // of the dense DFA to the state identifier in this sparse DFA.
256 //
257 // In the second pass, we fill in the transitions based on the map
258 // built in the first pass.
259
260 // The capacity given here reflects a minimum. (Well, the true minimum
261 // is likely even bigger, but hopefully this saves a few reallocs.)
262 let mut sparse = Vec::with_capacity(StateID::SIZE * dfa.state_count());
263 // This maps state indices from the dense DFA to StateIDs in the sparse
264 // DFA. We build out this map on the first pass, and then use it in the
265 // second pass to back-fill our transitions.
266 let mut remap: Vec<StateID> = vec![DEAD; dfa.state_count()];
267 for state in dfa.states() {
268 let pos = sparse.len();
269
270 remap[dfa.to_index(state.id())] =
271 StateID::new(pos).map_err(|_| Error::too_many_states())?;
272 // zero-filled space for the transition count
273 sparse.push(0);
274 sparse.push(0);
275
276 let mut transition_count = 0;
277 for (unit1, unit2, _) in state.sparse_transitions() {
278 match (unit1.as_u8(), unit2.as_u8()) {
279 (Some(b1), Some(b2)) => {
280 transition_count += 1;
281 sparse.push(b1);
282 sparse.push(b2);
283 }
284 (None, None) => {}
285 (Some(_), None) | (None, Some(_)) => {
286 // can never occur because sparse_transitions never
287 // groups EOI with any other transition.
288 unreachable!()
289 }
290 }
291 }
292 // Add dummy EOI transition. This is never actually read while
293 // searching, but having space equivalent to the total number
294 // of transitions is convenient. Otherwise, we'd need to track
295 // a different number of transitions for the byte ranges as for
296 // the 'next' states.
297 //
298 // N.B. The loop above is not guaranteed to yield the EOI
299 // transition, since it may point to a DEAD state. By putting
300 // it here, we always write the EOI transition, and thus
301 // guarantee that our transition count is >0. Why do we always
302 // need the EOI transition? Because in order to implement
303 // Automaton::next_eoi_state, this lets us just ask for the last
304 // transition. There are probably other/better ways to do this.
305 transition_count += 1;
306 sparse.push(0);
307 sparse.push(0);
308
309 // Check some assumptions about transition count.
310 assert_ne!(
311 transition_count, 0,
312 "transition count should be non-zero",
313 );
314 assert!(
315 transition_count <= 257,
316 "expected transition count {} to be <= 257",
317 transition_count,
318 );
319
320 // Fill in the transition count.
321 // Since transition count is always <= 257, we use the most
322 // significant bit to indicate whether this is a match state or
323 // not.
324 let ntrans = if dfa.is_match_state(state.id()) {
325 transition_count | (1 << 15)
326 } else {
327 transition_count
328 };
329 bytes::NE::write_u16(ntrans, &mut sparse[pos..]);
330
331 // zero-fill the actual transitions.
332 // Unwraps are OK since transition_count <= 257 and our minimum
333 // support usize size is 16-bits.
334 let zeros = usize::try_from(transition_count)
335 .unwrap()
336 .checked_mul(StateID::SIZE)
337 .unwrap();
338 sparse.extend(iter::repeat(0).take(zeros));
339
340 // If this is a match state, write the pattern IDs matched by this
341 // state.
342 if dfa.is_match_state(state.id()) {
343 let plen = dfa.match_pattern_len(state.id());
344 // Write the actual pattern IDs with a u32 length prefix.
345 // First, zero-fill space.
346 let mut pos = sparse.len();
347 // Unwraps are OK since it's guaranteed that plen <=
348 // PatternID::LIMIT, which is in turn guaranteed to fit into a
349 // u32.
350 let zeros = size_of::<u32>()
351 .checked_mul(plen)
352 .unwrap()
353 .checked_add(size_of::<u32>())
354 .unwrap();
355 sparse.extend(iter::repeat(0).take(zeros));
356
357 // Now write the length prefix.
358 bytes::NE::write_u32(
359 // Will never fail since u32::MAX is invalid pattern ID.
360 // Thus, the number of pattern IDs is representable by a
361 // u32.
362 plen.try_into().expect("pattern ID count fits in u32"),
363 &mut sparse[pos..],
364 );
365 pos += size_of::<u32>();
366
367 // Now write the pattern IDs.
368 for &pid in dfa.pattern_id_slice(state.id()) {
369 pos += bytes::write_pattern_id::<bytes::NE>(
370 pid,
371 &mut sparse[pos..],
372 );
373 }
374 }
375
376 // And now add the accelerator, if one exists. An accelerator is
377 // at most 4 bytes and at least 1 byte. The first byte is the
378 // length, N. N bytes follow the length. The set of bytes that
379 // follow correspond (exhaustively) to the bytes that must be seen
380 // to leave this state.
381 let accel = dfa.accelerator(state.id());
382 sparse.push(accel.len().try_into().unwrap());
383 sparse.extend_from_slice(accel);
384 }
385
386 let mut new = DFA {
387 trans: Transitions {
388 sparse,
389 classes: dfa.byte_classes().clone(),
390 count: dfa.state_count(),
391 patterns: dfa.pattern_count(),
392 },
393 starts: StartTable::from_dense_dfa(dfa, &remap)?,
394 special: dfa.special().remap(|id| remap[dfa.to_index(id)]),
395 };
396 // And here's our second pass. Iterate over all of the dense states
397 // again, and update the transitions in each of the states in the
398 // sparse DFA.
399 for old_state in dfa.states() {
400 let new_id = remap[dfa.to_index(old_state.id())];
401 let mut new_state = new.trans.state_mut(new_id);
402 let sparse = old_state.sparse_transitions();
403 for (i, (_, _, next)) in sparse.enumerate() {
404 let next = remap[dfa.to_index(next)];
405 new_state.set_next_at(i, next);
406 }
407 }
408 trace!(
409 "created sparse DFA, memory usage: {} (dense memory usage: {})",
410 new.memory_usage(),
411 dfa.memory_usage(),
412 );
413 Ok(new)
414 }
415}
416
417impl<T: AsRef<[u8]>> DFA<T> {
418 /// Cheaply return a borrowed version of this sparse DFA. Specifically, the
419 /// DFA returned always uses `&[u8]` for its transitions.
420 pub fn as_ref<'a>(&'a self) -> DFA<&'a [u8]> {
421 DFA {
422 trans: self.trans.as_ref(),
423 starts: self.starts.as_ref(),
424 special: self.special,
425 }
426 }
427
428 /// Return an owned version of this sparse DFA. Specifically, the DFA
429 /// returned always uses `Vec<u8>` for its transitions.
430 ///
431 /// Effectively, this returns a sparse DFA whose transitions live on the
432 /// heap.
433 #[cfg(feature = "alloc")]
434 pub fn to_owned(&self) -> DFA<Vec<u8>> {
435 DFA {
436 trans: self.trans.to_owned(),
437 starts: self.starts.to_owned(),
438 special: self.special,
439 }
440 }
441
442 /// Returns the memory usage, in bytes, of this DFA.
443 ///
444 /// The memory usage is computed based on the number of bytes used to
445 /// represent this DFA.
446 ///
447 /// This does **not** include the stack size used up by this DFA. To
448 /// compute that, use `std::mem::size_of::<sparse::DFA>()`.
449 pub fn memory_usage(&self) -> usize {
450 self.trans.memory_usage() + self.starts.memory_usage()
451 }
452
453 /// Returns true only if this DFA has starting states for each pattern.
454 ///
455 /// When a DFA has starting states for each pattern, then a search with the
456 /// DFA can be configured to only look for anchored matches of a specific
457 /// pattern. Specifically, APIs like [`Automaton::find_earliest_fwd_at`]
458 /// can accept a non-None `pattern_id` if and only if this method returns
459 /// true. Otherwise, calling `find_earliest_fwd_at` will panic.
460 ///
461 /// Note that if the DFA is empty, this always returns false.
462 pub fn has_starts_for_each_pattern(&self) -> bool {
463 self.starts.patterns > 0
464 }
465}
466
467/// Routines for converting a sparse DFA to other representations, such as raw
468/// bytes suitable for persistent storage.
469impl<T: AsRef<[u8]>> DFA<T> {
470 /// Serialize this DFA as raw bytes to a `Vec<u8>` in little endian
471 /// format.
472 ///
473 /// The written bytes are guaranteed to be deserialized correctly and
474 /// without errors in a semver compatible release of this crate by a
475 /// `DFA`'s deserialization APIs (assuming all other criteria for the
476 /// deserialization APIs has been satisfied):
477 ///
478 /// * [`DFA::from_bytes`]
479 /// * [`DFA::from_bytes_unchecked`]
480 ///
481 /// Note that unlike a [`dense::DFA`](crate::dfa::dense::DFA)'s
482 /// serialization methods, this does not add any initial padding to the
483 /// returned bytes. Padding isn't required for sparse DFAs since they have
484 /// no alignment requirements.
485 ///
486 /// # Example
487 ///
488 /// This example shows how to serialize and deserialize a DFA:
489 ///
490 /// ```
491 /// use regex_automata::{
492 /// dfa::{Automaton, sparse::DFA},
493 /// HalfMatch,
494 /// };
495 ///
496 /// // Compile our original DFA.
497 /// let original_dfa = DFA::new("foo[0-9]+")?;
498 ///
499 /// // N.B. We use native endianness here to make the example work, but
500 /// // using to_bytes_little_endian would work on a little endian target.
501 /// let buf = original_dfa.to_bytes_native_endian();
502 /// // Even if buf has initial padding, DFA::from_bytes will automatically
503 /// // ignore it.
504 /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0;
505 ///
506 /// let expected = HalfMatch::must(0, 8);
507 /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
508 /// # Ok::<(), Box<dyn std::error::Error>>(())
509 /// ```
510 #[cfg(feature = "alloc")]
511 pub fn to_bytes_little_endian(&self) -> Vec<u8> {
512 self.to_bytes::<bytes::LE>()
513 }
514
515 /// Serialize this DFA as raw bytes to a `Vec<u8>` in big endian
516 /// format.
517 ///
518 /// The written bytes are guaranteed to be deserialized correctly and
519 /// without errors in a semver compatible release of this crate by a
520 /// `DFA`'s deserialization APIs (assuming all other criteria for the
521 /// deserialization APIs has been satisfied):
522 ///
523 /// * [`DFA::from_bytes`]
524 /// * [`DFA::from_bytes_unchecked`]
525 ///
526 /// Note that unlike a [`dense::DFA`](crate::dfa::dense::DFA)'s
527 /// serialization methods, this does not add any initial padding to the
528 /// returned bytes. Padding isn't required for sparse DFAs since they have
529 /// no alignment requirements.
530 ///
531 /// # Example
532 ///
533 /// This example shows how to serialize and deserialize a DFA:
534 ///
535 /// ```
536 /// use regex_automata::{
537 /// dfa::{Automaton, sparse::DFA},
538 /// HalfMatch,
539 /// };
540 ///
541 /// // Compile our original DFA.
542 /// let original_dfa = DFA::new("foo[0-9]+")?;
543 ///
544 /// // N.B. We use native endianness here to make the example work, but
545 /// // using to_bytes_big_endian would work on a big endian target.
546 /// let buf = original_dfa.to_bytes_native_endian();
547 /// // Even if buf has initial padding, DFA::from_bytes will automatically
548 /// // ignore it.
549 /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0;
550 ///
551 /// let expected = HalfMatch::must(0, 8);
552 /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
553 /// # Ok::<(), Box<dyn std::error::Error>>(())
554 /// ```
555 #[cfg(feature = "alloc")]
556 pub fn to_bytes_big_endian(&self) -> Vec<u8> {
557 self.to_bytes::<bytes::BE>()
558 }
559
560 /// Serialize this DFA as raw bytes to a `Vec<u8>` in native endian
561 /// format.
562 ///
563 /// The written bytes are guaranteed to be deserialized correctly and
564 /// without errors in a semver compatible release of this crate by a
565 /// `DFA`'s deserialization APIs (assuming all other criteria for the
566 /// deserialization APIs has been satisfied):
567 ///
568 /// * [`DFA::from_bytes`]
569 /// * [`DFA::from_bytes_unchecked`]
570 ///
571 /// Note that unlike a [`dense::DFA`](crate::dfa::dense::DFA)'s
572 /// serialization methods, this does not add any initial padding to the
573 /// returned bytes. Padding isn't required for sparse DFAs since they have
574 /// no alignment requirements.
575 ///
576 /// Generally speaking, native endian format should only be used when
577 /// you know that the target you're compiling the DFA for matches the
578 /// endianness of the target on which you're compiling DFA. For example,
579 /// if serialization and deserialization happen in the same process or on
580 /// the same machine. Otherwise, when serializing a DFA for use in a
581 /// portable environment, you'll almost certainly want to serialize _both_
582 /// a little endian and a big endian version and then load the correct one
583 /// based on the target's configuration.
584 ///
585 /// # Example
586 ///
587 /// This example shows how to serialize and deserialize a DFA:
588 ///
589 /// ```
590 /// use regex_automata::{
591 /// dfa::{Automaton, sparse::DFA},
592 /// HalfMatch,
593 /// };
594 ///
595 /// // Compile our original DFA.
596 /// let original_dfa = DFA::new("foo[0-9]+")?;
597 ///
598 /// let buf = original_dfa.to_bytes_native_endian();
599 /// // Even if buf has initial padding, DFA::from_bytes will automatically
600 /// // ignore it.
601 /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0;
602 ///
603 /// let expected = HalfMatch::must(0, 8);
604 /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
605 /// # Ok::<(), Box<dyn std::error::Error>>(())
606 /// ```
607 #[cfg(feature = "alloc")]
608 pub fn to_bytes_native_endian(&self) -> Vec<u8> {
609 self.to_bytes::<bytes::NE>()
610 }
611
612 /// The implementation of the public `to_bytes` serialization methods,
613 /// which is generic over endianness.
614 #[cfg(feature = "alloc")]
615 fn to_bytes<E: Endian>(&self) -> Vec<u8> {
616 let mut buf = vec![0; self.write_to_len()];
617 // This should always succeed since the only possible serialization
618 // error is providing a buffer that's too small, but we've ensured that
619 // `buf` is big enough here.
620 self.write_to::<E>(&mut buf).unwrap();
621 buf
622 }
623
624 /// Serialize this DFA as raw bytes to the given slice, in little endian
625 /// format. Upon success, the total number of bytes written to `dst` is
626 /// returned.
627 ///
628 /// The written bytes are guaranteed to be deserialized correctly and
629 /// without errors in a semver compatible release of this crate by a
630 /// `DFA`'s deserialization APIs (assuming all other criteria for the
631 /// deserialization APIs has been satisfied):
632 ///
633 /// * [`DFA::from_bytes`]
634 /// * [`DFA::from_bytes_unchecked`]
635 ///
636 /// # Errors
637 ///
638 /// This returns an error if the given destination slice is not big enough
639 /// to contain the full serialized DFA. If an error occurs, then nothing
640 /// is written to `dst`.
641 ///
642 /// # Example
643 ///
644 /// This example shows how to serialize and deserialize a DFA without
645 /// dynamic memory allocation.
646 ///
647 /// ```
648 /// use regex_automata::{
649 /// dfa::{Automaton, sparse::DFA},
650 /// HalfMatch,
651 /// };
652 ///
653 /// // Compile our original DFA.
654 /// let original_dfa = DFA::new("foo[0-9]+")?;
655 ///
656 /// // Create a 4KB buffer on the stack to store our serialized DFA.
657 /// let mut buf = [0u8; 4 * (1<<10)];
658 /// // N.B. We use native endianness here to make the example work, but
659 /// // using write_to_little_endian would work on a little endian target.
660 /// let written = original_dfa.write_to_native_endian(&mut buf)?;
661 /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0;
662 ///
663 /// let expected = HalfMatch::must(0, 8);
664 /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
665 /// # Ok::<(), Box<dyn std::error::Error>>(())
666 /// ```
667 pub fn write_to_little_endian(
668 &self,
669 dst: &mut [u8],
670 ) -> Result<usize, SerializeError> {
671 self.write_to::<bytes::LE>(dst)
672 }
673
674 /// Serialize this DFA as raw bytes to the given slice, in big endian
675 /// format. Upon success, the total number of bytes written to `dst` is
676 /// returned.
677 ///
678 /// The written bytes are guaranteed to be deserialized correctly and
679 /// without errors in a semver compatible release of this crate by a
680 /// `DFA`'s deserialization APIs (assuming all other criteria for the
681 /// deserialization APIs has been satisfied):
682 ///
683 /// * [`DFA::from_bytes`]
684 /// * [`DFA::from_bytes_unchecked`]
685 ///
686 /// # Errors
687 ///
688 /// This returns an error if the given destination slice is not big enough
689 /// to contain the full serialized DFA. If an error occurs, then nothing
690 /// is written to `dst`.
691 ///
692 /// # Example
693 ///
694 /// This example shows how to serialize and deserialize a DFA without
695 /// dynamic memory allocation.
696 ///
697 /// ```
698 /// use regex_automata::{
699 /// dfa::{Automaton, sparse::DFA},
700 /// HalfMatch,
701 /// };
702 ///
703 /// // Compile our original DFA.
704 /// let original_dfa = DFA::new("foo[0-9]+")?;
705 ///
706 /// // Create a 4KB buffer on the stack to store our serialized DFA.
707 /// let mut buf = [0u8; 4 * (1<<10)];
708 /// // N.B. We use native endianness here to make the example work, but
709 /// // using write_to_big_endian would work on a big endian target.
710 /// let written = original_dfa.write_to_native_endian(&mut buf)?;
711 /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0;
712 ///
713 /// let expected = HalfMatch::must(0, 8);
714 /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
715 /// # Ok::<(), Box<dyn std::error::Error>>(())
716 /// ```
717 pub fn write_to_big_endian(
718 &self,
719 dst: &mut [u8],
720 ) -> Result<usize, SerializeError> {
721 self.write_to::<bytes::BE>(dst)
722 }
723
724 /// Serialize this DFA as raw bytes to the given slice, in native endian
725 /// format. Upon success, the total number of bytes written to `dst` is
726 /// returned.
727 ///
728 /// The written bytes are guaranteed to be deserialized correctly and
729 /// without errors in a semver compatible release of this crate by a
730 /// `DFA`'s deserialization APIs (assuming all other criteria for the
731 /// deserialization APIs has been satisfied):
732 ///
733 /// * [`DFA::from_bytes`]
734 /// * [`DFA::from_bytes_unchecked`]
735 ///
736 /// Generally speaking, native endian format should only be used when
737 /// you know that the target you're compiling the DFA for matches the
738 /// endianness of the target on which you're compiling DFA. For example,
739 /// if serialization and deserialization happen in the same process or on
740 /// the same machine. Otherwise, when serializing a DFA for use in a
741 /// portable environment, you'll almost certainly want to serialize _both_
742 /// a little endian and a big endian version and then load the correct one
743 /// based on the target's configuration.
744 ///
745 /// # Errors
746 ///
747 /// This returns an error if the given destination slice is not big enough
748 /// to contain the full serialized DFA. If an error occurs, then nothing
749 /// is written to `dst`.
750 ///
751 /// # Example
752 ///
753 /// This example shows how to serialize and deserialize a DFA without
754 /// dynamic memory allocation.
755 ///
756 /// ```
757 /// use regex_automata::{
758 /// dfa::{Automaton, sparse::DFA},
759 /// HalfMatch,
760 /// };
761 ///
762 /// // Compile our original DFA.
763 /// let original_dfa = DFA::new("foo[0-9]+")?;
764 ///
765 /// // Create a 4KB buffer on the stack to store our serialized DFA.
766 /// let mut buf = [0u8; 4 * (1<<10)];
767 /// let written = original_dfa.write_to_native_endian(&mut buf)?;
768 /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0;
769 ///
770 /// let expected = HalfMatch::must(0, 8);
771 /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
772 /// # Ok::<(), Box<dyn std::error::Error>>(())
773 /// ```
774 pub fn write_to_native_endian(
775 &self,
776 dst: &mut [u8],
777 ) -> Result<usize, SerializeError> {
778 self.write_to::<bytes::NE>(dst)
779 }
780
781 /// The implementation of the public `write_to` serialization methods,
782 /// which is generic over endianness.
783 fn write_to<E: Endian>(
784 &self,
785 dst: &mut [u8],
786 ) -> Result<usize, SerializeError> {
787 let mut nw = 0;
788 nw += bytes::write_label(LABEL, &mut dst[nw..])?;
789 nw += bytes::write_endianness_check::<E>(&mut dst[nw..])?;
790 nw += bytes::write_version::<E>(VERSION, &mut dst[nw..])?;
791 nw += {
792 // Currently unused, intended for future flexibility
793 E::write_u32(0, &mut dst[nw..]);
794 size_of::<u32>()
795 };
796 nw += self.trans.write_to::<E>(&mut dst[nw..])?;
797 nw += self.starts.write_to::<E>(&mut dst[nw..])?;
798 nw += self.special.write_to::<E>(&mut dst[nw..])?;
799 Ok(nw)
800 }
801
802 /// Return the total number of bytes required to serialize this DFA.
803 ///
804 /// This is useful for determining the size of the buffer required to pass
805 /// to one of the serialization routines:
806 ///
807 /// * [`DFA::write_to_little_endian`]
808 /// * [`DFA::write_to_big_endian`]
809 /// * [`DFA::write_to_native_endian`]
810 ///
811 /// Passing a buffer smaller than the size returned by this method will
812 /// result in a serialization error.
813 ///
814 /// # Example
815 ///
816 /// This example shows how to dynamically allocate enough room to serialize
817 /// a sparse DFA.
818 ///
819 /// ```
820 /// use regex_automata::{
821 /// dfa::{Automaton, sparse::DFA},
822 /// HalfMatch,
823 /// };
824 ///
825 /// // Compile our original DFA.
826 /// let original_dfa = DFA::new("foo[0-9]+")?;
827 ///
828 /// let mut buf = vec![0; original_dfa.write_to_len()];
829 /// let written = original_dfa.write_to_native_endian(&mut buf)?;
830 /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0;
831 ///
832 /// let expected = HalfMatch::must(0, 8);
833 /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
834 /// # Ok::<(), Box<dyn std::error::Error>>(())
835 /// ```
836 pub fn write_to_len(&self) -> usize {
837 bytes::write_label_len(LABEL)
838 + bytes::write_endianness_check_len()
839 + bytes::write_version_len()
840 + size_of::<u32>() // unused, intended for future flexibility
841 + self.trans.write_to_len()
842 + self.starts.write_to_len()
843 + self.special.write_to_len()
844 }
845}
846
847impl<'a> DFA<&'a [u8]> {
848 /// Safely deserialize a sparse DFA with a specific state identifier
849 /// representation. Upon success, this returns both the deserialized DFA
850 /// and the number of bytes read from the given slice. Namely, the contents
851 /// of the slice beyond the DFA are not read.
852 ///
853 /// Deserializing a DFA using this routine will never allocate heap memory.
854 /// For safety purposes, the DFA's transitions will be verified such that
855 /// every transition points to a valid state. If this verification is too
856 /// costly, then a [`DFA::from_bytes_unchecked`] API is provided, which
857 /// will always execute in constant time.
858 ///
859 /// The bytes given must be generated by one of the serialization APIs
860 /// of a `DFA` using a semver compatible release of this crate. Those
861 /// include:
862 ///
863 /// * [`DFA::to_bytes_little_endian`]
864 /// * [`DFA::to_bytes_big_endian`]
865 /// * [`DFA::to_bytes_native_endian`]
866 /// * [`DFA::write_to_little_endian`]
867 /// * [`DFA::write_to_big_endian`]
868 /// * [`DFA::write_to_native_endian`]
869 ///
870 /// The `to_bytes` methods allocate and return a `Vec<u8>` for you. The
871 /// `write_to` methods do not allocate and write to an existing slice
872 /// (which may be on the stack). Since deserialization always uses the
873 /// native endianness of the target platform, the serialization API you use
874 /// should match the endianness of the target platform. (It's often a good
875 /// idea to generate serialized DFAs for both forms of endianness and then
876 /// load the correct one based on endianness.)
877 ///
878 /// # Errors
879 ///
880 /// Generally speaking, it's easier to state the conditions in which an
881 /// error is _not_ returned. All of the following must be true:
882 ///
883 /// * The bytes given must be produced by one of the serialization APIs
884 /// on this DFA, as mentioned above.
885 /// * The endianness of the target platform matches the endianness used to
886 /// serialized the provided DFA.
887 ///
888 /// If any of the above are not true, then an error will be returned.
889 ///
890 /// Note that unlike deserializing a
891 /// [`dense::DFA`](crate::dfa::dense::DFA), deserializing a sparse DFA has
892 /// no alignment requirements. That is, an alignment of `1` is valid.
893 ///
894 /// # Panics
895 ///
896 /// This routine will never panic for any input.
897 ///
898 /// # Example
899 ///
900 /// This example shows how to serialize a DFA to raw bytes, deserialize it
901 /// and then use it for searching.
902 ///
903 /// ```
904 /// use regex_automata::{
905 /// dfa::{Automaton, sparse::DFA},
906 /// HalfMatch,
907 /// };
908 ///
909 /// let initial = DFA::new("foo[0-9]+")?;
910 /// let bytes = initial.to_bytes_native_endian();
911 /// let dfa: DFA<&[u8]> = DFA::from_bytes(&bytes)?.0;
912 ///
913 /// let expected = HalfMatch::must(0, 8);
914 /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
915 /// # Ok::<(), Box<dyn std::error::Error>>(())
916 /// ```
917 ///
918 /// # Example: loading a DFA from static memory
919 ///
920 /// One use case this library supports is the ability to serialize a
921 /// DFA to disk and then use `include_bytes!` to store it in a compiled
922 /// Rust program. Those bytes can then be cheaply deserialized into a
923 /// `DFA` structure at runtime and used for searching without having to
924 /// re-compile the DFA (which can be quite costly).
925 ///
926 /// We can show this in two parts. The first part is serializing the DFA to
927 /// a file:
928 ///
929 /// ```no_run
930 /// use regex_automata::dfa::{Automaton, sparse::DFA};
931 ///
932 /// let dfa = DFA::new("foo[0-9]+")?;
933 ///
934 /// // Write a big endian serialized version of this DFA to a file.
935 /// let bytes = dfa.to_bytes_big_endian();
936 /// std::fs::write("foo.bigendian.dfa", &bytes)?;
937 ///
938 /// // Do it again, but this time for little endian.
939 /// let bytes = dfa.to_bytes_little_endian();
940 /// std::fs::write("foo.littleendian.dfa", &bytes)?;
941 /// # Ok::<(), Box<dyn std::error::Error>>(())
942 /// ```
943 ///
944 /// And now the second part is embedding the DFA into the compiled program
945 /// and deserializing it at runtime on first use. We use conditional
946 /// compilation to choose the correct endianness. As mentioned above, we
947 /// do not need to employ any special tricks to ensure a proper alignment,
948 /// since a sparse DFA has no alignment requirements.
949 ///
950 /// ```no_run
951 /// use regex_automata::{
952 /// dfa::{Automaton, sparse},
953 /// HalfMatch,
954 /// };
955 ///
956 /// type DFA = sparse::DFA<&'static [u8]>;
957 ///
958 /// fn get_foo() -> &'static DFA {
959 /// use std::cell::Cell;
960 /// use std::mem::MaybeUninit;
961 /// use std::sync::Once;
962 ///
963 /// # const _: &str = stringify! {
964 /// #[cfg(target_endian = "big")]
965 /// static BYTES: &[u8] = include_bytes!("foo.bigendian.dfa");
966 /// #[cfg(target_endian = "little")]
967 /// static BYTES: &[u8] = include_bytes!("foo.littleendian.dfa");
968 /// # };
969 /// # static BYTES: &[u8] = b"";
970 ///
971 /// struct Lazy(Cell<MaybeUninit<DFA>>);
972 /// // SAFETY: This is safe because DFA impls Sync.
973 /// unsafe impl Sync for Lazy {}
974 ///
975 /// static INIT: Once = Once::new();
976 /// static DFA: Lazy = Lazy(Cell::new(MaybeUninit::uninit()));
977 ///
978 /// INIT.call_once(|| {
979 /// let (dfa, _) = DFA::from_bytes(BYTES)
980 /// .expect("serialized DFA should be valid");
981 /// // SAFETY: This is guaranteed to only execute once, and all
982 /// // we do with the pointer is write the DFA to it.
983 /// unsafe {
984 /// (*DFA.0.as_ptr()).as_mut_ptr().write(dfa);
985 /// }
986 /// });
987 /// // SAFETY: DFA is guaranteed to by initialized via INIT and is
988 /// // stored in static memory.
989 /// unsafe {
990 /// let dfa = (*DFA.0.as_ptr()).as_ptr();
991 /// std::mem::transmute::<*const DFA, &'static DFA>(dfa)
992 /// }
993 /// }
994 ///
995 /// let dfa = get_foo();
996 /// let expected = HalfMatch::must(0, 8);
997 /// assert_eq!(Ok(Some(expected)), dfa.find_leftmost_fwd(b"foo12345"));
998 /// ```
999 ///
1000 /// Alternatively, consider using
1001 /// [`lazy_static`](https://crates.io/crates/lazy_static)
1002 /// or
1003 /// [`once_cell`](https://crates.io/crates/once_cell),
1004 /// which will guarantee safety for you.
1005 pub fn from_bytes(
1006 slice: &'a [u8],
1007 ) -> Result<(DFA<&'a [u8]>, usize), DeserializeError> {
1008 // SAFETY: This is safe because we validate both the sparse transitions
1009 // (by trying to decode every state) and start state ID list below. If
1010 // either validation fails, then we return an error.
1011 let (dfa, nread) = unsafe { DFA::from_bytes_unchecked(slice)? };
1012 dfa.trans.validate()?;
1013 dfa.starts.validate(&dfa.trans)?;
1014 // N.B. dfa.special doesn't have a way to do unchecked deserialization,
1015 // so it has already been validated.
1016 Ok((dfa, nread))
1017 }
1018
1019 /// Deserialize a DFA with a specific state identifier representation in
1020 /// constant time by omitting the verification of the validity of the
1021 /// sparse transitions.
1022 ///
1023 /// This is just like [`DFA::from_bytes`], except it can potentially return
1024 /// a DFA that exhibits undefined behavior if its transitions contains
1025 /// invalid state identifiers.
1026 ///
1027 /// This routine is useful if you need to deserialize a DFA cheaply and
1028 /// cannot afford the transition validation performed by `from_bytes`.
1029 ///
1030 /// # Safety
1031 ///
1032 /// This routine is unsafe because it permits callers to provide
1033 /// arbitrary transitions with possibly incorrect state identifiers. While
1034 /// the various serialization routines will never return an incorrect
1035 /// DFA, there is no guarantee that the bytes provided here
1036 /// are correct. While `from_bytes_unchecked` will still do several forms
1037 /// of basic validation, this routine does not check that the transitions
1038 /// themselves are correct. Given an incorrect transition table, it is
1039 /// possible for the search routines to access out-of-bounds memory because
1040 /// of explicit bounds check elision.
1041 ///
1042 /// # Example
1043 ///
1044 /// ```
1045 /// use regex_automata::{
1046 /// dfa::{Automaton, sparse::DFA},
1047 /// HalfMatch,
1048 /// };
1049 ///
1050 /// let initial = DFA::new("foo[0-9]+")?;
1051 /// let bytes = initial.to_bytes_native_endian();
1052 /// // SAFETY: This is guaranteed to be safe since the bytes given come
1053 /// // directly from a compatible serialization routine.
1054 /// let dfa: DFA<&[u8]> = unsafe { DFA::from_bytes_unchecked(&bytes)?.0 };
1055 ///
1056 /// let expected = HalfMatch::must(0, 8);
1057 /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
1058 /// # Ok::<(), Box<dyn std::error::Error>>(())
1059 /// ```
1060 pub unsafe fn from_bytes_unchecked(
1061 slice: &'a [u8],
1062 ) -> Result<(DFA<&'a [u8]>, usize), DeserializeError> {
1063 let mut nr = 0;
1064
1065 nr += bytes::read_label(&slice[nr..], LABEL)?;
1066 nr += bytes::read_endianness_check(&slice[nr..])?;
1067 nr += bytes::read_version(&slice[nr..], VERSION)?;
1068
1069 let _unused = bytes::try_read_u32(&slice[nr..], "unused space")?;
1070 nr += size_of::<u32>();
1071
1072 let (trans, nread) = Transitions::from_bytes_unchecked(&slice[nr..])?;
1073 nr += nread;
1074
1075 let (starts, nread) = StartTable::from_bytes_unchecked(&slice[nr..])?;
1076 nr += nread;
1077
1078 let (special, nread) = Special::from_bytes(&slice[nr..])?;
1079 nr += nread;
1080 if special.max.as_usize() >= trans.sparse().len() {
1081 return Err(DeserializeError::generic(
1082 "max should not be greater than or equal to sparse bytes",
1083 ));
1084 }
1085
1086 Ok((DFA { trans, starts, special }, nr))
1087 }
1088}
1089
1090impl<T: AsRef<[u8]>> fmt::Debug for DFA<T> {
1091 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1092 writeln!(f, "sparse::DFA(")?;
1093 for state: State<'_> in self.trans.states() {
1094 fmt_state_indicator(f, self, state.id())?;
1095 writeln!(f, "{:06?}: {:?}", state.id(), state)?;
1096 }
1097 writeln!(f, "")?;
1098 for (i: usize, (start_id: StateID, sty: Start, pid: Option)) in self.starts.iter().enumerate() {
1099 if i % self.starts.stride == 0 {
1100 match pid {
1101 None => writeln!(f, "START-GROUP(ALL)")?,
1102 Some(pid: PatternID) => {
1103 writeln!(f, "START_GROUP(pattern: {:?})", pid)?
1104 }
1105 }
1106 }
1107 writeln!(f, " {:?} => {:06?}", sty, start_id.as_usize())?;
1108 }
1109 writeln!(f, "state count: {:?}", self.trans.count)?;
1110 writeln!(f, ")")?;
1111 Ok(())
1112 }
1113}
1114
1115unsafe impl<T: AsRef<[u8]>> Automaton for DFA<T> {
1116 #[inline]
1117 fn is_special_state(&self, id: StateID) -> bool {
1118 self.special.is_special_state(id)
1119 }
1120
1121 #[inline]
1122 fn is_dead_state(&self, id: StateID) -> bool {
1123 self.special.is_dead_state(id)
1124 }
1125
1126 #[inline]
1127 fn is_quit_state(&self, id: StateID) -> bool {
1128 self.special.is_quit_state(id)
1129 }
1130
1131 #[inline]
1132 fn is_match_state(&self, id: StateID) -> bool {
1133 self.special.is_match_state(id)
1134 }
1135
1136 #[inline]
1137 fn is_start_state(&self, id: StateID) -> bool {
1138 self.special.is_start_state(id)
1139 }
1140
1141 #[inline]
1142 fn is_accel_state(&self, id: StateID) -> bool {
1143 self.special.is_accel_state(id)
1144 }
1145
1146 // This is marked as inline to help dramatically boost sparse searching,
1147 // which decodes each state it enters to follow the next transition.
1148 #[inline(always)]
1149 fn next_state(&self, current: StateID, input: u8) -> StateID {
1150 let input = self.trans.classes.get(input);
1151 self.trans.state(current).next(input)
1152 }
1153
1154 #[inline]
1155 unsafe fn next_state_unchecked(
1156 &self,
1157 current: StateID,
1158 input: u8,
1159 ) -> StateID {
1160 self.next_state(current, input)
1161 }
1162
1163 #[inline]
1164 fn next_eoi_state(&self, current: StateID) -> StateID {
1165 self.trans.state(current).next_eoi()
1166 }
1167
1168 #[inline]
1169 fn pattern_count(&self) -> usize {
1170 self.trans.patterns
1171 }
1172
1173 #[inline]
1174 fn match_count(&self, id: StateID) -> usize {
1175 self.trans.state(id).pattern_count()
1176 }
1177
1178 #[inline]
1179 fn match_pattern(&self, id: StateID, match_index: usize) -> PatternID {
1180 // This is an optimization for the very common case of a DFA with a
1181 // single pattern. This conditional avoids a somewhat more costly path
1182 // that finds the pattern ID from the state machine, which requires
1183 // a bit of slicing/pointer-chasing. This optimization tends to only
1184 // matter when matches are frequent.
1185 if self.trans.patterns == 1 {
1186 return PatternID::ZERO;
1187 }
1188 self.trans.state(id).pattern_id(match_index)
1189 }
1190
1191 #[inline]
1192 fn start_state_forward(
1193 &self,
1194 pattern_id: Option<PatternID>,
1195 bytes: &[u8],
1196 start: usize,
1197 end: usize,
1198 ) -> StateID {
1199 let index = Start::from_position_fwd(bytes, start, end);
1200 self.starts.start(index, pattern_id)
1201 }
1202
1203 #[inline]
1204 fn start_state_reverse(
1205 &self,
1206 pattern_id: Option<PatternID>,
1207 bytes: &[u8],
1208 start: usize,
1209 end: usize,
1210 ) -> StateID {
1211 let index = Start::from_position_rev(bytes, start, end);
1212 self.starts.start(index, pattern_id)
1213 }
1214
1215 #[inline]
1216 fn accelerator(&self, id: StateID) -> &[u8] {
1217 self.trans.state(id).accelerator()
1218 }
1219}
1220
1221/// The transition table portion of a sparse DFA.
1222///
1223/// The transition table is the core part of the DFA in that it describes how
1224/// to move from one state to another based on the input sequence observed.
1225///
1226/// Unlike a typical dense table based DFA, states in a sparse transition
1227/// table have variable size. That is, states with more transitions use more
1228/// space than states with fewer transitions. This means that finding the next
1229/// transition takes more work than with a dense DFA, but also typically uses
1230/// much less space.
1231#[derive(Clone)]
1232struct Transitions<T> {
1233 /// The raw encoding of each state in this DFA.
1234 ///
1235 /// Each state has the following information:
1236 ///
1237 /// * A set of transitions to subsequent states. Transitions to the dead
1238 /// state are omitted.
1239 /// * If the state can be accelerated, then any additional accelerator
1240 /// information.
1241 /// * If the state is a match state, then the state contains all pattern
1242 /// IDs that match when in that state.
1243 ///
1244 /// To decode a state, use Transitions::state.
1245 ///
1246 /// In practice, T is either Vec<u8> or &[u8].
1247 sparse: T,
1248 /// A set of equivalence classes, where a single equivalence class
1249 /// represents a set of bytes that never discriminate between a match
1250 /// and a non-match in the DFA. Each equivalence class corresponds to a
1251 /// single character in this DFA's alphabet, where the maximum number of
1252 /// characters is 257 (each possible value of a byte plus the special
1253 /// EOI transition). Consequently, the number of equivalence classes
1254 /// corresponds to the number of transitions for each DFA state. Note
1255 /// though that the *space* used by each DFA state in the transition table
1256 /// may be larger. The total space used by each DFA state is known as the
1257 /// stride and is documented above.
1258 ///
1259 /// The only time the number of equivalence classes is fewer than 257 is
1260 /// if the DFA's kind uses byte classes which is the default. Equivalence
1261 /// classes should generally only be disabled when debugging, so that
1262 /// the transitions themselves aren't obscured. Disabling them has no
1263 /// other benefit, since the equivalence class map is always used while
1264 /// searching. In the vast majority of cases, the number of equivalence
1265 /// classes is substantially smaller than 257, particularly when large
1266 /// Unicode classes aren't used.
1267 ///
1268 /// N.B. Equivalence classes aren't particularly useful in a sparse DFA
1269 /// in the current implementation, since equivalence classes generally tend
1270 /// to correspond to continuous ranges of bytes that map to the same
1271 /// transition. So in a sparse DFA, equivalence classes don't really lead
1272 /// to a space savings. In the future, it would be good to try and remove
1273 /// them from sparse DFAs entirely, but requires a bit of work since sparse
1274 /// DFAs are built from dense DFAs, which are in turn built on top of
1275 /// equivalence classes.
1276 classes: ByteClasses,
1277 /// The total number of states in this DFA. Note that a DFA always has at
1278 /// least one state---the dead state---even the empty DFA. In particular,
1279 /// the dead state always has ID 0 and is correspondingly always the first
1280 /// state. The dead state is never a match state.
1281 count: usize,
1282 /// The total number of unique patterns represented by these match states.
1283 patterns: usize,
1284}
1285
1286impl<'a> Transitions<&'a [u8]> {
1287 unsafe fn from_bytes_unchecked(
1288 mut slice: &'a [u8],
1289 ) -> Result<(Transitions<&'a [u8]>, usize), DeserializeError> {
1290 let slice_start = slice.as_ptr() as usize;
1291
1292 let (state_count, nr) =
1293 bytes::try_read_u32_as_usize(&slice, "state count")?;
1294 slice = &slice[nr..];
1295
1296 let (pattern_count, nr) =
1297 bytes::try_read_u32_as_usize(&slice, "pattern count")?;
1298 slice = &slice[nr..];
1299
1300 let (classes, nr) = ByteClasses::from_bytes(&slice)?;
1301 slice = &slice[nr..];
1302
1303 let (len, nr) =
1304 bytes::try_read_u32_as_usize(&slice, "sparse transitions length")?;
1305 slice = &slice[nr..];
1306
1307 bytes::check_slice_len(slice, len, "sparse states byte length")?;
1308 let sparse = &slice[..len];
1309 slice = &slice[len..];
1310
1311 let trans = Transitions {
1312 sparse,
1313 classes,
1314 count: state_count,
1315 patterns: pattern_count,
1316 };
1317 Ok((trans, slice.as_ptr() as usize - slice_start))
1318 }
1319}
1320
1321impl<T: AsRef<[u8]>> Transitions<T> {
1322 /// Writes a serialized form of this transition table to the buffer given.
1323 /// If the buffer is too small, then an error is returned. To determine
1324 /// how big the buffer must be, use `write_to_len`.
1325 fn write_to<E: Endian>(
1326 &self,
1327 mut dst: &mut [u8],
1328 ) -> Result<usize, SerializeError> {
1329 let nwrite = self.write_to_len();
1330 if dst.len() < nwrite {
1331 return Err(SerializeError::buffer_too_small(
1332 "sparse transition table",
1333 ));
1334 }
1335 dst = &mut dst[..nwrite];
1336
1337 // write state count
1338 E::write_u32(u32::try_from(self.count).unwrap(), dst);
1339 dst = &mut dst[size_of::<u32>()..];
1340
1341 // write pattern count
1342 E::write_u32(u32::try_from(self.patterns).unwrap(), dst);
1343 dst = &mut dst[size_of::<u32>()..];
1344
1345 // write byte class map
1346 let n = self.classes.write_to(dst)?;
1347 dst = &mut dst[n..];
1348
1349 // write number of bytes in sparse transitions
1350 E::write_u32(u32::try_from(self.sparse().len()).unwrap(), dst);
1351 dst = &mut dst[size_of::<u32>()..];
1352
1353 // write actual transitions
1354 dst.copy_from_slice(self.sparse());
1355 Ok(nwrite)
1356 }
1357
1358 /// Returns the number of bytes the serialized form of this transition
1359 /// table will use.
1360 fn write_to_len(&self) -> usize {
1361 size_of::<u32>() // state count
1362 + size_of::<u32>() // pattern count
1363 + self.classes.write_to_len()
1364 + size_of::<u32>() // sparse transitions length
1365 + self.sparse().len()
1366 }
1367
1368 /// Validates that every state ID in this transition table is valid.
1369 ///
1370 /// That is, every state ID can be used to correctly index a state in this
1371 /// table.
1372 fn validate(&self) -> Result<(), DeserializeError> {
1373 // In order to validate everything, we not only need to make sure we
1374 // can decode every state, but that every transition in every state
1375 // points to a valid state. There are many duplicative transitions, so
1376 // we record state IDs that we've verified so that we don't redo the
1377 // decoding work.
1378 //
1379 // Except, when in no_std mode, we don't have dynamic memory allocation
1380 // available to us, so we skip this optimization. It's not clear
1381 // whether doing something more clever is worth it just yet. If you're
1382 // profiling this code and need it to run faster, please file an issue.
1383 //
1384 // ---AG
1385 struct Seen {
1386 #[cfg(feature = "alloc")]
1387 set: BTreeSet<StateID>,
1388 #[cfg(not(feature = "alloc"))]
1389 set: core::marker::PhantomData<StateID>,
1390 }
1391
1392 #[cfg(feature = "alloc")]
1393 impl Seen {
1394 fn new() -> Seen {
1395 Seen { set: BTreeSet::new() }
1396 }
1397 fn insert(&mut self, id: StateID) {
1398 self.set.insert(id);
1399 }
1400 fn contains(&self, id: &StateID) -> bool {
1401 self.set.contains(id)
1402 }
1403 }
1404
1405 #[cfg(not(feature = "alloc"))]
1406 impl Seen {
1407 fn new() -> Seen {
1408 Seen { set: core::marker::PhantomData }
1409 }
1410 fn insert(&mut self, _id: StateID) {}
1411 fn contains(&self, _id: &StateID) -> bool {
1412 false
1413 }
1414 }
1415
1416 let mut verified: Seen = Seen::new();
1417 // We need to make sure that we decode the correct number of states.
1418 // Otherwise, an empty set of transitions would validate even if the
1419 // recorded state count is non-empty.
1420 let mut count = 0;
1421 // We can't use the self.states() iterator because it assumes the state
1422 // encodings are valid. It could panic if they aren't.
1423 let mut id = DEAD;
1424 while id.as_usize() < self.sparse().len() {
1425 let state = self.try_state(id)?;
1426 verified.insert(id);
1427 // The next ID should be the offset immediately following `state`.
1428 id = StateID::new(bytes::add(
1429 id.as_usize(),
1430 state.bytes_len(),
1431 "next state ID offset",
1432 )?)
1433 .map_err(|err| {
1434 DeserializeError::state_id_error(err, "next state ID offset")
1435 })?;
1436 count += 1;
1437
1438 // Now check that all transitions in this state are correct.
1439 for i in 0..state.ntrans {
1440 let to = state.next_at(i);
1441 if verified.contains(&to) {
1442 continue;
1443 }
1444 let _ = self.try_state(to)?;
1445 verified.insert(id);
1446 }
1447 }
1448 if count != self.count {
1449 return Err(DeserializeError::generic(
1450 "mismatching sparse state count",
1451 ));
1452 }
1453 Ok(())
1454 }
1455
1456 /// Converts these transitions to a borrowed value.
1457 fn as_ref(&self) -> Transitions<&'_ [u8]> {
1458 Transitions {
1459 sparse: self.sparse(),
1460 classes: self.classes.clone(),
1461 count: self.count,
1462 patterns: self.patterns,
1463 }
1464 }
1465
1466 /// Converts these transitions to an owned value.
1467 #[cfg(feature = "alloc")]
1468 fn to_owned(&self) -> Transitions<Vec<u8>> {
1469 Transitions {
1470 sparse: self.sparse().to_vec(),
1471 classes: self.classes.clone(),
1472 count: self.count,
1473 patterns: self.patterns,
1474 }
1475 }
1476
1477 /// Return a convenient representation of the given state.
1478 ///
1479 /// This panics if the state is invalid.
1480 ///
1481 /// This is marked as inline to help dramatically boost sparse searching,
1482 /// which decodes each state it enters to follow the next transition. Other
1483 /// functions involved are also inlined, which should hopefully eliminate
1484 /// a lot of the extraneous decoding that is never needed just to follow
1485 /// the next transition.
1486 #[inline(always)]
1487 fn state(&self, id: StateID) -> State<'_> {
1488 let mut state = &self.sparse()[id.as_usize()..];
1489 let mut ntrans = bytes::read_u16(&state) as usize;
1490 let is_match = (1 << 15) & ntrans != 0;
1491 ntrans &= !(1 << 15);
1492 state = &state[2..];
1493
1494 let (input_ranges, state) = state.split_at(ntrans * 2);
1495 let (next, state) = state.split_at(ntrans * StateID::SIZE);
1496 let (pattern_ids, state) = if is_match {
1497 let npats = bytes::read_u32(&state) as usize;
1498 state[4..].split_at(npats * 4)
1499 } else {
1500 (&[][..], state)
1501 };
1502
1503 let accel_len = state[0] as usize;
1504 let accel = &state[1..accel_len + 1];
1505 State { id, is_match, ntrans, input_ranges, next, pattern_ids, accel }
1506 }
1507
1508 /// Like `state`, but will return an error if the state encoding is
1509 /// invalid. This is useful for verifying states after deserialization,
1510 /// which is required for a safe deserialization API.
1511 ///
1512 /// Note that this only verifies that this state is decodable and that
1513 /// all of its data is consistent. It does not verify that its state ID
1514 /// transitions point to valid states themselves, nor does it verify that
1515 /// every pattern ID is valid.
1516 fn try_state(&self, id: StateID) -> Result<State<'_>, DeserializeError> {
1517 if id.as_usize() > self.sparse().len() {
1518 return Err(DeserializeError::generic("invalid sparse state ID"));
1519 }
1520 let mut state = &self.sparse()[id.as_usize()..];
1521 // Encoding format starts with a u16 that stores the total number of
1522 // transitions in this state.
1523 let (mut ntrans, _) =
1524 bytes::try_read_u16_as_usize(state, "state transition count")?;
1525 let is_match = ((1 << 15) & ntrans) != 0;
1526 ntrans &= !(1 << 15);
1527 state = &state[2..];
1528 if ntrans > 257 || ntrans == 0 {
1529 return Err(DeserializeError::generic("invalid transition count"));
1530 }
1531
1532 // Each transition has two pieces: an inclusive range of bytes on which
1533 // it is defined, and the state ID that those bytes transition to. The
1534 // pairs come first, followed by a corresponding sequence of state IDs.
1535 let input_ranges_len = ntrans.checked_mul(2).unwrap();
1536 bytes::check_slice_len(state, input_ranges_len, "sparse byte pairs")?;
1537 let (input_ranges, state) = state.split_at(input_ranges_len);
1538 // Every range should be of the form A-B, where A<=B.
1539 for pair in input_ranges.chunks(2) {
1540 let (start, end) = (pair[0], pair[1]);
1541 if start > end {
1542 return Err(DeserializeError::generic("invalid input range"));
1543 }
1544 }
1545
1546 // And now extract the corresponding sequence of state IDs. We leave
1547 // this sequence as a &[u8] instead of a &[S] because sparse DFAs do
1548 // not have any alignment requirements.
1549 let next_len = ntrans
1550 .checked_mul(self.id_len())
1551 .expect("state size * #trans should always fit in a usize");
1552 bytes::check_slice_len(state, next_len, "sparse trans state IDs")?;
1553 let (next, state) = state.split_at(next_len);
1554 // We can at least verify that every state ID is in bounds.
1555 for idbytes in next.chunks(self.id_len()) {
1556 let (id, _) =
1557 bytes::read_state_id(idbytes, "sparse state ID in try_state")?;
1558 bytes::check_slice_len(
1559 self.sparse(),
1560 id.as_usize(),
1561 "invalid sparse state ID",
1562 )?;
1563 }
1564
1565 // If this is a match state, then read the pattern IDs for this state.
1566 // Pattern IDs is a u32-length prefixed sequence of native endian
1567 // encoded 32-bit integers.
1568 let (pattern_ids, state) = if is_match {
1569 let (npats, nr) =
1570 bytes::try_read_u32_as_usize(state, "pattern ID count")?;
1571 let state = &state[nr..];
1572
1573 let pattern_ids_len =
1574 bytes::mul(npats, 4, "sparse pattern ID byte length")?;
1575 bytes::check_slice_len(
1576 state,
1577 pattern_ids_len,
1578 "sparse pattern IDs",
1579 )?;
1580 let (pattern_ids, state) = state.split_at(pattern_ids_len);
1581 for patbytes in pattern_ids.chunks(PatternID::SIZE) {
1582 bytes::read_pattern_id(
1583 patbytes,
1584 "sparse pattern ID in try_state",
1585 )?;
1586 }
1587 (pattern_ids, state)
1588 } else {
1589 (&[][..], state)
1590 };
1591
1592 // Now read this state's accelerator info. The first byte is the length
1593 // of the accelerator, which is typically 0 (for no acceleration) but
1594 // is no bigger than 3. The length indicates the number of bytes that
1595 // follow, where each byte corresponds to a transition out of this
1596 // state.
1597 if state.is_empty() {
1598 return Err(DeserializeError::generic("no accelerator length"));
1599 }
1600 let (accel_len, state) = (state[0] as usize, &state[1..]);
1601
1602 if accel_len > 3 {
1603 return Err(DeserializeError::generic(
1604 "sparse invalid accelerator length",
1605 ));
1606 }
1607 bytes::check_slice_len(
1608 state,
1609 accel_len,
1610 "sparse corrupt accelerator length",
1611 )?;
1612 let (accel, _) = (&state[..accel_len], &state[accel_len..]);
1613
1614 Ok(State {
1615 id,
1616 is_match,
1617 ntrans,
1618 input_ranges,
1619 next,
1620 pattern_ids,
1621 accel,
1622 })
1623 }
1624
1625 /// Return an iterator over all of the states in this DFA.
1626 ///
1627 /// The iterator returned yields tuples, where the first element is the
1628 /// state ID and the second element is the state itself.
1629 fn states(&self) -> StateIter<'_, T> {
1630 StateIter { trans: self, id: DEAD.as_usize() }
1631 }
1632
1633 /// Returns the sparse transitions as raw bytes.
1634 fn sparse(&self) -> &[u8] {
1635 self.sparse.as_ref()
1636 }
1637
1638 /// Returns the number of bytes represented by a single state ID.
1639 fn id_len(&self) -> usize {
1640 StateID::SIZE
1641 }
1642
1643 /// Return the memory usage, in bytes, of these transitions.
1644 ///
1645 /// This does not include the size of a `Transitions` value itself.
1646 fn memory_usage(&self) -> usize {
1647 self.sparse().len()
1648 }
1649}
1650
1651#[cfg(feature = "alloc")]
1652impl<T: AsMut<[u8]>> Transitions<T> {
1653 /// Return a convenient mutable representation of the given state.
1654 /// This panics if the state is invalid.
1655 fn state_mut(&mut self, id: StateID) -> StateMut<'_> {
1656 let mut state = &mut self.sparse_mut()[id.as_usize()..];
1657 let mut ntrans = bytes::read_u16(&state) as usize;
1658 let is_match = (1 << 15) & ntrans != 0;
1659 ntrans &= !(1 << 15);
1660 state = &mut state[2..];
1661
1662 let (input_ranges, state) = state.split_at_mut(ntrans * 2);
1663 let (next, state) = state.split_at_mut(ntrans * StateID::SIZE);
1664 let (pattern_ids, state) = if is_match {
1665 let npats = bytes::read_u32(&state) as usize;
1666 state[4..].split_at_mut(npats * 4)
1667 } else {
1668 (&mut [][..], state)
1669 };
1670
1671 let accel_len = state[0] as usize;
1672 let accel = &mut state[1..accel_len + 1];
1673 StateMut {
1674 id,
1675 is_match,
1676 ntrans,
1677 input_ranges,
1678 next,
1679 pattern_ids,
1680 accel,
1681 }
1682 }
1683
1684 /// Returns the sparse transitions as raw mutable bytes.
1685 fn sparse_mut(&mut self) -> &mut [u8] {
1686 self.sparse.as_mut()
1687 }
1688}
1689
1690/// The set of all possible starting states in a DFA.
1691///
1692/// See the eponymous type in the `dense` module for more details. This type
1693/// is very similar to `dense::StartTable`, except that its underlying
1694/// representation is `&[u8]` instead of `&[S]`. (The latter would require
1695/// sparse DFAs to be aligned, which is explicitly something we do not require
1696/// because we don't really need it.)
1697#[derive(Clone)]
1698struct StartTable<T> {
1699 /// The initial start state IDs as a contiguous table of native endian
1700 /// encoded integers, represented by `S`.
1701 ///
1702 /// In practice, T is either Vec<u8> or &[u8] and has no alignment
1703 /// requirements.
1704 ///
1705 /// The first `stride` (currently always 4) entries always correspond to
1706 /// the start states for the entire DFA. After that, there are
1707 /// `stride * patterns` state IDs, where `patterns` may be zero in the
1708 /// case of a DFA with no patterns or in the case where the DFA was built
1709 /// without enabling starting states for each pattern.
1710 table: T,
1711 /// The number of starting state IDs per pattern.
1712 stride: usize,
1713 /// The total number of patterns for which starting states are encoded.
1714 /// This may be zero for non-empty DFAs when the DFA was built without
1715 /// start states for each pattern.
1716 patterns: usize,
1717}
1718
1719#[cfg(feature = "alloc")]
1720impl StartTable<Vec<u8>> {
1721 fn new(patterns: usize) -> StartTable<Vec<u8>> {
1722 let stride = Start::count();
1723 // This is OK since the only way we're here is if a dense DFA could be
1724 // constructed successfully, which uses the same space.
1725 let len = stride
1726 .checked_mul(patterns)
1727 .unwrap()
1728 .checked_add(stride)
1729 .unwrap()
1730 .checked_mul(StateID::SIZE)
1731 .unwrap();
1732 StartTable { table: vec![0; len], stride, patterns }
1733 }
1734
1735 fn from_dense_dfa<T: AsRef<[u32]>>(
1736 dfa: &dense::DFA<T>,
1737 remap: &[StateID],
1738 ) -> Result<StartTable<Vec<u8>>, Error> {
1739 // Unless the DFA has start states compiled for each pattern, then
1740 // as far as the starting state table is concerned, there are zero
1741 // patterns to account for. It will instead only store starting states
1742 // for the entire DFA.
1743 let start_pattern_count = if dfa.has_starts_for_each_pattern() {
1744 dfa.pattern_count()
1745 } else {
1746 0
1747 };
1748 let mut sl = StartTable::new(start_pattern_count);
1749 for (old_start_id, sty, pid) in dfa.starts() {
1750 let new_start_id = remap[dfa.to_index(old_start_id)];
1751 sl.set_start(sty, pid, new_start_id);
1752 }
1753 Ok(sl)
1754 }
1755}
1756
1757impl<'a> StartTable<&'a [u8]> {
1758 unsafe fn from_bytes_unchecked(
1759 mut slice: &'a [u8],
1760 ) -> Result<(StartTable<&'a [u8]>, usize), DeserializeError> {
1761 let slice_start = slice.as_ptr() as usize;
1762
1763 let (stride, nr) =
1764 bytes::try_read_u32_as_usize(slice, "sparse start table stride")?;
1765 slice = &slice[nr..];
1766
1767 let (patterns, nr) = bytes::try_read_u32_as_usize(
1768 slice,
1769 "sparse start table patterns",
1770 )?;
1771 slice = &slice[nr..];
1772
1773 if stride != Start::count() {
1774 return Err(DeserializeError::generic(
1775 "invalid sparse starting table stride",
1776 ));
1777 }
1778 if patterns > PatternID::LIMIT {
1779 return Err(DeserializeError::generic(
1780 "sparse invalid number of patterns",
1781 ));
1782 }
1783 let pattern_table_size =
1784 bytes::mul(stride, patterns, "sparse invalid pattern count")?;
1785 // Our start states always start with a single stride of start states
1786 // for the entire automaton which permit it to match any pattern. What
1787 // follows it are an optional set of start states for each pattern.
1788 let start_state_count = bytes::add(
1789 stride,
1790 pattern_table_size,
1791 "sparse invalid 'any' pattern starts size",
1792 )?;
1793 let table_bytes_len = bytes::mul(
1794 start_state_count,
1795 StateID::SIZE,
1796 "sparse pattern table bytes length",
1797 )?;
1798 bytes::check_slice_len(
1799 slice,
1800 table_bytes_len,
1801 "sparse start ID table",
1802 )?;
1803 let table_bytes = &slice[..table_bytes_len];
1804 slice = &slice[table_bytes_len..];
1805
1806 let sl = StartTable { table: table_bytes, stride, patterns };
1807 Ok((sl, slice.as_ptr() as usize - slice_start))
1808 }
1809}
1810
1811impl<T: AsRef<[u8]>> StartTable<T> {
1812 fn write_to<E: Endian>(
1813 &self,
1814 mut dst: &mut [u8],
1815 ) -> Result<usize, SerializeError> {
1816 let nwrite = self.write_to_len();
1817 if dst.len() < nwrite {
1818 return Err(SerializeError::buffer_too_small(
1819 "sparse starting table ids",
1820 ));
1821 }
1822 dst = &mut dst[..nwrite];
1823
1824 // write stride
1825 E::write_u32(u32::try_from(self.stride).unwrap(), dst);
1826 dst = &mut dst[size_of::<u32>()..];
1827 // write pattern count
1828 E::write_u32(u32::try_from(self.patterns).unwrap(), dst);
1829 dst = &mut dst[size_of::<u32>()..];
1830 // write start IDs
1831 dst.copy_from_slice(self.table());
1832 Ok(nwrite)
1833 }
1834
1835 /// Returns the number of bytes the serialized form of this transition
1836 /// table will use.
1837 fn write_to_len(&self) -> usize {
1838 size_of::<u32>() // stride
1839 + size_of::<u32>() // # patterns
1840 + self.table().len()
1841 }
1842
1843 /// Validates that every starting state ID in this table is valid.
1844 ///
1845 /// That is, every starting state ID can be used to correctly decode a
1846 /// state in the DFA's sparse transitions.
1847 fn validate(
1848 &self,
1849 trans: &Transitions<T>,
1850 ) -> Result<(), DeserializeError> {
1851 for (id, _, _) in self.iter() {
1852 let _ = trans.try_state(id)?;
1853 }
1854 Ok(())
1855 }
1856
1857 /// Converts this start list to a borrowed value.
1858 fn as_ref(&self) -> StartTable<&'_ [u8]> {
1859 StartTable {
1860 table: self.table(),
1861 stride: self.stride,
1862 patterns: self.patterns,
1863 }
1864 }
1865
1866 /// Converts this start list to an owned value.
1867 #[cfg(feature = "alloc")]
1868 fn to_owned(&self) -> StartTable<Vec<u8>> {
1869 StartTable {
1870 table: self.table().to_vec(),
1871 stride: self.stride,
1872 patterns: self.patterns,
1873 }
1874 }
1875
1876 /// Return the start state for the given index and pattern ID. If the
1877 /// pattern ID is None, then the corresponding start state for the entire
1878 /// DFA is returned. If the pattern ID is not None, then the corresponding
1879 /// starting state for the given pattern is returned. If this start table
1880 /// does not have individual starting states for each pattern, then this
1881 /// panics.
1882 fn start(&self, index: Start, pattern_id: Option<PatternID>) -> StateID {
1883 let start_index = index.as_usize();
1884 let index = match pattern_id {
1885 None => start_index,
1886 Some(pid) => {
1887 let pid = pid.as_usize();
1888 assert!(pid < self.patterns, "invalid pattern ID {:?}", pid);
1889 self.stride
1890 .checked_mul(pid)
1891 .unwrap()
1892 .checked_add(self.stride)
1893 .unwrap()
1894 .checked_add(start_index)
1895 .unwrap()
1896 }
1897 };
1898 let start = index * StateID::SIZE;
1899 // This OK since we're allowed to assume that the start table contains
1900 // valid StateIDs.
1901 bytes::read_state_id_unchecked(&self.table()[start..]).0
1902 }
1903
1904 /// Return an iterator over all start IDs in this table.
1905 fn iter(&self) -> StartStateIter<'_, T> {
1906 StartStateIter { st: self, i: 0 }
1907 }
1908
1909 /// Returns the total number of start state IDs in this table.
1910 fn len(&self) -> usize {
1911 self.table().len() / StateID::SIZE
1912 }
1913
1914 /// Returns the table as a raw slice of bytes.
1915 fn table(&self) -> &[u8] {
1916 self.table.as_ref()
1917 }
1918
1919 /// Return the memory usage, in bytes, of this start list.
1920 ///
1921 /// This does not include the size of a `StartTable` value itself.
1922 fn memory_usage(&self) -> usize {
1923 self.table().len()
1924 }
1925}
1926
1927#[cfg(feature = "alloc")]
1928impl<T: AsMut<[u8]>> StartTable<T> {
1929 /// Set the start state for the given index and pattern.
1930 ///
1931 /// If the pattern ID or state ID are not valid, then this will panic.
1932 fn set_start(
1933 &mut self,
1934 index: Start,
1935 pattern_id: Option<PatternID>,
1936 id: StateID,
1937 ) {
1938 let start_index = index.as_usize();
1939 let index = match pattern_id {
1940 None => start_index,
1941 Some(pid) => {
1942 let pid = pid.as_usize();
1943 assert!(pid < self.patterns, "invalid pattern ID {:?}", pid);
1944 self.stride
1945 .checked_mul(pid)
1946 .unwrap()
1947 .checked_add(self.stride)
1948 .unwrap()
1949 .checked_add(start_index)
1950 .unwrap()
1951 }
1952 };
1953 let start = index * StateID::SIZE;
1954 let end = start + StateID::SIZE;
1955 bytes::write_state_id::<bytes::NE>(
1956 id,
1957 &mut self.table.as_mut()[start..end],
1958 );
1959 }
1960}
1961
1962/// An iterator over all state state IDs in a sparse DFA.
1963struct StartStateIter<'a, T> {
1964 st: &'a StartTable<T>,
1965 i: usize,
1966}
1967
1968impl<'a, T: AsRef<[u8]>> Iterator for StartStateIter<'a, T> {
1969 type Item = (StateID, Start, Option<PatternID>);
1970
1971 fn next(&mut self) -> Option<(StateID, Start, Option<PatternID>)> {
1972 let i = self.i;
1973 if i >= self.st.len() {
1974 return None;
1975 }
1976 self.i += 1;
1977
1978 // This unwrap is okay since the stride of any DFA must always match
1979 // the number of start state types.
1980 let start_type = Start::from_usize(i % self.st.stride).unwrap();
1981 let pid = if i < self.st.stride {
1982 // This means we don't have start states for each pattern.
1983 None
1984 } else {
1985 // These unwraps are OK since we may assume our table and stride
1986 // is correct.
1987 let pid = i
1988 .checked_sub(self.st.stride)
1989 .unwrap()
1990 .checked_div(self.st.stride)
1991 .unwrap();
1992 Some(PatternID::new(pid).unwrap())
1993 };
1994 let start = i * StateID::SIZE;
1995 let end = start + StateID::SIZE;
1996 let bytes = self.st.table()[start..end].try_into().unwrap();
1997 // This is OK since we're allowed to assume that any IDs in this start
1998 // table are correct and valid for this DFA.
1999 let id = StateID::from_ne_bytes_unchecked(bytes);
2000 Some((id, start_type, pid))
2001 }
2002}
2003
2004impl<'a, T> fmt::Debug for StartStateIter<'a, T> {
2005 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
2006 f.debug_struct("StartStateIter").field(name:"i", &self.i).finish()
2007 }
2008}
2009
2010/// An iterator over all states in a sparse DFA.
2011///
2012/// This iterator yields tuples, where the first element is the state ID and
2013/// the second element is the state itself.
2014struct StateIter<'a, T> {
2015 trans: &'a Transitions<T>,
2016 id: usize,
2017}
2018
2019impl<'a, T: AsRef<[u8]>> Iterator for StateIter<'a, T> {
2020 type Item = State<'a>;
2021
2022 fn next(&mut self) -> Option<State<'a>> {
2023 if self.id >= self.trans.sparse().len() {
2024 return None;
2025 }
2026 let state: State<'_> = self.trans.state(id:StateID::new_unchecked(self.id));
2027 self.id = self.id + state.bytes_len();
2028 Some(state)
2029 }
2030}
2031
2032impl<'a, T> fmt::Debug for StateIter<'a, T> {
2033 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
2034 f.debug_struct("StateIter").field(name:"id", &self.id).finish()
2035 }
2036}
2037
2038/// A representation of a sparse DFA state that can be cheaply materialized
2039/// from a state identifier.
2040#[derive(Clone)]
2041struct State<'a> {
2042 /// The identifier of this state.
2043 id: StateID,
2044 /// Whether this is a match state or not.
2045 is_match: bool,
2046 /// The number of transitions in this state.
2047 ntrans: usize,
2048 /// Pairs of input ranges, where there is one pair for each transition.
2049 /// Each pair specifies an inclusive start and end byte range for the
2050 /// corresponding transition.
2051 input_ranges: &'a [u8],
2052 /// Transitions to the next state. This slice contains native endian
2053 /// encoded state identifiers, with `S` as the representation. Thus, there
2054 /// are `ntrans * size_of::<S>()` bytes in this slice.
2055 next: &'a [u8],
2056 /// If this is a match state, then this contains the pattern IDs that match
2057 /// when the DFA is in this state.
2058 ///
2059 /// This is a contiguous sequence of 32-bit native endian encoded integers.
2060 pattern_ids: &'a [u8],
2061 /// An accelerator for this state, if present. If this state has no
2062 /// accelerator, then this is an empty slice. When non-empty, this slice
2063 /// has length at most 3 and corresponds to the exhaustive set of bytes
2064 /// that must be seen in order to transition out of this state.
2065 accel: &'a [u8],
2066}
2067
2068impl<'a> State<'a> {
2069 /// Searches for the next transition given an input byte. If no such
2070 /// transition could be found, then a dead state is returned.
2071 ///
2072 /// This is marked as inline to help dramatically boost sparse searching,
2073 /// which decodes each state it enters to follow the next transition.
2074 #[inline(always)]
2075 fn next(&self, input: u8) -> StateID {
2076 // This straight linear search was observed to be much better than
2077 // binary search on ASCII haystacks, likely because a binary search
2078 // visits the ASCII case last but a linear search sees it first. A
2079 // binary search does do a little better on non-ASCII haystacks, but
2080 // not by much. There might be a better trade off lurking here.
2081 for i in 0..(self.ntrans - 1) {
2082 let (start, end) = self.range(i);
2083 if start <= input && input <= end {
2084 return self.next_at(i);
2085 }
2086 // We could bail early with an extra branch: if input < b1, then
2087 // we know we'll never find a matching transition. Interestingly,
2088 // this extra branch seems to not help performance, or will even
2089 // hurt it. It's likely very dependent on the DFA itself and what
2090 // is being searched.
2091 }
2092 DEAD
2093 }
2094
2095 /// Returns the next state ID for the special EOI transition.
2096 fn next_eoi(&self) -> StateID {
2097 self.next_at(self.ntrans - 1)
2098 }
2099
2100 /// Returns the identifier for this state.
2101 fn id(&self) -> StateID {
2102 self.id
2103 }
2104
2105 /// Returns the inclusive input byte range for the ith transition in this
2106 /// state.
2107 fn range(&self, i: usize) -> (u8, u8) {
2108 (self.input_ranges[i * 2], self.input_ranges[i * 2 + 1])
2109 }
2110
2111 /// Returns the next state for the ith transition in this state.
2112 fn next_at(&self, i: usize) -> StateID {
2113 let start = i * StateID::SIZE;
2114 let end = start + StateID::SIZE;
2115 let bytes = self.next[start..end].try_into().unwrap();
2116 StateID::from_ne_bytes_unchecked(bytes)
2117 }
2118
2119 /// Returns the pattern ID for the given match index. If the match index
2120 /// is invalid, then this panics.
2121 fn pattern_id(&self, match_index: usize) -> PatternID {
2122 let start = match_index * PatternID::SIZE;
2123 bytes::read_pattern_id_unchecked(&self.pattern_ids[start..]).0
2124 }
2125
2126 /// Returns the total number of pattern IDs for this state. This is always
2127 /// zero when `is_match` is false.
2128 fn pattern_count(&self) -> usize {
2129 assert_eq!(0, self.pattern_ids.len() % 4);
2130 self.pattern_ids.len() / 4
2131 }
2132
2133 /// Return the total number of bytes that this state consumes in its
2134 /// encoded form.
2135 fn bytes_len(&self) -> usize {
2136 let mut len = 2
2137 + (self.ntrans * 2)
2138 + (self.ntrans * StateID::SIZE)
2139 + (1 + self.accel.len());
2140 if self.is_match {
2141 len += size_of::<u32>() + self.pattern_ids.len();
2142 }
2143 len
2144 }
2145
2146 /// Return an accelerator for this state.
2147 fn accelerator(&self) -> &'a [u8] {
2148 self.accel
2149 }
2150}
2151
2152impl<'a> fmt::Debug for State<'a> {
2153 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
2154 let mut printed = false;
2155 for i in 0..(self.ntrans - 1) {
2156 let next = self.next_at(i);
2157 if next == DEAD {
2158 continue;
2159 }
2160
2161 if printed {
2162 write!(f, ", ")?;
2163 }
2164 let (start, end) = self.range(i);
2165 if start == end {
2166 write!(f, "{:?} => {:?}", DebugByte(start), next)?;
2167 } else {
2168 write!(
2169 f,
2170 "{:?}-{:?} => {:?}",
2171 DebugByte(start),
2172 DebugByte(end),
2173 next,
2174 )?;
2175 }
2176 printed = true;
2177 }
2178 let eoi = self.next_at(self.ntrans - 1);
2179 if eoi != DEAD {
2180 if printed {
2181 write!(f, ", ")?;
2182 }
2183 write!(f, "EOI => {:?}", eoi)?;
2184 }
2185 Ok(())
2186 }
2187}
2188
2189/// A representation of a mutable sparse DFA state that can be cheaply
2190/// materialized from a state identifier.
2191#[cfg(feature = "alloc")]
2192struct StateMut<'a> {
2193 /// The identifier of this state.
2194 id: StateID,
2195 /// Whether this is a match state or not.
2196 is_match: bool,
2197 /// The number of transitions in this state.
2198 ntrans: usize,
2199 /// Pairs of input ranges, where there is one pair for each transition.
2200 /// Each pair specifies an inclusive start and end byte range for the
2201 /// corresponding transition.
2202 input_ranges: &'a mut [u8],
2203 /// Transitions to the next state. This slice contains native endian
2204 /// encoded state identifiers, with `S` as the representation. Thus, there
2205 /// are `ntrans * size_of::<S>()` bytes in this slice.
2206 next: &'a mut [u8],
2207 /// If this is a match state, then this contains the pattern IDs that match
2208 /// when the DFA is in this state.
2209 ///
2210 /// This is a contiguous sequence of 32-bit native endian encoded integers.
2211 pattern_ids: &'a [u8],
2212 /// An accelerator for this state, if present. If this state has no
2213 /// accelerator, then this is an empty slice. When non-empty, this slice
2214 /// has length at most 3 and corresponds to the exhaustive set of bytes
2215 /// that must be seen in order to transition out of this state.
2216 accel: &'a mut [u8],
2217}
2218
2219#[cfg(feature = "alloc")]
2220impl<'a> StateMut<'a> {
2221 /// Sets the ith transition to the given state.
2222 fn set_next_at(&mut self, i: usize, next: StateID) {
2223 let start = i * StateID::SIZE;
2224 let end = start + StateID::SIZE;
2225 bytes::write_state_id::<bytes::NE>(next, &mut self.next[start..end]);
2226 }
2227}
2228
2229#[cfg(feature = "alloc")]
2230impl<'a> fmt::Debug for StateMut<'a> {
2231 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
2232 let state = State {
2233 id: self.id,
2234 is_match: self.is_match,
2235 ntrans: self.ntrans,
2236 input_ranges: self.input_ranges,
2237 next: self.next,
2238 pattern_ids: self.pattern_ids,
2239 accel: self.accel,
2240 };
2241 fmt::Debug::fmt(&state, f)
2242 }
2243}
2244
2245/// A binary search routine specialized specifically to a sparse DFA state's
2246/// transitions. Specifically, the transitions are defined as a set of pairs
2247/// of input bytes that delineate an inclusive range of bytes. If the input
2248/// byte is in the range, then the corresponding transition is a match.
2249///
2250/// This binary search accepts a slice of these pairs and returns the position
2251/// of the matching pair (the ith transition), or None if no matching pair
2252/// could be found.
2253///
2254/// Note that this routine is not currently used since it was observed to
2255/// either decrease performance when searching ASCII, or did not provide enough
2256/// of a boost on non-ASCII haystacks to be worth it. However, we leave it here
2257/// for posterity in case we can find a way to use it.
2258///
2259/// In theory, we could use the standard library's search routine if we could
2260/// cast a `&[u8]` to a `&[(u8, u8)]`, but I don't believe this is currently
2261/// guaranteed to be safe and is thus UB (since I don't think the in-memory
2262/// representation of `(u8, u8)` has been nailed down). One could define a
2263/// repr(C) type, but the casting doesn't seem justified.
2264#[allow(dead_code)]
2265#[inline(always)]
2266fn binary_search_ranges(ranges: &[u8], needle: u8) -> Option<usize> {
2267 debug_assert!(ranges.len() % 2 == 0, "ranges must have even length");
2268 debug_assert!(ranges.len() <= 512, "ranges should be short");
2269
2270 let (mut left: usize, mut right: usize) = (0, ranges.len() / 2);
2271 while left < right {
2272 let mid: usize = (left + right) / 2;
2273 let (b1: u8, b2: u8) = (ranges[mid * 2], ranges[mid * 2 + 1]);
2274 if needle < b1 {
2275 right = mid;
2276 } else if needle > b2 {
2277 left = mid + 1;
2278 } else {
2279 return Some(mid);
2280 }
2281 }
2282 None
2283}
2284