1/*!
2Types and routines specific to dense DFAs.
3
4This module is the home of [`dense::DFA`](DFA).
5
6This module also contains a [`dense::Builder`](Builder) and a
7[`dense::Config`](Config) for configuring and building a dense DFA.
8*/
9
10#[cfg(feature = "alloc")]
11use core::cmp;
12use core::{convert::TryFrom, fmt, iter, mem::size_of, slice};
13
14#[cfg(feature = "alloc")]
15use alloc::{
16 collections::{BTreeMap, BTreeSet},
17 vec,
18 vec::Vec,
19};
20
21#[cfg(feature = "alloc")]
22use crate::{
23 dfa::{
24 accel::Accel, determinize, error::Error, minimize::Minimizer, sparse,
25 },
26 nfa::thompson,
27 util::alphabet::ByteSet,
28 MatchKind,
29};
30use crate::{
31 dfa::{
32 accel::Accels,
33 automaton::{fmt_state_indicator, Automaton},
34 special::Special,
35 DEAD,
36 },
37 util::{
38 alphabet::{self, ByteClasses},
39 bytes::{self, DeserializeError, Endian, SerializeError},
40 id::{PatternID, StateID},
41 start::Start,
42 },
43};
44
45/// The label that is pre-pended to a serialized DFA.
46const LABEL: &str = "rust-regex-automata-dfa-dense";
47
48/// The format version of dense regexes. This version gets incremented when a
49/// change occurs. A change may not necessarily be a breaking change, but the
50/// version does permit good error messages in the case where a breaking change
51/// is made.
52const VERSION: u32 = 2;
53
54/// The configuration used for compiling a dense DFA.
55///
56/// A dense DFA configuration is a simple data object that is typically used
57/// with [`dense::Builder::configure`](self::Builder::configure).
58///
59/// The default configuration guarantees that a search will _never_ return a
60/// [`MatchError`](crate::MatchError) for any haystack or pattern. Setting a
61/// quit byte with [`Config::quit`] or enabling heuristic support for Unicode
62/// word boundaries with [`Config::unicode_word_boundary`] can in turn cause a
63/// search to return an error. See the corresponding configuration options for
64/// more details on when those error conditions arise.
65#[cfg(feature = "alloc")]
66#[derive(Clone, Copy, Debug, Default)]
67pub struct Config {
68 // As with other configuration types in this crate, we put all our knobs
69 // in options so that we can distinguish between "default" and "not set."
70 // This makes it possible to easily combine multiple configurations
71 // without default values overwriting explicitly specified values. See the
72 // 'overwrite' method.
73 //
74 // For docs on the fields below, see the corresponding method setters.
75 anchored: Option<bool>,
76 accelerate: Option<bool>,
77 minimize: Option<bool>,
78 match_kind: Option<MatchKind>,
79 starts_for_each_pattern: Option<bool>,
80 byte_classes: Option<bool>,
81 unicode_word_boundary: Option<bool>,
82 quit: Option<ByteSet>,
83 dfa_size_limit: Option<Option<usize>>,
84 determinize_size_limit: Option<Option<usize>>,
85}
86
87#[cfg(feature = "alloc")]
88impl Config {
89 /// Return a new default dense DFA compiler configuration.
90 pub fn new() -> Config {
91 Config::default()
92 }
93
94 /// Set whether matching must be anchored at the beginning of the input.
95 ///
96 /// When enabled, a match must begin at the start of a search. When
97 /// disabled, the DFA will act as if the pattern started with a `(?s:.)*?`,
98 /// which enables a match to appear anywhere.
99 ///
100 /// Note that if you want to run both anchored and unanchored
101 /// searches without building multiple automatons, you can enable the
102 /// [`Config::starts_for_each_pattern`] configuration instead. This will
103 /// permit unanchored any-pattern searches and pattern-specific anchored
104 /// searches. See the documentation for that configuration for an example.
105 ///
106 /// By default this is disabled.
107 ///
108 /// **WARNING:** this is subtly different than using a `^` at the start of
109 /// your regex. A `^` forces a regex to match exclusively at the start of
110 /// input, regardless of where you begin your search. In contrast, enabling
111 /// this option will allow your regex to match anywhere in your input,
112 /// but the match must start at the beginning of a search. (Most of the
113 /// higher level convenience search routines make "start of input" and
114 /// "start of search" equivalent, but some routines allow treating these as
115 /// orthogonal.)
116 ///
117 /// For example, consider the haystack `aba` and the following searches:
118 ///
119 /// 1. The regex `^a` is compiled with `anchored=false` and searches
120 /// `aba` starting at position `2`. Since `^` requires the match to
121 /// start at the beginning of the input and `2 > 0`, no match is found.
122 /// 2. The regex `a` is compiled with `anchored=true` and searches `aba`
123 /// starting at position `2`. This reports a match at `[2, 3]` since
124 /// the match starts where the search started. Since there is no `^`,
125 /// there is no requirement for the match to start at the beginning of
126 /// the input.
127 /// 3. The regex `a` is compiled with `anchored=true` and searches `aba`
128 /// starting at position `1`. Since `b` corresponds to position `1` and
129 /// since the regex is anchored, it finds no match.
130 /// 4. The regex `a` is compiled with `anchored=false` and searches `aba`
131 /// startting at position `1`. Since the regex is neither anchored nor
132 /// starts with `^`, the regex is compiled with an implicit `(?s:.)*?`
133 /// prefix that permits it to match anywhere. Thus, it reports a match
134 /// at `[2, 3]`.
135 ///
136 /// # Example
137 ///
138 /// This demonstrates the differences between an anchored search and
139 /// a pattern that begins with `^` (as described in the above warning
140 /// message).
141 ///
142 /// ```
143 /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch};
144 ///
145 /// let haystack = "aba".as_bytes();
146 ///
147 /// let dfa = dense::Builder::new()
148 /// .configure(dense::Config::new().anchored(false)) // default
149 /// .build(r"^a")?;
150 /// let got = dfa.find_leftmost_fwd_at(None, None, haystack, 2, 3)?;
151 /// // No match is found because 2 is not the beginning of the haystack,
152 /// // which is what ^ requires.
153 /// let expected = None;
154 /// assert_eq!(expected, got);
155 ///
156 /// let dfa = dense::Builder::new()
157 /// .configure(dense::Config::new().anchored(true))
158 /// .build(r"a")?;
159 /// let got = dfa.find_leftmost_fwd_at(None, None, haystack, 2, 3)?;
160 /// // An anchored search can still match anywhere in the haystack, it just
161 /// // must begin at the start of the search which is '2' in this case.
162 /// let expected = Some(HalfMatch::must(0, 3));
163 /// assert_eq!(expected, got);
164 ///
165 /// let dfa = dense::Builder::new()
166 /// .configure(dense::Config::new().anchored(true))
167 /// .build(r"a")?;
168 /// let got = dfa.find_leftmost_fwd_at(None, None, haystack, 1, 3)?;
169 /// // No match is found since we start searching at offset 1 which
170 /// // corresponds to 'b'. Since there is no '(?s:.)*?' prefix, no match
171 /// // is found.
172 /// let expected = None;
173 /// assert_eq!(expected, got);
174 ///
175 /// let dfa = dense::Builder::new()
176 /// .configure(dense::Config::new().anchored(false)) // default
177 /// .build(r"a")?;
178 /// let got = dfa.find_leftmost_fwd_at(None, None, haystack, 1, 3)?;
179 /// // Since anchored=false, an implicit '(?s:.)*?' prefix was added to the
180 /// // pattern. Even though the search starts at 'b', the 'match anything'
181 /// // prefix allows the search to match 'a'.
182 /// let expected = Some(HalfMatch::must(0, 3));
183 /// assert_eq!(expected, got);
184 ///
185 /// # Ok::<(), Box<dyn std::error::Error>>(())
186 /// ```
187 pub fn anchored(mut self, yes: bool) -> Config {
188 self.anchored = Some(yes);
189 self
190 }
191
192 /// Enable state acceleration.
193 ///
194 /// When enabled, DFA construction will analyze each state to determine
195 /// whether it is eligible for simple acceleration. Acceleration typically
196 /// occurs when most of a state's transitions loop back to itself, leaving
197 /// only a select few bytes that will exit the state. When this occurs,
198 /// other routines like `memchr` can be used to look for those bytes which
199 /// may be much faster than traversing the DFA.
200 ///
201 /// Callers may elect to disable this if consistent performance is more
202 /// desirable than variable performance. Namely, acceleration can sometimes
203 /// make searching slower than it otherwise would be if the transitions
204 /// that leave accelerated states are traversed frequently.
205 ///
206 /// See [`Automaton::accelerator`](crate::dfa::Automaton::accelerator) for
207 /// an example.
208 ///
209 /// This is enabled by default.
210 pub fn accelerate(mut self, yes: bool) -> Config {
211 self.accelerate = Some(yes);
212 self
213 }
214
215 /// Minimize the DFA.
216 ///
217 /// When enabled, the DFA built will be minimized such that it is as small
218 /// as possible.
219 ///
220 /// Whether one enables minimization or not depends on the types of costs
221 /// you're willing to pay and how much you care about its benefits. In
222 /// particular, minimization has worst case `O(n*k*logn)` time and `O(k*n)`
223 /// space, where `n` is the number of DFA states and `k` is the alphabet
224 /// size. In practice, minimization can be quite costly in terms of both
225 /// space and time, so it should only be done if you're willing to wait
226 /// longer to produce a DFA. In general, you might want a minimal DFA in
227 /// the following circumstances:
228 ///
229 /// 1. You would like to optimize for the size of the automaton. This can
230 /// manifest in one of two ways. Firstly, if you're converting the
231 /// DFA into Rust code (or a table embedded in the code), then a minimal
232 /// DFA will translate into a corresponding reduction in code size, and
233 /// thus, also the final compiled binary size. Secondly, if you are
234 /// building many DFAs and putting them on the heap, you'll be able to
235 /// fit more if they are smaller. Note though that building a minimal
236 /// DFA itself requires additional space; you only realize the space
237 /// savings once the minimal DFA is constructed (at which point, the
238 /// space used for minimization is freed).
239 /// 2. You've observed that a smaller DFA results in faster match
240 /// performance. Naively, this isn't guaranteed since there is no
241 /// inherent difference between matching with a bigger-than-minimal
242 /// DFA and a minimal DFA. However, a smaller DFA may make use of your
243 /// CPU's cache more efficiently.
244 /// 3. You are trying to establish an equivalence between regular
245 /// languages. The standard method for this is to build a minimal DFA
246 /// for each language and then compare them. If the DFAs are equivalent
247 /// (up to state renaming), then the languages are equivalent.
248 ///
249 /// Typically, minimization only makes sense as an offline process. That
250 /// is, one might minimize a DFA before serializing it to persistent
251 /// storage. In practical terms, minimization can take around an order of
252 /// magnitude more time than compiling the initial DFA via determinization.
253 ///
254 /// This option is disabled by default.
255 pub fn minimize(mut self, yes: bool) -> Config {
256 self.minimize = Some(yes);
257 self
258 }
259
260 /// Set the desired match semantics.
261 ///
262 /// The default is [`MatchKind::LeftmostFirst`], which corresponds to the
263 /// match semantics of Perl-like regex engines. That is, when multiple
264 /// patterns would match at the same leftmost position, the pattern that
265 /// appears first in the concrete syntax is chosen.
266 ///
267 /// Currently, the only other kind of match semantics supported is
268 /// [`MatchKind::All`]. This corresponds to classical DFA construction
269 /// where all possible matches are added to the DFA.
270 ///
271 /// Typically, `All` is used when one wants to execute an overlapping
272 /// search and `LeftmostFirst` otherwise. In particular, it rarely makes
273 /// sense to use `All` with the various "leftmost" find routines, since the
274 /// leftmost routines depend on the `LeftmostFirst` automata construction
275 /// strategy. Specifically, `LeftmostFirst` adds dead states to the DFA
276 /// as a way to terminate the search and report a match. `LeftmostFirst`
277 /// also supports non-greedy matches using this strategy where as `All`
278 /// does not.
279 ///
280 /// # Example: overlapping search
281 ///
282 /// This example shows the typical use of `MatchKind::All`, which is to
283 /// report overlapping matches.
284 ///
285 /// ```
286 /// use regex_automata::{
287 /// dfa::{Automaton, OverlappingState, dense},
288 /// HalfMatch, MatchKind,
289 /// };
290 ///
291 /// let dfa = dense::Builder::new()
292 /// .configure(dense::Config::new().match_kind(MatchKind::All))
293 /// .build_many(&[r"\w+$", r"\S+$"])?;
294 /// let haystack = "@foo".as_bytes();
295 /// let mut state = OverlappingState::start();
296 ///
297 /// let expected = Some(HalfMatch::must(1, 4));
298 /// let got = dfa.find_overlapping_fwd(haystack, &mut state)?;
299 /// assert_eq!(expected, got);
300 ///
301 /// // The first pattern also matches at the same position, so re-running
302 /// // the search will yield another match. Notice also that the first
303 /// // pattern is returned after the second. This is because the second
304 /// // pattern begins its match before the first, is therefore an earlier
305 /// // match and is thus reported first.
306 /// let expected = Some(HalfMatch::must(0, 4));
307 /// let got = dfa.find_overlapping_fwd(haystack, &mut state)?;
308 /// assert_eq!(expected, got);
309 ///
310 /// # Ok::<(), Box<dyn std::error::Error>>(())
311 /// ```
312 ///
313 /// # Example: reverse automaton to find start of match
314 ///
315 /// Another example for using `MatchKind::All` is for constructing a
316 /// reverse automaton to find the start of a match. `All` semantics are
317 /// used for this in order to find the longest possible match, which
318 /// corresponds to the leftmost starting position.
319 ///
320 /// Note that if you need the starting position then
321 /// [`dfa::regex::Regex`](crate::dfa::regex::Regex) will handle this for
322 /// you, so it's usually not necessary to do this yourself.
323 ///
324 /// ```
325 /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch, MatchKind};
326 ///
327 /// let haystack = "123foobar456".as_bytes();
328 /// let pattern = r"[a-z]+";
329 ///
330 /// let dfa_fwd = dense::DFA::new(pattern)?;
331 /// let dfa_rev = dense::Builder::new()
332 /// .configure(dense::Config::new()
333 /// .anchored(true)
334 /// .match_kind(MatchKind::All)
335 /// )
336 /// .build(pattern)?;
337 /// let expected_fwd = HalfMatch::must(0, 9);
338 /// let expected_rev = HalfMatch::must(0, 3);
339 /// let got_fwd = dfa_fwd.find_leftmost_fwd(haystack)?.unwrap();
340 /// // Here we don't specify the pattern to search for since there's only
341 /// // one pattern and we're doing a leftmost search. But if this were an
342 /// // overlapping search, you'd need to specify the pattern that matched
343 /// // in the forward direction. (Otherwise, you might wind up finding the
344 /// // starting position of a match of some other pattern.) That in turn
345 /// // requires building the reverse automaton with starts_for_each_pattern
346 /// // enabled. Indeed, this is what Regex does internally.
347 /// let got_rev = dfa_rev.find_leftmost_rev_at(
348 /// None, haystack, 0, got_fwd.offset(),
349 /// )?.unwrap();
350 /// assert_eq!(expected_fwd, got_fwd);
351 /// assert_eq!(expected_rev, got_rev);
352 ///
353 /// # Ok::<(), Box<dyn std::error::Error>>(())
354 /// ```
355 pub fn match_kind(mut self, kind: MatchKind) -> Config {
356 self.match_kind = Some(kind);
357 self
358 }
359
360 /// Whether to compile a separate start state for each pattern in the
361 /// automaton.
362 ///
363 /// When enabled, a separate **anchored** start state is added for each
364 /// pattern in the DFA. When this start state is used, then the DFA will
365 /// only search for matches for the pattern specified, even if there are
366 /// other patterns in the DFA.
367 ///
368 /// The main downside of this option is that it can potentially increase
369 /// the size of the DFA and/or increase the time it takes to build the DFA.
370 ///
371 /// There are a few reasons one might want to enable this (it's disabled
372 /// by default):
373 ///
374 /// 1. When looking for the start of an overlapping match (using a
375 /// reverse DFA), doing it correctly requires starting the reverse search
376 /// using the starting state of the pattern that matched in the forward
377 /// direction. Indeed, when building a [`Regex`](crate::dfa::regex::Regex),
378 /// it will automatically enable this option when building the reverse DFA
379 /// internally.
380 /// 2. When you want to use a DFA with multiple patterns to both search
381 /// for matches of any pattern or to search for anchored matches of one
382 /// particular pattern while using the same DFA. (Otherwise, you would need
383 /// to compile a new DFA for each pattern.)
384 /// 3. Since the start states added for each pattern are anchored, if you
385 /// compile an unanchored DFA with one pattern while also enabling this
386 /// option, then you can use the same DFA to perform anchored or unanchored
387 /// searches. The latter you get with the standard search APIs. The former
388 /// you get from the various `_at` search methods that allow you specify a
389 /// pattern ID to search for.
390 ///
391 /// By default this is disabled.
392 ///
393 /// # Example
394 ///
395 /// This example shows how to use this option to permit the same DFA to
396 /// run both anchored and unanchored searches for a single pattern.
397 ///
398 /// ```
399 /// use regex_automata::{
400 /// dfa::{Automaton, dense},
401 /// HalfMatch, PatternID,
402 /// };
403 ///
404 /// let dfa = dense::Builder::new()
405 /// .configure(dense::Config::new().starts_for_each_pattern(true))
406 /// .build(r"foo[0-9]+")?;
407 /// let haystack = b"quux foo123";
408 ///
409 /// // Here's a normal unanchored search. Notice that we use 'None' for the
410 /// // pattern ID. Since the DFA was built as an unanchored machine, it
411 /// // use its default unanchored starting state.
412 /// let expected = HalfMatch::must(0, 11);
413 /// assert_eq!(Some(expected), dfa.find_leftmost_fwd_at(
414 /// None, None, haystack, 0, haystack.len(),
415 /// )?);
416 /// // But now if we explicitly specify the pattern to search ('0' being
417 /// // the only pattern in the DFA), then it will use the starting state
418 /// // for that specific pattern which is always anchored. Since the
419 /// // pattern doesn't have a match at the beginning of the haystack, we
420 /// // find nothing.
421 /// assert_eq!(None, dfa.find_leftmost_fwd_at(
422 /// None, Some(PatternID::must(0)), haystack, 0, haystack.len(),
423 /// )?);
424 /// // And finally, an anchored search is not the same as putting a '^' at
425 /// // beginning of the pattern. An anchored search can only match at the
426 /// // beginning of the *search*, which we can change:
427 /// assert_eq!(Some(expected), dfa.find_leftmost_fwd_at(
428 /// None, Some(PatternID::must(0)), haystack, 5, haystack.len(),
429 /// )?);
430 ///
431 /// # Ok::<(), Box<dyn std::error::Error>>(())
432 /// ```
433 pub fn starts_for_each_pattern(mut self, yes: bool) -> Config {
434 self.starts_for_each_pattern = Some(yes);
435 self
436 }
437
438 /// Whether to attempt to shrink the size of the DFA's alphabet or not.
439 ///
440 /// This option is enabled by default and should never be disabled unless
441 /// one is debugging a generated DFA.
442 ///
443 /// When enabled, the DFA will use a map from all possible bytes to their
444 /// corresponding equivalence class. Each equivalence class represents a
445 /// set of bytes that does not discriminate between a match and a non-match
446 /// in the DFA. For example, the pattern `[ab]+` has at least two
447 /// equivalence classes: a set containing `a` and `b` and a set containing
448 /// every byte except for `a` and `b`. `a` and `b` are in the same
449 /// equivalence classes because they never discriminate between a match
450 /// and a non-match.
451 ///
452 /// The advantage of this map is that the size of the transition table
453 /// can be reduced drastically from `#states * 256 * sizeof(StateID)` to
454 /// `#states * k * sizeof(StateID)` where `k` is the number of equivalence
455 /// classes (rounded up to the nearest power of 2). As a result, total
456 /// space usage can decrease substantially. Moreover, since a smaller
457 /// alphabet is used, DFA compilation becomes faster as well.
458 ///
459 /// **WARNING:** This is only useful for debugging DFAs. Disabling this
460 /// does not yield any speed advantages. Namely, even when this is
461 /// disabled, a byte class map is still used while searching. The only
462 /// difference is that every byte will be forced into its own distinct
463 /// equivalence class. This is useful for debugging the actual generated
464 /// transitions because it lets one see the transitions defined on actual
465 /// bytes instead of the equivalence classes.
466 pub fn byte_classes(mut self, yes: bool) -> Config {
467 self.byte_classes = Some(yes);
468 self
469 }
470
471 /// Heuristically enable Unicode word boundaries.
472 ///
473 /// When set, this will attempt to implement Unicode word boundaries as if
474 /// they were ASCII word boundaries. This only works when the search input
475 /// is ASCII only. If a non-ASCII byte is observed while searching, then a
476 /// [`MatchError::Quit`](crate::MatchError::Quit) error is returned.
477 ///
478 /// A possible alternative to enabling this option is to simply use an
479 /// ASCII word boundary, e.g., via `(?-u:\b)`. The main reason to use this
480 /// option is if you absolutely need Unicode support. This option lets one
481 /// use a fast search implementation (a DFA) for some potentially very
482 /// common cases, while providing the option to fall back to some other
483 /// regex engine to handle the general case when an error is returned.
484 ///
485 /// If the pattern provided has no Unicode word boundary in it, then this
486 /// option has no effect. (That is, quitting on a non-ASCII byte only
487 /// occurs when this option is enabled _and_ a Unicode word boundary is
488 /// present in the pattern.)
489 ///
490 /// This is almost equivalent to setting all non-ASCII bytes to be quit
491 /// bytes. The only difference is that this will cause non-ASCII bytes to
492 /// be quit bytes _only_ when a Unicode word boundary is present in the
493 /// pattern.
494 ///
495 /// When enabling this option, callers _must_ be prepared to handle
496 /// a [`MatchError`](crate::MatchError) error during search.
497 /// When using a [`Regex`](crate::dfa::regex::Regex), this corresponds
498 /// to using the `try_` suite of methods. Alternatively, if
499 /// callers can guarantee that their input is ASCII only, then a
500 /// [`MatchError::Quit`](crate::MatchError::Quit) error will never be
501 /// returned while searching.
502 ///
503 /// This is disabled by default.
504 ///
505 /// # Example
506 ///
507 /// This example shows how to heuristically enable Unicode word boundaries
508 /// in a pattern. It also shows what happens when a search comes across a
509 /// non-ASCII byte.
510 ///
511 /// ```
512 /// use regex_automata::{
513 /// dfa::{Automaton, dense},
514 /// HalfMatch, MatchError, MatchKind,
515 /// };
516 ///
517 /// let dfa = dense::Builder::new()
518 /// .configure(dense::Config::new().unicode_word_boundary(true))
519 /// .build(r"\b[0-9]+\b")?;
520 ///
521 /// // The match occurs before the search ever observes the snowman
522 /// // character, so no error occurs.
523 /// let haystack = "foo 123 ☃".as_bytes();
524 /// let expected = Some(HalfMatch::must(0, 7));
525 /// let got = dfa.find_leftmost_fwd(haystack)?;
526 /// assert_eq!(expected, got);
527 ///
528 /// // Notice that this search fails, even though the snowman character
529 /// // occurs after the ending match offset. This is because search
530 /// // routines read one byte past the end of the search to account for
531 /// // look-around, and indeed, this is required here to determine whether
532 /// // the trailing \b matches.
533 /// let haystack = "foo 123☃".as_bytes();
534 /// let expected = MatchError::Quit { byte: 0xE2, offset: 7 };
535 /// let got = dfa.find_leftmost_fwd(haystack);
536 /// assert_eq!(Err(expected), got);
537 ///
538 /// # Ok::<(), Box<dyn std::error::Error>>(())
539 /// ```
540 pub fn unicode_word_boundary(mut self, yes: bool) -> Config {
541 // We have a separate option for this instead of just setting the
542 // appropriate quit bytes here because we don't want to set quit bytes
543 // for every regex. We only want to set them when the regex contains a
544 // Unicode word boundary.
545 self.unicode_word_boundary = Some(yes);
546 self
547 }
548
549 /// Add a "quit" byte to the DFA.
550 ///
551 /// When a quit byte is seen during search time, then search will return
552 /// a [`MatchError::Quit`](crate::MatchError::Quit) error indicating the
553 /// offset at which the search stopped.
554 ///
555 /// A quit byte will always overrule any other aspects of a regex. For
556 /// example, if the `x` byte is added as a quit byte and the regex `\w` is
557 /// used, then observing `x` will cause the search to quit immediately
558 /// despite the fact that `x` is in the `\w` class.
559 ///
560 /// This mechanism is primarily useful for heuristically enabling certain
561 /// features like Unicode word boundaries in a DFA. Namely, if the input
562 /// to search is ASCII, then a Unicode word boundary can be implemented
563 /// via an ASCII word boundary with no change in semantics. Thus, a DFA
564 /// can attempt to match a Unicode word boundary but give up as soon as it
565 /// observes a non-ASCII byte. Indeed, if callers set all non-ASCII bytes
566 /// to be quit bytes, then Unicode word boundaries will be permitted when
567 /// building DFAs. Of course, callers should enable
568 /// [`Config::unicode_word_boundary`] if they want this behavior instead.
569 /// (The advantage being that non-ASCII quit bytes will only be added if a
570 /// Unicode word boundary is in the pattern.)
571 ///
572 /// When enabling this option, callers _must_ be prepared to handle a
573 /// [`MatchError`](crate::MatchError) error during search. When using a
574 /// [`Regex`](crate::dfa::regex::Regex), this corresponds to using the
575 /// `try_` suite of methods.
576 ///
577 /// By default, there are no quit bytes set.
578 ///
579 /// # Panics
580 ///
581 /// This panics if heuristic Unicode word boundaries are enabled and any
582 /// non-ASCII byte is removed from the set of quit bytes. Namely, enabling
583 /// Unicode word boundaries requires setting every non-ASCII byte to a quit
584 /// byte. So if the caller attempts to undo any of that, then this will
585 /// panic.
586 ///
587 /// # Example
588 ///
589 /// This example shows how to cause a search to terminate if it sees a
590 /// `\n` byte. This could be useful if, for example, you wanted to prevent
591 /// a user supplied pattern from matching across a line boundary.
592 ///
593 /// ```
594 /// use regex_automata::{
595 /// dfa::{Automaton, dense},
596 /// HalfMatch, MatchError,
597 /// };
598 ///
599 /// let dfa = dense::Builder::new()
600 /// .configure(dense::Config::new().quit(b'\n', true))
601 /// .build(r"foo\p{any}+bar")?;
602 ///
603 /// let haystack = "foo\nbar".as_bytes();
604 /// // Normally this would produce a match, since \p{any} contains '\n'.
605 /// // But since we instructed the automaton to enter a quit state if a
606 /// // '\n' is observed, this produces a match error instead.
607 /// let expected = MatchError::Quit { byte: 0x0A, offset: 3 };
608 /// let got = dfa.find_leftmost_fwd(haystack).unwrap_err();
609 /// assert_eq!(expected, got);
610 ///
611 /// # Ok::<(), Box<dyn std::error::Error>>(())
612 /// ```
613 pub fn quit(mut self, byte: u8, yes: bool) -> Config {
614 if self.get_unicode_word_boundary() && !byte.is_ascii() && !yes {
615 panic!(
616 "cannot set non-ASCII byte to be non-quit when \
617 Unicode word boundaries are enabled"
618 );
619 }
620 if self.quit.is_none() {
621 self.quit = Some(ByteSet::empty());
622 }
623 if yes {
624 self.quit.as_mut().unwrap().add(byte);
625 } else {
626 self.quit.as_mut().unwrap().remove(byte);
627 }
628 self
629 }
630
631 /// Set a size limit on the total heap used by a DFA.
632 ///
633 /// This size limit is expressed in bytes and is applied during
634 /// determinization of an NFA into a DFA. If the DFA's heap usage, and only
635 /// the DFA, exceeds this configured limit, then determinization is stopped
636 /// and an error is returned.
637 ///
638 /// This limit does not apply to auxiliary storage used during
639 /// determinization that isn't part of the generated DFA.
640 ///
641 /// This limit is only applied during determinization. Currently, there is
642 /// no way to post-pone this check to after minimization if minimization
643 /// was enabled.
644 ///
645 /// The total limit on heap used during determinization is the sum of the
646 /// DFA and determinization size limits.
647 ///
648 /// The default is no limit.
649 ///
650 /// # Example
651 ///
652 /// This example shows a DFA that fails to build because of a configured
653 /// size limit. This particular example also serves as a cautionary tale
654 /// demonstrating just how big DFAs with large Unicode character classes
655 /// can get.
656 ///
657 /// ```
658 /// use regex_automata::dfa::{dense, Automaton};
659 ///
660 /// // 3MB isn't enough!
661 /// dense::Builder::new()
662 /// .configure(dense::Config::new().dfa_size_limit(Some(3_000_000)))
663 /// .build(r"\w{20}")
664 /// .unwrap_err();
665 ///
666 /// // ... but 4MB probably is!
667 /// // (Note that DFA sizes aren't necessarily stable between releases.)
668 /// let dfa = dense::Builder::new()
669 /// .configure(dense::Config::new().dfa_size_limit(Some(4_000_000)))
670 /// .build(r"\w{20}")?;
671 /// let haystack = "A".repeat(20).into_bytes();
672 /// assert!(dfa.find_leftmost_fwd(&haystack)?.is_some());
673 ///
674 /// # Ok::<(), Box<dyn std::error::Error>>(())
675 /// ```
676 ///
677 /// While one needs a little more than 3MB to represent `\w{20}`, it
678 /// turns out that you only need a little more than 4KB to represent
679 /// `(?-u:\w{20})`. So only use Unicode if you need it!
680 pub fn dfa_size_limit(mut self, bytes: Option<usize>) -> Config {
681 self.dfa_size_limit = Some(bytes);
682 self
683 }
684
685 /// Set a size limit on the total heap used by determinization.
686 ///
687 /// This size limit is expressed in bytes and is applied during
688 /// determinization of an NFA into a DFA. If the heap used for auxiliary
689 /// storage during determinization (memory that is not in the DFA but
690 /// necessary for building the DFA) exceeds this configured limit, then
691 /// determinization is stopped and an error is returned.
692 ///
693 /// This limit does not apply to heap used by the DFA itself.
694 ///
695 /// The total limit on heap used during determinization is the sum of the
696 /// DFA and determinization size limits.
697 ///
698 /// The default is no limit.
699 ///
700 /// # Example
701 ///
702 /// This example shows a DFA that fails to build because of a
703 /// configured size limit on the amount of heap space used by
704 /// determinization. This particular example complements the example for
705 /// [`Config::dfa_size_limit`] by demonstrating that not only does Unicode
706 /// potentially make DFAs themselves big, but it also results in more
707 /// auxiliary storage during determinization. (Although, auxiliary storage
708 /// is still not as much as the DFA itself.)
709 ///
710 /// ```
711 /// use regex_automata::dfa::{dense, Automaton};
712 ///
713 /// // 300KB isn't enough!
714 /// dense::Builder::new()
715 /// .configure(dense::Config::new()
716 /// .determinize_size_limit(Some(300_000))
717 /// )
718 /// .build(r"\w{20}")
719 /// .unwrap_err();
720 ///
721 /// // ... but 400KB probably is!
722 /// // (Note that auxiliary storage sizes aren't necessarily stable between
723 /// // releases.)
724 /// let dfa = dense::Builder::new()
725 /// .configure(dense::Config::new()
726 /// .determinize_size_limit(Some(400_000))
727 /// )
728 /// .build(r"\w{20}")?;
729 /// let haystack = "A".repeat(20).into_bytes();
730 /// assert!(dfa.find_leftmost_fwd(&haystack)?.is_some());
731 ///
732 /// # Ok::<(), Box<dyn std::error::Error>>(())
733 /// ```
734 pub fn determinize_size_limit(mut self, bytes: Option<usize>) -> Config {
735 self.determinize_size_limit = Some(bytes);
736 self
737 }
738
739 /// Returns whether this configuration has enabled anchored searches.
740 pub fn get_anchored(&self) -> bool {
741 self.anchored.unwrap_or(false)
742 }
743
744 /// Returns whether this configuration has enabled simple state
745 /// acceleration.
746 pub fn get_accelerate(&self) -> bool {
747 self.accelerate.unwrap_or(true)
748 }
749
750 /// Returns whether this configuration has enabled the expensive process
751 /// of minimizing a DFA.
752 pub fn get_minimize(&self) -> bool {
753 self.minimize.unwrap_or(false)
754 }
755
756 /// Returns the match semantics set in this configuration.
757 pub fn get_match_kind(&self) -> MatchKind {
758 self.match_kind.unwrap_or(MatchKind::LeftmostFirst)
759 }
760
761 /// Returns whether this configuration has enabled anchored starting states
762 /// for every pattern in the DFA.
763 pub fn get_starts_for_each_pattern(&self) -> bool {
764 self.starts_for_each_pattern.unwrap_or(false)
765 }
766
767 /// Returns whether this configuration has enabled byte classes or not.
768 /// This is typically a debugging oriented option, as disabling it confers
769 /// no speed benefit.
770 pub fn get_byte_classes(&self) -> bool {
771 self.byte_classes.unwrap_or(true)
772 }
773
774 /// Returns whether this configuration has enabled heuristic Unicode word
775 /// boundary support. When enabled, it is possible for a search to return
776 /// an error.
777 pub fn get_unicode_word_boundary(&self) -> bool {
778 self.unicode_word_boundary.unwrap_or(false)
779 }
780
781 /// Returns whether this configuration will instruct the DFA to enter a
782 /// quit state whenever the given byte is seen during a search. When at
783 /// least one byte has this enabled, it is possible for a search to return
784 /// an error.
785 pub fn get_quit(&self, byte: u8) -> bool {
786 self.quit.map_or(false, |q| q.contains(byte))
787 }
788
789 /// Returns the DFA size limit of this configuration if one was set.
790 /// The size limit is total number of bytes on the heap that a DFA is
791 /// permitted to use. If the DFA exceeds this limit during construction,
792 /// then construction is stopped and an error is returned.
793 pub fn get_dfa_size_limit(&self) -> Option<usize> {
794 self.dfa_size_limit.unwrap_or(None)
795 }
796
797 /// Returns the determinization size limit of this configuration if one
798 /// was set. The size limit is total number of bytes on the heap that
799 /// determinization is permitted to use. If determinization exceeds this
800 /// limit during construction, then construction is stopped and an error is
801 /// returned.
802 ///
803 /// This is different from the DFA size limit in that this only applies to
804 /// the auxiliary storage used during determinization. Once determinization
805 /// is complete, this memory is freed.
806 ///
807 /// The limit on the total heap memory used is the sum of the DFA and
808 /// determinization size limits.
809 pub fn get_determinize_size_limit(&self) -> Option<usize> {
810 self.determinize_size_limit.unwrap_or(None)
811 }
812
813 /// Overwrite the default configuration such that the options in `o` are
814 /// always used. If an option in `o` is not set, then the corresponding
815 /// option in `self` is used. If it's not set in `self` either, then it
816 /// remains not set.
817 pub(crate) fn overwrite(self, o: Config) -> Config {
818 Config {
819 anchored: o.anchored.or(self.anchored),
820 accelerate: o.accelerate.or(self.accelerate),
821 minimize: o.minimize.or(self.minimize),
822 match_kind: o.match_kind.or(self.match_kind),
823 starts_for_each_pattern: o
824 .starts_for_each_pattern
825 .or(self.starts_for_each_pattern),
826 byte_classes: o.byte_classes.or(self.byte_classes),
827 unicode_word_boundary: o
828 .unicode_word_boundary
829 .or(self.unicode_word_boundary),
830 quit: o.quit.or(self.quit),
831 dfa_size_limit: o.dfa_size_limit.or(self.dfa_size_limit),
832 determinize_size_limit: o
833 .determinize_size_limit
834 .or(self.determinize_size_limit),
835 }
836 }
837}
838
839/// A builder for constructing a deterministic finite automaton from regular
840/// expressions.
841///
842/// This builder provides two main things:
843///
844/// 1. It provides a few different `build` routines for actually constructing
845/// a DFA from different kinds of inputs. The most convenient is
846/// [`Builder::build`], which builds a DFA directly from a pattern string. The
847/// most flexible is [`Builder::build_from_nfa`], which builds a DFA straight
848/// from an NFA.
849/// 2. The builder permits configuring a number of things.
850/// [`Builder::configure`] is used with [`Config`] to configure aspects of
851/// the DFA and the construction process itself. [`Builder::syntax`] and
852/// [`Builder::thompson`] permit configuring the regex parser and Thompson NFA
853/// construction, respectively. The syntax and thompson configurations only
854/// apply when building from a pattern string.
855///
856/// This builder always constructs a *single* DFA. As such, this builder
857/// can only be used to construct regexes that either detect the presence
858/// of a match or find the end location of a match. A single DFA cannot
859/// produce both the start and end of a match. For that information, use a
860/// [`Regex`](crate::dfa::regex::Regex), which can be similarly configured
861/// using [`regex::Builder`](crate::dfa::regex::Builder). The main reason to
862/// use a DFA directly is if the end location of a match is enough for your use
863/// case. Namely, a `Regex` will construct two DFAs instead of one, since a
864/// second reverse DFA is needed to find the start of a match.
865///
866/// Note that if one wants to build a sparse DFA, you must first build a dense
867/// DFA and convert that to a sparse DFA. There is no way to build a sparse
868/// DFA without first building a dense DFA.
869///
870/// # Example
871///
872/// This example shows how to build a minimized DFA that completely disables
873/// Unicode. That is:
874///
875/// * Things such as `\w`, `.` and `\b` are no longer Unicode-aware. `\w`
876/// and `\b` are ASCII-only while `.` matches any byte except for `\n`
877/// (instead of any UTF-8 encoding of a Unicode scalar value except for
878/// `\n`). Things that are Unicode only, such as `\pL`, are not allowed.
879/// * The pattern itself is permitted to match invalid UTF-8. For example,
880/// things like `[^a]` that match any byte except for `a` are permitted.
881/// * Unanchored patterns can search through invalid UTF-8. That is, for
882/// unanchored patterns, the implicit prefix is `(?s-u:.)*?` instead of
883/// `(?s:.)*?`.
884///
885/// ```
886/// use regex_automata::{
887/// dfa::{Automaton, dense},
888/// nfa::thompson,
889/// HalfMatch, SyntaxConfig,
890/// };
891///
892/// let dfa = dense::Builder::new()
893/// .configure(dense::Config::new().minimize(false))
894/// .syntax(SyntaxConfig::new().unicode(false).utf8(false))
895/// .thompson(thompson::Config::new().utf8(false))
896/// .build(r"foo[^b]ar.*")?;
897///
898/// let haystack = b"\xFEfoo\xFFar\xE2\x98\xFF\n";
899/// let expected = Some(HalfMatch::must(0, 10));
900/// let got = dfa.find_leftmost_fwd(haystack)?;
901/// assert_eq!(expected, got);
902///
903/// # Ok::<(), Box<dyn std::error::Error>>(())
904/// ```
905#[cfg(feature = "alloc")]
906#[derive(Clone, Debug)]
907pub struct Builder {
908 config: Config,
909 thompson: thompson::Builder,
910}
911
912#[cfg(feature = "alloc")]
913impl Builder {
914 /// Create a new dense DFA builder with the default configuration.
915 pub fn new() -> Builder {
916 Builder {
917 config: Config::default(),
918 thompson: thompson::Builder::new(),
919 }
920 }
921
922 /// Build a DFA from the given pattern.
923 ///
924 /// If there was a problem parsing or compiling the pattern, then an error
925 /// is returned.
926 pub fn build(&self, pattern: &str) -> Result<OwnedDFA, Error> {
927 self.build_many(&[pattern])
928 }
929
930 /// Build a DFA from the given patterns.
931 ///
932 /// When matches are returned, the pattern ID corresponds to the index of
933 /// the pattern in the slice given.
934 pub fn build_many<P: AsRef<str>>(
935 &self,
936 patterns: &[P],
937 ) -> Result<OwnedDFA, Error> {
938 let nfa = self.thompson.build_many(patterns).map_err(Error::nfa)?;
939 self.build_from_nfa(&nfa)
940 }
941
942 /// Build a DFA from the given NFA.
943 ///
944 /// # Example
945 ///
946 /// This example shows how to build a DFA if you already have an NFA in
947 /// hand.
948 ///
949 /// ```
950 /// use regex_automata::{
951 /// dfa::{Automaton, dense},
952 /// nfa::thompson,
953 /// HalfMatch,
954 /// };
955 ///
956 /// let haystack = "foo123bar".as_bytes();
957 ///
958 /// // This shows how to set non-default options for building an NFA.
959 /// let nfa = thompson::Builder::new()
960 /// .configure(thompson::Config::new().shrink(false))
961 /// .build(r"[0-9]+")?;
962 /// let dfa = dense::Builder::new().build_from_nfa(&nfa)?;
963 /// let expected = Some(HalfMatch::must(0, 6));
964 /// let got = dfa.find_leftmost_fwd(haystack)?;
965 /// assert_eq!(expected, got);
966 ///
967 /// # Ok::<(), Box<dyn std::error::Error>>(())
968 /// ```
969 pub fn build_from_nfa(
970 &self,
971 nfa: &thompson::NFA,
972 ) -> Result<OwnedDFA, Error> {
973 let mut quit = self.config.quit.unwrap_or(ByteSet::empty());
974 if self.config.get_unicode_word_boundary()
975 && nfa.has_word_boundary_unicode()
976 {
977 for b in 0x80..=0xFF {
978 quit.add(b);
979 }
980 }
981 let classes = if !self.config.get_byte_classes() {
982 // DFAs will always use the equivalence class map, but enabling
983 // this option is useful for debugging. Namely, this will cause all
984 // transitions to be defined over their actual bytes instead of an
985 // opaque equivalence class identifier. The former is much easier
986 // to grok as a human.
987 ByteClasses::singletons()
988 } else {
989 let mut set = nfa.byte_class_set().clone();
990 // It is important to distinguish any "quit" bytes from all other
991 // bytes. Otherwise, a non-quit byte may end up in the same class
992 // as a quit byte, and thus cause the DFA stop when it shouldn't.
993 if !quit.is_empty() {
994 set.add_set(&quit);
995 }
996 set.byte_classes()
997 };
998
999 let mut dfa = DFA::initial(
1000 classes,
1001 nfa.pattern_len(),
1002 self.config.get_starts_for_each_pattern(),
1003 )?;
1004 determinize::Config::new()
1005 .anchored(self.config.get_anchored())
1006 .match_kind(self.config.get_match_kind())
1007 .quit(quit)
1008 .dfa_size_limit(self.config.get_dfa_size_limit())
1009 .determinize_size_limit(self.config.get_determinize_size_limit())
1010 .run(nfa, &mut dfa)?;
1011 if self.config.get_minimize() {
1012 dfa.minimize();
1013 }
1014 if self.config.get_accelerate() {
1015 dfa.accelerate();
1016 }
1017 Ok(dfa)
1018 }
1019
1020 /// Apply the given dense DFA configuration options to this builder.
1021 pub fn configure(&mut self, config: Config) -> &mut Builder {
1022 self.config = self.config.overwrite(config);
1023 self
1024 }
1025
1026 /// Set the syntax configuration for this builder using
1027 /// [`SyntaxConfig`](crate::SyntaxConfig).
1028 ///
1029 /// This permits setting things like case insensitivity, Unicode and multi
1030 /// line mode.
1031 ///
1032 /// These settings only apply when constructing a DFA directly from a
1033 /// pattern.
1034 pub fn syntax(
1035 &mut self,
1036 config: crate::util::syntax::SyntaxConfig,
1037 ) -> &mut Builder {
1038 self.thompson.syntax(config);
1039 self
1040 }
1041
1042 /// Set the Thompson NFA configuration for this builder using
1043 /// [`nfa::thompson::Config`](crate::nfa::thompson::Config).
1044 ///
1045 /// This permits setting things like whether the DFA should match the regex
1046 /// in reverse or if additional time should be spent shrinking the size of
1047 /// the NFA.
1048 ///
1049 /// These settings only apply when constructing a DFA directly from a
1050 /// pattern.
1051 pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
1052 self.thompson.configure(config);
1053 self
1054 }
1055}
1056
1057#[cfg(feature = "alloc")]
1058impl Default for Builder {
1059 fn default() -> Builder {
1060 Builder::new()
1061 }
1062}
1063
1064/// A convenience alias for an owned DFA. We use this particular instantiation
1065/// a lot in this crate, so it's worth giving it a name. This instantiation
1066/// is commonly used for mutable APIs on the DFA while building it. The main
1067/// reason for making DFAs generic is no_std support, and more generally,
1068/// making it possible to load a DFA from an arbitrary slice of bytes.
1069#[cfg(feature = "alloc")]
1070pub(crate) type OwnedDFA = DFA<Vec<u32>>;
1071
1072/// A dense table-based deterministic finite automaton (DFA).
1073///
1074/// All dense DFAs have one or more start states, zero or more match states
1075/// and a transition table that maps the current state and the current byte
1076/// of input to the next state. A DFA can use this information to implement
1077/// fast searching. In particular, the use of a dense DFA generally makes the
1078/// trade off that match speed is the most valuable characteristic, even if
1079/// building the DFA may take significant time *and* space. (More concretely,
1080/// building a DFA takes time and space that is exponential in the size of the
1081/// pattern in the worst case.) As such, the processing of every byte of input
1082/// is done with a small constant number of operations that does not vary with
1083/// the pattern, its size or the size of the alphabet. If your needs don't line
1084/// up with this trade off, then a dense DFA may not be an adequate solution to
1085/// your problem.
1086///
1087/// In contrast, a [`sparse::DFA`] makes the opposite
1088/// trade off: it uses less space but will execute a variable number of
1089/// instructions per byte at match time, which makes it slower for matching.
1090/// (Note that space usage is still exponential in the size of the pattern in
1091/// the worst case.)
1092///
1093/// A DFA can be built using the default configuration via the
1094/// [`DFA::new`] constructor. Otherwise, one can
1095/// configure various aspects via [`dense::Builder`](Builder).
1096///
1097/// A single DFA fundamentally supports the following operations:
1098///
1099/// 1. Detection of a match.
1100/// 2. Location of the end of a match.
1101/// 3. In the case of a DFA with multiple patterns, which pattern matched is
1102/// reported as well.
1103///
1104/// A notable absence from the above list of capabilities is the location of
1105/// the *start* of a match. In order to provide both the start and end of
1106/// a match, *two* DFAs are required. This functionality is provided by a
1107/// [`Regex`](crate::dfa::regex::Regex).
1108///
1109/// # Type parameters
1110///
1111/// A `DFA` has one type parameter, `T`, which is used to represent state IDs,
1112/// pattern IDs and accelerators. `T` is typically a `Vec<u32>` or a `&[u32]`.
1113///
1114/// # The `Automaton` trait
1115///
1116/// This type implements the [`Automaton`] trait, which means it can be used
1117/// for searching. For example:
1118///
1119/// ```
1120/// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch};
1121///
1122/// let dfa = DFA::new("foo[0-9]+")?;
1123/// let expected = HalfMatch::must(0, 8);
1124/// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
1125/// # Ok::<(), Box<dyn std::error::Error>>(())
1126/// ```
1127#[derive(Clone)]
1128pub struct DFA<T> {
1129 /// The transition table for this DFA. This includes the transitions
1130 /// themselves, along with the stride, number of states and the equivalence
1131 /// class mapping.
1132 tt: TransitionTable<T>,
1133 /// The set of starting state identifiers for this DFA. The starting state
1134 /// IDs act as pointers into the transition table. The specific starting
1135 /// state chosen for each search is dependent on the context at which the
1136 /// search begins.
1137 st: StartTable<T>,
1138 /// The set of match states and the patterns that match for each
1139 /// corresponding match state.
1140 ///
1141 /// This structure is technically only needed because of support for
1142 /// multi-regexes. Namely, multi-regexes require answering not just whether
1143 /// a match exists, but _which_ patterns match. So we need to store the
1144 /// matching pattern IDs for each match state. We do this even when there
1145 /// is only one pattern for the sake of simplicity. In practice, this uses
1146 /// up very little space for the case of on pattern.
1147 ms: MatchStates<T>,
1148 /// Information about which states are "special." Special states are states
1149 /// that are dead, quit, matching, starting or accelerated. For more info,
1150 /// see the docs for `Special`.
1151 special: Special,
1152 /// The accelerators for this DFA.
1153 ///
1154 /// If a state is accelerated, then there exist only a small number of
1155 /// bytes that can cause the DFA to leave the state. This permits searching
1156 /// to use optimized routines to find those specific bytes instead of using
1157 /// the transition table.
1158 ///
1159 /// All accelerated states exist in a contiguous range in the DFA's
1160 /// transition table. See dfa/special.rs for more details on how states are
1161 /// arranged.
1162 accels: Accels<T>,
1163}
1164
1165#[cfg(feature = "alloc")]
1166impl OwnedDFA {
1167 /// Parse the given regular expression using a default configuration and
1168 /// return the corresponding DFA.
1169 ///
1170 /// If you want a non-default configuration, then use the
1171 /// [`dense::Builder`](Builder) to set your own configuration.
1172 ///
1173 /// # Example
1174 ///
1175 /// ```
1176 /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch};
1177 ///
1178 /// let dfa = dense::DFA::new("foo[0-9]+bar")?;
1179 /// let expected = HalfMatch::must(0, 11);
1180 /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345bar")?);
1181 /// # Ok::<(), Box<dyn std::error::Error>>(())
1182 /// ```
1183 pub fn new(pattern: &str) -> Result<OwnedDFA, Error> {
1184 Builder::new().build(pattern)
1185 }
1186
1187 /// Parse the given regular expressions using a default configuration and
1188 /// return the corresponding multi-DFA.
1189 ///
1190 /// If you want a non-default configuration, then use the
1191 /// [`dense::Builder`](Builder) to set your own configuration.
1192 ///
1193 /// # Example
1194 ///
1195 /// ```
1196 /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch};
1197 ///
1198 /// let dfa = dense::DFA::new_many(&["[0-9]+", "[a-z]+"])?;
1199 /// let expected = HalfMatch::must(1, 3);
1200 /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345bar")?);
1201 /// # Ok::<(), Box<dyn std::error::Error>>(())
1202 /// ```
1203 pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<OwnedDFA, Error> {
1204 Builder::new().build_many(patterns)
1205 }
1206}
1207
1208#[cfg(feature = "alloc")]
1209impl OwnedDFA {
1210 /// Create a new DFA that matches every input.
1211 ///
1212 /// # Example
1213 ///
1214 /// ```
1215 /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch};
1216 ///
1217 /// let dfa = dense::DFA::always_match()?;
1218 ///
1219 /// let expected = HalfMatch::must(0, 0);
1220 /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"")?);
1221 /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo")?);
1222 /// # Ok::<(), Box<dyn std::error::Error>>(())
1223 /// ```
1224 pub fn always_match() -> Result<OwnedDFA, Error> {
1225 let nfa = thompson::NFA::always_match();
1226 Builder::new().build_from_nfa(&nfa)
1227 }
1228
1229 /// Create a new DFA that never matches any input.
1230 ///
1231 /// # Example
1232 ///
1233 /// ```
1234 /// use regex_automata::dfa::{Automaton, dense};
1235 ///
1236 /// let dfa = dense::DFA::never_match()?;
1237 /// assert_eq!(None, dfa.find_leftmost_fwd(b"")?);
1238 /// assert_eq!(None, dfa.find_leftmost_fwd(b"foo")?);
1239 /// # Ok::<(), Box<dyn std::error::Error>>(())
1240 /// ```
1241 pub fn never_match() -> Result<OwnedDFA, Error> {
1242 let nfa = thompson::NFA::never_match();
1243 Builder::new().build_from_nfa(&nfa)
1244 }
1245
1246 /// Create an initial DFA with the given equivalence classes, pattern count
1247 /// and whether anchored starting states are enabled for each pattern. An
1248 /// initial DFA can be further mutated via determinization.
1249 fn initial(
1250 classes: ByteClasses,
1251 pattern_count: usize,
1252 starts_for_each_pattern: bool,
1253 ) -> Result<OwnedDFA, Error> {
1254 let start_pattern_count =
1255 if starts_for_each_pattern { pattern_count } else { 0 };
1256 Ok(DFA {
1257 tt: TransitionTable::minimal(classes),
1258 st: StartTable::dead(start_pattern_count)?,
1259 ms: MatchStates::empty(pattern_count),
1260 special: Special::new(),
1261 accels: Accels::empty(),
1262 })
1263 }
1264}
1265
1266impl<T: AsRef<[u32]>> DFA<T> {
1267 /// Cheaply return a borrowed version of this dense DFA. Specifically,
1268 /// the DFA returned always uses `&[u32]` for its transition table.
1269 pub fn as_ref(&self) -> DFA<&'_ [u32]> {
1270 DFA {
1271 tt: self.tt.as_ref(),
1272 st: self.st.as_ref(),
1273 ms: self.ms.as_ref(),
1274 special: self.special,
1275 accels: self.accels(),
1276 }
1277 }
1278
1279 /// Return an owned version of this sparse DFA. Specifically, the DFA
1280 /// returned always uses `Vec<u32>` for its transition table.
1281 ///
1282 /// Effectively, this returns a dense DFA whose transition table lives on
1283 /// the heap.
1284 #[cfg(feature = "alloc")]
1285 pub fn to_owned(&self) -> OwnedDFA {
1286 DFA {
1287 tt: self.tt.to_owned(),
1288 st: self.st.to_owned(),
1289 ms: self.ms.to_owned(),
1290 special: self.special,
1291 accels: self.accels().to_owned(),
1292 }
1293 }
1294
1295 /// Returns true only if this DFA has starting states for each pattern.
1296 ///
1297 /// When a DFA has starting states for each pattern, then a search with the
1298 /// DFA can be configured to only look for anchored matches of a specific
1299 /// pattern. Specifically, APIs like [`Automaton::find_earliest_fwd_at`]
1300 /// can accept a non-None `pattern_id` if and only if this method returns
1301 /// true. Otherwise, calling `find_earliest_fwd_at` will panic.
1302 ///
1303 /// Note that if the DFA has no patterns, this always returns false.
1304 pub fn has_starts_for_each_pattern(&self) -> bool {
1305 self.st.patterns > 0
1306 }
1307
1308 /// Returns the total number of elements in the alphabet for this DFA.
1309 ///
1310 /// That is, this returns the total number of transitions that each state
1311 /// in this DFA must have. Typically, a normal byte oriented DFA would
1312 /// always have an alphabet size of 256, corresponding to the number of
1313 /// unique values in a single byte. However, this implementation has two
1314 /// peculiarities that impact the alphabet length:
1315 ///
1316 /// * Every state has a special "EOI" transition that is only followed
1317 /// after the end of some haystack is reached. This EOI transition is
1318 /// necessary to account for one byte of look-ahead when implementing
1319 /// things like `\b` and `$`.
1320 /// * Bytes are grouped into equivalence classes such that no two bytes in
1321 /// the same class can distinguish a match from a non-match. For example,
1322 /// in the regex `^[a-z]+$`, the ASCII bytes `a-z` could all be in the
1323 /// same equivalence class. This leads to a massive space savings.
1324 ///
1325 /// Note though that the alphabet length does _not_ necessarily equal the
1326 /// total stride space taken up by a single DFA state in the transition
1327 /// table. Namely, for performance reasons, the stride is always the
1328 /// smallest power of two that is greater than or equal to the alphabet
1329 /// length. For this reason, [`DFA::stride`] or [`DFA::stride2`] are
1330 /// often more useful. The alphabet length is typically useful only for
1331 /// informational purposes.
1332 pub fn alphabet_len(&self) -> usize {
1333 self.tt.alphabet_len()
1334 }
1335
1336 /// Returns the total stride for every state in this DFA, expressed as the
1337 /// exponent of a power of 2. The stride is the amount of space each state
1338 /// takes up in the transition table, expressed as a number of transitions.
1339 /// (Unused transitions map to dead states.)
1340 ///
1341 /// The stride of a DFA is always equivalent to the smallest power of 2
1342 /// that is greater than or equal to the DFA's alphabet length. This
1343 /// definition uses extra space, but permits faster translation between
1344 /// premultiplied state identifiers and contiguous indices (by using shifts
1345 /// instead of relying on integer division).
1346 ///
1347 /// For example, if the DFA's stride is 16 transitions, then its `stride2`
1348 /// is `4` since `2^4 = 16`.
1349 ///
1350 /// The minimum `stride2` value is `1` (corresponding to a stride of `2`)
1351 /// while the maximum `stride2` value is `9` (corresponding to a stride of
1352 /// `512`). The maximum is not `8` since the maximum alphabet size is `257`
1353 /// when accounting for the special EOI transition. However, an alphabet
1354 /// length of that size is exceptionally rare since the alphabet is shrunk
1355 /// into equivalence classes.
1356 pub fn stride2(&self) -> usize {
1357 self.tt.stride2
1358 }
1359
1360 /// Returns the total stride for every state in this DFA. This corresponds
1361 /// to the total number of transitions used by each state in this DFA's
1362 /// transition table.
1363 ///
1364 /// Please see [`DFA::stride2`] for more information. In particular, this
1365 /// returns the stride as the number of transitions, where as `stride2`
1366 /// returns it as the exponent of a power of 2.
1367 pub fn stride(&self) -> usize {
1368 self.tt.stride()
1369 }
1370
1371 /// Returns the "universal" start state for this DFA.
1372 ///
1373 /// A universal start state occurs only when all of the starting states
1374 /// for this DFA are precisely the same. This occurs when there are no
1375 /// look-around assertions at the beginning (or end for a reverse DFA) of
1376 /// the pattern.
1377 ///
1378 /// Using this as a starting state for a DFA without a universal starting
1379 /// state has unspecified behavior. This condition is not checked, so the
1380 /// caller must guarantee it themselves.
1381 pub(crate) fn universal_start_state(&self) -> StateID {
1382 // We choose 'NonWordByte' for no particular reason, other than
1383 // the fact that this is the 'main' starting configuration used in
1384 // determinization. But in essence, it doesn't really matter.
1385 //
1386 // Also, we might consider exposing this routine, but it seems
1387 // a little tricky to use correctly. Maybe if we also expose a
1388 // 'has_universal_start_state' method?
1389 self.st.start(Start::NonWordByte, None)
1390 }
1391
1392 /// Returns the memory usage, in bytes, of this DFA.
1393 ///
1394 /// The memory usage is computed based on the number of bytes used to
1395 /// represent this DFA.
1396 ///
1397 /// This does **not** include the stack size used up by this DFA. To
1398 /// compute that, use `std::mem::size_of::<dense::DFA>()`.
1399 pub fn memory_usage(&self) -> usize {
1400 self.tt.memory_usage()
1401 + self.st.memory_usage()
1402 + self.ms.memory_usage()
1403 + self.accels.memory_usage()
1404 }
1405}
1406
1407/// Routines for converting a dense DFA to other representations, such as
1408/// sparse DFAs or raw bytes suitable for persistent storage.
1409impl<T: AsRef<[u32]>> DFA<T> {
1410 /// Convert this dense DFA to a sparse DFA.
1411 ///
1412 /// If a `StateID` is too small to represent all states in the sparse
1413 /// DFA, then this returns an error. In most cases, if a dense DFA is
1414 /// constructable with `StateID` then a sparse DFA will be as well.
1415 /// However, it is not guaranteed.
1416 ///
1417 /// # Example
1418 ///
1419 /// ```
1420 /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch};
1421 ///
1422 /// let dense = dense::DFA::new("foo[0-9]+")?;
1423 /// let sparse = dense.to_sparse()?;
1424 ///
1425 /// let expected = HalfMatch::must(0, 8);
1426 /// assert_eq!(Some(expected), sparse.find_leftmost_fwd(b"foo12345")?);
1427 /// # Ok::<(), Box<dyn std::error::Error>>(())
1428 /// ```
1429 #[cfg(feature = "alloc")]
1430 pub fn to_sparse(&self) -> Result<sparse::DFA<Vec<u8>>, Error> {
1431 sparse::DFA::from_dense(self)
1432 }
1433
1434 /// Serialize this DFA as raw bytes to a `Vec<u8>` in little endian
1435 /// format. Upon success, the `Vec<u8>` and the initial padding length are
1436 /// returned.
1437 ///
1438 /// The written bytes are guaranteed to be deserialized correctly and
1439 /// without errors in a semver compatible release of this crate by a
1440 /// `DFA`'s deserialization APIs (assuming all other criteria for the
1441 /// deserialization APIs has been satisfied):
1442 ///
1443 /// * [`DFA::from_bytes`]
1444 /// * [`DFA::from_bytes_unchecked`]
1445 ///
1446 /// The padding returned is non-zero if the returned `Vec<u8>` starts at
1447 /// an address that does not have the same alignment as `u32`. The padding
1448 /// corresponds to the number of leading bytes written to the returned
1449 /// `Vec<u8>`.
1450 ///
1451 /// # Example
1452 ///
1453 /// This example shows how to serialize and deserialize a DFA:
1454 ///
1455 /// ```
1456 /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch};
1457 ///
1458 /// // Compile our original DFA.
1459 /// let original_dfa = DFA::new("foo[0-9]+")?;
1460 ///
1461 /// // N.B. We use native endianness here to make the example work, but
1462 /// // using to_bytes_little_endian would work on a little endian target.
1463 /// let (buf, _) = original_dfa.to_bytes_native_endian();
1464 /// // Even if buf has initial padding, DFA::from_bytes will automatically
1465 /// // ignore it.
1466 /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf)?.0;
1467 ///
1468 /// let expected = HalfMatch::must(0, 8);
1469 /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
1470 /// # Ok::<(), Box<dyn std::error::Error>>(())
1471 /// ```
1472 #[cfg(feature = "alloc")]
1473 pub fn to_bytes_little_endian(&self) -> (Vec<u8>, usize) {
1474 self.to_bytes::<bytes::LE>()
1475 }
1476
1477 /// Serialize this DFA as raw bytes to a `Vec<u8>` in big endian
1478 /// format. Upon success, the `Vec<u8>` and the initial padding length are
1479 /// returned.
1480 ///
1481 /// The written bytes are guaranteed to be deserialized correctly and
1482 /// without errors in a semver compatible release of this crate by a
1483 /// `DFA`'s deserialization APIs (assuming all other criteria for the
1484 /// deserialization APIs has been satisfied):
1485 ///
1486 /// * [`DFA::from_bytes`]
1487 /// * [`DFA::from_bytes_unchecked`]
1488 ///
1489 /// The padding returned is non-zero if the returned `Vec<u8>` starts at
1490 /// an address that does not have the same alignment as `u32`. The padding
1491 /// corresponds to the number of leading bytes written to the returned
1492 /// `Vec<u8>`.
1493 ///
1494 /// # Example
1495 ///
1496 /// This example shows how to serialize and deserialize a DFA:
1497 ///
1498 /// ```
1499 /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch};
1500 ///
1501 /// // Compile our original DFA.
1502 /// let original_dfa = DFA::new("foo[0-9]+")?;
1503 ///
1504 /// // N.B. We use native endianness here to make the example work, but
1505 /// // using to_bytes_big_endian would work on a big endian target.
1506 /// let (buf, _) = original_dfa.to_bytes_native_endian();
1507 /// // Even if buf has initial padding, DFA::from_bytes will automatically
1508 /// // ignore it.
1509 /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf)?.0;
1510 ///
1511 /// let expected = HalfMatch::must(0, 8);
1512 /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
1513 /// # Ok::<(), Box<dyn std::error::Error>>(())
1514 /// ```
1515 #[cfg(feature = "alloc")]
1516 pub fn to_bytes_big_endian(&self) -> (Vec<u8>, usize) {
1517 self.to_bytes::<bytes::BE>()
1518 }
1519
1520 /// Serialize this DFA as raw bytes to a `Vec<u8>` in native endian
1521 /// format. Upon success, the `Vec<u8>` and the initial padding length are
1522 /// returned.
1523 ///
1524 /// The written bytes are guaranteed to be deserialized correctly and
1525 /// without errors in a semver compatible release of this crate by a
1526 /// `DFA`'s deserialization APIs (assuming all other criteria for the
1527 /// deserialization APIs has been satisfied):
1528 ///
1529 /// * [`DFA::from_bytes`]
1530 /// * [`DFA::from_bytes_unchecked`]
1531 ///
1532 /// The padding returned is non-zero if the returned `Vec<u8>` starts at
1533 /// an address that does not have the same alignment as `u32`. The padding
1534 /// corresponds to the number of leading bytes written to the returned
1535 /// `Vec<u8>`.
1536 ///
1537 /// Generally speaking, native endian format should only be used when
1538 /// you know that the target you're compiling the DFA for matches the
1539 /// endianness of the target on which you're compiling DFA. For example,
1540 /// if serialization and deserialization happen in the same process or on
1541 /// the same machine. Otherwise, when serializing a DFA for use in a
1542 /// portable environment, you'll almost certainly want to serialize _both_
1543 /// a little endian and a big endian version and then load the correct one
1544 /// based on the target's configuration.
1545 ///
1546 /// # Example
1547 ///
1548 /// This example shows how to serialize and deserialize a DFA:
1549 ///
1550 /// ```
1551 /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch};
1552 ///
1553 /// // Compile our original DFA.
1554 /// let original_dfa = DFA::new("foo[0-9]+")?;
1555 ///
1556 /// let (buf, _) = original_dfa.to_bytes_native_endian();
1557 /// // Even if buf has initial padding, DFA::from_bytes will automatically
1558 /// // ignore it.
1559 /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf)?.0;
1560 ///
1561 /// let expected = HalfMatch::must(0, 8);
1562 /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
1563 /// # Ok::<(), Box<dyn std::error::Error>>(())
1564 /// ```
1565 #[cfg(feature = "alloc")]
1566 pub fn to_bytes_native_endian(&self) -> (Vec<u8>, usize) {
1567 self.to_bytes::<bytes::NE>()
1568 }
1569
1570 /// The implementation of the public `to_bytes` serialization methods,
1571 /// which is generic over endianness.
1572 #[cfg(feature = "alloc")]
1573 fn to_bytes<E: Endian>(&self) -> (Vec<u8>, usize) {
1574 let len = self.write_to_len();
1575 let (mut buf, padding) = bytes::alloc_aligned_buffer::<u32>(len);
1576 // This should always succeed since the only possible serialization
1577 // error is providing a buffer that's too small, but we've ensured that
1578 // `buf` is big enough here.
1579 self.as_ref().write_to::<E>(&mut buf[padding..]).unwrap();
1580 (buf, padding)
1581 }
1582
1583 /// Serialize this DFA as raw bytes to the given slice, in little endian
1584 /// format. Upon success, the total number of bytes written to `dst` is
1585 /// returned.
1586 ///
1587 /// The written bytes are guaranteed to be deserialized correctly and
1588 /// without errors in a semver compatible release of this crate by a
1589 /// `DFA`'s deserialization APIs (assuming all other criteria for the
1590 /// deserialization APIs has been satisfied):
1591 ///
1592 /// * [`DFA::from_bytes`]
1593 /// * [`DFA::from_bytes_unchecked`]
1594 ///
1595 /// Note that unlike the various `to_byte_*` routines, this does not write
1596 /// any padding. Callers are responsible for handling alignment correctly.
1597 ///
1598 /// # Errors
1599 ///
1600 /// This returns an error if the given destination slice is not big enough
1601 /// to contain the full serialized DFA. If an error occurs, then nothing
1602 /// is written to `dst`.
1603 ///
1604 /// # Example
1605 ///
1606 /// This example shows how to serialize and deserialize a DFA without
1607 /// dynamic memory allocation.
1608 ///
1609 /// ```
1610 /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch};
1611 ///
1612 /// // Compile our original DFA.
1613 /// let original_dfa = DFA::new("foo[0-9]+")?;
1614 ///
1615 /// // Create a 4KB buffer on the stack to store our serialized DFA.
1616 /// let mut buf = [0u8; 4 * (1<<10)];
1617 /// // N.B. We use native endianness here to make the example work, but
1618 /// // using write_to_little_endian would work on a little endian target.
1619 /// let written = original_dfa.write_to_native_endian(&mut buf)?;
1620 /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf[..written])?.0;
1621 ///
1622 /// let expected = HalfMatch::must(0, 8);
1623 /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
1624 /// # Ok::<(), Box<dyn std::error::Error>>(())
1625 /// ```
1626 pub fn write_to_little_endian(
1627 &self,
1628 dst: &mut [u8],
1629 ) -> Result<usize, SerializeError> {
1630 self.as_ref().write_to::<bytes::LE>(dst)
1631 }
1632
1633 /// Serialize this DFA as raw bytes to the given slice, in big endian
1634 /// format. Upon success, the total number of bytes written to `dst` is
1635 /// returned.
1636 ///
1637 /// The written bytes are guaranteed to be deserialized correctly and
1638 /// without errors in a semver compatible release of this crate by a
1639 /// `DFA`'s deserialization APIs (assuming all other criteria for the
1640 /// deserialization APIs has been satisfied):
1641 ///
1642 /// * [`DFA::from_bytes`]
1643 /// * [`DFA::from_bytes_unchecked`]
1644 ///
1645 /// Note that unlike the various `to_byte_*` routines, this does not write
1646 /// any padding. Callers are responsible for handling alignment correctly.
1647 ///
1648 /// # Errors
1649 ///
1650 /// This returns an error if the given destination slice is not big enough
1651 /// to contain the full serialized DFA. If an error occurs, then nothing
1652 /// is written to `dst`.
1653 ///
1654 /// # Example
1655 ///
1656 /// This example shows how to serialize and deserialize a DFA without
1657 /// dynamic memory allocation.
1658 ///
1659 /// ```
1660 /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch};
1661 ///
1662 /// // Compile our original DFA.
1663 /// let original_dfa = DFA::new("foo[0-9]+")?;
1664 ///
1665 /// // Create a 4KB buffer on the stack to store our serialized DFA.
1666 /// let mut buf = [0u8; 4 * (1<<10)];
1667 /// // N.B. We use native endianness here to make the example work, but
1668 /// // using write_to_big_endian would work on a big endian target.
1669 /// let written = original_dfa.write_to_native_endian(&mut buf)?;
1670 /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf[..written])?.0;
1671 ///
1672 /// let expected = HalfMatch::must(0, 8);
1673 /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
1674 /// # Ok::<(), Box<dyn std::error::Error>>(())
1675 /// ```
1676 pub fn write_to_big_endian(
1677 &self,
1678 dst: &mut [u8],
1679 ) -> Result<usize, SerializeError> {
1680 self.as_ref().write_to::<bytes::BE>(dst)
1681 }
1682
1683 /// Serialize this DFA as raw bytes to the given slice, in native endian
1684 /// format. Upon success, the total number of bytes written to `dst` is
1685 /// returned.
1686 ///
1687 /// The written bytes are guaranteed to be deserialized correctly and
1688 /// without errors in a semver compatible release of this crate by a
1689 /// `DFA`'s deserialization APIs (assuming all other criteria for the
1690 /// deserialization APIs has been satisfied):
1691 ///
1692 /// * [`DFA::from_bytes`]
1693 /// * [`DFA::from_bytes_unchecked`]
1694 ///
1695 /// Generally speaking, native endian format should only be used when
1696 /// you know that the target you're compiling the DFA for matches the
1697 /// endianness of the target on which you're compiling DFA. For example,
1698 /// if serialization and deserialization happen in the same process or on
1699 /// the same machine. Otherwise, when serializing a DFA for use in a
1700 /// portable environment, you'll almost certainly want to serialize _both_
1701 /// a little endian and a big endian version and then load the correct one
1702 /// based on the target's configuration.
1703 ///
1704 /// Note that unlike the various `to_byte_*` routines, this does not write
1705 /// any padding. Callers are responsible for handling alignment correctly.
1706 ///
1707 /// # Errors
1708 ///
1709 /// This returns an error if the given destination slice is not big enough
1710 /// to contain the full serialized DFA. If an error occurs, then nothing
1711 /// is written to `dst`.
1712 ///
1713 /// # Example
1714 ///
1715 /// This example shows how to serialize and deserialize a DFA without
1716 /// dynamic memory allocation.
1717 ///
1718 /// ```
1719 /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch};
1720 ///
1721 /// // Compile our original DFA.
1722 /// let original_dfa = DFA::new("foo[0-9]+")?;
1723 ///
1724 /// // Create a 4KB buffer on the stack to store our serialized DFA.
1725 /// let mut buf = [0u8; 4 * (1<<10)];
1726 /// let written = original_dfa.write_to_native_endian(&mut buf)?;
1727 /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf[..written])?.0;
1728 ///
1729 /// let expected = HalfMatch::must(0, 8);
1730 /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
1731 /// # Ok::<(), Box<dyn std::error::Error>>(())
1732 /// ```
1733 pub fn write_to_native_endian(
1734 &self,
1735 dst: &mut [u8],
1736 ) -> Result<usize, SerializeError> {
1737 self.as_ref().write_to::<bytes::NE>(dst)
1738 }
1739
1740 /// Return the total number of bytes required to serialize this DFA.
1741 ///
1742 /// This is useful for determining the size of the buffer required to pass
1743 /// to one of the serialization routines:
1744 ///
1745 /// * [`DFA::write_to_little_endian`]
1746 /// * [`DFA::write_to_big_endian`]
1747 /// * [`DFA::write_to_native_endian`]
1748 ///
1749 /// Passing a buffer smaller than the size returned by this method will
1750 /// result in a serialization error. Serialization routines are guaranteed
1751 /// to succeed when the buffer is big enough.
1752 ///
1753 /// # Example
1754 ///
1755 /// This example shows how to dynamically allocate enough room to serialize
1756 /// a DFA.
1757 ///
1758 /// ```
1759 /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch};
1760 ///
1761 /// // Compile our original DFA.
1762 /// let original_dfa = DFA::new("foo[0-9]+")?;
1763 ///
1764 /// let mut buf = vec![0; original_dfa.write_to_len()];
1765 /// let written = original_dfa.write_to_native_endian(&mut buf)?;
1766 /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf[..written])?.0;
1767 ///
1768 /// let expected = HalfMatch::must(0, 8);
1769 /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
1770 /// # Ok::<(), Box<dyn std::error::Error>>(())
1771 /// ```
1772 ///
1773 /// Note that this example isn't actually guaranteed to work! In
1774 /// particular, if `buf` is not aligned to a 4-byte boundary, then the
1775 /// `DFA::from_bytes` call will fail. If you need this to work, then you
1776 /// either need to deal with adding some initial padding yourself, or use
1777 /// one of the `to_bytes` methods, which will do it for you.
1778 pub fn write_to_len(&self) -> usize {
1779 bytes::write_label_len(LABEL)
1780 + bytes::write_endianness_check_len()
1781 + bytes::write_version_len()
1782 + size_of::<u32>() // unused, intended for future flexibility
1783 + self.tt.write_to_len()
1784 + self.st.write_to_len()
1785 + self.ms.write_to_len()
1786 + self.special.write_to_len()
1787 + self.accels.write_to_len()
1788 }
1789}
1790
1791impl<'a> DFA<&'a [u32]> {
1792 /// Safely deserialize a DFA with a specific state identifier
1793 /// representation. Upon success, this returns both the deserialized DFA
1794 /// and the number of bytes read from the given slice. Namely, the contents
1795 /// of the slice beyond the DFA are not read.
1796 ///
1797 /// Deserializing a DFA using this routine will never allocate heap memory.
1798 /// For safety purposes, the DFA's transition table will be verified such
1799 /// that every transition points to a valid state. If this verification is
1800 /// too costly, then a [`DFA::from_bytes_unchecked`] API is provided, which
1801 /// will always execute in constant time.
1802 ///
1803 /// The bytes given must be generated by one of the serialization APIs
1804 /// of a `DFA` using a semver compatible release of this crate. Those
1805 /// include:
1806 ///
1807 /// * [`DFA::to_bytes_little_endian`]
1808 /// * [`DFA::to_bytes_big_endian`]
1809 /// * [`DFA::to_bytes_native_endian`]
1810 /// * [`DFA::write_to_little_endian`]
1811 /// * [`DFA::write_to_big_endian`]
1812 /// * [`DFA::write_to_native_endian`]
1813 ///
1814 /// The `to_bytes` methods allocate and return a `Vec<u8>` for you, along
1815 /// with handling alignment correctly. The `write_to` methods do not
1816 /// allocate and write to an existing slice (which may be on the stack).
1817 /// Since deserialization always uses the native endianness of the target
1818 /// platform, the serialization API you use should match the endianness of
1819 /// the target platform. (It's often a good idea to generate serialized
1820 /// DFAs for both forms of endianness and then load the correct one based
1821 /// on endianness.)
1822 ///
1823 /// # Errors
1824 ///
1825 /// Generally speaking, it's easier to state the conditions in which an
1826 /// error is _not_ returned. All of the following must be true:
1827 ///
1828 /// * The bytes given must be produced by one of the serialization APIs
1829 /// on this DFA, as mentioned above.
1830 /// * The endianness of the target platform matches the endianness used to
1831 /// serialized the provided DFA.
1832 /// * The slice given must have the same alignment as `u32`.
1833 ///
1834 /// If any of the above are not true, then an error will be returned.
1835 ///
1836 /// # Panics
1837 ///
1838 /// This routine will never panic for any input.
1839 ///
1840 /// # Example
1841 ///
1842 /// This example shows how to serialize a DFA to raw bytes, deserialize it
1843 /// and then use it for searching.
1844 ///
1845 /// ```
1846 /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch};
1847 ///
1848 /// let initial = DFA::new("foo[0-9]+")?;
1849 /// let (bytes, _) = initial.to_bytes_native_endian();
1850 /// let dfa: DFA<&[u32]> = DFA::from_bytes(&bytes)?.0;
1851 ///
1852 /// let expected = HalfMatch::must(0, 8);
1853 /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
1854 /// # Ok::<(), Box<dyn std::error::Error>>(())
1855 /// ```
1856 ///
1857 /// # Example: dealing with alignment and padding
1858 ///
1859 /// In the above example, we used the `to_bytes_native_endian` method to
1860 /// serialize a DFA, but we ignored part of its return value corresponding
1861 /// to padding added to the beginning of the serialized DFA. This is OK
1862 /// because deserialization will skip this initial padding. What matters
1863 /// is that the address immediately following the padding has an alignment
1864 /// that matches `u32`. That is, the following is an equivalent but
1865 /// alternative way to write the above example:
1866 ///
1867 /// ```
1868 /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch};
1869 ///
1870 /// let initial = DFA::new("foo[0-9]+")?;
1871 /// // Serialization returns the number of leading padding bytes added to
1872 /// // the returned Vec<u8>.
1873 /// let (bytes, pad) = initial.to_bytes_native_endian();
1874 /// let dfa: DFA<&[u32]> = DFA::from_bytes(&bytes[pad..])?.0;
1875 ///
1876 /// let expected = HalfMatch::must(0, 8);
1877 /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
1878 /// # Ok::<(), Box<dyn std::error::Error>>(())
1879 /// ```
1880 ///
1881 /// This padding is necessary because Rust's standard library does
1882 /// not expose any safe and robust way of creating a `Vec<u8>` with a
1883 /// guaranteed alignment other than 1. Now, in practice, the underlying
1884 /// allocator is likely to provide a `Vec<u8>` that meets our alignment
1885 /// requirements, which means `pad` is zero in practice most of the time.
1886 ///
1887 /// The purpose of exposing the padding like this is flexibility for the
1888 /// caller. For example, if one wants to embed a serialized DFA into a
1889 /// compiled program, then it's important to guarantee that it starts at a
1890 /// `u32`-aligned address. The simplest way to do this is to discard the
1891 /// padding bytes and set it up so that the serialized DFA itself begins at
1892 /// a properly aligned address. We can show this in two parts. The first
1893 /// part is serializing the DFA to a file:
1894 ///
1895 /// ```no_run
1896 /// use regex_automata::dfa::{Automaton, dense::DFA};
1897 ///
1898 /// let dfa = DFA::new("foo[0-9]+")?;
1899 ///
1900 /// let (bytes, pad) = dfa.to_bytes_big_endian();
1901 /// // Write the contents of the DFA *without* the initial padding.
1902 /// std::fs::write("foo.bigendian.dfa", &bytes[pad..])?;
1903 ///
1904 /// // Do it again, but this time for little endian.
1905 /// let (bytes, pad) = dfa.to_bytes_little_endian();
1906 /// std::fs::write("foo.littleendian.dfa", &bytes[pad..])?;
1907 /// # Ok::<(), Box<dyn std::error::Error>>(())
1908 /// ```
1909 ///
1910 /// And now the second part is embedding the DFA into the compiled program
1911 /// and deserializing it at runtime on first use. We use conditional
1912 /// compilation to choose the correct endianness.
1913 ///
1914 /// ```no_run
1915 /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch};
1916 ///
1917 /// type S = u32;
1918 /// type DFA = dense::DFA<&'static [S]>;
1919 ///
1920 /// fn get_foo() -> &'static DFA {
1921 /// use std::cell::Cell;
1922 /// use std::mem::MaybeUninit;
1923 /// use std::sync::Once;
1924 ///
1925 /// // This struct with a generic B is used to permit unsizing
1926 /// // coercions, specifically, where B winds up being a [u8]. We also
1927 /// // need repr(C) to guarantee that _align comes first, which forces
1928 /// // a correct alignment.
1929 /// #[repr(C)]
1930 /// struct Aligned<B: ?Sized> {
1931 /// _align: [S; 0],
1932 /// bytes: B,
1933 /// }
1934 ///
1935 /// # const _: &str = stringify! {
1936 /// // This assignment is made possible (implicitly) via the
1937 /// // CoerceUnsized trait.
1938 /// static ALIGNED: &Aligned<[u8]> = &Aligned {
1939 /// _align: [],
1940 /// #[cfg(target_endian = "big")]
1941 /// bytes: *include_bytes!("foo.bigendian.dfa"),
1942 /// #[cfg(target_endian = "little")]
1943 /// bytes: *include_bytes!("foo.littleendian.dfa"),
1944 /// };
1945 /// # };
1946 /// # static ALIGNED: &Aligned<[u8]> = &Aligned {
1947 /// # _align: [],
1948 /// # bytes: [],
1949 /// # };
1950 ///
1951 /// struct Lazy(Cell<MaybeUninit<DFA>>);
1952 /// // SAFETY: This is safe because DFA impls Sync.
1953 /// unsafe impl Sync for Lazy {}
1954 ///
1955 /// static INIT: Once = Once::new();
1956 /// static DFA: Lazy = Lazy(Cell::new(MaybeUninit::uninit()));
1957 ///
1958 /// INIT.call_once(|| {
1959 /// let (dfa, _) = DFA::from_bytes(&ALIGNED.bytes)
1960 /// .expect("serialized DFA should be valid");
1961 /// // SAFETY: This is guaranteed to only execute once, and all
1962 /// // we do with the pointer is write the DFA to it.
1963 /// unsafe {
1964 /// (*DFA.0.as_ptr()).as_mut_ptr().write(dfa);
1965 /// }
1966 /// });
1967 /// // SAFETY: DFA is guaranteed to by initialized via INIT and is
1968 /// // stored in static memory.
1969 /// unsafe {
1970 /// let dfa = (*DFA.0.as_ptr()).as_ptr();
1971 /// std::mem::transmute::<*const DFA, &'static DFA>(dfa)
1972 /// }
1973 /// }
1974 ///
1975 /// let dfa = get_foo();
1976 /// let expected = HalfMatch::must(0, 8);
1977 /// assert_eq!(Ok(Some(expected)), dfa.find_leftmost_fwd(b"foo12345"));
1978 /// ```
1979 ///
1980 /// Alternatively, consider using
1981 /// [`lazy_static`](https://crates.io/crates/lazy_static)
1982 /// or
1983 /// [`once_cell`](https://crates.io/crates/once_cell),
1984 /// which will guarantee safety for you. You will still need to use the
1985 /// `Aligned` trick above to force correct alignment, but this is safe to
1986 /// do and `from_bytes` will return an error if you get it wrong.
1987 pub fn from_bytes(
1988 slice: &'a [u8],
1989 ) -> Result<(DFA<&'a [u32]>, usize), DeserializeError> {
1990 // SAFETY: This is safe because we validate both the transition table,
1991 // start state ID list and the match states below. If either validation
1992 // fails, then we return an error.
1993 let (dfa, nread) = unsafe { DFA::from_bytes_unchecked(slice)? };
1994 dfa.tt.validate()?;
1995 dfa.st.validate(&dfa.tt)?;
1996 dfa.ms.validate(&dfa)?;
1997 dfa.accels.validate()?;
1998 // N.B. dfa.special doesn't have a way to do unchecked deserialization,
1999 // so it has already been validated.
2000 Ok((dfa, nread))
2001 }
2002
2003 /// Deserialize a DFA with a specific state identifier representation in
2004 /// constant time by omitting the verification of the validity of the
2005 /// transition table and other data inside the DFA.
2006 ///
2007 /// This is just like [`DFA::from_bytes`], except it can potentially return
2008 /// a DFA that exhibits undefined behavior if its transition table contains
2009 /// invalid state identifiers.
2010 ///
2011 /// This routine is useful if you need to deserialize a DFA cheaply
2012 /// and cannot afford the transition table validation performed by
2013 /// `from_bytes`.
2014 ///
2015 /// # Example
2016 ///
2017 /// ```
2018 /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch};
2019 ///
2020 /// let initial = DFA::new("foo[0-9]+")?;
2021 /// let (bytes, _) = initial.to_bytes_native_endian();
2022 /// // SAFETY: This is guaranteed to be safe since the bytes given come
2023 /// // directly from a compatible serialization routine.
2024 /// let dfa: DFA<&[u32]> = unsafe { DFA::from_bytes_unchecked(&bytes)?.0 };
2025 ///
2026 /// let expected = HalfMatch::must(0, 8);
2027 /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
2028 /// # Ok::<(), Box<dyn std::error::Error>>(())
2029 /// ```
2030 pub unsafe fn from_bytes_unchecked(
2031 slice: &'a [u8],
2032 ) -> Result<(DFA<&'a [u32]>, usize), DeserializeError> {
2033 let mut nr = 0;
2034
2035 nr += bytes::skip_initial_padding(slice);
2036 bytes::check_alignment::<StateID>(&slice[nr..])?;
2037 nr += bytes::read_label(&slice[nr..], LABEL)?;
2038 nr += bytes::read_endianness_check(&slice[nr..])?;
2039 nr += bytes::read_version(&slice[nr..], VERSION)?;
2040
2041 let _unused = bytes::try_read_u32(&slice[nr..], "unused space")?;
2042 nr += size_of::<u32>();
2043
2044 let (tt, nread) = TransitionTable::from_bytes_unchecked(&slice[nr..])?;
2045 nr += nread;
2046
2047 let (st, nread) = StartTable::from_bytes_unchecked(&slice[nr..])?;
2048 nr += nread;
2049
2050 let (ms, nread) = MatchStates::from_bytes_unchecked(&slice[nr..])?;
2051 nr += nread;
2052
2053 let (special, nread) = Special::from_bytes(&slice[nr..])?;
2054 nr += nread;
2055 special.validate_state_count(tt.count(), tt.stride2)?;
2056
2057 let (accels, nread) = Accels::from_bytes_unchecked(&slice[nr..])?;
2058 nr += nread;
2059
2060 Ok((DFA { tt, st, ms, special, accels }, nr))
2061 }
2062
2063 /// The implementation of the public `write_to` serialization methods,
2064 /// which is generic over endianness.
2065 ///
2066 /// This is defined only for &[u32] to reduce binary size/compilation time.
2067 fn write_to<E: Endian>(
2068 &self,
2069 mut dst: &mut [u8],
2070 ) -> Result<usize, SerializeError> {
2071 let nwrite = self.write_to_len();
2072 if dst.len() < nwrite {
2073 return Err(SerializeError::buffer_too_small("dense DFA"));
2074 }
2075 dst = &mut dst[..nwrite];
2076
2077 let mut nw = 0;
2078 nw += bytes::write_label(LABEL, &mut dst[nw..])?;
2079 nw += bytes::write_endianness_check::<E>(&mut dst[nw..])?;
2080 nw += bytes::write_version::<E>(VERSION, &mut dst[nw..])?;
2081 nw += {
2082 // Currently unused, intended for future flexibility
2083 E::write_u32(0, &mut dst[nw..]);
2084 size_of::<u32>()
2085 };
2086 nw += self.tt.write_to::<E>(&mut dst[nw..])?;
2087 nw += self.st.write_to::<E>(&mut dst[nw..])?;
2088 nw += self.ms.write_to::<E>(&mut dst[nw..])?;
2089 nw += self.special.write_to::<E>(&mut dst[nw..])?;
2090 nw += self.accels.write_to::<E>(&mut dst[nw..])?;
2091 Ok(nw)
2092 }
2093}
2094
2095/// The following methods implement mutable routines on the internal
2096/// representation of a DFA. As such, we must fix the first type parameter to a
2097/// `Vec<u32>` since a generic `T: AsRef<[u32]>` does not permit mutation. We
2098/// can get away with this because these methods are internal to the crate and
2099/// are exclusively used during construction of the DFA.
2100#[cfg(feature = "alloc")]
2101impl OwnedDFA {
2102 /// Add a start state of this DFA.
2103 pub(crate) fn set_start_state(
2104 &mut self,
2105 index: Start,
2106 pattern_id: Option<PatternID>,
2107 id: StateID,
2108 ) {
2109 assert!(self.tt.is_valid(id), "invalid start state");
2110 self.st.set_start(index, pattern_id, id);
2111 }
2112
2113 /// Set the given transition to this DFA. Both the `from` and `to` states
2114 /// must already exist.
2115 pub(crate) fn set_transition(
2116 &mut self,
2117 from: StateID,
2118 byte: alphabet::Unit,
2119 to: StateID,
2120 ) {
2121 self.tt.set(from, byte, to);
2122 }
2123
2124 /// An an empty state (a state where all transitions lead to a dead state)
2125 /// and return its identifier. The identifier returned is guaranteed to
2126 /// not point to any other existing state.
2127 ///
2128 /// If adding a state would exceed `StateID::LIMIT`, then this returns an
2129 /// error.
2130 pub(crate) fn add_empty_state(&mut self) -> Result<StateID, Error> {
2131 self.tt.add_empty_state()
2132 }
2133
2134 /// Swap the two states given in the transition table.
2135 ///
2136 /// This routine does not do anything to check the correctness of this
2137 /// swap. Callers must ensure that other states pointing to id1 and id2 are
2138 /// updated appropriately.
2139 pub(crate) fn swap_states(&mut self, id1: StateID, id2: StateID) {
2140 self.tt.swap(id1, id2);
2141 }
2142
2143 /// Truncate the states in this DFA to the given count.
2144 ///
2145 /// This routine does not do anything to check the correctness of this
2146 /// truncation. Callers must ensure that other states pointing to truncated
2147 /// states are updated appropriately.
2148 pub(crate) fn truncate_states(&mut self, count: usize) {
2149 self.tt.truncate(count);
2150 }
2151
2152 /// Return a mutable representation of the state corresponding to the given
2153 /// id. This is useful for implementing routines that manipulate DFA states
2154 /// (e.g., swapping states).
2155 pub(crate) fn state_mut(&mut self, id: StateID) -> StateMut<'_> {
2156 self.tt.state_mut(id)
2157 }
2158
2159 /// Minimize this DFA in place using Hopcroft's algorithm.
2160 pub(crate) fn minimize(&mut self) {
2161 Minimizer::new(self).run();
2162 }
2163
2164 /// Updates the match state pattern ID map to use the one provided.
2165 ///
2166 /// This is useful when it's convenient to manipulate matching states
2167 /// (and their corresponding pattern IDs) as a map. In particular, the
2168 /// representation used by a DFA for this map is not amenable to mutation,
2169 /// so if things need to be changed (like when shuffling states), it's
2170 /// often easier to work with the map form.
2171 pub(crate) fn set_pattern_map(
2172 &mut self,
2173 map: &BTreeMap<StateID, Vec<PatternID>>,
2174 ) -> Result<(), Error> {
2175 self.ms = self.ms.new_with_map(map)?;
2176 Ok(())
2177 }
2178
2179 /// Find states that have a small number of non-loop transitions and mark
2180 /// them as candidates for acceleration during search.
2181 pub(crate) fn accelerate(&mut self) {
2182 // dead and quit states can never be accelerated.
2183 if self.state_count() <= 2 {
2184 return;
2185 }
2186
2187 // Go through every state and record their accelerator, if possible.
2188 let mut accels = BTreeMap::new();
2189 // Count the number of accelerated match, start and non-match/start
2190 // states.
2191 let (mut cmatch, mut cstart, mut cnormal) = (0, 0, 0);
2192 for state in self.states() {
2193 if let Some(accel) = state.accelerate(self.byte_classes()) {
2194 accels.insert(state.id(), accel);
2195 if self.is_match_state(state.id()) {
2196 cmatch += 1;
2197 } else if self.is_start_state(state.id()) {
2198 cstart += 1;
2199 } else {
2200 assert!(!self.is_dead_state(state.id()));
2201 assert!(!self.is_quit_state(state.id()));
2202 cnormal += 1;
2203 }
2204 }
2205 }
2206 // If no states were able to be accelerated, then we're done.
2207 if accels.is_empty() {
2208 return;
2209 }
2210 let original_accels_len = accels.len();
2211
2212 // A remapper keeps track of state ID changes. Once we're done
2213 // shuffling, the remapper is used to rewrite all transitions in the
2214 // DFA based on the new positions of states.
2215 let mut remapper = Remapper::from_dfa(self);
2216
2217 // As we swap states, if they are match states, we need to swap their
2218 // pattern ID lists too (for multi-regexes). We do this by converting
2219 // the lists to an easily swappable map, and then convert back to
2220 // MatchStates once we're done.
2221 let mut new_matches = self.ms.to_map(self);
2222
2223 // There is at least one state that gets accelerated, so these are
2224 // guaranteed to get set to sensible values below.
2225 self.special.min_accel = StateID::MAX;
2226 self.special.max_accel = StateID::ZERO;
2227 let update_special_accel =
2228 |special: &mut Special, accel_id: StateID| {
2229 special.min_accel = cmp::min(special.min_accel, accel_id);
2230 special.max_accel = cmp::max(special.max_accel, accel_id);
2231 };
2232
2233 // Start by shuffling match states. Any match states that are
2234 // accelerated get moved to the end of the match state range.
2235 if cmatch > 0 && self.special.matches() {
2236 // N.B. special.{min,max}_match do not need updating, since the
2237 // range/number of match states does not change. Only the ordering
2238 // of match states may change.
2239 let mut next_id = self.special.max_match;
2240 let mut cur_id = next_id;
2241 while cur_id >= self.special.min_match {
2242 if let Some(accel) = accels.remove(&cur_id) {
2243 accels.insert(next_id, accel);
2244 update_special_accel(&mut self.special, next_id);
2245
2246 // No need to do any actual swapping for equivalent IDs.
2247 if cur_id != next_id {
2248 remapper.swap(self, cur_id, next_id);
2249
2250 // Swap pattern IDs for match states.
2251 let cur_pids = new_matches.remove(&cur_id).unwrap();
2252 let next_pids = new_matches.remove(&next_id).unwrap();
2253 new_matches.insert(cur_id, next_pids);
2254 new_matches.insert(next_id, cur_pids);
2255 }
2256 next_id = self.tt.prev_state_id(next_id);
2257 }
2258 cur_id = self.tt.prev_state_id(cur_id);
2259 }
2260 }
2261
2262 // This is where it gets tricky. Without acceleration, start states
2263 // normally come right after match states. But we want accelerated
2264 // states to be a single contiguous range (to make it very fast
2265 // to determine whether a state *is* accelerated), while also keeping
2266 // match and starting states as contiguous ranges for the same reason.
2267 // So what we do here is shuffle states such that it looks like this:
2268 //
2269 // DQMMMMAAAAASSSSSSNNNNNNN
2270 // | |
2271 // |---------|
2272 // accelerated states
2273 //
2274 // Where:
2275 // D - dead state
2276 // Q - quit state
2277 // M - match state (may be accelerated)
2278 // A - normal state that is accelerated
2279 // S - start state (may be accelerated)
2280 // N - normal state that is NOT accelerated
2281 //
2282 // We implement this by shuffling states, which is done by a sequence
2283 // of pairwise swaps. We start by looking at all normal states to be
2284 // accelerated. When we find one, we swap it with the earliest starting
2285 // state, and then swap that with the earliest normal state. This
2286 // preserves the contiguous property.
2287 //
2288 // Once we're done looking for accelerated normal states, now we look
2289 // for accelerated starting states by moving them to the beginning
2290 // of the starting state range (just like we moved accelerated match
2291 // states to the end of the matching state range).
2292 //
2293 // For a more detailed/different perspective on this, see the docs
2294 // in dfa/special.rs.
2295 if cnormal > 0 {
2296 // our next available starting and normal states for swapping.
2297 let mut next_start_id = self.special.min_start;
2298 let mut cur_id = self.from_index(self.state_count() - 1);
2299 // This is guaranteed to exist since cnormal > 0.
2300 let mut next_norm_id =
2301 self.tt.next_state_id(self.special.max_start);
2302 while cur_id >= next_norm_id {
2303 if let Some(accel) = accels.remove(&cur_id) {
2304 remapper.swap(self, next_start_id, cur_id);
2305 remapper.swap(self, next_norm_id, cur_id);
2306 // Keep our accelerator map updated with new IDs if the
2307 // states we swapped were also accelerated.
2308 if let Some(accel2) = accels.remove(&next_norm_id) {
2309 accels.insert(cur_id, accel2);
2310 }
2311 if let Some(accel2) = accels.remove(&next_start_id) {
2312 accels.insert(next_norm_id, accel2);
2313 }
2314 accels.insert(next_start_id, accel);
2315 update_special_accel(&mut self.special, next_start_id);
2316 // Our start range shifts one to the right now.
2317 self.special.min_start =
2318 self.tt.next_state_id(self.special.min_start);
2319 self.special.max_start =
2320 self.tt.next_state_id(self.special.max_start);
2321 next_start_id = self.tt.next_state_id(next_start_id);
2322 next_norm_id = self.tt.next_state_id(next_norm_id);
2323 }
2324 // This is pretty tricky, but if our 'next_norm_id' state also
2325 // happened to be accelerated, then the result is that it is
2326 // now in the position of cur_id, so we need to consider it
2327 // again. This loop is still guaranteed to terminate though,
2328 // because when accels contains cur_id, we're guaranteed to
2329 // increment next_norm_id even if cur_id remains unchanged.
2330 if !accels.contains_key(&cur_id) {
2331 cur_id = self.tt.prev_state_id(cur_id);
2332 }
2333 }
2334 }
2335 // Just like we did for match states, but we want to move accelerated
2336 // start states to the beginning of the range instead of the end.
2337 if cstart > 0 {
2338 // N.B. special.{min,max}_start do not need updating, since the
2339 // range/number of start states does not change at this point. Only
2340 // the ordering of start states may change.
2341 let mut next_id = self.special.min_start;
2342 let mut cur_id = next_id;
2343 while cur_id <= self.special.max_start {
2344 if let Some(accel) = accels.remove(&cur_id) {
2345 remapper.swap(self, cur_id, next_id);
2346 accels.insert(next_id, accel);
2347 update_special_accel(&mut self.special, next_id);
2348 next_id = self.tt.next_state_id(next_id);
2349 }
2350 cur_id = self.tt.next_state_id(cur_id);
2351 }
2352 }
2353
2354 // Remap all transitions in our DFA and assert some things.
2355 remapper.remap(self);
2356 // This unwrap is OK because acceleration never changes the number of
2357 // match states or patterns in those match states. Since acceleration
2358 // runs after the pattern map has been set at least once, we know that
2359 // our match states cannot error.
2360 self.set_pattern_map(&new_matches).unwrap();
2361 self.special.set_max();
2362 self.special.validate().expect("special state ranges should validate");
2363 self.special
2364 .validate_state_count(self.state_count(), self.stride2())
2365 .expect(
2366 "special state ranges should be consistent with state count",
2367 );
2368 assert_eq!(
2369 self.special.accel_len(self.stride()),
2370 // We record the number of accelerated states initially detected
2371 // since the accels map is itself mutated in the process above.
2372 // If mutated incorrectly, its size may change, and thus can't be
2373 // trusted as a source of truth of how many accelerated states we
2374 // expected there to be.
2375 original_accels_len,
2376 "mismatch with expected number of accelerated states",
2377 );
2378
2379 // And finally record our accelerators. We kept our accels map updated
2380 // as we shuffled states above, so the accelerators should now
2381 // correspond to a contiguous range in the state ID space. (Which we
2382 // assert.)
2383 let mut prev: Option<StateID> = None;
2384 for (id, accel) in accels {
2385 assert!(prev.map_or(true, |p| self.tt.next_state_id(p) == id));
2386 prev = Some(id);
2387 self.accels.add(accel);
2388 }
2389 }
2390
2391 /// Shuffle the states in this DFA so that starting states, match
2392 /// states and accelerated states are all contiguous.
2393 ///
2394 /// See dfa/special.rs for more details.
2395 pub(crate) fn shuffle(
2396 &mut self,
2397 mut matches: BTreeMap<StateID, Vec<PatternID>>,
2398 ) -> Result<(), Error> {
2399 // The determinizer always adds a quit state and it is always second.
2400 self.special.quit_id = self.from_index(1);
2401 // If all we have are the dead and quit states, then we're done and
2402 // the DFA will never produce a match.
2403 if self.state_count() <= 2 {
2404 self.special.set_max();
2405 return Ok(());
2406 }
2407
2408 // Collect all our start states into a convenient set and confirm there
2409 // is no overlap with match states. In the classicl DFA construction,
2410 // start states can be match states. But because of look-around, we
2411 // delay all matches by a byte, which prevents start states from being
2412 // match states.
2413 let mut is_start: BTreeSet<StateID> = BTreeSet::new();
2414 for (start_id, _, _) in self.starts() {
2415 // While there's nothing theoretically wrong with setting a start
2416 // state to a dead ID (indeed, it could be an optimization!), the
2417 // shuffling code below assumes that start states aren't dead. If
2418 // this assumption is violated, the dead state could be shuffled
2419 // to a new location, which must never happen. So if we do want
2420 // to allow start states to be dead, then this assert should be
2421 // removed and the code below fixed.
2422 //
2423 // N.B. Minimization can cause start states to be dead, but that
2424 // happens after states are shuffled, so it's OK. Also, start
2425 // states are dead for the DFA that never matches anything, but
2426 // in that case, there are no states to shuffle.
2427 assert_ne!(start_id, DEAD, "start state cannot be dead");
2428 assert!(
2429 !matches.contains_key(&start_id),
2430 "{:?} is both a start and a match state, which is not allowed",
2431 start_id,
2432 );
2433 is_start.insert(start_id);
2434 }
2435
2436 // We implement shuffling by a sequence of pairwise swaps of states.
2437 // Since we have a number of things referencing states via their
2438 // IDs and swapping them changes their IDs, we need to record every
2439 // swap we make so that we can remap IDs. The remapper handles this
2440 // book-keeping for us.
2441 let mut remapper = Remapper::from_dfa(self);
2442
2443 // Shuffle matching states.
2444 if matches.is_empty() {
2445 self.special.min_match = DEAD;
2446 self.special.max_match = DEAD;
2447 } else {
2448 // The determinizer guarantees that the first two states are the
2449 // dead and quit states, respectively. We want our match states to
2450 // come right after quit.
2451 let mut next_id = self.from_index(2);
2452 let mut new_matches = BTreeMap::new();
2453 self.special.min_match = next_id;
2454 for (id, pids) in matches {
2455 remapper.swap(self, next_id, id);
2456 new_matches.insert(next_id, pids);
2457 // If we swapped a start state, then update our set.
2458 if is_start.contains(&next_id) {
2459 is_start.remove(&next_id);
2460 is_start.insert(id);
2461 }
2462 next_id = self.tt.next_state_id(next_id);
2463 }
2464 matches = new_matches;
2465 self.special.max_match = cmp::max(
2466 self.special.min_match,
2467 self.tt.prev_state_id(next_id),
2468 );
2469 }
2470
2471 // Shuffle starting states.
2472 {
2473 let mut next_id = self.from_index(2);
2474 if self.special.matches() {
2475 next_id = self.tt.next_state_id(self.special.max_match);
2476 }
2477 self.special.min_start = next_id;
2478 for id in is_start {
2479 remapper.swap(self, next_id, id);
2480 next_id = self.tt.next_state_id(next_id);
2481 }
2482 self.special.max_start = cmp::max(
2483 self.special.min_start,
2484 self.tt.prev_state_id(next_id),
2485 );
2486 }
2487
2488 // Finally remap all transitions in our DFA.
2489 remapper.remap(self);
2490 self.set_pattern_map(&matches)?;
2491 self.special.set_max();
2492 self.special.validate().expect("special state ranges should validate");
2493 self.special
2494 .validate_state_count(self.state_count(), self.stride2())
2495 .expect(
2496 "special state ranges should be consistent with state count",
2497 );
2498 Ok(())
2499 }
2500}
2501
2502/// A variety of generic internal methods for accessing DFA internals.
2503impl<T: AsRef<[u32]>> DFA<T> {
2504 /// Return the byte classes used by this DFA.
2505 pub(crate) fn byte_classes(&self) -> &ByteClasses {
2506 &self.tt.classes
2507 }
2508
2509 /// Return the info about special states.
2510 pub(crate) fn special(&self) -> &Special {
2511 &self.special
2512 }
2513
2514 /// Return the info about special states as a mutable borrow.
2515 #[cfg(feature = "alloc")]
2516 pub(crate) fn special_mut(&mut self) -> &mut Special {
2517 &mut self.special
2518 }
2519
2520 /// Returns an iterator over all states in this DFA.
2521 ///
2522 /// This iterator yields a tuple for each state. The first element of the
2523 /// tuple corresponds to a state's identifier, and the second element
2524 /// corresponds to the state itself (comprised of its transitions).
2525 pub(crate) fn states(&self) -> StateIter<'_, T> {
2526 self.tt.states()
2527 }
2528
2529 /// Return the total number of states in this DFA. Every DFA has at least
2530 /// 1 state, even the empty DFA.
2531 pub(crate) fn state_count(&self) -> usize {
2532 self.tt.count()
2533 }
2534
2535 /// Return an iterator over all pattern IDs for the given match state.
2536 ///
2537 /// If the given state is not a match state, then this panics.
2538 #[cfg(feature = "alloc")]
2539 pub(crate) fn pattern_id_slice(&self, id: StateID) -> &[PatternID] {
2540 assert!(self.is_match_state(id));
2541 self.ms.pattern_id_slice(self.match_state_index(id))
2542 }
2543
2544 /// Return the total number of pattern IDs for the given match state.
2545 ///
2546 /// If the given state is not a match state, then this panics.
2547 pub(crate) fn match_pattern_len(&self, id: StateID) -> usize {
2548 assert!(self.is_match_state(id));
2549 self.ms.pattern_len(self.match_state_index(id))
2550 }
2551
2552 /// Returns the total number of patterns matched by this DFA.
2553 pub(crate) fn pattern_count(&self) -> usize {
2554 self.ms.patterns
2555 }
2556
2557 /// Returns a map from match state ID to a list of pattern IDs that match
2558 /// in that state.
2559 #[cfg(feature = "alloc")]
2560 pub(crate) fn pattern_map(&self) -> BTreeMap<StateID, Vec<PatternID>> {
2561 self.ms.to_map(self)
2562 }
2563
2564 /// Returns the ID of the quit state for this DFA.
2565 #[cfg(feature = "alloc")]
2566 pub(crate) fn quit_id(&self) -> StateID {
2567 self.from_index(1)
2568 }
2569
2570 /// Convert the given state identifier to the state's index. The state's
2571 /// index corresponds to the position in which it appears in the transition
2572 /// table. When a DFA is NOT premultiplied, then a state's identifier is
2573 /// also its index. When a DFA is premultiplied, then a state's identifier
2574 /// is equal to `index * alphabet_len`. This routine reverses that.
2575 pub(crate) fn to_index(&self, id: StateID) -> usize {
2576 self.tt.to_index(id)
2577 }
2578
2579 /// Convert an index to a state (in the range 0..self.state_count()) to an
2580 /// actual state identifier.
2581 ///
2582 /// This is useful when using a `Vec<T>` as an efficient map keyed by state
2583 /// to some other information (such as a remapped state ID).
2584 #[cfg(feature = "alloc")]
2585 pub(crate) fn from_index(&self, index: usize) -> StateID {
2586 self.tt.from_index(index)
2587 }
2588
2589 /// Return the table of state IDs for this DFA's start states.
2590 pub(crate) fn starts(&self) -> StartStateIter<'_> {
2591 self.st.iter()
2592 }
2593
2594 /// Returns the index of the match state for the given ID. If the
2595 /// given ID does not correspond to a match state, then this may
2596 /// panic or produce an incorrect result.
2597 fn match_state_index(&self, id: StateID) -> usize {
2598 debug_assert!(self.is_match_state(id));
2599 // This is one of the places where we rely on the fact that match
2600 // states are contiguous in the transition table. Namely, that the
2601 // first match state ID always corresponds to dfa.special.min_start.
2602 // From there, since we know the stride, we can compute the overall
2603 // index of any match state given the match state's ID.
2604 let min = self.special().min_match.as_usize();
2605 // CORRECTNESS: We're allowed to produce an incorrect result or panic,
2606 // so both the subtraction and the unchecked StateID construction is
2607 // OK.
2608 self.to_index(StateID::new_unchecked(id.as_usize() - min))
2609 }
2610
2611 /// Returns the index of the accelerator state for the given ID. If the
2612 /// given ID does not correspond to an accelerator state, then this may
2613 /// panic or produce an incorrect result.
2614 fn accelerator_index(&self, id: StateID) -> usize {
2615 let min = self.special().min_accel.as_usize();
2616 // CORRECTNESS: We're allowed to produce an incorrect result or panic,
2617 // so both the subtraction and the unchecked StateID construction is
2618 // OK.
2619 self.to_index(StateID::new_unchecked(id.as_usize() - min))
2620 }
2621
2622 /// Return the accelerators for this DFA.
2623 fn accels(&self) -> Accels<&[u32]> {
2624 self.accels.as_ref()
2625 }
2626
2627 /// Return this DFA's transition table as a slice.
2628 fn trans(&self) -> &[StateID] {
2629 self.tt.table()
2630 }
2631}
2632
2633impl<T: AsRef<[u32]>> fmt::Debug for DFA<T> {
2634 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
2635 writeln!(f, "dense::DFA(")?;
2636 for state in self.states() {
2637 fmt_state_indicator(f, self, state.id())?;
2638 let id = if f.alternate() {
2639 state.id().as_usize()
2640 } else {
2641 self.to_index(state.id())
2642 };
2643 write!(f, "{:06?}: ", id)?;
2644 state.fmt(f)?;
2645 write!(f, "\n")?;
2646 }
2647 writeln!(f, "")?;
2648 for (i, (start_id, sty, pid)) in self.starts().enumerate() {
2649 let id = if f.alternate() {
2650 start_id.as_usize()
2651 } else {
2652 self.to_index(start_id)
2653 };
2654 if i % self.st.stride == 0 {
2655 match pid {
2656 None => writeln!(f, "START-GROUP(ALL)")?,
2657 Some(pid) => {
2658 writeln!(f, "START_GROUP(pattern: {:?})", pid)?
2659 }
2660 }
2661 }
2662 writeln!(f, " {:?} => {:06?}", sty, id)?;
2663 }
2664 if self.pattern_count() > 1 {
2665 writeln!(f, "")?;
2666 for i in 0..self.ms.count() {
2667 let id = self.ms.match_state_id(self, i);
2668 let id = if f.alternate() {
2669 id.as_usize()
2670 } else {
2671 self.to_index(id)
2672 };
2673 write!(f, "MATCH({:06?}): ", id)?;
2674 for (i, &pid) in self.ms.pattern_id_slice(i).iter().enumerate()
2675 {
2676 if i > 0 {
2677 write!(f, ", ")?;
2678 }
2679 write!(f, "{:?}", pid)?;
2680 }
2681 writeln!(f, "")?;
2682 }
2683 }
2684 writeln!(f, "state count: {:?}", self.state_count())?;
2685 writeln!(f, "pattern count: {:?}", self.pattern_count())?;
2686 writeln!(f, ")")?;
2687 Ok(())
2688 }
2689}
2690
2691unsafe impl<T: AsRef<[u32]>> Automaton for DFA<T> {
2692 #[inline]
2693 fn is_special_state(&self, id: StateID) -> bool {
2694 self.special.is_special_state(id)
2695 }
2696
2697 #[inline]
2698 fn is_dead_state(&self, id: StateID) -> bool {
2699 self.special.is_dead_state(id)
2700 }
2701
2702 #[inline]
2703 fn is_quit_state(&self, id: StateID) -> bool {
2704 self.special.is_quit_state(id)
2705 }
2706
2707 #[inline]
2708 fn is_match_state(&self, id: StateID) -> bool {
2709 self.special.is_match_state(id)
2710 }
2711
2712 #[inline]
2713 fn is_start_state(&self, id: StateID) -> bool {
2714 self.special.is_start_state(id)
2715 }
2716
2717 #[inline]
2718 fn is_accel_state(&self, id: StateID) -> bool {
2719 self.special.is_accel_state(id)
2720 }
2721
2722 #[inline]
2723 fn next_state(&self, current: StateID, input: u8) -> StateID {
2724 let input = self.byte_classes().get(input);
2725 let o = current.as_usize() + usize::from(input);
2726 self.trans()[o]
2727 }
2728
2729 #[inline]
2730 unsafe fn next_state_unchecked(
2731 &self,
2732 current: StateID,
2733 input: u8,
2734 ) -> StateID {
2735 let input = self.byte_classes().get_unchecked(input);
2736 let o = current.as_usize() + usize::from(input);
2737 *self.trans().get_unchecked(o)
2738 }
2739
2740 #[inline]
2741 fn next_eoi_state(&self, current: StateID) -> StateID {
2742 let eoi = self.byte_classes().eoi().as_usize();
2743 let o = current.as_usize() + eoi;
2744 self.trans()[o]
2745 }
2746
2747 #[inline]
2748 fn pattern_count(&self) -> usize {
2749 self.ms.patterns
2750 }
2751
2752 #[inline]
2753 fn match_count(&self, id: StateID) -> usize {
2754 self.match_pattern_len(id)
2755 }
2756
2757 #[inline]
2758 fn match_pattern(&self, id: StateID, match_index: usize) -> PatternID {
2759 // This is an optimization for the very common case of a DFA with a
2760 // single pattern. This conditional avoids a somewhat more costly path
2761 // that finds the pattern ID from the state machine, which requires
2762 // a bit of slicing/pointer-chasing. This optimization tends to only
2763 // matter when matches are frequent.
2764 if self.ms.patterns == 1 {
2765 return PatternID::ZERO;
2766 }
2767 let state_index = self.match_state_index(id);
2768 self.ms.pattern_id(state_index, match_index)
2769 }
2770
2771 #[inline]
2772 fn start_state_forward(
2773 &self,
2774 pattern_id: Option<PatternID>,
2775 bytes: &[u8],
2776 start: usize,
2777 end: usize,
2778 ) -> StateID {
2779 let index = Start::from_position_fwd(bytes, start, end);
2780 self.st.start(index, pattern_id)
2781 }
2782
2783 #[inline]
2784 fn start_state_reverse(
2785 &self,
2786 pattern_id: Option<PatternID>,
2787 bytes: &[u8],
2788 start: usize,
2789 end: usize,
2790 ) -> StateID {
2791 let index = Start::from_position_rev(bytes, start, end);
2792 self.st.start(index, pattern_id)
2793 }
2794
2795 #[inline(always)]
2796 fn accelerator(&self, id: StateID) -> &[u8] {
2797 if !self.is_accel_state(id) {
2798 return &[];
2799 }
2800 self.accels.needles(self.accelerator_index(id))
2801 }
2802}
2803
2804/// The transition table portion of a dense DFA.
2805///
2806/// The transition table is the core part of the DFA in that it describes how
2807/// to move from one state to another based on the input sequence observed.
2808#[derive(Clone)]
2809pub(crate) struct TransitionTable<T> {
2810 /// A contiguous region of memory representing the transition table in
2811 /// row-major order. The representation is dense. That is, every state
2812 /// has precisely the same number of transitions. The maximum number of
2813 /// transitions per state is 257 (256 for each possible byte value, plus 1
2814 /// for the special EOI transition). If a DFA has been instructed to use
2815 /// byte classes (the default), then the number of transitions is usually
2816 /// substantially fewer.
2817 ///
2818 /// In practice, T is either `Vec<u32>` or `&[u32]`.
2819 table: T,
2820 /// A set of equivalence classes, where a single equivalence class
2821 /// represents a set of bytes that never discriminate between a match
2822 /// and a non-match in the DFA. Each equivalence class corresponds to a
2823 /// single character in this DFA's alphabet, where the maximum number of
2824 /// characters is 257 (each possible value of a byte plus the special
2825 /// EOI transition). Consequently, the number of equivalence classes
2826 /// corresponds to the number of transitions for each DFA state. Note
2827 /// though that the *space* used by each DFA state in the transition table
2828 /// may be larger. The total space used by each DFA state is known as the
2829 /// stride.
2830 ///
2831 /// The only time the number of equivalence classes is fewer than 257 is if
2832 /// the DFA's kind uses byte classes (which is the default). Equivalence
2833 /// classes should generally only be disabled when debugging, so that
2834 /// the transitions themselves aren't obscured. Disabling them has no
2835 /// other benefit, since the equivalence class map is always used while
2836 /// searching. In the vast majority of cases, the number of equivalence
2837 /// classes is substantially smaller than 257, particularly when large
2838 /// Unicode classes aren't used.
2839 classes: ByteClasses,
2840 /// The stride of each DFA state, expressed as a power-of-two exponent.
2841 ///
2842 /// The stride of a DFA corresponds to the total amount of space used by
2843 /// each DFA state in the transition table. This may be bigger than the
2844 /// size of a DFA's alphabet, since the stride is always the smallest
2845 /// power of two greater than or equal to the alphabet size.
2846 ///
2847 /// While this wastes space, this avoids the need for integer division
2848 /// to convert between premultiplied state IDs and their corresponding
2849 /// indices. Instead, we can use simple bit-shifts.
2850 ///
2851 /// See the docs for the `stride2` method for more details.
2852 ///
2853 /// The minimum `stride2` value is `1` (corresponding to a stride of `2`)
2854 /// while the maximum `stride2` value is `9` (corresponding to a stride of
2855 /// `512`). The maximum is not `8` since the maximum alphabet size is `257`
2856 /// when accounting for the special EOI transition. However, an alphabet
2857 /// length of that size is exceptionally rare since the alphabet is shrunk
2858 /// into equivalence classes.
2859 stride2: usize,
2860}
2861
2862impl<'a> TransitionTable<&'a [u32]> {
2863 /// Deserialize a transition table starting at the beginning of `slice`.
2864 /// Upon success, return the total number of bytes read along with the
2865 /// transition table.
2866 ///
2867 /// If there was a problem deserializing any part of the transition table,
2868 /// then this returns an error. Notably, if the given slice does not have
2869 /// the same alignment as `StateID`, then this will return an error (among
2870 /// other possible errors).
2871 ///
2872 /// This is guaranteed to execute in constant time.
2873 ///
2874 /// # Safety
2875 ///
2876 /// This routine is not safe because it does not check the valdity of the
2877 /// transition table itself. In particular, the transition table can be
2878 /// quite large, so checking its validity can be somewhat expensive. An
2879 /// invalid transition table is not safe because other code may rely on the
2880 /// transition table being correct (such as explicit bounds check elision).
2881 /// Therefore, an invalid transition table can lead to undefined behavior.
2882 ///
2883 /// Callers that use this function must either pass on the safety invariant
2884 /// or guarantee that the bytes given contain a valid transition table.
2885 /// This guarantee is upheld by the bytes written by `write_to`.
2886 unsafe fn from_bytes_unchecked(
2887 mut slice: &'a [u8],
2888 ) -> Result<(TransitionTable<&'a [u32]>, usize), DeserializeError> {
2889 let slice_start = slice.as_ptr() as usize;
2890
2891 let (count, nr) = bytes::try_read_u32_as_usize(slice, "state count")?;
2892 slice = &slice[nr..];
2893
2894 let (stride2, nr) = bytes::try_read_u32_as_usize(slice, "stride2")?;
2895 slice = &slice[nr..];
2896
2897 let (classes, nr) = ByteClasses::from_bytes(slice)?;
2898 slice = &slice[nr..];
2899
2900 // The alphabet length (determined by the byte class map) cannot be
2901 // bigger than the stride (total space used by each DFA state).
2902 if stride2 > 9 {
2903 return Err(DeserializeError::generic(
2904 "dense DFA has invalid stride2 (too big)",
2905 ));
2906 }
2907 // It also cannot be zero, since even a DFA that never matches anything
2908 // has a non-zero number of states with at least two equivalence
2909 // classes: one for all 256 byte values and another for the EOI
2910 // sentinel.
2911 if stride2 < 1 {
2912 return Err(DeserializeError::generic(
2913 "dense DFA has invalid stride2 (too small)",
2914 ));
2915 }
2916 // This is OK since 1 <= stride2 <= 9.
2917 let stride =
2918 1usize.checked_shl(u32::try_from(stride2).unwrap()).unwrap();
2919 if classes.alphabet_len() > stride {
2920 return Err(DeserializeError::generic(
2921 "alphabet size cannot be bigger than transition table stride",
2922 ));
2923 }
2924
2925 let trans_count =
2926 bytes::shl(count, stride2, "dense table transition count")?;
2927 let table_bytes_len = bytes::mul(
2928 trans_count,
2929 StateID::SIZE,
2930 "dense table state byte count",
2931 )?;
2932 bytes::check_slice_len(slice, table_bytes_len, "transition table")?;
2933 bytes::check_alignment::<StateID>(slice)?;
2934 let table_bytes = &slice[..table_bytes_len];
2935 slice = &slice[table_bytes_len..];
2936 // SAFETY: Since StateID is always representable as a u32, all we need
2937 // to do is ensure that we have the proper length and alignment. We've
2938 // checked both above, so the cast below is safe.
2939 //
2940 // N.B. This is the only not-safe code in this function, so we mark
2941 // it explicitly to call it out, even though it is technically
2942 // superfluous.
2943 #[allow(unused_unsafe)]
2944 let table = unsafe {
2945 core::slice::from_raw_parts(
2946 table_bytes.as_ptr() as *const u32,
2947 trans_count,
2948 )
2949 };
2950 let tt = TransitionTable { table, classes, stride2 };
2951 Ok((tt, slice.as_ptr() as usize - slice_start))
2952 }
2953}
2954
2955#[cfg(feature = "alloc")]
2956impl TransitionTable<Vec<u32>> {
2957 /// Create a minimal transition table with just two states: a dead state
2958 /// and a quit state. The alphabet length and stride of the transition
2959 /// table is determined by the given set of equivalence classes.
2960 fn minimal(classes: ByteClasses) -> TransitionTable<Vec<u32>> {
2961 let mut tt = TransitionTable {
2962 table: vec![],
2963 classes,
2964 stride2: classes.stride2(),
2965 };
2966 // Two states, regardless of alphabet size, can always fit into u32.
2967 tt.add_empty_state().unwrap(); // dead state
2968 tt.add_empty_state().unwrap(); // quit state
2969 tt
2970 }
2971
2972 /// Set a transition in this table. Both the `from` and `to` states must
2973 /// already exist, otherwise this panics. `unit` should correspond to the
2974 /// transition out of `from` to set to `to`.
2975 fn set(&mut self, from: StateID, unit: alphabet::Unit, to: StateID) {
2976 assert!(self.is_valid(from), "invalid 'from' state");
2977 assert!(self.is_valid(to), "invalid 'to' state");
2978 self.table[from.as_usize() + self.classes.get_by_unit(unit)] =
2979 to.as_u32();
2980 }
2981
2982 /// Add an empty state (a state where all transitions lead to a dead state)
2983 /// and return its identifier. The identifier returned is guaranteed to
2984 /// not point to any other existing state.
2985 ///
2986 /// If adding a state would exhaust the state identifier space, then this
2987 /// returns an error.
2988 fn add_empty_state(&mut self) -> Result<StateID, Error> {
2989 // Normally, to get a fresh state identifier, we would just
2990 // take the index of the next state added to the transition
2991 // table. However, we actually perform an optimization here
2992 // that premultiplies state IDs by the stride, such that they
2993 // point immediately at the beginning of their transitions in
2994 // the transition table. This avoids an extra multiplication
2995 // instruction for state lookup at search time.
2996 //
2997 // Premultiplied identifiers means that instead of your matching
2998 // loop looking something like this:
2999 //
3000 // state = dfa.start
3001 // for byte in haystack:
3002 // next = dfa.transitions[state * stride + byte]
3003 // if dfa.is_match(next):
3004 // return true
3005 // return false
3006 //
3007 // it can instead look like this:
3008 //
3009 // state = dfa.start
3010 // for byte in haystack:
3011 // next = dfa.transitions[state + byte]
3012 // if dfa.is_match(next):
3013 // return true
3014 // return false
3015 //
3016 // In other words, we save a multiplication instruction in the
3017 // critical path. This turns out to be a decent performance win.
3018 // The cost of using premultiplied state ids is that they can
3019 // require a bigger state id representation. (And they also make
3020 // the code a bit more complex, especially during minimization and
3021 // when reshuffling states, as one needs to convert back and forth
3022 // between state IDs and state indices.)
3023 //
3024 // To do this, we simply take the index of the state into the
3025 // entire transition table, rather than the index of the state
3026 // itself. e.g., If the stride is 64, then the ID of the 3rd state
3027 // is 192, not 2.
3028 let next = self.table.len();
3029 let id = StateID::new(next).map_err(|_| Error::too_many_states())?;
3030 self.table.extend(iter::repeat(0).take(self.stride()));
3031 Ok(id)
3032 }
3033
3034 /// Swap the two states given in this transition table.
3035 ///
3036 /// This routine does not do anything to check the correctness of this
3037 /// swap. Callers must ensure that other states pointing to id1 and id2 are
3038 /// updated appropriately.
3039 ///
3040 /// Both id1 and id2 must point to valid states, otherwise this panics.
3041 fn swap(&mut self, id1: StateID, id2: StateID) {
3042 assert!(self.is_valid(id1), "invalid 'id1' state: {:?}", id1);
3043 assert!(self.is_valid(id2), "invalid 'id2' state: {:?}", id2);
3044 // We only need to swap the parts of the state that are used. So if the
3045 // stride is 64, but the alphabet length is only 33, then we save a lot
3046 // of work.
3047 for b in 0..self.classes.alphabet_len() {
3048 self.table.swap(id1.as_usize() + b, id2.as_usize() + b);
3049 }
3050 }
3051
3052 /// Truncate the states in this transition table to the given count.
3053 ///
3054 /// This routine does not do anything to check the correctness of this
3055 /// truncation. Callers must ensure that other states pointing to truncated
3056 /// states are updated appropriately.
3057 fn truncate(&mut self, count: usize) {
3058 self.table.truncate(count << self.stride2);
3059 }
3060
3061 /// Return a mutable representation of the state corresponding to the given
3062 /// id. This is useful for implementing routines that manipulate DFA states
3063 /// (e.g., swapping states).
3064 fn state_mut(&mut self, id: StateID) -> StateMut<'_> {
3065 let alphabet_len = self.alphabet_len();
3066 let i = id.as_usize();
3067 StateMut {
3068 id,
3069 stride2: self.stride2,
3070 transitions: &mut self.table_mut()[i..i + alphabet_len],
3071 }
3072 }
3073}
3074
3075impl<T: AsRef<[u32]>> TransitionTable<T> {
3076 /// Writes a serialized form of this transition table to the buffer given.
3077 /// If the buffer is too small, then an error is returned. To determine
3078 /// how big the buffer must be, use `write_to_len`.
3079 fn write_to<E: Endian>(
3080 &self,
3081 mut dst: &mut [u8],
3082 ) -> Result<usize, SerializeError> {
3083 let nwrite = self.write_to_len();
3084 if dst.len() < nwrite {
3085 return Err(SerializeError::buffer_too_small("transition table"));
3086 }
3087 dst = &mut dst[..nwrite];
3088
3089 // write state count
3090 // Unwrap is OK since number of states is guaranteed to fit in a u32.
3091 E::write_u32(u32::try_from(self.count()).unwrap(), dst);
3092 dst = &mut dst[size_of::<u32>()..];
3093
3094 // write state stride (as power of 2)
3095 // Unwrap is OK since stride2 is guaranteed to be <= 9.
3096 E::write_u32(u32::try_from(self.stride2).unwrap(), dst);
3097 dst = &mut dst[size_of::<u32>()..];
3098
3099 // write byte class map
3100 let n = self.classes.write_to(dst)?;
3101 dst = &mut dst[n..];
3102
3103 // write actual transitions
3104 for &sid in self.table() {
3105 let n = bytes::write_state_id::<E>(sid, &mut dst);
3106 dst = &mut dst[n..];
3107 }
3108 Ok(nwrite)
3109 }
3110
3111 /// Returns the number of bytes the serialized form of this transition
3112 /// table will use.
3113 fn write_to_len(&self) -> usize {
3114 size_of::<u32>() // state count
3115 + size_of::<u32>() // stride2
3116 + self.classes.write_to_len()
3117 + (self.table().len() * StateID::SIZE)
3118 }
3119
3120 /// Validates that every state ID in this transition table is valid.
3121 ///
3122 /// That is, every state ID can be used to correctly index a state in this
3123 /// table.
3124 fn validate(&self) -> Result<(), DeserializeError> {
3125 for state in self.states() {
3126 for (_, to) in state.transitions() {
3127 if !self.is_valid(to) {
3128 return Err(DeserializeError::generic(
3129 "found invalid state ID in transition table",
3130 ));
3131 }
3132 }
3133 }
3134 Ok(())
3135 }
3136
3137 /// Converts this transition table to a borrowed value.
3138 fn as_ref(&self) -> TransitionTable<&'_ [u32]> {
3139 TransitionTable {
3140 table: self.table.as_ref(),
3141 classes: self.classes.clone(),
3142 stride2: self.stride2,
3143 }
3144 }
3145
3146 /// Converts this transition table to an owned value.
3147 #[cfg(feature = "alloc")]
3148 fn to_owned(&self) -> TransitionTable<Vec<u32>> {
3149 TransitionTable {
3150 table: self.table.as_ref().to_vec(),
3151 classes: self.classes.clone(),
3152 stride2: self.stride2,
3153 }
3154 }
3155
3156 /// Return the state for the given ID. If the given ID is not valid, then
3157 /// this panics.
3158 fn state(&self, id: StateID) -> State<'_> {
3159 assert!(self.is_valid(id));
3160
3161 let i = id.as_usize();
3162 State {
3163 id,
3164 stride2: self.stride2,
3165 transitions: &self.table()[i..i + self.alphabet_len()],
3166 }
3167 }
3168
3169 /// Returns an iterator over all states in this transition table.
3170 ///
3171 /// This iterator yields a tuple for each state. The first element of the
3172 /// tuple corresponds to a state's identifier, and the second element
3173 /// corresponds to the state itself (comprised of its transitions).
3174 fn states(&self) -> StateIter<'_, T> {
3175 StateIter {
3176 tt: self,
3177 it: self.table().chunks(self.stride()).enumerate(),
3178 }
3179 }
3180
3181 /// Convert a state identifier to an index to a state (in the range
3182 /// 0..self.count()).
3183 ///
3184 /// This is useful when using a `Vec<T>` as an efficient map keyed by state
3185 /// to some other information (such as a remapped state ID).
3186 ///
3187 /// If the given ID is not valid, then this may panic or produce an
3188 /// incorrect index.
3189 fn to_index(&self, id: StateID) -> usize {
3190 id.as_usize() >> self.stride2
3191 }
3192
3193 /// Convert an index to a state (in the range 0..self.count()) to an actual
3194 /// state identifier.
3195 ///
3196 /// This is useful when using a `Vec<T>` as an efficient map keyed by state
3197 /// to some other information (such as a remapped state ID).
3198 ///
3199 /// If the given index is not in the specified range, then this may panic
3200 /// or produce an incorrect state ID.
3201 fn from_index(&self, index: usize) -> StateID {
3202 // CORRECTNESS: If the given index is not valid, then it is not
3203 // required for this to panic or return a valid state ID.
3204 StateID::new_unchecked(index << self.stride2)
3205 }
3206
3207 /// Returns the state ID for the state immediately following the one given.
3208 ///
3209 /// This does not check whether the state ID returned is invalid. In fact,
3210 /// if the state ID given is the last state in this DFA, then the state ID
3211 /// returned is guaranteed to be invalid.
3212 #[cfg(feature = "alloc")]
3213 fn next_state_id(&self, id: StateID) -> StateID {
3214 self.from_index(self.to_index(id).checked_add(1).unwrap())
3215 }
3216
3217 /// Returns the state ID for the state immediately preceding the one given.
3218 ///
3219 /// If the dead ID given (which is zero), then this panics.
3220 #[cfg(feature = "alloc")]
3221 fn prev_state_id(&self, id: StateID) -> StateID {
3222 self.from_index(self.to_index(id).checked_sub(1).unwrap())
3223 }
3224
3225 /// Returns the table as a slice of state IDs.
3226 fn table(&self) -> &[StateID] {
3227 let integers = self.table.as_ref();
3228 // SAFETY: This is safe because StateID is guaranteed to be
3229 // representable as a u32.
3230 unsafe {
3231 core::slice::from_raw_parts(
3232 integers.as_ptr() as *const StateID,
3233 integers.len(),
3234 )
3235 }
3236 }
3237
3238 /// Returns the total number of states in this transition table.
3239 ///
3240 /// Note that a DFA always has at least two states: the dead and quit
3241 /// states. In particular, the dead state always has ID 0 and is
3242 /// correspondingly always the first state. The dead state is never a match
3243 /// state.
3244 fn count(&self) -> usize {
3245 self.table().len() >> self.stride2
3246 }
3247
3248 /// Returns the total stride for every state in this DFA. This corresponds
3249 /// to the total number of transitions used by each state in this DFA's
3250 /// transition table.
3251 fn stride(&self) -> usize {
3252 1 << self.stride2
3253 }
3254
3255 /// Returns the total number of elements in the alphabet for this
3256 /// transition table. This is always less than or equal to `self.stride()`.
3257 /// It is only equal when the alphabet length is a power of 2. Otherwise,
3258 /// it is always strictly less.
3259 fn alphabet_len(&self) -> usize {
3260 self.classes.alphabet_len()
3261 }
3262
3263 /// Returns true if and only if the given state ID is valid for this
3264 /// transition table. Validity in this context means that the given ID can
3265 /// be used as a valid offset with `self.stride()` to index this transition
3266 /// table.
3267 fn is_valid(&self, id: StateID) -> bool {
3268 let id = id.as_usize();
3269 id < self.table().len() && id % self.stride() == 0
3270 }
3271
3272 /// Return the memory usage, in bytes, of this transition table.
3273 ///
3274 /// This does not include the size of a `TransitionTable` value itself.
3275 fn memory_usage(&self) -> usize {
3276 self.table().len() * StateID::SIZE
3277 }
3278}
3279
3280#[cfg(feature = "alloc")]
3281impl<T: AsMut<[u32]>> TransitionTable<T> {
3282 /// Returns the table as a slice of state IDs.
3283 fn table_mut(&mut self) -> &mut [StateID] {
3284 let integers = self.table.as_mut();
3285 // SAFETY: This is safe because StateID is guaranteed to be
3286 // representable as a u32.
3287 unsafe {
3288 core::slice::from_raw_parts_mut(
3289 integers.as_mut_ptr() as *mut StateID,
3290 integers.len(),
3291 )
3292 }
3293 }
3294}
3295
3296/// The set of all possible starting states in a DFA.
3297///
3298/// The set of starting states corresponds to the possible choices one can make
3299/// in terms of starting a DFA. That is, before following the first transition,
3300/// you first need to select the state that you start in.
3301///
3302/// Normally, a DFA converted from an NFA that has a single starting state
3303/// would itself just have one starting state. However, our support for look
3304/// around generally requires more starting states. The correct starting state
3305/// is chosen based on certain properties of the position at which we begin
3306/// our search.
3307///
3308/// Before listing those properties, we first must define two terms:
3309///
3310/// * `haystack` - The bytes to execute the search. The search always starts
3311/// at the beginning of `haystack` and ends before or at the end of
3312/// `haystack`.
3313/// * `context` - The (possibly empty) bytes surrounding `haystack`. `haystack`
3314/// must be contained within `context` such that `context` is at least as big
3315/// as `haystack`.
3316///
3317/// This split is crucial for dealing with look-around. For example, consider
3318/// the context `foobarbaz`, the haystack `bar` and the regex `^bar$`. This
3319/// regex should _not_ match the haystack since `bar` does not appear at the
3320/// beginning of the input. Similarly, the regex `\Bbar\B` should match the
3321/// haystack because `bar` is not surrounded by word boundaries. But a search
3322/// that does not take context into account would not permit `\B` to match
3323/// since the beginning of any string matches a word boundary. Similarly, a
3324/// search that does not take context into account when searching `^bar$` in
3325/// the haystack `bar` would produce a match when it shouldn't.
3326///
3327/// Thus, it follows that the starting state is chosen based on the following
3328/// criteria, derived from the position at which the search starts in the
3329/// `context` (corresponding to the start of `haystack`):
3330///
3331/// 1. If the search starts at the beginning of `context`, then the `Text`
3332/// start state is used. (Since `^` corresponds to
3333/// `hir::Anchor::StartText`.)
3334/// 2. If the search starts at a position immediately following a line
3335/// terminator, then the `Line` start state is used. (Since `(?m:^)`
3336/// corresponds to `hir::Anchor::StartLine`.)
3337/// 3. If the search starts at a position immediately following a byte
3338/// classified as a "word" character (`[_0-9a-zA-Z]`), then the `WordByte`
3339/// start state is used. (Since `(?-u:\b)` corresponds to a word boundary.)
3340/// 4. Otherwise, if the search starts at a position immediately following
3341/// a byte that is not classified as a "word" character (`[^_0-9a-zA-Z]`),
3342/// then the `NonWordByte` start state is used. (Since `(?-u:\B)`
3343/// corresponds to a not-word-boundary.)
3344///
3345/// (N.B. Unicode word boundaries are not supported by the DFA because they
3346/// require multi-byte look-around and this is difficult to support in a DFA.)
3347///
3348/// To further complicate things, we also support constructing individual
3349/// anchored start states for each pattern in the DFA. (Which is required to
3350/// implement overlapping regexes correctly, but is also generally useful.)
3351/// Thus, when individual start states for each pattern are enabled, then the
3352/// total number of start states represented is `4 + (4 * #patterns)`, where
3353/// the 4 comes from each of the 4 possibilities above. The first 4 represents
3354/// the starting states for the entire DFA, which support searching for
3355/// multiple patterns simultaneously (possibly unanchored).
3356///
3357/// If individual start states are disabled, then this will only store 4
3358/// start states. Typically, individual start states are only enabled when
3359/// constructing the reverse DFA for regex matching. But they are also useful
3360/// for building DFAs that can search for a specific pattern or even to support
3361/// both anchored and unanchored searches with the same DFA.
3362///
3363/// Note though that while the start table always has either `4` or
3364/// `4 + (4 * #patterns)` starting state *ids*, the total number of states
3365/// might be considerably smaller. That is, many of the IDs may be duplicative.
3366/// (For example, if a regex doesn't have a `\b` sub-pattern, then there's no
3367/// reason to generate a unique starting state for handling word boundaries.
3368/// Similarly for start/end anchors.)
3369#[derive(Clone)]
3370pub(crate) struct StartTable<T> {
3371 /// The initial start state IDs.
3372 ///
3373 /// In practice, T is either `Vec<u32>` or `&[u32]`.
3374 ///
3375 /// The first `stride` (currently always 4) entries always correspond to
3376 /// the start states for the entire DFA. After that, there are
3377 /// `stride * patterns` state IDs, where `patterns` may be zero in the
3378 /// case of a DFA with no patterns or in the case where the DFA was built
3379 /// without enabling starting states for each pattern.
3380 table: T,
3381 /// The number of starting state IDs per pattern.
3382 stride: usize,
3383 /// The total number of patterns for which starting states are encoded.
3384 /// This may be zero for non-empty DFAs when the DFA was built without
3385 /// start states for each pattern. Thus, one cannot use this field to
3386 /// say how many patterns are in the DFA in all cases. It is specific to
3387 /// how many patterns are represented in this start table.
3388 patterns: usize,
3389}
3390
3391#[cfg(feature = "alloc")]
3392impl StartTable<Vec<u32>> {
3393 /// Create a valid set of start states all pointing to the dead state.
3394 ///
3395 /// When the corresponding DFA is constructed with start states for each
3396 /// pattern, then `patterns` should be the number of patterns. Otherwise,
3397 /// it should be zero.
3398 ///
3399 /// If the total table size could exceed the allocatable limit, then this
3400 /// returns an error. In practice, this is unlikely to be able to occur,
3401 /// since it's likely that allocation would have failed long before it got
3402 /// to this point.
3403 fn dead(patterns: usize) -> Result<StartTable<Vec<u32>>, Error> {
3404 assert!(patterns <= PatternID::LIMIT);
3405 let stride = Start::count();
3406 let pattern_starts_len = match stride.checked_mul(patterns) {
3407 Some(x) => x,
3408 None => return Err(Error::too_many_start_states()),
3409 };
3410 let table_len = match stride.checked_add(pattern_starts_len) {
3411 Some(x) => x,
3412 None => return Err(Error::too_many_start_states()),
3413 };
3414 if table_len > core::isize::MAX as usize {
3415 return Err(Error::too_many_start_states());
3416 }
3417 let table = vec![DEAD.as_u32(); table_len];
3418 Ok(StartTable { table, stride, patterns })
3419 }
3420}
3421
3422impl<'a> StartTable<&'a [u32]> {
3423 /// Deserialize a table of start state IDs starting at the beginning of
3424 /// `slice`. Upon success, return the total number of bytes read along with
3425 /// the table of starting state IDs.
3426 ///
3427 /// If there was a problem deserializing any part of the starting IDs,
3428 /// then this returns an error. Notably, if the given slice does not have
3429 /// the same alignment as `StateID`, then this will return an error (among
3430 /// other possible errors).
3431 ///
3432 /// This is guaranteed to execute in constant time.
3433 ///
3434 /// # Safety
3435 ///
3436 /// This routine is not safe because it does not check the valdity of the
3437 /// starting state IDs themselves. In particular, the number of starting
3438 /// IDs can be of variable length, so it's possible that checking their
3439 /// validity cannot be done in constant time. An invalid starting state
3440 /// ID is not safe because other code may rely on the starting IDs being
3441 /// correct (such as explicit bounds check elision). Therefore, an invalid
3442 /// start ID can lead to undefined behavior.
3443 ///
3444 /// Callers that use this function must either pass on the safety invariant
3445 /// or guarantee that the bytes given contain valid starting state IDs.
3446 /// This guarantee is upheld by the bytes written by `write_to`.
3447 unsafe fn from_bytes_unchecked(
3448 mut slice: &'a [u8],
3449 ) -> Result<(StartTable<&'a [u32]>, usize), DeserializeError> {
3450 let slice_start = slice.as_ptr() as usize;
3451
3452 let (stride, nr) =
3453 bytes::try_read_u32_as_usize(slice, "start table stride")?;
3454 slice = &slice[nr..];
3455
3456 let (patterns, nr) =
3457 bytes::try_read_u32_as_usize(slice, "start table patterns")?;
3458 slice = &slice[nr..];
3459
3460 if stride != Start::count() {
3461 return Err(DeserializeError::generic(
3462 "invalid starting table stride",
3463 ));
3464 }
3465 if patterns > PatternID::LIMIT {
3466 return Err(DeserializeError::generic(
3467 "invalid number of patterns",
3468 ));
3469 }
3470 let pattern_table_size =
3471 bytes::mul(stride, patterns, "invalid pattern count")?;
3472 // Our start states always start with a single stride of start states
3473 // for the entire automaton which permit it to match any pattern. What
3474 // follows it are an optional set of start states for each pattern.
3475 let start_state_count = bytes::add(
3476 stride,
3477 pattern_table_size,
3478 "invalid 'any' pattern starts size",
3479 )?;
3480 let table_bytes_len = bytes::mul(
3481 start_state_count,
3482 StateID::SIZE,
3483 "pattern table bytes length",
3484 )?;
3485 bytes::check_slice_len(slice, table_bytes_len, "start ID table")?;
3486 bytes::check_alignment::<StateID>(slice)?;
3487 let table_bytes = &slice[..table_bytes_len];
3488 slice = &slice[table_bytes_len..];
3489 // SAFETY: Since StateID is always representable as a u32, all we need
3490 // to do is ensure that we have the proper length and alignment. We've
3491 // checked both above, so the cast below is safe.
3492 //
3493 // N.B. This is the only not-safe code in this function, so we mark
3494 // it explicitly to call it out, even though it is technically
3495 // superfluous.
3496 #[allow(unused_unsafe)]
3497 let table = unsafe {
3498 core::slice::from_raw_parts(
3499 table_bytes.as_ptr() as *const u32,
3500 start_state_count,
3501 )
3502 };
3503 let st = StartTable { table, stride, patterns };
3504 Ok((st, slice.as_ptr() as usize - slice_start))
3505 }
3506}
3507
3508impl<T: AsRef<[u32]>> StartTable<T> {
3509 /// Writes a serialized form of this start table to the buffer given. If
3510 /// the buffer is too small, then an error is returned. To determine how
3511 /// big the buffer must be, use `write_to_len`.
3512 fn write_to<E: Endian>(
3513 &self,
3514 mut dst: &mut [u8],
3515 ) -> Result<usize, SerializeError> {
3516 let nwrite = self.write_to_len();
3517 if dst.len() < nwrite {
3518 return Err(SerializeError::buffer_too_small(
3519 "starting table ids",
3520 ));
3521 }
3522 dst = &mut dst[..nwrite];
3523
3524 // write stride
3525 // Unwrap is OK since the stride is always 4 (currently).
3526 E::write_u32(u32::try_from(self.stride).unwrap(), dst);
3527 dst = &mut dst[size_of::<u32>()..];
3528 // write pattern count
3529 // Unwrap is OK since number of patterns is guaranteed to fit in a u32.
3530 E::write_u32(u32::try_from(self.patterns).unwrap(), dst);
3531 dst = &mut dst[size_of::<u32>()..];
3532 // write start IDs
3533 for &sid in self.table() {
3534 let n = bytes::write_state_id::<E>(sid, &mut dst);
3535 dst = &mut dst[n..];
3536 }
3537 Ok(nwrite)
3538 }
3539
3540 /// Returns the number of bytes the serialized form of this start ID table
3541 /// will use.
3542 fn write_to_len(&self) -> usize {
3543 size_of::<u32>() // stride
3544 + size_of::<u32>() // # patterns
3545 + (self.table().len() * StateID::SIZE)
3546 }
3547
3548 /// Validates that every state ID in this start table is valid by checking
3549 /// it against the given transition table (which must be for the same DFA).
3550 ///
3551 /// That is, every state ID can be used to correctly index a state.
3552 fn validate(
3553 &self,
3554 tt: &TransitionTable<T>,
3555 ) -> Result<(), DeserializeError> {
3556 for &id in self.table() {
3557 if !tt.is_valid(id) {
3558 return Err(DeserializeError::generic(
3559 "found invalid starting state ID",
3560 ));
3561 }
3562 }
3563 Ok(())
3564 }
3565
3566 /// Converts this start list to a borrowed value.
3567 fn as_ref(&self) -> StartTable<&'_ [u32]> {
3568 StartTable {
3569 table: self.table.as_ref(),
3570 stride: self.stride,
3571 patterns: self.patterns,
3572 }
3573 }
3574
3575 /// Converts this start list to an owned value.
3576 #[cfg(feature = "alloc")]
3577 fn to_owned(&self) -> StartTable<Vec<u32>> {
3578 StartTable {
3579 table: self.table.as_ref().to_vec(),
3580 stride: self.stride,
3581 patterns: self.patterns,
3582 }
3583 }
3584
3585 /// Return the start state for the given start index and pattern ID. If the
3586 /// pattern ID is None, then the corresponding start state for the entire
3587 /// DFA is returned. If the pattern ID is not None, then the corresponding
3588 /// starting state for the given pattern is returned. If this start table
3589 /// does not have individual starting states for each pattern, then this
3590 /// panics.
3591 fn start(&self, index: Start, pattern_id: Option<PatternID>) -> StateID {
3592 let start_index = index.as_usize();
3593 let index = match pattern_id {
3594 None => start_index,
3595 Some(pid) => {
3596 let pid = pid.as_usize();
3597 assert!(pid < self.patterns, "invalid pattern ID {:?}", pid);
3598 self.stride + (self.stride * pid) + start_index
3599 }
3600 };
3601 self.table()[index]
3602 }
3603
3604 /// Returns an iterator over all start state IDs in this table.
3605 ///
3606 /// Each item is a triple of: start state ID, the start state type and the
3607 /// pattern ID (if any).
3608 fn iter(&self) -> StartStateIter<'_> {
3609 StartStateIter { st: self.as_ref(), i: 0 }
3610 }
3611
3612 /// Returns the table as a slice of state IDs.
3613 fn table(&self) -> &[StateID] {
3614 let integers = self.table.as_ref();
3615 // SAFETY: This is safe because StateID is guaranteed to be
3616 // representable as a u32.
3617 unsafe {
3618 core::slice::from_raw_parts(
3619 integers.as_ptr() as *const StateID,
3620 integers.len(),
3621 )
3622 }
3623 }
3624
3625 /// Return the memory usage, in bytes, of this start list.
3626 ///
3627 /// This does not include the size of a `StartList` value itself.
3628 fn memory_usage(&self) -> usize {
3629 self.table().len() * StateID::SIZE
3630 }
3631}
3632
3633#[cfg(feature = "alloc")]
3634impl<T: AsMut<[u32]>> StartTable<T> {
3635 /// Set the start state for the given index and pattern.
3636 ///
3637 /// If the pattern ID or state ID are not valid, then this will panic.
3638 fn set_start(
3639 &mut self,
3640 index: Start,
3641 pattern_id: Option<PatternID>,
3642 id: StateID,
3643 ) {
3644 let start_index = index.as_usize();
3645 let index = match pattern_id {
3646 None => start_index,
3647 Some(pid) => self
3648 .stride
3649 .checked_mul(pid.as_usize())
3650 .unwrap()
3651 .checked_add(self.stride)
3652 .unwrap()
3653 .checked_add(start_index)
3654 .unwrap(),
3655 };
3656 self.table_mut()[index] = id;
3657 }
3658
3659 /// Returns the table as a mutable slice of state IDs.
3660 fn table_mut(&mut self) -> &mut [StateID] {
3661 let integers = self.table.as_mut();
3662 // SAFETY: This is safe because StateID is guaranteed to be
3663 // representable as a u32.
3664 unsafe {
3665 core::slice::from_raw_parts_mut(
3666 integers.as_mut_ptr() as *mut StateID,
3667 integers.len(),
3668 )
3669 }
3670 }
3671}
3672
3673/// An iterator over start state IDs.
3674///
3675/// This iterator yields a triple of start state ID, the start state type
3676/// and the pattern ID (if any). The pattern ID is None for start states
3677/// corresponding to the entire DFA and non-None for start states corresponding
3678/// to a specific pattern. The latter only occurs when the DFA is compiled with
3679/// start states for each pattern.
3680pub(crate) struct StartStateIter<'a> {
3681 st: StartTable<&'a [u32]>,
3682 i: usize,
3683}
3684
3685impl<'a> Iterator for StartStateIter<'a> {
3686 type Item = (StateID, Start, Option<PatternID>);
3687
3688 fn next(&mut self) -> Option<(StateID, Start, Option<PatternID>)> {
3689 let i: usize = self.i;
3690 let table: &[StateID] = self.st.table();
3691 if i >= table.len() {
3692 return None;
3693 }
3694 self.i += 1;
3695
3696 // This unwrap is okay since the stride of the starting state table
3697 // must always match the number of start state types.
3698 let start_type: Start = Start::from_usize(i % self.st.stride).unwrap();
3699 let pid: Option = if i < self.st.stride {
3700 None
3701 } else {
3702 Some(
3703 PatternID::new((i - self.st.stride) / self.st.stride).unwrap(),
3704 )
3705 };
3706 Some((table[i], start_type, pid))
3707 }
3708}
3709
3710/// This type represents that patterns that should be reported whenever a DFA
3711/// enters a match state. This structure exists to support DFAs that search for
3712/// matches for multiple regexes.
3713///
3714/// This structure relies on the fact that all match states in a DFA occur
3715/// contiguously in the DFA's transition table. (See dfa/special.rs for a more
3716/// detailed breakdown of the representation.) Namely, when a match occurs, we
3717/// know its state ID. Since we know the start and end of the contiguous region
3718/// of match states, we can use that to compute the position at which the match
3719/// state occurs. That in turn is used as an offset into this structure.
3720#[derive(Clone, Debug)]
3721struct MatchStates<T> {
3722 /// Slices is a flattened sequence of pairs, where each pair points to a
3723 /// sub-slice of pattern_ids. The first element of the pair is an offset
3724 /// into pattern_ids and the second element of the pair is the number
3725 /// of 32-bit pattern IDs starting at that position. That is, each pair
3726 /// corresponds to a single DFA match state and its corresponding match
3727 /// IDs. The number of pairs always corresponds to the number of distinct
3728 /// DFA match states.
3729 ///
3730 /// In practice, T is either Vec<u32> or &[u32].
3731 slices: T,
3732 /// A flattened sequence of pattern IDs for each DFA match state. The only
3733 /// way to correctly read this sequence is indirectly via `slices`.
3734 ///
3735 /// In practice, T is either Vec<u32> or &[u32].
3736 pattern_ids: T,
3737 /// The total number of unique patterns represented by these match states.
3738 patterns: usize,
3739}
3740
3741impl<'a> MatchStates<&'a [u32]> {
3742 unsafe fn from_bytes_unchecked(
3743 mut slice: &'a [u8],
3744 ) -> Result<(MatchStates<&'a [u32]>, usize), DeserializeError> {
3745 let slice_start = slice.as_ptr() as usize;
3746
3747 // Read the total number of match states.
3748 let (count, nr) =
3749 bytes::try_read_u32_as_usize(slice, "match state count")?;
3750 slice = &slice[nr..];
3751
3752 // Read the slice start/length pairs.
3753 let pair_count = bytes::mul(2, count, "match state offset pairs")?;
3754 let slices_bytes_len = bytes::mul(
3755 pair_count,
3756 PatternID::SIZE,
3757 "match state slice offset byte length",
3758 )?;
3759 bytes::check_slice_len(slice, slices_bytes_len, "match state slices")?;
3760 bytes::check_alignment::<PatternID>(slice)?;
3761 let slices_bytes = &slice[..slices_bytes_len];
3762 slice = &slice[slices_bytes_len..];
3763 // SAFETY: Since PatternID is always representable as a u32, all we
3764 // need to do is ensure that we have the proper length and alignment.
3765 // We've checked both above, so the cast below is safe.
3766 //
3767 // N.B. This is one of the few not-safe snippets in this function, so
3768 // we mark it explicitly to call it out, even though it is technically
3769 // superfluous.
3770 #[allow(unused_unsafe)]
3771 let slices = unsafe {
3772 core::slice::from_raw_parts(
3773 slices_bytes.as_ptr() as *const u32,
3774 pair_count,
3775 )
3776 };
3777
3778 // Read the total number of unique pattern IDs (which is always 1 more
3779 // than the maximum pattern ID in this automaton, since pattern IDs are
3780 // handed out contiguously starting at 0).
3781 let (patterns, nr) =
3782 bytes::try_read_u32_as_usize(slice, "pattern count")?;
3783 slice = &slice[nr..];
3784
3785 // Now read the pattern ID count. We don't need to store this
3786 // explicitly, but we need it to know how many pattern IDs to read.
3787 let (idcount, nr) =
3788 bytes::try_read_u32_as_usize(slice, "pattern ID count")?;
3789 slice = &slice[nr..];
3790
3791 // Read the actual pattern IDs.
3792 let pattern_ids_len =
3793 bytes::mul(idcount, PatternID::SIZE, "pattern ID byte length")?;
3794 bytes::check_slice_len(slice, pattern_ids_len, "match pattern IDs")?;
3795 bytes::check_alignment::<PatternID>(slice)?;
3796 let pattern_ids_bytes = &slice[..pattern_ids_len];
3797 slice = &slice[pattern_ids_len..];
3798 // SAFETY: Since PatternID is always representable as a u32, all we
3799 // need to do is ensure that we have the proper length and alignment.
3800 // We've checked both above, so the cast below is safe.
3801 //
3802 // N.B. This is one of the few not-safe snippets in this function, so
3803 // we mark it explicitly to call it out, even though it is technically
3804 // superfluous.
3805 #[allow(unused_unsafe)]
3806 let pattern_ids = unsafe {
3807 core::slice::from_raw_parts(
3808 pattern_ids_bytes.as_ptr() as *const u32,
3809 idcount,
3810 )
3811 };
3812
3813 let ms = MatchStates { slices, pattern_ids, patterns };
3814 Ok((ms, slice.as_ptr() as usize - slice_start))
3815 }
3816}
3817
3818#[cfg(feature = "alloc")]
3819impl MatchStates<Vec<u32>> {
3820 fn empty(pattern_count: usize) -> MatchStates<Vec<u32>> {
3821 assert!(pattern_count <= PatternID::LIMIT);
3822 MatchStates {
3823 slices: vec![],
3824 pattern_ids: vec![],
3825 patterns: pattern_count,
3826 }
3827 }
3828
3829 fn new(
3830 matches: &BTreeMap<StateID, Vec<PatternID>>,
3831 pattern_count: usize,
3832 ) -> Result<MatchStates<Vec<u32>>, Error> {
3833 let mut m = MatchStates::empty(pattern_count);
3834 for (_, pids) in matches.iter() {
3835 let start = PatternID::new(m.pattern_ids.len())
3836 .map_err(|_| Error::too_many_match_pattern_ids())?;
3837 m.slices.push(start.as_u32());
3838 // This is always correct since the number of patterns in a single
3839 // match state can never exceed maximum number of allowable
3840 // patterns. Why? Because a pattern can only appear once in a
3841 // particular match state, by construction. (And since our pattern
3842 // ID limit is one less than u32::MAX, we're guaranteed that the
3843 // length fits in a u32.)
3844 m.slices.push(u32::try_from(pids.len()).unwrap());
3845 for &pid in pids {
3846 m.pattern_ids.push(pid.as_u32());
3847 }
3848 }
3849 m.patterns = pattern_count;
3850 Ok(m)
3851 }
3852
3853 fn new_with_map(
3854 &self,
3855 matches: &BTreeMap<StateID, Vec<PatternID>>,
3856 ) -> Result<MatchStates<Vec<u32>>, Error> {
3857 MatchStates::new(matches, self.patterns)
3858 }
3859}
3860
3861impl<T: AsRef<[u32]>> MatchStates<T> {
3862 /// Writes a serialized form of these match states to the buffer given. If
3863 /// the buffer is too small, then an error is returned. To determine how
3864 /// big the buffer must be, use `write_to_len`.
3865 fn write_to<E: Endian>(
3866 &self,
3867 mut dst: &mut [u8],
3868 ) -> Result<usize, SerializeError> {
3869 let nwrite = self.write_to_len();
3870 if dst.len() < nwrite {
3871 return Err(SerializeError::buffer_too_small("match states"));
3872 }
3873 dst = &mut dst[..nwrite];
3874
3875 // write state ID count
3876 // Unwrap is OK since number of states is guaranteed to fit in a u32.
3877 E::write_u32(u32::try_from(self.count()).unwrap(), dst);
3878 dst = &mut dst[size_of::<u32>()..];
3879
3880 // write slice offset pairs
3881 for &pid in self.slices() {
3882 let n = bytes::write_pattern_id::<E>(pid, &mut dst);
3883 dst = &mut dst[n..];
3884 }
3885
3886 // write unique pattern ID count
3887 // Unwrap is OK since number of patterns is guaranteed to fit in a u32.
3888 E::write_u32(u32::try_from(self.patterns).unwrap(), dst);
3889 dst = &mut dst[size_of::<u32>()..];
3890
3891 // write pattern ID count
3892 // Unwrap is OK since we check at construction (and deserialization)
3893 // that the number of patterns is representable as a u32.
3894 E::write_u32(u32::try_from(self.pattern_ids().len()).unwrap(), dst);
3895 dst = &mut dst[size_of::<u32>()..];
3896
3897 // write pattern IDs
3898 for &pid in self.pattern_ids() {
3899 let n = bytes::write_pattern_id::<E>(pid, &mut dst);
3900 dst = &mut dst[n..];
3901 }
3902
3903 Ok(nwrite)
3904 }
3905
3906 /// Returns the number of bytes the serialized form of this transition
3907 /// table will use.
3908 fn write_to_len(&self) -> usize {
3909 size_of::<u32>() // match state count
3910 + (self.slices().len() * PatternID::SIZE)
3911 + size_of::<u32>() // unique pattern ID count
3912 + size_of::<u32>() // pattern ID count
3913 + (self.pattern_ids().len() * PatternID::SIZE)
3914 }
3915
3916 /// Valides that the match state info is itself internally consistent and
3917 /// consistent with the recorded match state region in the given DFA.
3918 fn validate(&self, dfa: &DFA<T>) -> Result<(), DeserializeError> {
3919 if self.count() != dfa.special.match_len(dfa.stride()) {
3920 return Err(DeserializeError::generic(
3921 "match state count mismatch",
3922 ));
3923 }
3924 for si in 0..self.count() {
3925 let start = self.slices()[si * 2].as_usize();
3926 let len = self.slices()[si * 2 + 1].as_usize();
3927 if start >= self.pattern_ids().len() {
3928 return Err(DeserializeError::generic(
3929 "invalid pattern ID start offset",
3930 ));
3931 }
3932 if start + len > self.pattern_ids().len() {
3933 return Err(DeserializeError::generic(
3934 "invalid pattern ID length",
3935 ));
3936 }
3937 for mi in 0..len {
3938 let pid = self.pattern_id(si, mi);
3939 if pid.as_usize() >= self.patterns {
3940 return Err(DeserializeError::generic(
3941 "invalid pattern ID",
3942 ));
3943 }
3944 }
3945 }
3946 Ok(())
3947 }
3948
3949 /// Converts these match states back into their map form. This is useful
3950 /// when shuffling states, as the normal MatchStates representation is not
3951 /// amenable to easy state swapping. But with this map, to swap id1 and
3952 /// id2, all you need to do is:
3953 ///
3954 /// if let Some(pids) = map.remove(&id1) {
3955 /// map.insert(id2, pids);
3956 /// }
3957 ///
3958 /// Once shuffling is done, use MatchStates::new to convert back.
3959 #[cfg(feature = "alloc")]
3960 fn to_map(&self, dfa: &DFA<T>) -> BTreeMap<StateID, Vec<PatternID>> {
3961 let mut map = BTreeMap::new();
3962 for i in 0..self.count() {
3963 let mut pids = vec![];
3964 for j in 0..self.pattern_len(i) {
3965 pids.push(self.pattern_id(i, j));
3966 }
3967 map.insert(self.match_state_id(dfa, i), pids);
3968 }
3969 map
3970 }
3971
3972 /// Converts these match states to a borrowed value.
3973 fn as_ref(&self) -> MatchStates<&'_ [u32]> {
3974 MatchStates {
3975 slices: self.slices.as_ref(),
3976 pattern_ids: self.pattern_ids.as_ref(),
3977 patterns: self.patterns,
3978 }
3979 }
3980
3981 /// Converts these match states to an owned value.
3982 #[cfg(feature = "alloc")]
3983 fn to_owned(&self) -> MatchStates<Vec<u32>> {
3984 MatchStates {
3985 slices: self.slices.as_ref().to_vec(),
3986 pattern_ids: self.pattern_ids.as_ref().to_vec(),
3987 patterns: self.patterns,
3988 }
3989 }
3990
3991 /// Returns the match state ID given the match state index. (Where the
3992 /// first match state corresponds to index 0.)
3993 ///
3994 /// This panics if there is no match state at the given index.
3995 fn match_state_id(&self, dfa: &DFA<T>, index: usize) -> StateID {
3996 assert!(dfa.special.matches(), "no match states to index");
3997 // This is one of the places where we rely on the fact that match
3998 // states are contiguous in the transition table. Namely, that the
3999 // first match state ID always corresponds to dfa.special.min_start.
4000 // From there, since we know the stride, we can compute the ID of any
4001 // match state given its index.
4002 let stride2 = u32::try_from(dfa.stride2()).unwrap();
4003 let offset = index.checked_shl(stride2).unwrap();
4004 let id = dfa.special.min_match.as_usize().checked_add(offset).unwrap();
4005 let sid = StateID::new(id).unwrap();
4006 assert!(dfa.is_match_state(sid));
4007 sid
4008 }
4009
4010 /// Returns the pattern ID at the given match index for the given match
4011 /// state.
4012 ///
4013 /// The match state index is the state index minus the state index of the
4014 /// first match state in the DFA.
4015 ///
4016 /// The match index is the index of the pattern ID for the given state.
4017 /// The index must be less than `self.pattern_len(state_index)`.
4018 fn pattern_id(&self, state_index: usize, match_index: usize) -> PatternID {
4019 self.pattern_id_slice(state_index)[match_index]
4020 }
4021
4022 /// Returns the number of patterns in the given match state.
4023 ///
4024 /// The match state index is the state index minus the state index of the
4025 /// first match state in the DFA.
4026 fn pattern_len(&self, state_index: usize) -> usize {
4027 self.slices()[state_index * 2 + 1].as_usize()
4028 }
4029
4030 /// Returns all of the pattern IDs for the given match state index.
4031 ///
4032 /// The match state index is the state index minus the state index of the
4033 /// first match state in the DFA.
4034 fn pattern_id_slice(&self, state_index: usize) -> &[PatternID] {
4035 let start = self.slices()[state_index * 2].as_usize();
4036 let len = self.pattern_len(state_index);
4037 &self.pattern_ids()[start..start + len]
4038 }
4039
4040 /// Returns the pattern ID offset slice of u32 as a slice of PatternID.
4041 fn slices(&self) -> &[PatternID] {
4042 let integers = self.slices.as_ref();
4043 // SAFETY: This is safe because PatternID is guaranteed to be
4044 // representable as a u32.
4045 unsafe {
4046 core::slice::from_raw_parts(
4047 integers.as_ptr() as *const PatternID,
4048 integers.len(),
4049 )
4050 }
4051 }
4052
4053 /// Returns the total number of match states.
4054 fn count(&self) -> usize {
4055 assert_eq!(0, self.slices().len() % 2);
4056 self.slices().len() / 2
4057 }
4058
4059 /// Returns the pattern ID slice of u32 as a slice of PatternID.
4060 fn pattern_ids(&self) -> &[PatternID] {
4061 let integers = self.pattern_ids.as_ref();
4062 // SAFETY: This is safe because PatternID is guaranteed to be
4063 // representable as a u32.
4064 unsafe {
4065 core::slice::from_raw_parts(
4066 integers.as_ptr() as *const PatternID,
4067 integers.len(),
4068 )
4069 }
4070 }
4071
4072 /// Return the memory usage, in bytes, of these match pairs.
4073 fn memory_usage(&self) -> usize {
4074 (self.slices().len() + self.pattern_ids().len()) * PatternID::SIZE
4075 }
4076}
4077
4078/// An iterator over all states in a DFA.
4079///
4080/// This iterator yields a tuple for each state. The first element of the
4081/// tuple corresponds to a state's identifier, and the second element
4082/// corresponds to the state itself (comprised of its transitions).
4083///
4084/// `'a` corresponding to the lifetime of original DFA, `T` corresponds to
4085/// the type of the transition table itself.
4086pub(crate) struct StateIter<'a, T> {
4087 tt: &'a TransitionTable<T>,
4088 it: iter::Enumerate<slice::Chunks<'a, StateID>>,
4089}
4090
4091impl<'a, T: AsRef<[u32]>> Iterator for StateIter<'a, T> {
4092 type Item = State<'a>;
4093
4094 fn next(&mut self) -> Option<State<'a>> {
4095 self.it.next().map(|(index: usize, _)| {
4096 let id: StateID = self.tt.from_index(index);
4097 self.tt.state(id)
4098 })
4099 }
4100}
4101
4102/// An immutable representation of a single DFA state.
4103///
4104/// `'a` correspondings to the lifetime of a DFA's transition table.
4105pub(crate) struct State<'a> {
4106 id: StateID,
4107 stride2: usize,
4108 transitions: &'a [StateID],
4109}
4110
4111impl<'a> State<'a> {
4112 /// Return an iterator over all transitions in this state. This yields
4113 /// a number of transitions equivalent to the alphabet length of the
4114 /// corresponding DFA.
4115 ///
4116 /// Each transition is represented by a tuple. The first element is
4117 /// the input byte for that transition and the second element is the
4118 /// transitions itself.
4119 pub(crate) fn transitions(&self) -> StateTransitionIter<'_> {
4120 StateTransitionIter {
4121 len: self.transitions.len(),
4122 it: self.transitions.iter().enumerate(),
4123 }
4124 }
4125
4126 /// Return an iterator over a sparse representation of the transitions in
4127 /// this state. Only non-dead transitions are returned.
4128 ///
4129 /// The "sparse" representation in this case corresponds to a sequence of
4130 /// triples. The first two elements of the triple comprise an inclusive
4131 /// byte range while the last element corresponds to the transition taken
4132 /// for all bytes in the range.
4133 ///
4134 /// This is somewhat more condensed than the classical sparse
4135 /// representation (where you have an element for every non-dead
4136 /// transition), but in practice, checking if a byte is in a range is very
4137 /// cheap and using ranges tends to conserve quite a bit more space.
4138 pub(crate) fn sparse_transitions(&self) -> StateSparseTransitionIter<'_> {
4139 StateSparseTransitionIter { dense: self.transitions(), cur: None }
4140 }
4141
4142 /// Returns the identifier for this state.
4143 pub(crate) fn id(&self) -> StateID {
4144 self.id
4145 }
4146
4147 /// Analyzes this state to determine whether it can be accelerated. If so,
4148 /// it returns an accelerator that contains at least one byte.
4149 #[cfg(feature = "alloc")]
4150 fn accelerate(&self, classes: &ByteClasses) -> Option<Accel> {
4151 // We just try to add bytes to our accelerator. Once adding fails
4152 // (because we've added too many bytes), then give up.
4153 let mut accel = Accel::new();
4154 for (class, id) in self.transitions() {
4155 if id == self.id() {
4156 continue;
4157 }
4158 for unit in classes.elements(class) {
4159 if let Some(byte) = unit.as_u8() {
4160 if !accel.add(byte) {
4161 return None;
4162 }
4163 }
4164 }
4165 }
4166 if accel.is_empty() {
4167 None
4168 } else {
4169 Some(accel)
4170 }
4171 }
4172}
4173
4174impl<'a> fmt::Debug for State<'a> {
4175 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
4176 for (i: usize, (start: Unit, end: Unit, id: StateID)) in self.sparse_transitions().enumerate() {
4177 let index: usize = if f.alternate() {
4178 id.as_usize()
4179 } else {
4180 id.as_usize() >> self.stride2
4181 };
4182 if i > 0 {
4183 write!(f, ", ")?;
4184 }
4185 if start == end {
4186 write!(f, "{:?} => {:?}", start, index)?;
4187 } else {
4188 write!(f, "{:?}-{:?} => {:?}", start, end, index)?;
4189 }
4190 }
4191 Ok(())
4192 }
4193}
4194
4195/// A mutable representation of a single DFA state.
4196///
4197/// `'a` correspondings to the lifetime of a DFA's transition table.
4198#[cfg(feature = "alloc")]
4199pub(crate) struct StateMut<'a> {
4200 id: StateID,
4201 stride2: usize,
4202 transitions: &'a mut [StateID],
4203}
4204
4205#[cfg(feature = "alloc")]
4206impl<'a> StateMut<'a> {
4207 /// Return an iterator over all transitions in this state. This yields
4208 /// a number of transitions equivalent to the alphabet length of the
4209 /// corresponding DFA.
4210 ///
4211 /// Each transition is represented by a tuple. The first element is the
4212 /// input byte for that transition and the second element is a mutable
4213 /// reference to the transition itself.
4214 pub(crate) fn iter_mut(&mut self) -> StateTransitionIterMut<'_> {
4215 StateTransitionIterMut {
4216 len: self.transitions.len(),
4217 it: self.transitions.iter_mut().enumerate(),
4218 }
4219 }
4220}
4221
4222#[cfg(feature = "alloc")]
4223impl<'a> fmt::Debug for StateMut<'a> {
4224 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
4225 fmt::Debug::fmt(
4226 &State {
4227 id: self.id,
4228 stride2: self.stride2,
4229 transitions: self.transitions,
4230 },
4231 f,
4232 )
4233 }
4234}
4235
4236/// An iterator over all transitions in a single DFA state. This yields
4237/// a number of transitions equivalent to the alphabet length of the
4238/// corresponding DFA.
4239///
4240/// Each transition is represented by a tuple. The first element is the input
4241/// byte for that transition and the second element is the transition itself.
4242#[derive(Debug)]
4243pub(crate) struct StateTransitionIter<'a> {
4244 len: usize,
4245 it: iter::Enumerate<slice::Iter<'a, StateID>>,
4246}
4247
4248impl<'a> Iterator for StateTransitionIter<'a> {
4249 type Item = (alphabet::Unit, StateID);
4250
4251 fn next(&mut self) -> Option<(alphabet::Unit, StateID)> {
4252 self.it.next().map(|(i: usize, &id: StateID)| {
4253 let unit: Unit = if i + 1 == self.len {
4254 alphabet::Unit::eoi(num_byte_equiv_classes:i)
4255 } else {
4256 let b: u8 = u8::try_from(i)
4257 .expect(msg:"raw byte alphabet is never exceeded");
4258 alphabet::Unit::u8(byte:b)
4259 };
4260 (unit, id)
4261 })
4262 }
4263}
4264
4265/// A mutable iterator over all transitions in a DFA state.
4266///
4267/// Each transition is represented by a tuple. The first element is the
4268/// input byte for that transition and the second element is a mutable
4269/// reference to the transition itself.
4270#[cfg(feature = "alloc")]
4271#[derive(Debug)]
4272pub(crate) struct StateTransitionIterMut<'a> {
4273 len: usize,
4274 it: iter::Enumerate<slice::IterMut<'a, StateID>>,
4275}
4276
4277#[cfg(feature = "alloc")]
4278impl<'a> Iterator for StateTransitionIterMut<'a> {
4279 type Item = (alphabet::Unit, &'a mut StateID);
4280
4281 fn next(&mut self) -> Option<(alphabet::Unit, &'a mut StateID)> {
4282 self.it.next().map(|(i, id)| {
4283 let unit = if i + 1 == self.len {
4284 alphabet::Unit::eoi(i)
4285 } else {
4286 let b = u8::try_from(i)
4287 .expect("raw byte alphabet is never exceeded");
4288 alphabet::Unit::u8(b)
4289 };
4290 (unit, id)
4291 })
4292 }
4293}
4294
4295/// An iterator over all non-DEAD transitions in a single DFA state using a
4296/// sparse representation.
4297///
4298/// Each transition is represented by a triple. The first two elements of the
4299/// triple comprise an inclusive byte range while the last element corresponds
4300/// to the transition taken for all bytes in the range.
4301///
4302/// As a convenience, this always returns `alphabet::Unit` values of the same
4303/// type. That is, you'll never get a (byte, EOI) or a (EOI, byte). Only (byte,
4304/// byte) and (EOI, EOI) values are yielded.
4305#[derive(Debug)]
4306pub(crate) struct StateSparseTransitionIter<'a> {
4307 dense: StateTransitionIter<'a>,
4308 cur: Option<(alphabet::Unit, alphabet::Unit, StateID)>,
4309}
4310
4311impl<'a> Iterator for StateSparseTransitionIter<'a> {
4312 type Item = (alphabet::Unit, alphabet::Unit, StateID);
4313
4314 fn next(&mut self) -> Option<(alphabet::Unit, alphabet::Unit, StateID)> {
4315 while let Some((unit, next)) = self.dense.next() {
4316 let (prev_start, prev_end, prev_next) = match self.cur {
4317 Some(t) => t,
4318 None => {
4319 self.cur = Some((unit, unit, next));
4320 continue;
4321 }
4322 };
4323 if prev_next == next && !unit.is_eoi() {
4324 self.cur = Some((prev_start, unit, prev_next));
4325 } else {
4326 self.cur = Some((unit, unit, next));
4327 if prev_next != DEAD {
4328 return Some((prev_start, prev_end, prev_next));
4329 }
4330 }
4331 }
4332 if let Some((start, end, next)) = self.cur.take() {
4333 if next != DEAD {
4334 return Some((start, end, next));
4335 }
4336 }
4337 None
4338 }
4339}
4340
4341/// An iterator over pattern IDs for a single match state.
4342#[derive(Debug)]
4343pub(crate) struct PatternIDIter<'a>(slice::Iter<'a, PatternID>);
4344
4345impl<'a> Iterator for PatternIDIter<'a> {
4346 type Item = PatternID;
4347
4348 fn next(&mut self) -> Option<PatternID> {
4349 self.0.next().copied()
4350 }
4351}
4352
4353/// Remapper is an abstraction the manages the remapping of state IDs in a
4354/// dense DFA. This is useful when one wants to shuffle states into different
4355/// positions in the DFA.
4356///
4357/// One of the key complexities this manages is the ability to correctly move
4358/// one state multiple times.
4359///
4360/// Once shuffling is complete, `remap` should be called, which will rewrite
4361/// all pertinent transitions to updated state IDs.
4362#[cfg(feature = "alloc")]
4363#[derive(Debug)]
4364struct Remapper {
4365 /// A map from the index of a state to its pre-multiplied identifier.
4366 ///
4367 /// When a state is swapped with another, then their corresponding
4368 /// locations in this map are also swapped. Thus, its new position will
4369 /// still point to its old pre-multiplied StateID.
4370 ///
4371 /// While there is a bit more to it, this then allows us to rewrite the
4372 /// state IDs in a DFA's transition table in a single pass. This is done
4373 /// by iterating over every ID in this map, then iterating over each
4374 /// transition for the state at that ID and re-mapping the transition from
4375 /// `old_id` to `map[dfa.to_index(old_id)]`. That is, we find the position
4376 /// in this map where `old_id` *started*, and set it to where it ended up
4377 /// after all swaps have been completed.
4378 map: Vec<StateID>,
4379}
4380
4381#[cfg(feature = "alloc")]
4382impl Remapper {
4383 fn from_dfa(dfa: &OwnedDFA) -> Remapper {
4384 Remapper {
4385 map: (0..dfa.state_count()).map(|i| dfa.from_index(i)).collect(),
4386 }
4387 }
4388
4389 fn swap(&mut self, dfa: &mut OwnedDFA, id1: StateID, id2: StateID) {
4390 dfa.swap_states(id1, id2);
4391 self.map.swap(dfa.to_index(id1), dfa.to_index(id2));
4392 }
4393
4394 fn remap(mut self, dfa: &mut OwnedDFA) {
4395 // Update the map to account for states that have been swapped
4396 // multiple times. For example, if (A, C) and (C, G) are swapped, then
4397 // transitions previously pointing to A should now point to G. But if
4398 // we don't update our map, they will erroneously be set to C. All we
4399 // do is follow the swaps in our map until we see our original state
4400 // ID.
4401 let oldmap = self.map.clone();
4402 for i in 0..dfa.state_count() {
4403 let cur_id = dfa.from_index(i);
4404 let mut new = oldmap[i];
4405 if cur_id == new {
4406 continue;
4407 }
4408 loop {
4409 let id = oldmap[dfa.to_index(new)];
4410 if cur_id == id {
4411 self.map[i] = new;
4412 break;
4413 }
4414 new = id;
4415 }
4416 }
4417
4418 // To work around the borrow checker for converting state IDs to
4419 // indices. We cannot borrow self while mutably iterating over a
4420 // state's transitions. Otherwise, we'd just use dfa.to_index(..).
4421 let stride2 = dfa.stride2();
4422 let to_index = |id: StateID| -> usize { id.as_usize() >> stride2 };
4423
4424 // Now that we've finished shuffling, we need to remap all of our
4425 // transitions. We don't need to handle re-mapping accelerated states
4426 // since `accels` is only populated after shuffling.
4427 for &id in self.map.iter() {
4428 for (_, next_id) in dfa.state_mut(id).iter_mut() {
4429 *next_id = self.map[to_index(*next_id)];
4430 }
4431 }
4432 for start_id in dfa.st.table_mut().iter_mut() {
4433 *start_id = self.map[to_index(*start_id)];
4434 }
4435 }
4436}
4437
4438#[cfg(all(test, feature = "alloc"))]
4439mod tests {
4440 use super::*;
4441
4442 #[test]
4443 fn errors_with_unicode_word_boundary() {
4444 let pattern = r"\b";
4445 assert!(Builder::new().build(pattern).is_err());
4446 }
4447
4448 #[test]
4449 fn roundtrip_never_match() {
4450 let dfa = DFA::never_match().unwrap();
4451 let (buf, _) = dfa.to_bytes_native_endian();
4452 let dfa: DFA<&[u32]> = DFA::from_bytes(&buf).unwrap().0;
4453
4454 assert_eq!(None, dfa.find_leftmost_fwd(b"foo12345").unwrap());
4455 }
4456
4457 #[test]
4458 fn roundtrip_always_match() {
4459 use crate::HalfMatch;
4460
4461 let dfa = DFA::always_match().unwrap();
4462 let (buf, _) = dfa.to_bytes_native_endian();
4463 let dfa: DFA<&[u32]> = DFA::from_bytes(&buf).unwrap().0;
4464
4465 assert_eq!(
4466 Some(HalfMatch::must(0, 0)),
4467 dfa.find_leftmost_fwd(b"foo12345").unwrap()
4468 );
4469 }
4470}
4471