1#[cfg(feature = "std")]
2use dense::{self, DenseDFA};
3use dfa::DFA;
4#[cfg(feature = "std")]
5use error::Result;
6#[cfg(feature = "std")]
7use sparse::SparseDFA;
8#[cfg(feature = "std")]
9use state_id::StateID;
10
11/// A regular expression that uses deterministic finite automata for fast
12/// searching.
13///
14/// A regular expression is comprised of two DFAs, a "forward" DFA and a
15/// "reverse" DFA. The forward DFA is responsible for detecting the end of a
16/// match while the reverse DFA is responsible for detecting the start of a
17/// match. Thus, in order to find the bounds of any given match, a forward
18/// search must first be run followed by a reverse search. A match found by
19/// the forward DFA guarantees that the reverse DFA will also find a match.
20///
21/// The type of the DFA used by a `Regex` corresponds to the `D` type
22/// parameter, which must satisfy the [`DFA`](trait.DFA.html) trait. Typically,
23/// `D` is either a [`DenseDFA`](enum.DenseDFA.html) or a
24/// [`SparseDFA`](enum.SparseDFA.html), where dense DFAs use more memory but
25/// search faster, while sparse DFAs use less memory but search more slowly.
26///
27/// By default, a regex's DFA type parameter is set to
28/// `DenseDFA<Vec<usize>, usize>`. For most in-memory work loads, this is the
29/// most convenient type that gives the best search performance.
30///
31/// # Sparse DFAs
32///
33/// Since a `Regex` is generic over the `DFA` trait, it can be used with any
34/// kind of DFA. While this crate constructs dense DFAs by default, it is easy
35/// enough to build corresponding sparse DFAs, and then build a regex from
36/// them:
37///
38/// ```
39/// use regex_automata::Regex;
40///
41/// # fn example() -> Result<(), regex_automata::Error> {
42/// // First, build a regex that uses dense DFAs.
43/// let dense_re = Regex::new("foo[0-9]+")?;
44///
45/// // Second, build sparse DFAs from the forward and reverse dense DFAs.
46/// let fwd = dense_re.forward().to_sparse()?;
47/// let rev = dense_re.reverse().to_sparse()?;
48///
49/// // Third, build a new regex from the constituent sparse DFAs.
50/// let sparse_re = Regex::from_dfas(fwd, rev);
51///
52/// // A regex that uses sparse DFAs can be used just like with dense DFAs.
53/// assert_eq!(true, sparse_re.is_match(b"foo123"));
54/// # Ok(()) }; example().unwrap()
55/// ```
56#[cfg(feature = "std")]
57#[derive(Clone, Debug)]
58pub struct Regex<D: DFA = DenseDFA<Vec<usize>, usize>> {
59 forward: D,
60 reverse: D,
61}
62
63/// A regular expression that uses deterministic finite automata for fast
64/// searching.
65///
66/// A regular expression is comprised of two DFAs, a "forward" DFA and a
67/// "reverse" DFA. The forward DFA is responsible for detecting the end of a
68/// match while the reverse DFA is responsible for detecting the start of a
69/// match. Thus, in order to find the bounds of any given match, a forward
70/// search must first be run followed by a reverse search. A match found by
71/// the forward DFA guarantees that the reverse DFA will also find a match.
72///
73/// The type of the DFA used by a `Regex` corresponds to the `D` type
74/// parameter, which must satisfy the [`DFA`](trait.DFA.html) trait. Typically,
75/// `D` is either a [`DenseDFA`](enum.DenseDFA.html) or a
76/// [`SparseDFA`](enum.SparseDFA.html), where dense DFAs use more memory but
77/// search faster, while sparse DFAs use less memory but search more slowly.
78///
79/// When using this crate without the standard library, the `Regex` type has
80/// no default type parameter.
81///
82/// # Sparse DFAs
83///
84/// Since a `Regex` is generic over the `DFA` trait, it can be used with any
85/// kind of DFA. While this crate constructs dense DFAs by default, it is easy
86/// enough to build corresponding sparse DFAs, and then build a regex from
87/// them:
88///
89/// ```
90/// use regex_automata::Regex;
91///
92/// # fn example() -> Result<(), regex_automata::Error> {
93/// // First, build a regex that uses dense DFAs.
94/// let dense_re = Regex::new("foo[0-9]+")?;
95///
96/// // Second, build sparse DFAs from the forward and reverse dense DFAs.
97/// let fwd = dense_re.forward().to_sparse()?;
98/// let rev = dense_re.reverse().to_sparse()?;
99///
100/// // Third, build a new regex from the constituent sparse DFAs.
101/// let sparse_re = Regex::from_dfas(fwd, rev);
102///
103/// // A regex that uses sparse DFAs can be used just like with dense DFAs.
104/// assert_eq!(true, sparse_re.is_match(b"foo123"));
105/// # Ok(()) }; example().unwrap()
106/// ```
107#[cfg(not(feature = "std"))]
108#[derive(Clone, Debug)]
109pub struct Regex<D> {
110 forward: D,
111 reverse: D,
112}
113
114#[cfg(feature = "std")]
115impl Regex {
116 /// Parse the given regular expression using a default configuration and
117 /// return the corresponding regex.
118 ///
119 /// The default configuration uses `usize` for state IDs, premultiplies
120 /// them and reduces the alphabet size by splitting bytes into equivalence
121 /// classes. The underlying DFAs are *not* minimized.
122 ///
123 /// If you want a non-default configuration, then use the
124 /// [`RegexBuilder`](struct.RegexBuilder.html)
125 /// to set your own configuration.
126 ///
127 /// # Example
128 ///
129 /// ```
130 /// use regex_automata::Regex;
131 ///
132 /// # fn example() -> Result<(), regex_automata::Error> {
133 /// let re = Regex::new("foo[0-9]+bar")?;
134 /// assert_eq!(Some((3, 14)), re.find(b"zzzfoo12345barzzz"));
135 /// # Ok(()) }; example().unwrap()
136 /// ```
137 pub fn new(pattern: &str) -> Result<Regex> {
138 RegexBuilder::new().build(pattern)
139 }
140}
141
142#[cfg(feature = "std")]
143impl Regex<SparseDFA<Vec<u8>, usize>> {
144 /// Parse the given regular expression using a default configuration and
145 /// return the corresponding regex using sparse DFAs.
146 ///
147 /// The default configuration uses `usize` for state IDs, reduces the
148 /// alphabet size by splitting bytes into equivalence classes. The
149 /// underlying DFAs are *not* minimized.
150 ///
151 /// If you want a non-default configuration, then use the
152 /// [`RegexBuilder`](struct.RegexBuilder.html)
153 /// to set your own configuration.
154 ///
155 /// # Example
156 ///
157 /// ```
158 /// use regex_automata::Regex;
159 ///
160 /// # fn example() -> Result<(), regex_automata::Error> {
161 /// let re = Regex::new_sparse("foo[0-9]+bar")?;
162 /// assert_eq!(Some((3, 14)), re.find(b"zzzfoo12345barzzz"));
163 /// # Ok(()) }; example().unwrap()
164 /// ```
165 pub fn new_sparse(
166 pattern: &str,
167 ) -> Result<Regex<SparseDFA<Vec<u8>, usize>>> {
168 RegexBuilder::new().build_sparse(pattern)
169 }
170}
171
172impl<D: DFA> Regex<D> {
173 /// Returns true if and only if the given bytes match.
174 ///
175 /// This routine may short circuit if it knows that scanning future input
176 /// will never lead to a different result. In particular, if the underlying
177 /// DFA enters a match state or a dead state, then this routine will return
178 /// `true` or `false`, respectively, without inspecting any future input.
179 ///
180 /// # Example
181 ///
182 /// ```
183 /// use regex_automata::Regex;
184 ///
185 /// # fn example() -> Result<(), regex_automata::Error> {
186 /// let re = Regex::new("foo[0-9]+bar")?;
187 /// assert_eq!(true, re.is_match(b"foo12345bar"));
188 /// assert_eq!(false, re.is_match(b"foobar"));
189 /// # Ok(()) }; example().unwrap()
190 /// ```
191 pub fn is_match(&self, input: &[u8]) -> bool {
192 self.is_match_at(input, 0)
193 }
194
195 /// Returns the first position at which a match is found.
196 ///
197 /// This routine stops scanning input in precisely the same circumstances
198 /// as `is_match`. The key difference is that this routine returns the
199 /// position at which it stopped scanning input if and only if a match
200 /// was found. If no match is found, then `None` is returned.
201 ///
202 /// # Example
203 ///
204 /// ```
205 /// use regex_automata::Regex;
206 ///
207 /// # fn example() -> Result<(), regex_automata::Error> {
208 /// let re = Regex::new("foo[0-9]+")?;
209 /// assert_eq!(Some(4), re.shortest_match(b"foo12345"));
210 ///
211 /// // Normally, the end of the leftmost first match here would be 3,
212 /// // but the shortest match semantics detect a match earlier.
213 /// let re = Regex::new("abc|a")?;
214 /// assert_eq!(Some(1), re.shortest_match(b"abc"));
215 /// # Ok(()) }; example().unwrap()
216 /// ```
217 pub fn shortest_match(&self, input: &[u8]) -> Option<usize> {
218 self.shortest_match_at(input, 0)
219 }
220
221 /// Returns the start and end offset of the leftmost first match. If no
222 /// match exists, then `None` is returned.
223 ///
224 /// The "leftmost first" match corresponds to the match with the smallest
225 /// starting offset, but where the end offset is determined by preferring
226 /// earlier branches in the original regular expression. For example,
227 /// `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam` will
228 /// match `Samwise` in `Samwise`.
229 ///
230 /// Generally speaking, the "leftmost first" match is how most backtracking
231 /// regular expressions tend to work. This is in contrast to POSIX-style
232 /// regular expressions that yield "leftmost longest" matches. Namely,
233 /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using
234 /// leftmost longest semantics.
235 ///
236 /// # Example
237 ///
238 /// ```
239 /// use regex_automata::Regex;
240 ///
241 /// # fn example() -> Result<(), regex_automata::Error> {
242 /// let re = Regex::new("foo[0-9]+")?;
243 /// assert_eq!(Some((3, 11)), re.find(b"zzzfoo12345zzz"));
244 ///
245 /// // Even though a match is found after reading the first byte (`a`),
246 /// // the leftmost first match semantics demand that we find the earliest
247 /// // match that prefers earlier parts of the pattern over latter parts.
248 /// let re = Regex::new("abc|a")?;
249 /// assert_eq!(Some((0, 3)), re.find(b"abc"));
250 /// # Ok(()) }; example().unwrap()
251 /// ```
252 pub fn find(&self, input: &[u8]) -> Option<(usize, usize)> {
253 self.find_at(input, 0)
254 }
255
256 /// Returns the same as `is_match`, but starts the search at the given
257 /// offset.
258 ///
259 /// The significance of the starting point is that it takes the surrounding
260 /// context into consideration. For example, if the DFA is anchored, then
261 /// a match can only occur when `start == 0`.
262 pub fn is_match_at(&self, input: &[u8], start: usize) -> bool {
263 self.forward().is_match_at(input, start)
264 }
265
266 /// Returns the same as `shortest_match`, but starts the search at the
267 /// given offset.
268 ///
269 /// The significance of the starting point is that it takes the surrounding
270 /// context into consideration. For example, if the DFA is anchored, then
271 /// a match can only occur when `start == 0`.
272 pub fn shortest_match_at(
273 &self,
274 input: &[u8],
275 start: usize,
276 ) -> Option<usize> {
277 self.forward().shortest_match_at(input, start)
278 }
279
280 /// Returns the same as `find`, but starts the search at the given
281 /// offset.
282 ///
283 /// The significance of the starting point is that it takes the surrounding
284 /// context into consideration. For example, if the DFA is anchored, then
285 /// a match can only occur when `start == 0`.
286 pub fn find_at(
287 &self,
288 input: &[u8],
289 start: usize,
290 ) -> Option<(usize, usize)> {
291 let end = match self.forward().find_at(input, start) {
292 None => return None,
293 Some(end) => end,
294 };
295 let start = self
296 .reverse()
297 .rfind(&input[start..end])
298 .map(|i| start + i)
299 .expect("reverse search must match if forward search does");
300 Some((start, end))
301 }
302
303 /// Returns an iterator over all non-overlapping leftmost first matches
304 /// in the given bytes. If no match exists, then the iterator yields no
305 /// elements.
306 ///
307 /// Note that if the regex can match the empty string, then it is
308 /// possible for the iterator to yield a zero-width match at a location
309 /// that is not a valid UTF-8 boundary (for example, between the code units
310 /// of a UTF-8 encoded codepoint). This can happen regardless of whether
311 /// [`allow_invalid_utf8`](struct.RegexBuilder.html#method.allow_invalid_utf8)
312 /// was enabled or not.
313 ///
314 /// # Example
315 ///
316 /// ```
317 /// use regex_automata::Regex;
318 ///
319 /// # fn example() -> Result<(), regex_automata::Error> {
320 /// let re = Regex::new("foo[0-9]+")?;
321 /// let text = b"foo1 foo12 foo123";
322 /// let matches: Vec<(usize, usize)> = re.find_iter(text).collect();
323 /// assert_eq!(matches, vec![(0, 4), (5, 10), (11, 17)]);
324 /// # Ok(()) }; example().unwrap()
325 /// ```
326 pub fn find_iter<'r, 't>(&'r self, input: &'t [u8]) -> Matches<'r, 't, D> {
327 Matches::new(self, input)
328 }
329
330 /// Build a new regex from its constituent forward and reverse DFAs.
331 ///
332 /// This is useful when deserializing a regex from some arbitrary
333 /// memory region. This is also useful for building regexes from other
334 /// types of DFAs.
335 ///
336 /// # Example
337 ///
338 /// This example is a bit a contrived. The usual use of these methods
339 /// would involve serializing `initial_re` somewhere and then deserializing
340 /// it later to build a regex.
341 ///
342 /// ```
343 /// use regex_automata::Regex;
344 ///
345 /// # fn example() -> Result<(), regex_automata::Error> {
346 /// let initial_re = Regex::new("foo[0-9]+")?;
347 /// assert_eq!(true, initial_re.is_match(b"foo123"));
348 ///
349 /// let (fwd, rev) = (initial_re.forward(), initial_re.reverse());
350 /// let re = Regex::from_dfas(fwd, rev);
351 /// assert_eq!(true, re.is_match(b"foo123"));
352 /// # Ok(()) }; example().unwrap()
353 /// ```
354 ///
355 /// This example shows how you might build smaller DFAs, and then use those
356 /// smaller DFAs to build a new regex.
357 ///
358 /// ```
359 /// use regex_automata::Regex;
360 ///
361 /// # fn example() -> Result<(), regex_automata::Error> {
362 /// let initial_re = Regex::new("foo[0-9]+")?;
363 /// assert_eq!(true, initial_re.is_match(b"foo123"));
364 ///
365 /// let fwd = initial_re.forward().to_u16()?;
366 /// let rev = initial_re.reverse().to_u16()?;
367 /// let re = Regex::from_dfas(fwd, rev);
368 /// assert_eq!(true, re.is_match(b"foo123"));
369 /// # Ok(()) }; example().unwrap()
370 /// ```
371 ///
372 /// This example shows how to build a `Regex` that uses sparse DFAs instead
373 /// of dense DFAs:
374 ///
375 /// ```
376 /// use regex_automata::Regex;
377 ///
378 /// # fn example() -> Result<(), regex_automata::Error> {
379 /// let initial_re = Regex::new("foo[0-9]+")?;
380 /// assert_eq!(true, initial_re.is_match(b"foo123"));
381 ///
382 /// let fwd = initial_re.forward().to_sparse()?;
383 /// let rev = initial_re.reverse().to_sparse()?;
384 /// let re = Regex::from_dfas(fwd, rev);
385 /// assert_eq!(true, re.is_match(b"foo123"));
386 /// # Ok(()) }; example().unwrap()
387 /// ```
388 pub fn from_dfas(forward: D, reverse: D) -> Regex<D> {
389 Regex { forward, reverse }
390 }
391
392 /// Return the underlying DFA responsible for forward matching.
393 pub fn forward(&self) -> &D {
394 &self.forward
395 }
396
397 /// Return the underlying DFA responsible for reverse matching.
398 pub fn reverse(&self) -> &D {
399 &self.reverse
400 }
401}
402
403/// An iterator over all non-overlapping matches for a particular search.
404///
405/// The iterator yields a `(usize, usize)` value until no more matches could be
406/// found. The first `usize` is the start of the match (inclusive) while the
407/// second `usize` is the end of the match (exclusive).
408///
409/// `S` is the type used to represent state identifiers in the underlying
410/// regex. The lifetime variables are as follows:
411///
412/// * `'r` is the lifetime of the regular expression value itself.
413/// * `'t` is the lifetime of the text being searched.
414#[derive(Clone, Debug)]
415pub struct Matches<'r, 't, D: DFA + 'r> {
416 re: &'r Regex<D>,
417 text: &'t [u8],
418 last_end: usize,
419 last_match: Option<usize>,
420}
421
422impl<'r, 't, D: DFA> Matches<'r, 't, D> {
423 fn new(re: &'r Regex<D>, text: &'t [u8]) -> Matches<'r, 't, D> {
424 Matches { re, text, last_end: 0, last_match: None }
425 }
426}
427
428impl<'r, 't, D: DFA> Iterator for Matches<'r, 't, D> {
429 type Item = (usize, usize);
430
431 fn next(&mut self) -> Option<(usize, usize)> {
432 if self.last_end > self.text.len() {
433 return None;
434 }
435 let (s, e) = match self.re.find_at(self.text, self.last_end) {
436 None => return None,
437 Some((s, e)) => (s, e),
438 };
439 if s == e {
440 // This is an empty match. To ensure we make progress, start
441 // the next search at the smallest possible starting position
442 // of the next match following this one.
443 self.last_end = e + 1;
444 // Don't accept empty matches immediately following a match.
445 // Just move on to the next match.
446 if Some(e) == self.last_match {
447 return self.next();
448 }
449 } else {
450 self.last_end = e;
451 }
452 self.last_match = Some(e);
453 Some((s, e))
454 }
455}
456
457/// A builder for a regex based on deterministic finite automatons.
458///
459/// This builder permits configuring several aspects of the construction
460/// process such as case insensitivity, Unicode support and various options
461/// that impact the size of the underlying DFAs. In some cases, options (like
462/// performing DFA minimization) can come with a substantial additional cost.
463///
464/// This builder generally constructs two DFAs, where one is responsible for
465/// finding the end of a match and the other is responsible for finding the
466/// start of a match. If you only need to detect whether something matched,
467/// or only the end of a match, then you should use a
468/// [`dense::Builder`](dense/struct.Builder.html)
469/// to construct a single DFA, which is cheaper than building two DFAs.
470#[cfg(feature = "std")]
471#[derive(Clone, Debug)]
472pub struct RegexBuilder {
473 dfa: dense::Builder,
474}
475
476#[cfg(feature = "std")]
477impl RegexBuilder {
478 /// Create a new regex builder with the default configuration.
479 pub fn new() -> RegexBuilder {
480 RegexBuilder { dfa: dense::Builder::new() }
481 }
482
483 /// Build a regex from the given pattern.
484 ///
485 /// If there was a problem parsing or compiling the pattern, then an error
486 /// is returned.
487 pub fn build(&self, pattern: &str) -> Result<Regex> {
488 self.build_with_size::<usize>(pattern)
489 }
490
491 /// Build a regex from the given pattern using sparse DFAs.
492 ///
493 /// If there was a problem parsing or compiling the pattern, then an error
494 /// is returned.
495 pub fn build_sparse(
496 &self,
497 pattern: &str,
498 ) -> Result<Regex<SparseDFA<Vec<u8>, usize>>> {
499 self.build_with_size_sparse::<usize>(pattern)
500 }
501
502 /// Build a regex from the given pattern using a specific representation
503 /// for the underlying DFA state IDs.
504 ///
505 /// If there was a problem parsing or compiling the pattern, then an error
506 /// is returned.
507 ///
508 /// The representation of state IDs is determined by the `S` type
509 /// parameter. In general, `S` is usually one of `u8`, `u16`, `u32`, `u64`
510 /// or `usize`, where `usize` is the default used for `build`. The purpose
511 /// of specifying a representation for state IDs is to reduce the memory
512 /// footprint of the underlying DFAs.
513 ///
514 /// When using this routine, the chosen state ID representation will be
515 /// used throughout determinization and minimization, if minimization was
516 /// requested. Even if the minimized DFAs can fit into the chosen state ID
517 /// representation but the initial determinized DFA cannot, then this will
518 /// still return an error. To get a minimized DFA with a smaller state ID
519 /// representation, first build it with a bigger state ID representation,
520 /// and then shrink the sizes of the DFAs using one of its conversion
521 /// routines, such as [`DenseDFA::to_u16`](enum.DenseDFA.html#method.to_u16).
522 /// Finally, reconstitute the regex via
523 /// [`Regex::from_dfa`](struct.Regex.html#method.from_dfa).
524 pub fn build_with_size<S: StateID>(
525 &self,
526 pattern: &str,
527 ) -> Result<Regex<DenseDFA<Vec<S>, S>>> {
528 let forward = self.dfa.build_with_size(pattern)?;
529 let reverse = self
530 .dfa
531 .clone()
532 .anchored(true)
533 .reverse(true)
534 .longest_match(true)
535 .build_with_size(pattern)?;
536 Ok(Regex::from_dfas(forward, reverse))
537 }
538
539 /// Build a regex from the given pattern using a specific representation
540 /// for the underlying DFA state IDs using sparse DFAs.
541 pub fn build_with_size_sparse<S: StateID>(
542 &self,
543 pattern: &str,
544 ) -> Result<Regex<SparseDFA<Vec<u8>, S>>> {
545 let re = self.build_with_size(pattern)?;
546 let fwd = re.forward().to_sparse()?;
547 let rev = re.reverse().to_sparse()?;
548 Ok(Regex::from_dfas(fwd, rev))
549 }
550
551 /// Set whether matching must be anchored at the beginning of the input.
552 ///
553 /// When enabled, a match must begin at the start of the input. When
554 /// disabled, the regex will act as if the pattern started with a `.*?`,
555 /// which enables a match to appear anywhere.
556 ///
557 /// By default this is disabled.
558 pub fn anchored(&mut self, yes: bool) -> &mut RegexBuilder {
559 self.dfa.anchored(yes);
560 self
561 }
562
563 /// Enable or disable the case insensitive flag by default.
564 ///
565 /// By default this is disabled. It may alternatively be selectively
566 /// enabled in the regular expression itself via the `i` flag.
567 pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder {
568 self.dfa.case_insensitive(yes);
569 self
570 }
571
572 /// Enable verbose mode in the regular expression.
573 ///
574 /// When enabled, verbose mode permits insigificant whitespace in many
575 /// places in the regular expression, as well as comments. Comments are
576 /// started using `#` and continue until the end of the line.
577 ///
578 /// By default, this is disabled. It may be selectively enabled in the
579 /// regular expression by using the `x` flag regardless of this setting.
580 pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder {
581 self.dfa.ignore_whitespace(yes);
582 self
583 }
584
585 /// Enable or disable the "dot matches any character" flag by default.
586 ///
587 /// By default this is disabled. It may alternatively be selectively
588 /// enabled in the regular expression itself via the `s` flag.
589 pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut RegexBuilder {
590 self.dfa.dot_matches_new_line(yes);
591 self
592 }
593
594 /// Enable or disable the "swap greed" flag by default.
595 ///
596 /// By default this is disabled. It may alternatively be selectively
597 /// enabled in the regular expression itself via the `U` flag.
598 pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder {
599 self.dfa.swap_greed(yes);
600 self
601 }
602
603 /// Enable or disable the Unicode flag (`u`) by default.
604 ///
605 /// By default this is **enabled**. It may alternatively be selectively
606 /// disabled in the regular expression itself via the `u` flag.
607 ///
608 /// Note that unless `allow_invalid_utf8` is enabled (it's disabled by
609 /// default), a regular expression will fail to parse if Unicode mode is
610 /// disabled and a sub-expression could possibly match invalid UTF-8.
611 pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder {
612 self.dfa.unicode(yes);
613 self
614 }
615
616 /// When enabled, the builder will permit the construction of a regular
617 /// expression that may match invalid UTF-8.
618 ///
619 /// When disabled (the default), the builder is guaranteed to produce a
620 /// regex that will only ever match valid UTF-8 (otherwise, the builder
621 /// will return an error).
622 pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut RegexBuilder {
623 self.dfa.allow_invalid_utf8(yes);
624 self
625 }
626
627 /// Set the nesting limit used for the regular expression parser.
628 ///
629 /// The nesting limit controls how deep the abstract syntax tree is allowed
630 /// to be. If the AST exceeds the given limit (e.g., with too many nested
631 /// groups), then an error is returned by the parser.
632 ///
633 /// The purpose of this limit is to act as a heuristic to prevent stack
634 /// overflow when building a finite automaton from a regular expression's
635 /// abstract syntax tree. In particular, construction currently uses
636 /// recursion. In the future, the implementation may stop using recursion
637 /// and this option will no longer be necessary.
638 ///
639 /// This limit is not checked until the entire AST is parsed. Therefore,
640 /// if callers want to put a limit on the amount of heap space used, then
641 /// they should impose a limit on the length, in bytes, of the concrete
642 /// pattern string. In particular, this is viable since the parser will
643 /// limit itself to heap space proportional to the lenth of the pattern
644 /// string.
645 ///
646 /// Note that a nest limit of `0` will return a nest limit error for most
647 /// patterns but not all. For example, a nest limit of `0` permits `a` but
648 /// not `ab`, since `ab` requires a concatenation AST item, which results
649 /// in a nest depth of `1`. In general, a nest limit is not something that
650 /// manifests in an obvious way in the concrete syntax, therefore, it
651 /// should not be used in a granular way.
652 pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder {
653 self.dfa.nest_limit(limit);
654 self
655 }
656
657 /// Minimize the underlying DFAs.
658 ///
659 /// When enabled, the DFAs powering the resulting regex will be minimized
660 /// such that it is as small as possible.
661 ///
662 /// Whether one enables minimization or not depends on the types of costs
663 /// you're willing to pay and how much you care about its benefits. In
664 /// particular, minimization has worst case `O(n*k*logn)` time and `O(k*n)`
665 /// space, where `n` is the number of DFA states and `k` is the alphabet
666 /// size. In practice, minimization can be quite costly in terms of both
667 /// space and time, so it should only be done if you're willing to wait
668 /// longer to produce a DFA. In general, you might want a minimal DFA in
669 /// the following circumstances:
670 ///
671 /// 1. You would like to optimize for the size of the automaton. This can
672 /// manifest in one of two ways. Firstly, if you're converting the
673 /// DFA into Rust code (or a table embedded in the code), then a minimal
674 /// DFA will translate into a corresponding reduction in code size, and
675 /// thus, also the final compiled binary size. Secondly, if you are
676 /// building many DFAs and putting them on the heap, you'll be able to
677 /// fit more if they are smaller. Note though that building a minimal
678 /// DFA itself requires additional space; you only realize the space
679 /// savings once the minimal DFA is constructed (at which point, the
680 /// space used for minimization is freed).
681 /// 2. You've observed that a smaller DFA results in faster match
682 /// performance. Naively, this isn't guaranteed since there is no
683 /// inherent difference between matching with a bigger-than-minimal
684 /// DFA and a minimal DFA. However, a smaller DFA may make use of your
685 /// CPU's cache more efficiently.
686 /// 3. You are trying to establish an equivalence between regular
687 /// languages. The standard method for this is to build a minimal DFA
688 /// for each language and then compare them. If the DFAs are equivalent
689 /// (up to state renaming), then the languages are equivalent.
690 ///
691 /// This option is disabled by default.
692 pub fn minimize(&mut self, yes: bool) -> &mut RegexBuilder {
693 self.dfa.minimize(yes);
694 self
695 }
696
697 /// Premultiply state identifiers in the underlying DFA transition tables.
698 ///
699 /// When enabled, state identifiers are premultiplied to point to their
700 /// corresponding row in the DFA's transition table. That is, given the
701 /// `i`th state, its corresponding premultiplied identifier is `i * k`
702 /// where `k` is the alphabet size of the DFA. (The alphabet size is at
703 /// most 256, but is in practice smaller if byte classes is enabled.)
704 ///
705 /// When state identifiers are not premultiplied, then the identifier of
706 /// the `i`th state is `i`.
707 ///
708 /// The advantage of premultiplying state identifiers is that is saves
709 /// a multiplication instruction per byte when searching with the DFA.
710 /// This has been observed to lead to a 20% performance benefit in
711 /// micro-benchmarks.
712 ///
713 /// The primary disadvantage of premultiplying state identifiers is
714 /// that they require a larger integer size to represent. For example,
715 /// if your DFA has 200 states, then its premultiplied form requires
716 /// 16 bits to represent every possible state identifier, where as its
717 /// non-premultiplied form only requires 8 bits.
718 ///
719 /// This option is enabled by default.
720 pub fn premultiply(&mut self, yes: bool) -> &mut RegexBuilder {
721 self.dfa.premultiply(yes);
722 self
723 }
724
725 /// Shrink the size of the underlying DFA alphabet by mapping bytes to
726 /// their equivalence classes.
727 ///
728 /// When enabled, each DFA will use a map from all possible bytes to their
729 /// corresponding equivalence class. Each equivalence class represents a
730 /// set of bytes that does not discriminate between a match and a non-match
731 /// in the DFA. For example, the pattern `[ab]+` has at least two
732 /// equivalence classes: a set containing `a` and `b` and a set containing
733 /// every byte except for `a` and `b`. `a` and `b` are in the same
734 /// equivalence classes because they never discriminate between a match
735 /// and a non-match.
736 ///
737 /// The advantage of this map is that the size of the transition table can
738 /// be reduced drastically from `#states * 256 * sizeof(id)` to
739 /// `#states * k * sizeof(id)` where `k` is the number of equivalence
740 /// classes. As a result, total space usage can decrease substantially.
741 /// Moreover, since a smaller alphabet is used, compilation becomes faster
742 /// as well.
743 ///
744 /// The disadvantage of this map is that every byte searched must be
745 /// passed through this map before it can be used to determine the next
746 /// transition. This has a small match time performance cost.
747 ///
748 /// This option is enabled by default.
749 pub fn byte_classes(&mut self, yes: bool) -> &mut RegexBuilder {
750 self.dfa.byte_classes(yes);
751 self
752 }
753
754 /// Apply best effort heuristics to shrink the NFA at the expense of more
755 /// time/memory.
756 ///
757 /// This may be exposed in the future, but for now is exported for use in
758 /// the `regex-automata-debug` tool.
759 #[doc(hidden)]
760 pub fn shrink(&mut self, yes: bool) -> &mut RegexBuilder {
761 self.dfa.shrink(yes);
762 self
763 }
764}
765
766#[cfg(feature = "std")]
767impl Default for RegexBuilder {
768 fn default() -> RegexBuilder {
769 RegexBuilder::new()
770 }
771}
772