1#![allow(warnings)]
2
3// This module defines an internal builder that encapsulates all interaction
4// with meta::Regex construction, and then 4 public API builders that wrap
5// around it. The docs are essentially repeated on each of the 4 public
6// builders, with tweaks to the examples as needed.
7//
8// The reason why there are so many builders is partially because of a misstep
9// in the initial API design: the builder constructor takes in the pattern
10// strings instead of using the `build` method to accept the pattern strings.
11// This means `new` has a different signature for each builder. It probably
12// would have been nicer to to use one builder with `fn new()`, and then add
13// `build(pat)` and `build_many(pats)` constructors.
14//
15// The other reason is because I think the `bytes` module should probably
16// have its own builder type. That way, it is completely isolated from the
17// top-level API.
18//
19// If I could do it again, I'd probably have a `regex::Builder` and a
20// `regex::bytes::Builder`. Each would have `build` and `build_set` (or
21// `build_many`) methods for constructing a single pattern `Regex` and a
22// multi-pattern `RegexSet`, respectively.
23
24use alloc::{
25 string::{String, ToString},
26 sync::Arc,
27 vec,
28 vec::Vec,
29};
30
31use regex_automata::{
32 meta, nfa::thompson::WhichCaptures, util::syntax, MatchKind,
33};
34
35use crate::error::Error;
36
37/// A builder for constructing a `Regex`, `bytes::Regex`, `RegexSet` or a
38/// `bytes::RegexSet`.
39///
40/// This is essentially the implementation of the four different builder types
41/// in the public API: `RegexBuilder`, `bytes::RegexBuilder`, `RegexSetBuilder`
42/// and `bytes::RegexSetBuilder`.
43#[derive(Clone, Debug)]
44struct Builder {
45 pats: Vec<String>,
46 metac: meta::Config,
47 syntaxc: syntax::Config,
48}
49
50impl Default for Builder {
51 fn default() -> Builder {
52 let metac = meta::Config::new()
53 .nfa_size_limit(Some(10 * (1 << 20)))
54 .hybrid_cache_capacity(2 * (1 << 20));
55 Builder { pats: vec![], metac, syntaxc: syntax::Config::default() }
56 }
57}
58
59impl Builder {
60 fn new<I, S>(patterns: I) -> Builder
61 where
62 S: AsRef<str>,
63 I: IntoIterator<Item = S>,
64 {
65 let mut b = Builder::default();
66 b.pats.extend(patterns.into_iter().map(|p| p.as_ref().to_string()));
67 b
68 }
69
70 fn build_one_string(&self) -> Result<crate::Regex, Error> {
71 assert_eq!(1, self.pats.len());
72 let metac = self
73 .metac
74 .clone()
75 .match_kind(MatchKind::LeftmostFirst)
76 .utf8_empty(true);
77 let syntaxc = self.syntaxc.clone().utf8(true);
78 let pattern = Arc::from(self.pats[0].as_str());
79 meta::Builder::new()
80 .configure(metac)
81 .syntax(syntaxc)
82 .build(&pattern)
83 .map(|meta| crate::Regex { meta, pattern })
84 .map_err(Error::from_meta_build_error)
85 }
86
87 fn build_one_bytes(&self) -> Result<crate::bytes::Regex, Error> {
88 assert_eq!(1, self.pats.len());
89 let metac = self
90 .metac
91 .clone()
92 .match_kind(MatchKind::LeftmostFirst)
93 .utf8_empty(false);
94 let syntaxc = self.syntaxc.clone().utf8(false);
95 let pattern = Arc::from(self.pats[0].as_str());
96 meta::Builder::new()
97 .configure(metac)
98 .syntax(syntaxc)
99 .build(&pattern)
100 .map(|meta| crate::bytes::Regex { meta, pattern })
101 .map_err(Error::from_meta_build_error)
102 }
103
104 fn build_many_string(&self) -> Result<crate::RegexSet, Error> {
105 let metac = self
106 .metac
107 .clone()
108 .match_kind(MatchKind::All)
109 .utf8_empty(true)
110 .which_captures(WhichCaptures::None);
111 let syntaxc = self.syntaxc.clone().utf8(true);
112 let patterns = Arc::from(self.pats.as_slice());
113 meta::Builder::new()
114 .configure(metac)
115 .syntax(syntaxc)
116 .build_many(&patterns)
117 .map(|meta| crate::RegexSet { meta, patterns })
118 .map_err(Error::from_meta_build_error)
119 }
120
121 fn build_many_bytes(&self) -> Result<crate::bytes::RegexSet, Error> {
122 let metac = self
123 .metac
124 .clone()
125 .match_kind(MatchKind::All)
126 .utf8_empty(false)
127 .which_captures(WhichCaptures::None);
128 let syntaxc = self.syntaxc.clone().utf8(false);
129 let patterns = Arc::from(self.pats.as_slice());
130 meta::Builder::new()
131 .configure(metac)
132 .syntax(syntaxc)
133 .build_many(&patterns)
134 .map(|meta| crate::bytes::RegexSet { meta, patterns })
135 .map_err(Error::from_meta_build_error)
136 }
137
138 fn case_insensitive(&mut self, yes: bool) -> &mut Builder {
139 self.syntaxc = self.syntaxc.case_insensitive(yes);
140 self
141 }
142
143 fn multi_line(&mut self, yes: bool) -> &mut Builder {
144 self.syntaxc = self.syntaxc.multi_line(yes);
145 self
146 }
147
148 fn dot_matches_new_line(&mut self, yes: bool) -> &mut Builder {
149 self.syntaxc = self.syntaxc.dot_matches_new_line(yes);
150 self
151 }
152
153 fn crlf(&mut self, yes: bool) -> &mut Builder {
154 self.syntaxc = self.syntaxc.crlf(yes);
155 self
156 }
157
158 fn line_terminator(&mut self, byte: u8) -> &mut Builder {
159 self.metac = self.metac.clone().line_terminator(byte);
160 self.syntaxc = self.syntaxc.line_terminator(byte);
161 self
162 }
163
164 fn swap_greed(&mut self, yes: bool) -> &mut Builder {
165 self.syntaxc = self.syntaxc.swap_greed(yes);
166 self
167 }
168
169 fn ignore_whitespace(&mut self, yes: bool) -> &mut Builder {
170 self.syntaxc = self.syntaxc.ignore_whitespace(yes);
171 self
172 }
173
174 fn unicode(&mut self, yes: bool) -> &mut Builder {
175 self.syntaxc = self.syntaxc.unicode(yes);
176 self
177 }
178
179 fn octal(&mut self, yes: bool) -> &mut Builder {
180 self.syntaxc = self.syntaxc.octal(yes);
181 self
182 }
183
184 fn size_limit(&mut self, limit: usize) -> &mut Builder {
185 self.metac = self.metac.clone().nfa_size_limit(Some(limit));
186 self
187 }
188
189 fn dfa_size_limit(&mut self, limit: usize) -> &mut Builder {
190 self.metac = self.metac.clone().hybrid_cache_capacity(limit);
191 self
192 }
193
194 fn nest_limit(&mut self, limit: u32) -> &mut Builder {
195 self.syntaxc = self.syntaxc.nest_limit(limit);
196 self
197 }
198}
199
200pub(crate) mod string {
201 use crate::{error::Error, Regex, RegexSet};
202
203 use super::Builder;
204
205 /// A configurable builder for a [`Regex`].
206 ///
207 /// This builder can be used to programmatically set flags such as `i`
208 /// (case insensitive) and `x` (for verbose mode). This builder can also be
209 /// used to configure things like the line terminator and a size limit on
210 /// the compiled regular expression.
211 #[derive(Clone, Debug)]
212 pub struct RegexBuilder {
213 builder: Builder,
214 }
215
216 impl RegexBuilder {
217 /// Create a new builder with a default configuration for the given
218 /// pattern.
219 ///
220 /// If the pattern is invalid or exceeds the configured size limits,
221 /// then an error will be returned when [`RegexBuilder::build`] is
222 /// called.
223 pub fn new(pattern: &str) -> RegexBuilder {
224 RegexBuilder { builder: Builder::new([pattern]) }
225 }
226
227 /// Compiles the pattern given to `RegexBuilder::new` with the
228 /// configuration set on this builder.
229 ///
230 /// If the pattern isn't a valid regex or if a configured size limit
231 /// was exceeded, then an error is returned.
232 pub fn build(&self) -> Result<Regex, Error> {
233 self.builder.build_one_string()
234 }
235
236 /// This configures Unicode mode for the entire pattern.
237 ///
238 /// Enabling Unicode mode does a number of things:
239 ///
240 /// * Most fundamentally, it causes the fundamental atom of matching
241 /// to be a single codepoint. When Unicode mode is disabled, it's a
242 /// single byte. For example, when Unicode mode is enabled, `.` will
243 /// match `💩` once, where as it will match 4 times when Unicode mode
244 /// is disabled. (Since the UTF-8 encoding of `💩` is 4 bytes long.)
245 /// * Case insensitive matching uses Unicode simple case folding rules.
246 /// * Unicode character classes like `\p{Letter}` and `\p{Greek}` are
247 /// available.
248 /// * Perl character classes are Unicode aware. That is, `\w`, `\s` and
249 /// `\d`.
250 /// * The word boundary assertions, `\b` and `\B`, use the Unicode
251 /// definition of a word character.
252 ///
253 /// Note that if Unicode mode is disabled, then the regex will fail to
254 /// compile if it could match invalid UTF-8. For example, when Unicode
255 /// mode is disabled, then since `.` matches any byte (except for
256 /// `\n`), then it can match invalid UTF-8 and thus building a regex
257 /// from it will fail. Another example is `\w` and `\W`. Since `\w` can
258 /// only match ASCII bytes when Unicode mode is disabled, it's allowed.
259 /// But `\W` can match more than ASCII bytes, including invalid UTF-8,
260 /// and so it is not allowed. This restriction can be lifted only by
261 /// using a [`bytes::Regex`](crate::bytes::Regex).
262 ///
263 /// For more details on the Unicode support in this crate, see the
264 /// [Unicode section](crate#unicode) in this crate's top-level
265 /// documentation.
266 ///
267 /// The default for this is `true`.
268 ///
269 /// # Example
270 ///
271 /// ```
272 /// use regex::RegexBuilder;
273 ///
274 /// let re = RegexBuilder::new(r"\w")
275 /// .unicode(false)
276 /// .build()
277 /// .unwrap();
278 /// // Normally greek letters would be included in \w, but since
279 /// // Unicode mode is disabled, it only matches ASCII letters.
280 /// assert!(!re.is_match("δ"));
281 ///
282 /// let re = RegexBuilder::new(r"s")
283 /// .case_insensitive(true)
284 /// .unicode(false)
285 /// .build()
286 /// .unwrap();
287 /// // Normally 'Å¿' is included when searching for 's' case
288 /// // insensitively due to Unicode's simple case folding rules. But
289 /// // when Unicode mode is disabled, only ASCII case insensitive rules
290 /// // are used.
291 /// assert!(!re.is_match("Å¿"));
292 /// ```
293 pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder {
294 self.builder.unicode(yes);
295 self
296 }
297
298 /// This configures whether to enable case insensitive matching for the
299 /// entire pattern.
300 ///
301 /// This setting can also be configured using the inline flag `i`
302 /// in the pattern. For example, `(?i:foo)` matches `foo` case
303 /// insensitively while `(?-i:foo)` matches `foo` case sensitively.
304 ///
305 /// The default for this is `false`.
306 ///
307 /// # Example
308 ///
309 /// ```
310 /// use regex::RegexBuilder;
311 ///
312 /// let re = RegexBuilder::new(r"foo(?-i:bar)quux")
313 /// .case_insensitive(true)
314 /// .build()
315 /// .unwrap();
316 /// assert!(re.is_match("FoObarQuUx"));
317 /// // Even though case insensitive matching is enabled in the builder,
318 /// // it can be locally disabled within the pattern. In this case,
319 /// // `bar` is matched case sensitively.
320 /// assert!(!re.is_match("fooBARquux"));
321 /// ```
322 pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder {
323 self.builder.case_insensitive(yes);
324 self
325 }
326
327 /// This configures multi-line mode for the entire pattern.
328 ///
329 /// Enabling multi-line mode changes the behavior of the `^` and `$`
330 /// anchor assertions. Instead of only matching at the beginning and
331 /// end of a haystack, respectively, multi-line mode causes them to
332 /// match at the beginning and end of a line *in addition* to the
333 /// beginning and end of a haystack. More precisely, `^` will match at
334 /// the position immediately following a `\n` and `$` will match at the
335 /// position immediately preceding a `\n`.
336 ///
337 /// The behavior of this option can be impacted by other settings too:
338 ///
339 /// * The [`RegexBuilder::line_terminator`] option changes `\n` above
340 /// to any ASCII byte.
341 /// * The [`RegexBuilder::crlf`] option changes the line terminator to
342 /// be either `\r` or `\n`, but never at the position between a `\r`
343 /// and `\n`.
344 ///
345 /// This setting can also be configured using the inline flag `m` in
346 /// the pattern.
347 ///
348 /// The default for this is `false`.
349 ///
350 /// # Example
351 ///
352 /// ```
353 /// use regex::RegexBuilder;
354 ///
355 /// let re = RegexBuilder::new(r"^foo$")
356 /// .multi_line(true)
357 /// .build()
358 /// .unwrap();
359 /// assert_eq!(Some(1..4), re.find("\nfoo\n").map(|m| m.range()));
360 /// ```
361 pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder {
362 self.builder.multi_line(yes);
363 self
364 }
365
366 /// This configures dot-matches-new-line mode for the entire pattern.
367 ///
368 /// Perhaps surprisingly, the default behavior for `.` is not to match
369 /// any character, but rather, to match any character except for the
370 /// line terminator (which is `\n` by default). When this mode is
371 /// enabled, the behavior changes such that `.` truly matches any
372 /// character.
373 ///
374 /// This setting can also be configured using the inline flag `s` in
375 /// the pattern. For example, `(?s:.)` and `\p{any}` are equivalent
376 /// regexes.
377 ///
378 /// The default for this is `false`.
379 ///
380 /// # Example
381 ///
382 /// ```
383 /// use regex::RegexBuilder;
384 ///
385 /// let re = RegexBuilder::new(r"foo.bar")
386 /// .dot_matches_new_line(true)
387 /// .build()
388 /// .unwrap();
389 /// let hay = "foo\nbar";
390 /// assert_eq!(Some("foo\nbar"), re.find(hay).map(|m| m.as_str()));
391 /// ```
392 pub fn dot_matches_new_line(
393 &mut self,
394 yes: bool,
395 ) -> &mut RegexBuilder {
396 self.builder.dot_matches_new_line(yes);
397 self
398 }
399
400 /// This configures CRLF mode for the entire pattern.
401 ///
402 /// When CRLF mode is enabled, both `\r` ("carriage return" or CR for
403 /// short) and `\n` ("line feed" or LF for short) are treated as line
404 /// terminators. This results in the following:
405 ///
406 /// * Unless dot-matches-new-line mode is enabled, `.` will now match
407 /// any character except for `\n` and `\r`.
408 /// * When multi-line mode is enabled, `^` will match immediately
409 /// following a `\n` or a `\r`. Similarly, `$` will match immediately
410 /// preceding a `\n` or a `\r`. Neither `^` nor `$` will ever match
411 /// between `\r` and `\n`.
412 ///
413 /// This setting can also be configured using the inline flag `R` in
414 /// the pattern.
415 ///
416 /// The default for this is `false`.
417 ///
418 /// # Example
419 ///
420 /// ```
421 /// use regex::RegexBuilder;
422 ///
423 /// let re = RegexBuilder::new(r"^foo$")
424 /// .multi_line(true)
425 /// .crlf(true)
426 /// .build()
427 /// .unwrap();
428 /// let hay = "\r\nfoo\r\n";
429 /// // If CRLF mode weren't enabled here, then '$' wouldn't match
430 /// // immediately after 'foo', and thus no match would be found.
431 /// assert_eq!(Some("foo"), re.find(hay).map(|m| m.as_str()));
432 /// ```
433 ///
434 /// This example demonstrates that `^` will never match at a position
435 /// between `\r` and `\n`. (`$` will similarly not match between a `\r`
436 /// and a `\n`.)
437 ///
438 /// ```
439 /// use regex::RegexBuilder;
440 ///
441 /// let re = RegexBuilder::new(r"^")
442 /// .multi_line(true)
443 /// .crlf(true)
444 /// .build()
445 /// .unwrap();
446 /// let hay = "\r\n\r\n";
447 /// let ranges: Vec<_> = re.find_iter(hay).map(|m| m.range()).collect();
448 /// assert_eq!(ranges, vec![0..0, 2..2, 4..4]);
449 /// ```
450 pub fn crlf(&mut self, yes: bool) -> &mut RegexBuilder {
451 self.builder.crlf(yes);
452 self
453 }
454
455 /// Configures the line terminator to be used by the regex.
456 ///
457 /// The line terminator is relevant in two ways for a particular regex:
458 ///
459 /// * When dot-matches-new-line mode is *not* enabled (the default),
460 /// then `.` will match any character except for the configured line
461 /// terminator.
462 /// * When multi-line mode is enabled (not the default), then `^` and
463 /// `$` will match immediately after and before, respectively, a line
464 /// terminator.
465 ///
466 /// In both cases, if CRLF mode is enabled in a particular context,
467 /// then it takes precedence over any configured line terminator.
468 ///
469 /// This option cannot be configured from within the pattern.
470 ///
471 /// The default line terminator is `\n`.
472 ///
473 /// # Example
474 ///
475 /// This shows how to treat the NUL byte as a line terminator. This can
476 /// be a useful heuristic when searching binary data.
477 ///
478 /// ```
479 /// use regex::RegexBuilder;
480 ///
481 /// let re = RegexBuilder::new(r"^foo$")
482 /// .multi_line(true)
483 /// .line_terminator(b'\x00')
484 /// .build()
485 /// .unwrap();
486 /// let hay = "\x00foo\x00";
487 /// assert_eq!(Some(1..4), re.find(hay).map(|m| m.range()));
488 /// ```
489 ///
490 /// This example shows that the behavior of `.` is impacted by this
491 /// setting as well:
492 ///
493 /// ```
494 /// use regex::RegexBuilder;
495 ///
496 /// let re = RegexBuilder::new(r".")
497 /// .line_terminator(b'\x00')
498 /// .build()
499 /// .unwrap();
500 /// assert!(re.is_match("\n"));
501 /// assert!(!re.is_match("\x00"));
502 /// ```
503 ///
504 /// This shows that building a regex will fail if the byte given
505 /// is not ASCII and the pattern could result in matching invalid
506 /// UTF-8. This is because any singular non-ASCII byte is not valid
507 /// UTF-8, and it is not permitted for a [`Regex`] to match invalid
508 /// UTF-8. (It is permissible to use a non-ASCII byte when building a
509 /// [`bytes::Regex`](crate::bytes::Regex).)
510 ///
511 /// ```
512 /// use regex::RegexBuilder;
513 ///
514 /// assert!(RegexBuilder::new(r".").line_terminator(0x80).build().is_err());
515 /// // Note that using a non-ASCII byte isn't enough on its own to
516 /// // cause regex compilation to fail. You actually have to make use
517 /// // of it in the regex in a way that leads to matching invalid
518 /// // UTF-8. If you don't, then regex compilation will succeed!
519 /// assert!(RegexBuilder::new(r"a").line_terminator(0x80).build().is_ok());
520 /// ```
521 pub fn line_terminator(&mut self, byte: u8) -> &mut RegexBuilder {
522 self.builder.line_terminator(byte);
523 self
524 }
525
526 /// This configures swap-greed mode for the entire pattern.
527 ///
528 /// When swap-greed mode is enabled, patterns like `a+` will become
529 /// non-greedy and patterns like `a+?` will become greedy. In other
530 /// words, the meanings of `a+` and `a+?` are switched.
531 ///
532 /// This setting can also be configured using the inline flag `U` in
533 /// the pattern.
534 ///
535 /// The default for this is `false`.
536 ///
537 /// # Example
538 ///
539 /// ```
540 /// use regex::RegexBuilder;
541 ///
542 /// let re = RegexBuilder::new(r"a+")
543 /// .swap_greed(true)
544 /// .build()
545 /// .unwrap();
546 /// assert_eq!(Some("a"), re.find("aaa").map(|m| m.as_str()));
547 /// ```
548 pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder {
549 self.builder.swap_greed(yes);
550 self
551 }
552
553 /// This configures verbose mode for the entire pattern.
554 ///
555 /// When enabled, whitespace will treated as insignifcant in the
556 /// pattern and `#` can be used to start a comment until the next new
557 /// line.
558 ///
559 /// Normally, in most places in a pattern, whitespace is treated
560 /// literally. For example ` +` will match one or more ASCII whitespace
561 /// characters.
562 ///
563 /// When verbose mode is enabled, `\#` can be used to match a literal
564 /// `#` and `\ ` can be used to match a literal ASCII whitespace
565 /// character.
566 ///
567 /// Verbose mode is useful for permitting regexes to be formatted and
568 /// broken up more nicely. This may make them more easily readable.
569 ///
570 /// This setting can also be configured using the inline flag `x` in
571 /// the pattern.
572 ///
573 /// The default for this is `false`.
574 ///
575 /// # Example
576 ///
577 /// ```
578 /// use regex::RegexBuilder;
579 ///
580 /// let pat = r"
581 /// \b
582 /// (?<first>\p{Uppercase}\w*) # always start with uppercase letter
583 /// [\s--\n]+ # whitespace should separate names
584 /// (?: # middle name can be an initial!
585 /// (?:(?<initial>\p{Uppercase})\.|(?<middle>\p{Uppercase}\w*))
586 /// [\s--\n]+
587 /// )?
588 /// (?<last>\p{Uppercase}\w*)
589 /// \b
590 /// ";
591 /// let re = RegexBuilder::new(pat)
592 /// .ignore_whitespace(true)
593 /// .build()
594 /// .unwrap();
595 ///
596 /// let caps = re.captures("Harry Potter").unwrap();
597 /// assert_eq!("Harry", &caps["first"]);
598 /// assert_eq!("Potter", &caps["last"]);
599 ///
600 /// let caps = re.captures("Harry J. Potter").unwrap();
601 /// assert_eq!("Harry", &caps["first"]);
602 /// // Since a middle name/initial isn't required for an overall match,
603 /// // we can't assume that 'initial' or 'middle' will be populated!
604 /// assert_eq!(Some("J"), caps.name("initial").map(|m| m.as_str()));
605 /// assert_eq!(None, caps.name("middle").map(|m| m.as_str()));
606 /// assert_eq!("Potter", &caps["last"]);
607 ///
608 /// let caps = re.captures("Harry James Potter").unwrap();
609 /// assert_eq!("Harry", &caps["first"]);
610 /// // Since a middle name/initial isn't required for an overall match,
611 /// // we can't assume that 'initial' or 'middle' will be populated!
612 /// assert_eq!(None, caps.name("initial").map(|m| m.as_str()));
613 /// assert_eq!(Some("James"), caps.name("middle").map(|m| m.as_str()));
614 /// assert_eq!("Potter", &caps["last"]);
615 /// ```
616 pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder {
617 self.builder.ignore_whitespace(yes);
618 self
619 }
620
621 /// This configures octal mode for the entire pattern.
622 ///
623 /// Octal syntax is a little-known way of uttering Unicode codepoints
624 /// in a pattern. For example, `a`, `\x61`, `\u0061` and `\141` are all
625 /// equivalent patterns, where the last example shows octal syntax.
626 ///
627 /// While supporting octal syntax isn't in and of itself a problem,
628 /// it does make good error messages harder. That is, in PCRE based
629 /// regex engines, syntax like `\1` invokes a backreference, which is
630 /// explicitly unsupported this library. However, many users expect
631 /// backreferences to be supported. Therefore, when octal support
632 /// is disabled, the error message will explicitly mention that
633 /// backreferences aren't supported.
634 ///
635 /// The default for this is `false`.
636 ///
637 /// # Example
638 ///
639 /// ```
640 /// use regex::RegexBuilder;
641 ///
642 /// // Normally this pattern would not compile, with an error message
643 /// // about backreferences not being supported. But with octal mode
644 /// // enabled, octal escape sequences work.
645 /// let re = RegexBuilder::new(r"\141")
646 /// .octal(true)
647 /// .build()
648 /// .unwrap();
649 /// assert!(re.is_match("a"));
650 /// ```
651 pub fn octal(&mut self, yes: bool) -> &mut RegexBuilder {
652 self.builder.octal(yes);
653 self
654 }
655
656 /// Sets the approximate size limit, in bytes, of the compiled regex.
657 ///
658 /// This roughly corresponds to the number of heap memory, in
659 /// bytes, occupied by a single regex. If the regex would otherwise
660 /// approximately exceed this limit, then compiling that regex will
661 /// fail.
662 ///
663 /// The main utility of a method like this is to avoid compiling
664 /// regexes that use an unexpected amount of resources, such as
665 /// time and memory. Even if the memory usage of a large regex is
666 /// acceptable, its search time may not be. Namely, worst case time
667 /// complexity for search is `O(m * n)`, where `m ~ len(pattern)` and
668 /// `n ~ len(haystack)`. That is, search time depends, in part, on the
669 /// size of the compiled regex. This means that putting a limit on the
670 /// size of the regex limits how much a regex can impact search time.
671 ///
672 /// For more information about regex size limits, see the section on
673 /// [untrusted inputs](crate#untrusted-input) in the top-level crate
674 /// documentation.
675 ///
676 /// The default for this is some reasonable number that permits most
677 /// patterns to compile successfully.
678 ///
679 /// # Example
680 ///
681 /// ```
682 /// # if !cfg!(target_pointer_width = "64") { return; } // see #1041
683 /// use regex::RegexBuilder;
684 ///
685 /// // It may surprise you how big some seemingly small patterns can
686 /// // be! Since \w is Unicode aware, this generates a regex that can
687 /// // match approximately 140,000 distinct codepoints.
688 /// assert!(RegexBuilder::new(r"\w").size_limit(45_000).build().is_err());
689 /// ```
690 pub fn size_limit(&mut self, bytes: usize) -> &mut RegexBuilder {
691 self.builder.size_limit(bytes);
692 self
693 }
694
695 /// Set the approximate capacity, in bytes, of the cache of transitions
696 /// used by the lazy DFA.
697 ///
698 /// While the lazy DFA isn't always used, in tends to be the most
699 /// commonly use regex engine in default configurations. It tends to
700 /// adopt the performance profile of a fully build DFA, but without the
701 /// downside of taking worst case exponential time to build.
702 ///
703 /// The downside is that it needs to keep a cache of transitions and
704 /// states that are built while running a search, and this cache
705 /// can fill up. When it fills up, the cache will reset itself. Any
706 /// previously generated states and transitions will then need to be
707 /// re-generated. If this happens too many times, then this library
708 /// will bail out of using the lazy DFA and switch to a different regex
709 /// engine.
710 ///
711 /// If your regex provokes this particular downside of the lazy DFA,
712 /// then it may be beneficial to increase its cache capacity. This will
713 /// potentially reduce the frequency of cache resetting (ideally to
714 /// `0`). While it won't fix all potential performance problems with
715 /// the lazy DFA, increasing the cache capacity does fix some.
716 ///
717 /// There is no easy way to determine, a priori, whether increasing
718 /// this cache capacity will help. In general, the larger your regex,
719 /// the more cache it's likely to use. But that isn't an ironclad rule.
720 /// For example, a regex like `[01]*1[01]{N}` would normally produce a
721 /// fully build DFA that is exponential in size with respect to `N`.
722 /// The lazy DFA will prevent exponential space blow-up, but it cache
723 /// is likely to fill up, even when it's large and even for smallish
724 /// values of `N`.
725 ///
726 /// If you aren't sure whether this helps or not, it is sensible to
727 /// set this to some arbitrarily large number in testing, such as
728 /// `usize::MAX`. Namely, this represents the amount of capacity that
729 /// *may* be used. It's probably not a good idea to use `usize::MAX` in
730 /// production though, since it implies there are no controls on heap
731 /// memory used by this library during a search. In effect, set it to
732 /// whatever you're willing to allocate for a single regex search.
733 pub fn dfa_size_limit(&mut self, bytes: usize) -> &mut RegexBuilder {
734 self.builder.dfa_size_limit(bytes);
735 self
736 }
737
738 /// Set the nesting limit for this parser.
739 ///
740 /// The nesting limit controls how deep the abstract syntax tree is
741 /// allowed to be. If the AST exceeds the given limit (e.g., with too
742 /// many nested groups), then an error is returned by the parser.
743 ///
744 /// The purpose of this limit is to act as a heuristic to prevent stack
745 /// overflow for consumers that do structural induction on an AST using
746 /// explicit recursion. While this crate never does this (instead using
747 /// constant stack space and moving the call stack to the heap), other
748 /// crates may.
749 ///
750 /// This limit is not checked until the entire AST is parsed.
751 /// Therefore, if callers want to put a limit on the amount of heap
752 /// space used, then they should impose a limit on the length, in
753 /// bytes, of the concrete pattern string. In particular, this is
754 /// viable since this parser implementation will limit itself to heap
755 /// space proportional to the length of the pattern string. See also
756 /// the [untrusted inputs](crate#untrusted-input) section in the
757 /// top-level crate documentation for more information about this.
758 ///
759 /// Note that a nest limit of `0` will return a nest limit error for
760 /// most patterns but not all. For example, a nest limit of `0` permits
761 /// `a` but not `ab`, since `ab` requires an explicit concatenation,
762 /// which results in a nest depth of `1`. In general, a nest limit is
763 /// not something that manifests in an obvious way in the concrete
764 /// syntax, therefore, it should not be used in a granular way.
765 ///
766 /// # Example
767 ///
768 /// ```
769 /// use regex::RegexBuilder;
770 ///
771 /// assert!(RegexBuilder::new(r"a").nest_limit(0).build().is_ok());
772 /// assert!(RegexBuilder::new(r"ab").nest_limit(0).build().is_err());
773 /// ```
774 pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder {
775 self.builder.nest_limit(limit);
776 self
777 }
778 }
779
780 /// A configurable builder for a [`RegexSet`].
781 ///
782 /// This builder can be used to programmatically set flags such as
783 /// `i` (case insensitive) and `x` (for verbose mode). This builder
784 /// can also be used to configure things like the line terminator
785 /// and a size limit on the compiled regular expression.
786 #[derive(Clone, Debug)]
787 pub struct RegexSetBuilder {
788 builder: Builder,
789 }
790
791 impl RegexSetBuilder {
792 /// Create a new builder with a default configuration for the given
793 /// patterns.
794 ///
795 /// If the patterns are invalid or exceed the configured size limits,
796 /// then an error will be returned when [`RegexSetBuilder::build`] is
797 /// called.
798 pub fn new<I, S>(patterns: I) -> RegexSetBuilder
799 where
800 I: IntoIterator<Item = S>,
801 S: AsRef<str>,
802 {
803 RegexSetBuilder { builder: Builder::new(patterns) }
804 }
805
806 /// Compiles the patterns given to `RegexSetBuilder::new` with the
807 /// configuration set on this builder.
808 ///
809 /// If the patterns aren't valid regexes or if a configured size limit
810 /// was exceeded, then an error is returned.
811 pub fn build(&self) -> Result<RegexSet, Error> {
812 self.builder.build_many_string()
813 }
814
815 /// This configures Unicode mode for the all of the patterns.
816 ///
817 /// Enabling Unicode mode does a number of things:
818 ///
819 /// * Most fundamentally, it causes the fundamental atom of matching
820 /// to be a single codepoint. When Unicode mode is disabled, it's a
821 /// single byte. For example, when Unicode mode is enabled, `.` will
822 /// match `💩` once, where as it will match 4 times when Unicode mode
823 /// is disabled. (Since the UTF-8 encoding of `💩` is 4 bytes long.)
824 /// * Case insensitive matching uses Unicode simple case folding rules.
825 /// * Unicode character classes like `\p{Letter}` and `\p{Greek}` are
826 /// available.
827 /// * Perl character classes are Unicode aware. That is, `\w`, `\s` and
828 /// `\d`.
829 /// * The word boundary assertions, `\b` and `\B`, use the Unicode
830 /// definition of a word character.
831 ///
832 /// Note that if Unicode mode is disabled, then the regex will fail to
833 /// compile if it could match invalid UTF-8. For example, when Unicode
834 /// mode is disabled, then since `.` matches any byte (except for
835 /// `\n`), then it can match invalid UTF-8 and thus building a regex
836 /// from it will fail. Another example is `\w` and `\W`. Since `\w` can
837 /// only match ASCII bytes when Unicode mode is disabled, it's allowed.
838 /// But `\W` can match more than ASCII bytes, including invalid UTF-8,
839 /// and so it is not allowed. This restriction can be lifted only by
840 /// using a [`bytes::RegexSet`](crate::bytes::RegexSet).
841 ///
842 /// For more details on the Unicode support in this crate, see the
843 /// [Unicode section](crate#unicode) in this crate's top-level
844 /// documentation.
845 ///
846 /// The default for this is `true`.
847 ///
848 /// # Example
849 ///
850 /// ```
851 /// use regex::RegexSetBuilder;
852 ///
853 /// let re = RegexSetBuilder::new([r"\w"])
854 /// .unicode(false)
855 /// .build()
856 /// .unwrap();
857 /// // Normally greek letters would be included in \w, but since
858 /// // Unicode mode is disabled, it only matches ASCII letters.
859 /// assert!(!re.is_match("δ"));
860 ///
861 /// let re = RegexSetBuilder::new([r"s"])
862 /// .case_insensitive(true)
863 /// .unicode(false)
864 /// .build()
865 /// .unwrap();
866 /// // Normally 'Å¿' is included when searching for 's' case
867 /// // insensitively due to Unicode's simple case folding rules. But
868 /// // when Unicode mode is disabled, only ASCII case insensitive rules
869 /// // are used.
870 /// assert!(!re.is_match("Å¿"));
871 /// ```
872 pub fn unicode(&mut self, yes: bool) -> &mut RegexSetBuilder {
873 self.builder.unicode(yes);
874 self
875 }
876
877 /// This configures whether to enable case insensitive matching for all
878 /// of the patterns.
879 ///
880 /// This setting can also be configured using the inline flag `i`
881 /// in the pattern. For example, `(?i:foo)` matches `foo` case
882 /// insensitively while `(?-i:foo)` matches `foo` case sensitively.
883 ///
884 /// The default for this is `false`.
885 ///
886 /// # Example
887 ///
888 /// ```
889 /// use regex::RegexSetBuilder;
890 ///
891 /// let re = RegexSetBuilder::new([r"foo(?-i:bar)quux"])
892 /// .case_insensitive(true)
893 /// .build()
894 /// .unwrap();
895 /// assert!(re.is_match("FoObarQuUx"));
896 /// // Even though case insensitive matching is enabled in the builder,
897 /// // it can be locally disabled within the pattern. In this case,
898 /// // `bar` is matched case sensitively.
899 /// assert!(!re.is_match("fooBARquux"));
900 /// ```
901 pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexSetBuilder {
902 self.builder.case_insensitive(yes);
903 self
904 }
905
906 /// This configures multi-line mode for all of the patterns.
907 ///
908 /// Enabling multi-line mode changes the behavior of the `^` and `$`
909 /// anchor assertions. Instead of only matching at the beginning and
910 /// end of a haystack, respectively, multi-line mode causes them to
911 /// match at the beginning and end of a line *in addition* to the
912 /// beginning and end of a haystack. More precisely, `^` will match at
913 /// the position immediately following a `\n` and `$` will match at the
914 /// position immediately preceding a `\n`.
915 ///
916 /// The behavior of this option can be impacted by other settings too:
917 ///
918 /// * The [`RegexSetBuilder::line_terminator`] option changes `\n`
919 /// above to any ASCII byte.
920 /// * The [`RegexSetBuilder::crlf`] option changes the line terminator
921 /// to be either `\r` or `\n`, but never at the position between a `\r`
922 /// and `\n`.
923 ///
924 /// This setting can also be configured using the inline flag `m` in
925 /// the pattern.
926 ///
927 /// The default for this is `false`.
928 ///
929 /// # Example
930 ///
931 /// ```
932 /// use regex::RegexSetBuilder;
933 ///
934 /// let re = RegexSetBuilder::new([r"^foo$"])
935 /// .multi_line(true)
936 /// .build()
937 /// .unwrap();
938 /// assert!(re.is_match("\nfoo\n"));
939 /// ```
940 pub fn multi_line(&mut self, yes: bool) -> &mut RegexSetBuilder {
941 self.builder.multi_line(yes);
942 self
943 }
944
945 /// This configures dot-matches-new-line mode for the entire pattern.
946 ///
947 /// Perhaps surprisingly, the default behavior for `.` is not to match
948 /// any character, but rather, to match any character except for the
949 /// line terminator (which is `\n` by default). When this mode is
950 /// enabled, the behavior changes such that `.` truly matches any
951 /// character.
952 ///
953 /// This setting can also be configured using the inline flag `s` in
954 /// the pattern. For example, `(?s:.)` and `\p{any}` are equivalent
955 /// regexes.
956 ///
957 /// The default for this is `false`.
958 ///
959 /// # Example
960 ///
961 /// ```
962 /// use regex::RegexSetBuilder;
963 ///
964 /// let re = RegexSetBuilder::new([r"foo.bar"])
965 /// .dot_matches_new_line(true)
966 /// .build()
967 /// .unwrap();
968 /// let hay = "foo\nbar";
969 /// assert!(re.is_match(hay));
970 /// ```
971 pub fn dot_matches_new_line(
972 &mut self,
973 yes: bool,
974 ) -> &mut RegexSetBuilder {
975 self.builder.dot_matches_new_line(yes);
976 self
977 }
978
979 /// This configures CRLF mode for all of the patterns.
980 ///
981 /// When CRLF mode is enabled, both `\r` ("carriage return" or CR for
982 /// short) and `\n` ("line feed" or LF for short) are treated as line
983 /// terminators. This results in the following:
984 ///
985 /// * Unless dot-matches-new-line mode is enabled, `.` will now match
986 /// any character except for `\n` and `\r`.
987 /// * When multi-line mode is enabled, `^` will match immediately
988 /// following a `\n` or a `\r`. Similarly, `$` will match immediately
989 /// preceding a `\n` or a `\r`. Neither `^` nor `$` will ever match
990 /// between `\r` and `\n`.
991 ///
992 /// This setting can also be configured using the inline flag `R` in
993 /// the pattern.
994 ///
995 /// The default for this is `false`.
996 ///
997 /// # Example
998 ///
999 /// ```
1000 /// use regex::RegexSetBuilder;
1001 ///
1002 /// let re = RegexSetBuilder::new([r"^foo$"])
1003 /// .multi_line(true)
1004 /// .crlf(true)
1005 /// .build()
1006 /// .unwrap();
1007 /// let hay = "\r\nfoo\r\n";
1008 /// // If CRLF mode weren't enabled here, then '$' wouldn't match
1009 /// // immediately after 'foo', and thus no match would be found.
1010 /// assert!(re.is_match(hay));
1011 /// ```
1012 ///
1013 /// This example demonstrates that `^` will never match at a position
1014 /// between `\r` and `\n`. (`$` will similarly not match between a `\r`
1015 /// and a `\n`.)
1016 ///
1017 /// ```
1018 /// use regex::RegexSetBuilder;
1019 ///
1020 /// let re = RegexSetBuilder::new([r"^\n"])
1021 /// .multi_line(true)
1022 /// .crlf(true)
1023 /// .build()
1024 /// .unwrap();
1025 /// assert!(!re.is_match("\r\n"));
1026 /// ```
1027 pub fn crlf(&mut self, yes: bool) -> &mut RegexSetBuilder {
1028 self.builder.crlf(yes);
1029 self
1030 }
1031
1032 /// Configures the line terminator to be used by the regex.
1033 ///
1034 /// The line terminator is relevant in two ways for a particular regex:
1035 ///
1036 /// * When dot-matches-new-line mode is *not* enabled (the default),
1037 /// then `.` will match any character except for the configured line
1038 /// terminator.
1039 /// * When multi-line mode is enabled (not the default), then `^` and
1040 /// `$` will match immediately after and before, respectively, a line
1041 /// terminator.
1042 ///
1043 /// In both cases, if CRLF mode is enabled in a particular context,
1044 /// then it takes precedence over any configured line terminator.
1045 ///
1046 /// This option cannot be configured from within the pattern.
1047 ///
1048 /// The default line terminator is `\n`.
1049 ///
1050 /// # Example
1051 ///
1052 /// This shows how to treat the NUL byte as a line terminator. This can
1053 /// be a useful heuristic when searching binary data.
1054 ///
1055 /// ```
1056 /// use regex::RegexSetBuilder;
1057 ///
1058 /// let re = RegexSetBuilder::new([r"^foo$"])
1059 /// .multi_line(true)
1060 /// .line_terminator(b'\x00')
1061 /// .build()
1062 /// .unwrap();
1063 /// let hay = "\x00foo\x00";
1064 /// assert!(re.is_match(hay));
1065 /// ```
1066 ///
1067 /// This example shows that the behavior of `.` is impacted by this
1068 /// setting as well:
1069 ///
1070 /// ```
1071 /// use regex::RegexSetBuilder;
1072 ///
1073 /// let re = RegexSetBuilder::new([r"."])
1074 /// .line_terminator(b'\x00')
1075 /// .build()
1076 /// .unwrap();
1077 /// assert!(re.is_match("\n"));
1078 /// assert!(!re.is_match("\x00"));
1079 /// ```
1080 ///
1081 /// This shows that building a regex will fail if the byte given
1082 /// is not ASCII and the pattern could result in matching invalid
1083 /// UTF-8. This is because any singular non-ASCII byte is not valid
1084 /// UTF-8, and it is not permitted for a [`RegexSet`] to match invalid
1085 /// UTF-8. (It is permissible to use a non-ASCII byte when building a
1086 /// [`bytes::RegexSet`](crate::bytes::RegexSet).)
1087 ///
1088 /// ```
1089 /// use regex::RegexSetBuilder;
1090 ///
1091 /// assert!(
1092 /// RegexSetBuilder::new([r"."])
1093 /// .line_terminator(0x80)
1094 /// .build()
1095 /// .is_err()
1096 /// );
1097 /// // Note that using a non-ASCII byte isn't enough on its own to
1098 /// // cause regex compilation to fail. You actually have to make use
1099 /// // of it in the regex in a way that leads to matching invalid
1100 /// // UTF-8. If you don't, then regex compilation will succeed!
1101 /// assert!(
1102 /// RegexSetBuilder::new([r"a"])
1103 /// .line_terminator(0x80)
1104 /// .build()
1105 /// .is_ok()
1106 /// );
1107 /// ```
1108 pub fn line_terminator(&mut self, byte: u8) -> &mut RegexSetBuilder {
1109 self.builder.line_terminator(byte);
1110 self
1111 }
1112
1113 /// This configures swap-greed mode for all of the patterns.
1114 ///
1115 /// When swap-greed mode is enabled, patterns like `a+` will become
1116 /// non-greedy and patterns like `a+?` will become greedy. In other
1117 /// words, the meanings of `a+` and `a+?` are switched.
1118 ///
1119 /// This setting can also be configured using the inline flag `U` in
1120 /// the pattern.
1121 ///
1122 /// Note that this is generally not useful for a `RegexSet` since a
1123 /// `RegexSet` can only report whether a pattern matches or not. Since
1124 /// greediness never impacts whether a match is found or not (only the
1125 /// offsets of the match), it follows that whether parts of a pattern
1126 /// are greedy or not doesn't matter for a `RegexSet`.
1127 ///
1128 /// The default for this is `false`.
1129 pub fn swap_greed(&mut self, yes: bool) -> &mut RegexSetBuilder {
1130 self.builder.swap_greed(yes);
1131 self
1132 }
1133
1134 /// This configures verbose mode for all of the patterns.
1135 ///
1136 /// When enabled, whitespace will treated as insignifcant in the
1137 /// pattern and `#` can be used to start a comment until the next new
1138 /// line.
1139 ///
1140 /// Normally, in most places in a pattern, whitespace is treated
1141 /// literally. For example ` +` will match one or more ASCII whitespace
1142 /// characters.
1143 ///
1144 /// When verbose mode is enabled, `\#` can be used to match a literal
1145 /// `#` and `\ ` can be used to match a literal ASCII whitespace
1146 /// character.
1147 ///
1148 /// Verbose mode is useful for permitting regexes to be formatted and
1149 /// broken up more nicely. This may make them more easily readable.
1150 ///
1151 /// This setting can also be configured using the inline flag `x` in
1152 /// the pattern.
1153 ///
1154 /// The default for this is `false`.
1155 ///
1156 /// # Example
1157 ///
1158 /// ```
1159 /// use regex::RegexSetBuilder;
1160 ///
1161 /// let pat = r"
1162 /// \b
1163 /// (?<first>\p{Uppercase}\w*) # always start with uppercase letter
1164 /// [\s--\n]+ # whitespace should separate names
1165 /// (?: # middle name can be an initial!
1166 /// (?:(?<initial>\p{Uppercase})\.|(?<middle>\p{Uppercase}\w*))
1167 /// [\s--\n]+
1168 /// )?
1169 /// (?<last>\p{Uppercase}\w*)
1170 /// \b
1171 /// ";
1172 /// let re = RegexSetBuilder::new([pat])
1173 /// .ignore_whitespace(true)
1174 /// .build()
1175 /// .unwrap();
1176 /// assert!(re.is_match("Harry Potter"));
1177 /// assert!(re.is_match("Harry J. Potter"));
1178 /// assert!(re.is_match("Harry James Potter"));
1179 /// assert!(!re.is_match("harry J. Potter"));
1180 /// ```
1181 pub fn ignore_whitespace(
1182 &mut self,
1183 yes: bool,
1184 ) -> &mut RegexSetBuilder {
1185 self.builder.ignore_whitespace(yes);
1186 self
1187 }
1188
1189 /// This configures octal mode for all of the patterns.
1190 ///
1191 /// Octal syntax is a little-known way of uttering Unicode codepoints
1192 /// in a pattern. For example, `a`, `\x61`, `\u0061` and `\141` are all
1193 /// equivalent patterns, where the last example shows octal syntax.
1194 ///
1195 /// While supporting octal syntax isn't in and of itself a problem,
1196 /// it does make good error messages harder. That is, in PCRE based
1197 /// regex engines, syntax like `\1` invokes a backreference, which is
1198 /// explicitly unsupported this library. However, many users expect
1199 /// backreferences to be supported. Therefore, when octal support
1200 /// is disabled, the error message will explicitly mention that
1201 /// backreferences aren't supported.
1202 ///
1203 /// The default for this is `false`.
1204 ///
1205 /// # Example
1206 ///
1207 /// ```
1208 /// use regex::RegexSetBuilder;
1209 ///
1210 /// // Normally this pattern would not compile, with an error message
1211 /// // about backreferences not being supported. But with octal mode
1212 /// // enabled, octal escape sequences work.
1213 /// let re = RegexSetBuilder::new([r"\141"])
1214 /// .octal(true)
1215 /// .build()
1216 /// .unwrap();
1217 /// assert!(re.is_match("a"));
1218 /// ```
1219 pub fn octal(&mut self, yes: bool) -> &mut RegexSetBuilder {
1220 self.builder.octal(yes);
1221 self
1222 }
1223
1224 /// Sets the approximate size limit, in bytes, of the compiled regex.
1225 ///
1226 /// This roughly corresponds to the number of heap memory, in
1227 /// bytes, occupied by a single regex. If the regex would otherwise
1228 /// approximately exceed this limit, then compiling that regex will
1229 /// fail.
1230 ///
1231 /// The main utility of a method like this is to avoid compiling
1232 /// regexes that use an unexpected amount of resources, such as
1233 /// time and memory. Even if the memory usage of a large regex is
1234 /// acceptable, its search time may not be. Namely, worst case time
1235 /// complexity for search is `O(m * n)`, where `m ~ len(pattern)` and
1236 /// `n ~ len(haystack)`. That is, search time depends, in part, on the
1237 /// size of the compiled regex. This means that putting a limit on the
1238 /// size of the regex limits how much a regex can impact search time.
1239 ///
1240 /// For more information about regex size limits, see the section on
1241 /// [untrusted inputs](crate#untrusted-input) in the top-level crate
1242 /// documentation.
1243 ///
1244 /// The default for this is some reasonable number that permits most
1245 /// patterns to compile successfully.
1246 ///
1247 /// # Example
1248 ///
1249 /// ```
1250 /// # if !cfg!(target_pointer_width = "64") { return; } // see #1041
1251 /// use regex::RegexSetBuilder;
1252 ///
1253 /// // It may surprise you how big some seemingly small patterns can
1254 /// // be! Since \w is Unicode aware, this generates a regex that can
1255 /// // match approximately 140,000 distinct codepoints.
1256 /// assert!(
1257 /// RegexSetBuilder::new([r"\w"])
1258 /// .size_limit(45_000)
1259 /// .build()
1260 /// .is_err()
1261 /// );
1262 /// ```
1263 pub fn size_limit(&mut self, bytes: usize) -> &mut RegexSetBuilder {
1264 self.builder.size_limit(bytes);
1265 self
1266 }
1267
1268 /// Set the approximate capacity, in bytes, of the cache of transitions
1269 /// used by the lazy DFA.
1270 ///
1271 /// While the lazy DFA isn't always used, in tends to be the most
1272 /// commonly use regex engine in default configurations. It tends to
1273 /// adopt the performance profile of a fully build DFA, but without the
1274 /// downside of taking worst case exponential time to build.
1275 ///
1276 /// The downside is that it needs to keep a cache of transitions and
1277 /// states that are built while running a search, and this cache
1278 /// can fill up. When it fills up, the cache will reset itself. Any
1279 /// previously generated states and transitions will then need to be
1280 /// re-generated. If this happens too many times, then this library
1281 /// will bail out of using the lazy DFA and switch to a different regex
1282 /// engine.
1283 ///
1284 /// If your regex provokes this particular downside of the lazy DFA,
1285 /// then it may be beneficial to increase its cache capacity. This will
1286 /// potentially reduce the frequency of cache resetting (ideally to
1287 /// `0`). While it won't fix all potential performance problems with
1288 /// the lazy DFA, increasing the cache capacity does fix some.
1289 ///
1290 /// There is no easy way to determine, a priori, whether increasing
1291 /// this cache capacity will help. In general, the larger your regex,
1292 /// the more cache it's likely to use. But that isn't an ironclad rule.
1293 /// For example, a regex like `[01]*1[01]{N}` would normally produce a
1294 /// fully build DFA that is exponential in size with respect to `N`.
1295 /// The lazy DFA will prevent exponential space blow-up, but it cache
1296 /// is likely to fill up, even when it's large and even for smallish
1297 /// values of `N`.
1298 ///
1299 /// If you aren't sure whether this helps or not, it is sensible to
1300 /// set this to some arbitrarily large number in testing, such as
1301 /// `usize::MAX`. Namely, this represents the amount of capacity that
1302 /// *may* be used. It's probably not a good idea to use `usize::MAX` in
1303 /// production though, since it implies there are no controls on heap
1304 /// memory used by this library during a search. In effect, set it to
1305 /// whatever you're willing to allocate for a single regex search.
1306 pub fn dfa_size_limit(
1307 &mut self,
1308 bytes: usize,
1309 ) -> &mut RegexSetBuilder {
1310 self.builder.dfa_size_limit(bytes);
1311 self
1312 }
1313
1314 /// Set the nesting limit for this parser.
1315 ///
1316 /// The nesting limit controls how deep the abstract syntax tree is
1317 /// allowed to be. If the AST exceeds the given limit (e.g., with too
1318 /// many nested groups), then an error is returned by the parser.
1319 ///
1320 /// The purpose of this limit is to act as a heuristic to prevent stack
1321 /// overflow for consumers that do structural induction on an AST using
1322 /// explicit recursion. While this crate never does this (instead using
1323 /// constant stack space and moving the call stack to the heap), other
1324 /// crates may.
1325 ///
1326 /// This limit is not checked until the entire AST is parsed.
1327 /// Therefore, if callers want to put a limit on the amount of heap
1328 /// space used, then they should impose a limit on the length, in
1329 /// bytes, of the concrete pattern string. In particular, this is
1330 /// viable since this parser implementation will limit itself to heap
1331 /// space proportional to the length of the pattern string. See also
1332 /// the [untrusted inputs](crate#untrusted-input) section in the
1333 /// top-level crate documentation for more information about this.
1334 ///
1335 /// Note that a nest limit of `0` will return a nest limit error for
1336 /// most patterns but not all. For example, a nest limit of `0` permits
1337 /// `a` but not `ab`, since `ab` requires an explicit concatenation,
1338 /// which results in a nest depth of `1`. In general, a nest limit is
1339 /// not something that manifests in an obvious way in the concrete
1340 /// syntax, therefore, it should not be used in a granular way.
1341 ///
1342 /// # Example
1343 ///
1344 /// ```
1345 /// use regex::RegexSetBuilder;
1346 ///
1347 /// assert!(RegexSetBuilder::new([r"a"]).nest_limit(0).build().is_ok());
1348 /// assert!(RegexSetBuilder::new([r"ab"]).nest_limit(0).build().is_err());
1349 /// ```
1350 pub fn nest_limit(&mut self, limit: u32) -> &mut RegexSetBuilder {
1351 self.builder.nest_limit(limit);
1352 self
1353 }
1354 }
1355}
1356
1357pub(crate) mod bytes {
1358 use crate::{
1359 bytes::{Regex, RegexSet},
1360 error::Error,
1361 };
1362
1363 use super::Builder;
1364
1365 /// A configurable builder for a [`Regex`].
1366 ///
1367 /// This builder can be used to programmatically set flags such as `i`
1368 /// (case insensitive) and `x` (for verbose mode). This builder can also be
1369 /// used to configure things like the line terminator and a size limit on
1370 /// the compiled regular expression.
1371 #[derive(Clone, Debug)]
1372 pub struct RegexBuilder {
1373 builder: Builder,
1374 }
1375
1376 impl RegexBuilder {
1377 /// Create a new builder with a default configuration for the given
1378 /// pattern.
1379 ///
1380 /// If the pattern is invalid or exceeds the configured size limits,
1381 /// then an error will be returned when [`RegexBuilder::build`] is
1382 /// called.
1383 pub fn new(pattern: &str) -> RegexBuilder {
1384 RegexBuilder { builder: Builder::new([pattern]) }
1385 }
1386
1387 /// Compiles the pattern given to `RegexBuilder::new` with the
1388 /// configuration set on this builder.
1389 ///
1390 /// If the pattern isn't a valid regex or if a configured size limit
1391 /// was exceeded, then an error is returned.
1392 pub fn build(&self) -> Result<Regex, Error> {
1393 self.builder.build_one_bytes()
1394 }
1395
1396 /// This configures Unicode mode for the entire pattern.
1397 ///
1398 /// Enabling Unicode mode does a number of things:
1399 ///
1400 /// * Most fundamentally, it causes the fundamental atom of matching
1401 /// to be a single codepoint. When Unicode mode is disabled, it's a
1402 /// single byte. For example, when Unicode mode is enabled, `.` will
1403 /// match `💩` once, where as it will match 4 times when Unicode mode
1404 /// is disabled. (Since the UTF-8 encoding of `💩` is 4 bytes long.)
1405 /// * Case insensitive matching uses Unicode simple case folding rules.
1406 /// * Unicode character classes like `\p{Letter}` and `\p{Greek}` are
1407 /// available.
1408 /// * Perl character classes are Unicode aware. That is, `\w`, `\s` and
1409 /// `\d`.
1410 /// * The word boundary assertions, `\b` and `\B`, use the Unicode
1411 /// definition of a word character.
1412 ///
1413 /// Note that unlike the top-level `Regex` for searching `&str`, it
1414 /// is permitted to disable Unicode mode even if the resulting pattern
1415 /// could match invalid UTF-8. For example, `(?-u:.)` is not a valid
1416 /// pattern for a top-level `Regex`, but is valid for a `bytes::Regex`.
1417 ///
1418 /// For more details on the Unicode support in this crate, see the
1419 /// [Unicode section](crate#unicode) in this crate's top-level
1420 /// documentation.
1421 ///
1422 /// The default for this is `true`.
1423 ///
1424 /// # Example
1425 ///
1426 /// ```
1427 /// use regex::bytes::RegexBuilder;
1428 ///
1429 /// let re = RegexBuilder::new(r"\w")
1430 /// .unicode(false)
1431 /// .build()
1432 /// .unwrap();
1433 /// // Normally greek letters would be included in \w, but since
1434 /// // Unicode mode is disabled, it only matches ASCII letters.
1435 /// assert!(!re.is_match("δ".as_bytes()));
1436 ///
1437 /// let re = RegexBuilder::new(r"s")
1438 /// .case_insensitive(true)
1439 /// .unicode(false)
1440 /// .build()
1441 /// .unwrap();
1442 /// // Normally 'Å¿' is included when searching for 's' case
1443 /// // insensitively due to Unicode's simple case folding rules. But
1444 /// // when Unicode mode is disabled, only ASCII case insensitive rules
1445 /// // are used.
1446 /// assert!(!re.is_match("Å¿".as_bytes()));
1447 /// ```
1448 ///
1449 /// Since this builder is for constructing a [`bytes::Regex`](Regex),
1450 /// one can disable Unicode mode even if it would match invalid UTF-8:
1451 ///
1452 /// ```
1453 /// use regex::bytes::RegexBuilder;
1454 ///
1455 /// let re = RegexBuilder::new(r".")
1456 /// .unicode(false)
1457 /// .build()
1458 /// .unwrap();
1459 /// // Normally greek letters would be included in \w, but since
1460 /// // Unicode mode is disabled, it only matches ASCII letters.
1461 /// assert!(re.is_match(b"\xFF"));
1462 /// ```
1463 pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder {
1464 self.builder.unicode(yes);
1465 self
1466 }
1467
1468 /// This configures whether to enable case insensitive matching for the
1469 /// entire pattern.
1470 ///
1471 /// This setting can also be configured using the inline flag `i`
1472 /// in the pattern. For example, `(?i:foo)` matches `foo` case
1473 /// insensitively while `(?-i:foo)` matches `foo` case sensitively.
1474 ///
1475 /// The default for this is `false`.
1476 ///
1477 /// # Example
1478 ///
1479 /// ```
1480 /// use regex::bytes::RegexBuilder;
1481 ///
1482 /// let re = RegexBuilder::new(r"foo(?-i:bar)quux")
1483 /// .case_insensitive(true)
1484 /// .build()
1485 /// .unwrap();
1486 /// assert!(re.is_match(b"FoObarQuUx"));
1487 /// // Even though case insensitive matching is enabled in the builder,
1488 /// // it can be locally disabled within the pattern. In this case,
1489 /// // `bar` is matched case sensitively.
1490 /// assert!(!re.is_match(b"fooBARquux"));
1491 /// ```
1492 pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder {
1493 self.builder.case_insensitive(yes);
1494 self
1495 }
1496
1497 /// This configures multi-line mode for the entire pattern.
1498 ///
1499 /// Enabling multi-line mode changes the behavior of the `^` and `$`
1500 /// anchor assertions. Instead of only matching at the beginning and
1501 /// end of a haystack, respectively, multi-line mode causes them to
1502 /// match at the beginning and end of a line *in addition* to the
1503 /// beginning and end of a haystack. More precisely, `^` will match at
1504 /// the position immediately following a `\n` and `$` will match at the
1505 /// position immediately preceding a `\n`.
1506 ///
1507 /// The behavior of this option can be impacted by other settings too:
1508 ///
1509 /// * The [`RegexBuilder::line_terminator`] option changes `\n` above
1510 /// to any ASCII byte.
1511 /// * The [`RegexBuilder::crlf`] option changes the line terminator to
1512 /// be either `\r` or `\n`, but never at the position between a `\r`
1513 /// and `\n`.
1514 ///
1515 /// This setting can also be configured using the inline flag `m` in
1516 /// the pattern.
1517 ///
1518 /// The default for this is `false`.
1519 ///
1520 /// # Example
1521 ///
1522 /// ```
1523 /// use regex::bytes::RegexBuilder;
1524 ///
1525 /// let re = RegexBuilder::new(r"^foo$")
1526 /// .multi_line(true)
1527 /// .build()
1528 /// .unwrap();
1529 /// assert_eq!(Some(1..4), re.find(b"\nfoo\n").map(|m| m.range()));
1530 /// ```
1531 pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder {
1532 self.builder.multi_line(yes);
1533 self
1534 }
1535
1536 /// This configures dot-matches-new-line mode for the entire pattern.
1537 ///
1538 /// Perhaps surprisingly, the default behavior for `.` is not to match
1539 /// any character, but rather, to match any character except for the
1540 /// line terminator (which is `\n` by default). When this mode is
1541 /// enabled, the behavior changes such that `.` truly matches any
1542 /// character.
1543 ///
1544 /// This setting can also be configured using the inline flag `s` in
1545 /// the pattern. For example, `(?s:.)` and `\p{any}` are equivalent
1546 /// regexes.
1547 ///
1548 /// The default for this is `false`.
1549 ///
1550 /// # Example
1551 ///
1552 /// ```
1553 /// use regex::bytes::RegexBuilder;
1554 ///
1555 /// let re = RegexBuilder::new(r"foo.bar")
1556 /// .dot_matches_new_line(true)
1557 /// .build()
1558 /// .unwrap();
1559 /// let hay = b"foo\nbar";
1560 /// assert_eq!(Some(&b"foo\nbar"[..]), re.find(hay).map(|m| m.as_bytes()));
1561 /// ```
1562 pub fn dot_matches_new_line(
1563 &mut self,
1564 yes: bool,
1565 ) -> &mut RegexBuilder {
1566 self.builder.dot_matches_new_line(yes);
1567 self
1568 }
1569
1570 /// This configures CRLF mode for the entire pattern.
1571 ///
1572 /// When CRLF mode is enabled, both `\r` ("carriage return" or CR for
1573 /// short) and `\n` ("line feed" or LF for short) are treated as line
1574 /// terminators. This results in the following:
1575 ///
1576 /// * Unless dot-matches-new-line mode is enabled, `.` will now match
1577 /// any character except for `\n` and `\r`.
1578 /// * When multi-line mode is enabled, `^` will match immediately
1579 /// following a `\n` or a `\r`. Similarly, `$` will match immediately
1580 /// preceding a `\n` or a `\r`. Neither `^` nor `$` will ever match
1581 /// between `\r` and `\n`.
1582 ///
1583 /// This setting can also be configured using the inline flag `R` in
1584 /// the pattern.
1585 ///
1586 /// The default for this is `false`.
1587 ///
1588 /// # Example
1589 ///
1590 /// ```
1591 /// use regex::bytes::RegexBuilder;
1592 ///
1593 /// let re = RegexBuilder::new(r"^foo$")
1594 /// .multi_line(true)
1595 /// .crlf(true)
1596 /// .build()
1597 /// .unwrap();
1598 /// let hay = b"\r\nfoo\r\n";
1599 /// // If CRLF mode weren't enabled here, then '$' wouldn't match
1600 /// // immediately after 'foo', and thus no match would be found.
1601 /// assert_eq!(Some(&b"foo"[..]), re.find(hay).map(|m| m.as_bytes()));
1602 /// ```
1603 ///
1604 /// This example demonstrates that `^` will never match at a position
1605 /// between `\r` and `\n`. (`$` will similarly not match between a `\r`
1606 /// and a `\n`.)
1607 ///
1608 /// ```
1609 /// use regex::bytes::RegexBuilder;
1610 ///
1611 /// let re = RegexBuilder::new(r"^")
1612 /// .multi_line(true)
1613 /// .crlf(true)
1614 /// .build()
1615 /// .unwrap();
1616 /// let hay = b"\r\n\r\n";
1617 /// let ranges: Vec<_> = re.find_iter(hay).map(|m| m.range()).collect();
1618 /// assert_eq!(ranges, vec![0..0, 2..2, 4..4]);
1619 /// ```
1620 pub fn crlf(&mut self, yes: bool) -> &mut RegexBuilder {
1621 self.builder.crlf(yes);
1622 self
1623 }
1624
1625 /// Configures the line terminator to be used by the regex.
1626 ///
1627 /// The line terminator is relevant in two ways for a particular regex:
1628 ///
1629 /// * When dot-matches-new-line mode is *not* enabled (the default),
1630 /// then `.` will match any character except for the configured line
1631 /// terminator.
1632 /// * When multi-line mode is enabled (not the default), then `^` and
1633 /// `$` will match immediately after and before, respectively, a line
1634 /// terminator.
1635 ///
1636 /// In both cases, if CRLF mode is enabled in a particular context,
1637 /// then it takes precedence over any configured line terminator.
1638 ///
1639 /// This option cannot be configured from within the pattern.
1640 ///
1641 /// The default line terminator is `\n`.
1642 ///
1643 /// # Example
1644 ///
1645 /// This shows how to treat the NUL byte as a line terminator. This can
1646 /// be a useful heuristic when searching binary data.
1647 ///
1648 /// ```
1649 /// use regex::bytes::RegexBuilder;
1650 ///
1651 /// let re = RegexBuilder::new(r"^foo$")
1652 /// .multi_line(true)
1653 /// .line_terminator(b'\x00')
1654 /// .build()
1655 /// .unwrap();
1656 /// let hay = b"\x00foo\x00";
1657 /// assert_eq!(Some(1..4), re.find(hay).map(|m| m.range()));
1658 /// ```
1659 ///
1660 /// This example shows that the behavior of `.` is impacted by this
1661 /// setting as well:
1662 ///
1663 /// ```
1664 /// use regex::bytes::RegexBuilder;
1665 ///
1666 /// let re = RegexBuilder::new(r".")
1667 /// .line_terminator(b'\x00')
1668 /// .build()
1669 /// .unwrap();
1670 /// assert!(re.is_match(b"\n"));
1671 /// assert!(!re.is_match(b"\x00"));
1672 /// ```
1673 ///
1674 /// This shows that building a regex will work even when the byte
1675 /// given is not ASCII. This is unlike the top-level `Regex` API where
1676 /// matching invalid UTF-8 is not allowed.
1677 ///
1678 /// Note though that you must disable Unicode mode. This is required
1679 /// because Unicode mode requires matching one codepoint at a time,
1680 /// and there is no way to match a non-ASCII byte as if it were a
1681 /// codepoint.
1682 ///
1683 /// ```
1684 /// use regex::bytes::RegexBuilder;
1685 ///
1686 /// assert!(
1687 /// RegexBuilder::new(r".")
1688 /// .unicode(false)
1689 /// .line_terminator(0x80)
1690 /// .build()
1691 /// .is_ok(),
1692 /// );
1693 /// ```
1694 pub fn line_terminator(&mut self, byte: u8) -> &mut RegexBuilder {
1695 self.builder.line_terminator(byte);
1696 self
1697 }
1698
1699 /// This configures swap-greed mode for the entire pattern.
1700 ///
1701 /// When swap-greed mode is enabled, patterns like `a+` will become
1702 /// non-greedy and patterns like `a+?` will become greedy. In other
1703 /// words, the meanings of `a+` and `a+?` are switched.
1704 ///
1705 /// This setting can also be configured using the inline flag `U` in
1706 /// the pattern.
1707 ///
1708 /// The default for this is `false`.
1709 ///
1710 /// # Example
1711 ///
1712 /// ```
1713 /// use regex::bytes::RegexBuilder;
1714 ///
1715 /// let re = RegexBuilder::new(r"a+")
1716 /// .swap_greed(true)
1717 /// .build()
1718 /// .unwrap();
1719 /// assert_eq!(Some(&b"a"[..]), re.find(b"aaa").map(|m| m.as_bytes()));
1720 /// ```
1721 pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder {
1722 self.builder.swap_greed(yes);
1723 self
1724 }
1725
1726 /// This configures verbose mode for the entire pattern.
1727 ///
1728 /// When enabled, whitespace will treated as insignifcant in the
1729 /// pattern and `#` can be used to start a comment until the next new
1730 /// line.
1731 ///
1732 /// Normally, in most places in a pattern, whitespace is treated
1733 /// literally. For example ` +` will match one or more ASCII whitespace
1734 /// characters.
1735 ///
1736 /// When verbose mode is enabled, `\#` can be used to match a literal
1737 /// `#` and `\ ` can be used to match a literal ASCII whitespace
1738 /// character.
1739 ///
1740 /// Verbose mode is useful for permitting regexes to be formatted and
1741 /// broken up more nicely. This may make them more easily readable.
1742 ///
1743 /// This setting can also be configured using the inline flag `x` in
1744 /// the pattern.
1745 ///
1746 /// The default for this is `false`.
1747 ///
1748 /// # Example
1749 ///
1750 /// ```
1751 /// use regex::bytes::RegexBuilder;
1752 ///
1753 /// let pat = r"
1754 /// \b
1755 /// (?<first>\p{Uppercase}\w*) # always start with uppercase letter
1756 /// [\s--\n]+ # whitespace should separate names
1757 /// (?: # middle name can be an initial!
1758 /// (?:(?<initial>\p{Uppercase})\.|(?<middle>\p{Uppercase}\w*))
1759 /// [\s--\n]+
1760 /// )?
1761 /// (?<last>\p{Uppercase}\w*)
1762 /// \b
1763 /// ";
1764 /// let re = RegexBuilder::new(pat)
1765 /// .ignore_whitespace(true)
1766 /// .build()
1767 /// .unwrap();
1768 ///
1769 /// let caps = re.captures(b"Harry Potter").unwrap();
1770 /// assert_eq!(&b"Harry"[..], &caps["first"]);
1771 /// assert_eq!(&b"Potter"[..], &caps["last"]);
1772 ///
1773 /// let caps = re.captures(b"Harry J. Potter").unwrap();
1774 /// assert_eq!(&b"Harry"[..], &caps["first"]);
1775 /// // Since a middle name/initial isn't required for an overall match,
1776 /// // we can't assume that 'initial' or 'middle' will be populated!
1777 /// assert_eq!(
1778 /// Some(&b"J"[..]),
1779 /// caps.name("initial").map(|m| m.as_bytes()),
1780 /// );
1781 /// assert_eq!(None, caps.name("middle").map(|m| m.as_bytes()));
1782 /// assert_eq!(&b"Potter"[..], &caps["last"]);
1783 ///
1784 /// let caps = re.captures(b"Harry James Potter").unwrap();
1785 /// assert_eq!(&b"Harry"[..], &caps["first"]);
1786 /// // Since a middle name/initial isn't required for an overall match,
1787 /// // we can't assume that 'initial' or 'middle' will be populated!
1788 /// assert_eq!(None, caps.name("initial").map(|m| m.as_bytes()));
1789 /// assert_eq!(
1790 /// Some(&b"James"[..]),
1791 /// caps.name("middle").map(|m| m.as_bytes()),
1792 /// );
1793 /// assert_eq!(&b"Potter"[..], &caps["last"]);
1794 /// ```
1795 pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder {
1796 self.builder.ignore_whitespace(yes);
1797 self
1798 }
1799
1800 /// This configures octal mode for the entire pattern.
1801 ///
1802 /// Octal syntax is a little-known way of uttering Unicode codepoints
1803 /// in a pattern. For example, `a`, `\x61`, `\u0061` and `\141` are all
1804 /// equivalent patterns, where the last example shows octal syntax.
1805 ///
1806 /// While supporting octal syntax isn't in and of itself a problem,
1807 /// it does make good error messages harder. That is, in PCRE based
1808 /// regex engines, syntax like `\1` invokes a backreference, which is
1809 /// explicitly unsupported this library. However, many users expect
1810 /// backreferences to be supported. Therefore, when octal support
1811 /// is disabled, the error message will explicitly mention that
1812 /// backreferences aren't supported.
1813 ///
1814 /// The default for this is `false`.
1815 ///
1816 /// # Example
1817 ///
1818 /// ```
1819 /// use regex::bytes::RegexBuilder;
1820 ///
1821 /// // Normally this pattern would not compile, with an error message
1822 /// // about backreferences not being supported. But with octal mode
1823 /// // enabled, octal escape sequences work.
1824 /// let re = RegexBuilder::new(r"\141")
1825 /// .octal(true)
1826 /// .build()
1827 /// .unwrap();
1828 /// assert!(re.is_match(b"a"));
1829 /// ```
1830 pub fn octal(&mut self, yes: bool) -> &mut RegexBuilder {
1831 self.builder.octal(yes);
1832 self
1833 }
1834
1835 /// Sets the approximate size limit, in bytes, of the compiled regex.
1836 ///
1837 /// This roughly corresponds to the number of heap memory, in
1838 /// bytes, occupied by a single regex. If the regex would otherwise
1839 /// approximately exceed this limit, then compiling that regex will
1840 /// fail.
1841 ///
1842 /// The main utility of a method like this is to avoid compiling
1843 /// regexes that use an unexpected amount of resources, such as
1844 /// time and memory. Even if the memory usage of a large regex is
1845 /// acceptable, its search time may not be. Namely, worst case time
1846 /// complexity for search is `O(m * n)`, where `m ~ len(pattern)` and
1847 /// `n ~ len(haystack)`. That is, search time depends, in part, on the
1848 /// size of the compiled regex. This means that putting a limit on the
1849 /// size of the regex limits how much a regex can impact search time.
1850 ///
1851 /// For more information about regex size limits, see the section on
1852 /// [untrusted inputs](crate#untrusted-input) in the top-level crate
1853 /// documentation.
1854 ///
1855 /// The default for this is some reasonable number that permits most
1856 /// patterns to compile successfully.
1857 ///
1858 /// # Example
1859 ///
1860 /// ```
1861 /// # if !cfg!(target_pointer_width = "64") { return; } // see #1041
1862 /// use regex::bytes::RegexBuilder;
1863 ///
1864 /// // It may surprise you how big some seemingly small patterns can
1865 /// // be! Since \w is Unicode aware, this generates a regex that can
1866 /// // match approximately 140,000 distinct codepoints.
1867 /// assert!(RegexBuilder::new(r"\w").size_limit(45_000).build().is_err());
1868 /// ```
1869 pub fn size_limit(&mut self, bytes: usize) -> &mut RegexBuilder {
1870 self.builder.size_limit(bytes);
1871 self
1872 }
1873
1874 /// Set the approximate capacity, in bytes, of the cache of transitions
1875 /// used by the lazy DFA.
1876 ///
1877 /// While the lazy DFA isn't always used, in tends to be the most
1878 /// commonly use regex engine in default configurations. It tends to
1879 /// adopt the performance profile of a fully build DFA, but without the
1880 /// downside of taking worst case exponential time to build.
1881 ///
1882 /// The downside is that it needs to keep a cache of transitions and
1883 /// states that are built while running a search, and this cache
1884 /// can fill up. When it fills up, the cache will reset itself. Any
1885 /// previously generated states and transitions will then need to be
1886 /// re-generated. If this happens too many times, then this library
1887 /// will bail out of using the lazy DFA and switch to a different regex
1888 /// engine.
1889 ///
1890 /// If your regex provokes this particular downside of the lazy DFA,
1891 /// then it may be beneficial to increase its cache capacity. This will
1892 /// potentially reduce the frequency of cache resetting (ideally to
1893 /// `0`). While it won't fix all potential performance problems with
1894 /// the lazy DFA, increasing the cache capacity does fix some.
1895 ///
1896 /// There is no easy way to determine, a priori, whether increasing
1897 /// this cache capacity will help. In general, the larger your regex,
1898 /// the more cache it's likely to use. But that isn't an ironclad rule.
1899 /// For example, a regex like `[01]*1[01]{N}` would normally produce a
1900 /// fully build DFA that is exponential in size with respect to `N`.
1901 /// The lazy DFA will prevent exponential space blow-up, but it cache
1902 /// is likely to fill up, even when it's large and even for smallish
1903 /// values of `N`.
1904 ///
1905 /// If you aren't sure whether this helps or not, it is sensible to
1906 /// set this to some arbitrarily large number in testing, such as
1907 /// `usize::MAX`. Namely, this represents the amount of capacity that
1908 /// *may* be used. It's probably not a good idea to use `usize::MAX` in
1909 /// production though, since it implies there are no controls on heap
1910 /// memory used by this library during a search. In effect, set it to
1911 /// whatever you're willing to allocate for a single regex search.
1912 pub fn dfa_size_limit(&mut self, bytes: usize) -> &mut RegexBuilder {
1913 self.builder.dfa_size_limit(bytes);
1914 self
1915 }
1916
1917 /// Set the nesting limit for this parser.
1918 ///
1919 /// The nesting limit controls how deep the abstract syntax tree is
1920 /// allowed to be. If the AST exceeds the given limit (e.g., with too
1921 /// many nested groups), then an error is returned by the parser.
1922 ///
1923 /// The purpose of this limit is to act as a heuristic to prevent stack
1924 /// overflow for consumers that do structural induction on an AST using
1925 /// explicit recursion. While this crate never does this (instead using
1926 /// constant stack space and moving the call stack to the heap), other
1927 /// crates may.
1928 ///
1929 /// This limit is not checked until the entire AST is parsed.
1930 /// Therefore, if callers want to put a limit on the amount of heap
1931 /// space used, then they should impose a limit on the length, in
1932 /// bytes, of the concrete pattern string. In particular, this is
1933 /// viable since this parser implementation will limit itself to heap
1934 /// space proportional to the length of the pattern string. See also
1935 /// the [untrusted inputs](crate#untrusted-input) section in the
1936 /// top-level crate documentation for more information about this.
1937 ///
1938 /// Note that a nest limit of `0` will return a nest limit error for
1939 /// most patterns but not all. For example, a nest limit of `0` permits
1940 /// `a` but not `ab`, since `ab` requires an explicit concatenation,
1941 /// which results in a nest depth of `1`. In general, a nest limit is
1942 /// not something that manifests in an obvious way in the concrete
1943 /// syntax, therefore, it should not be used in a granular way.
1944 ///
1945 /// # Example
1946 ///
1947 /// ```
1948 /// use regex::bytes::RegexBuilder;
1949 ///
1950 /// assert!(RegexBuilder::new(r"a").nest_limit(0).build().is_ok());
1951 /// assert!(RegexBuilder::new(r"ab").nest_limit(0).build().is_err());
1952 /// ```
1953 pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder {
1954 self.builder.nest_limit(limit);
1955 self
1956 }
1957 }
1958
1959 /// A configurable builder for a [`RegexSet`].
1960 ///
1961 /// This builder can be used to programmatically set flags such as `i`
1962 /// (case insensitive) and `x` (for verbose mode). This builder can also be
1963 /// used to configure things like the line terminator and a size limit on
1964 /// the compiled regular expression.
1965 #[derive(Clone, Debug)]
1966 pub struct RegexSetBuilder {
1967 builder: Builder,
1968 }
1969
1970 impl RegexSetBuilder {
1971 /// Create a new builder with a default configuration for the given
1972 /// patterns.
1973 ///
1974 /// If the patterns are invalid or exceed the configured size limits,
1975 /// then an error will be returned when [`RegexSetBuilder::build`] is
1976 /// called.
1977 pub fn new<I, S>(patterns: I) -> RegexSetBuilder
1978 where
1979 I: IntoIterator<Item = S>,
1980 S: AsRef<str>,
1981 {
1982 RegexSetBuilder { builder: Builder::new(patterns) }
1983 }
1984
1985 /// Compiles the patterns given to `RegexSetBuilder::new` with the
1986 /// configuration set on this builder.
1987 ///
1988 /// If the patterns aren't valid regexes or if a configured size limit
1989 /// was exceeded, then an error is returned.
1990 pub fn build(&self) -> Result<RegexSet, Error> {
1991 self.builder.build_many_bytes()
1992 }
1993
1994 /// This configures Unicode mode for the all of the patterns.
1995 ///
1996 /// Enabling Unicode mode does a number of things:
1997 ///
1998 /// * Most fundamentally, it causes the fundamental atom of matching
1999 /// to be a single codepoint. When Unicode mode is disabled, it's a
2000 /// single byte. For example, when Unicode mode is enabled, `.` will
2001 /// match `💩` once, where as it will match 4 times when Unicode mode
2002 /// is disabled. (Since the UTF-8 encoding of `💩` is 4 bytes long.)
2003 /// * Case insensitive matching uses Unicode simple case folding rules.
2004 /// * Unicode character classes like `\p{Letter}` and `\p{Greek}` are
2005 /// available.
2006 /// * Perl character classes are Unicode aware. That is, `\w`, `\s` and
2007 /// `\d`.
2008 /// * The word boundary assertions, `\b` and `\B`, use the Unicode
2009 /// definition of a word character.
2010 ///
2011 /// Note that unlike the top-level `RegexSet` for searching `&str`,
2012 /// it is permitted to disable Unicode mode even if the resulting
2013 /// pattern could match invalid UTF-8. For example, `(?-u:.)` is not
2014 /// a valid pattern for a top-level `RegexSet`, but is valid for a
2015 /// `bytes::RegexSet`.
2016 ///
2017 /// For more details on the Unicode support in this crate, see the
2018 /// [Unicode section](crate#unicode) in this crate's top-level
2019 /// documentation.
2020 ///
2021 /// The default for this is `true`.
2022 ///
2023 /// # Example
2024 ///
2025 /// ```
2026 /// use regex::bytes::RegexSetBuilder;
2027 ///
2028 /// let re = RegexSetBuilder::new([r"\w"])
2029 /// .unicode(false)
2030 /// .build()
2031 /// .unwrap();
2032 /// // Normally greek letters would be included in \w, but since
2033 /// // Unicode mode is disabled, it only matches ASCII letters.
2034 /// assert!(!re.is_match("δ".as_bytes()));
2035 ///
2036 /// let re = RegexSetBuilder::new([r"s"])
2037 /// .case_insensitive(true)
2038 /// .unicode(false)
2039 /// .build()
2040 /// .unwrap();
2041 /// // Normally 'Å¿' is included when searching for 's' case
2042 /// // insensitively due to Unicode's simple case folding rules. But
2043 /// // when Unicode mode is disabled, only ASCII case insensitive rules
2044 /// // are used.
2045 /// assert!(!re.is_match("Å¿".as_bytes()));
2046 /// ```
2047 ///
2048 /// Since this builder is for constructing a
2049 /// [`bytes::RegexSet`](RegexSet), one can disable Unicode mode even if
2050 /// it would match invalid UTF-8:
2051 ///
2052 /// ```
2053 /// use regex::bytes::RegexSetBuilder;
2054 ///
2055 /// let re = RegexSetBuilder::new([r"."])
2056 /// .unicode(false)
2057 /// .build()
2058 /// .unwrap();
2059 /// // Normally greek letters would be included in \w, but since
2060 /// // Unicode mode is disabled, it only matches ASCII letters.
2061 /// assert!(re.is_match(b"\xFF"));
2062 /// ```
2063 pub fn unicode(&mut self, yes: bool) -> &mut RegexSetBuilder {
2064 self.builder.unicode(yes);
2065 self
2066 }
2067
2068 /// This configures whether to enable case insensitive matching for all
2069 /// of the patterns.
2070 ///
2071 /// This setting can also be configured using the inline flag `i`
2072 /// in the pattern. For example, `(?i:foo)` matches `foo` case
2073 /// insensitively while `(?-i:foo)` matches `foo` case sensitively.
2074 ///
2075 /// The default for this is `false`.
2076 ///
2077 /// # Example
2078 ///
2079 /// ```
2080 /// use regex::bytes::RegexSetBuilder;
2081 ///
2082 /// let re = RegexSetBuilder::new([r"foo(?-i:bar)quux"])
2083 /// .case_insensitive(true)
2084 /// .build()
2085 /// .unwrap();
2086 /// assert!(re.is_match(b"FoObarQuUx"));
2087 /// // Even though case insensitive matching is enabled in the builder,
2088 /// // it can be locally disabled within the pattern. In this case,
2089 /// // `bar` is matched case sensitively.
2090 /// assert!(!re.is_match(b"fooBARquux"));
2091 /// ```
2092 pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexSetBuilder {
2093 self.builder.case_insensitive(yes);
2094 self
2095 }
2096
2097 /// This configures multi-line mode for all of the patterns.
2098 ///
2099 /// Enabling multi-line mode changes the behavior of the `^` and `$`
2100 /// anchor assertions. Instead of only matching at the beginning and
2101 /// end of a haystack, respectively, multi-line mode causes them to
2102 /// match at the beginning and end of a line *in addition* to the
2103 /// beginning and end of a haystack. More precisely, `^` will match at
2104 /// the position immediately following a `\n` and `$` will match at the
2105 /// position immediately preceding a `\n`.
2106 ///
2107 /// The behavior of this option can be impacted by other settings too:
2108 ///
2109 /// * The [`RegexSetBuilder::line_terminator`] option changes `\n`
2110 /// above to any ASCII byte.
2111 /// * The [`RegexSetBuilder::crlf`] option changes the line terminator
2112 /// to be either `\r` or `\n`, but never at the position between a `\r`
2113 /// and `\n`.
2114 ///
2115 /// This setting can also be configured using the inline flag `m` in
2116 /// the pattern.
2117 ///
2118 /// The default for this is `false`.
2119 ///
2120 /// # Example
2121 ///
2122 /// ```
2123 /// use regex::bytes::RegexSetBuilder;
2124 ///
2125 /// let re = RegexSetBuilder::new([r"^foo$"])
2126 /// .multi_line(true)
2127 /// .build()
2128 /// .unwrap();
2129 /// assert!(re.is_match(b"\nfoo\n"));
2130 /// ```
2131 pub fn multi_line(&mut self, yes: bool) -> &mut RegexSetBuilder {
2132 self.builder.multi_line(yes);
2133 self
2134 }
2135
2136 /// This configures dot-matches-new-line mode for the entire pattern.
2137 ///
2138 /// Perhaps surprisingly, the default behavior for `.` is not to match
2139 /// any character, but rather, to match any character except for the
2140 /// line terminator (which is `\n` by default). When this mode is
2141 /// enabled, the behavior changes such that `.` truly matches any
2142 /// character.
2143 ///
2144 /// This setting can also be configured using the inline flag `s` in
2145 /// the pattern. For example, `(?s:.)` and `\p{any}` are equivalent
2146 /// regexes.
2147 ///
2148 /// The default for this is `false`.
2149 ///
2150 /// # Example
2151 ///
2152 /// ```
2153 /// use regex::bytes::RegexSetBuilder;
2154 ///
2155 /// let re = RegexSetBuilder::new([r"foo.bar"])
2156 /// .dot_matches_new_line(true)
2157 /// .build()
2158 /// .unwrap();
2159 /// let hay = b"foo\nbar";
2160 /// assert!(re.is_match(hay));
2161 /// ```
2162 pub fn dot_matches_new_line(
2163 &mut self,
2164 yes: bool,
2165 ) -> &mut RegexSetBuilder {
2166 self.builder.dot_matches_new_line(yes);
2167 self
2168 }
2169
2170 /// This configures CRLF mode for all of the patterns.
2171 ///
2172 /// When CRLF mode is enabled, both `\r` ("carriage return" or CR for
2173 /// short) and `\n` ("line feed" or LF for short) are treated as line
2174 /// terminators. This results in the following:
2175 ///
2176 /// * Unless dot-matches-new-line mode is enabled, `.` will now match
2177 /// any character except for `\n` and `\r`.
2178 /// * When multi-line mode is enabled, `^` will match immediately
2179 /// following a `\n` or a `\r`. Similarly, `$` will match immediately
2180 /// preceding a `\n` or a `\r`. Neither `^` nor `$` will ever match
2181 /// between `\r` and `\n`.
2182 ///
2183 /// This setting can also be configured using the inline flag `R` in
2184 /// the pattern.
2185 ///
2186 /// The default for this is `false`.
2187 ///
2188 /// # Example
2189 ///
2190 /// ```
2191 /// use regex::bytes::RegexSetBuilder;
2192 ///
2193 /// let re = RegexSetBuilder::new([r"^foo$"])
2194 /// .multi_line(true)
2195 /// .crlf(true)
2196 /// .build()
2197 /// .unwrap();
2198 /// let hay = b"\r\nfoo\r\n";
2199 /// // If CRLF mode weren't enabled here, then '$' wouldn't match
2200 /// // immediately after 'foo', and thus no match would be found.
2201 /// assert!(re.is_match(hay));
2202 /// ```
2203 ///
2204 /// This example demonstrates that `^` will never match at a position
2205 /// between `\r` and `\n`. (`$` will similarly not match between a `\r`
2206 /// and a `\n`.)
2207 ///
2208 /// ```
2209 /// use regex::bytes::RegexSetBuilder;
2210 ///
2211 /// let re = RegexSetBuilder::new([r"^\n"])
2212 /// .multi_line(true)
2213 /// .crlf(true)
2214 /// .build()
2215 /// .unwrap();
2216 /// assert!(!re.is_match(b"\r\n"));
2217 /// ```
2218 pub fn crlf(&mut self, yes: bool) -> &mut RegexSetBuilder {
2219 self.builder.crlf(yes);
2220 self
2221 }
2222
2223 /// Configures the line terminator to be used by the regex.
2224 ///
2225 /// The line terminator is relevant in two ways for a particular regex:
2226 ///
2227 /// * When dot-matches-new-line mode is *not* enabled (the default),
2228 /// then `.` will match any character except for the configured line
2229 /// terminator.
2230 /// * When multi-line mode is enabled (not the default), then `^` and
2231 /// `$` will match immediately after and before, respectively, a line
2232 /// terminator.
2233 ///
2234 /// In both cases, if CRLF mode is enabled in a particular context,
2235 /// then it takes precedence over any configured line terminator.
2236 ///
2237 /// This option cannot be configured from within the pattern.
2238 ///
2239 /// The default line terminator is `\n`.
2240 ///
2241 /// # Example
2242 ///
2243 /// This shows how to treat the NUL byte as a line terminator. This can
2244 /// be a useful heuristic when searching binary data.
2245 ///
2246 /// ```
2247 /// use regex::bytes::RegexSetBuilder;
2248 ///
2249 /// let re = RegexSetBuilder::new([r"^foo$"])
2250 /// .multi_line(true)
2251 /// .line_terminator(b'\x00')
2252 /// .build()
2253 /// .unwrap();
2254 /// let hay = b"\x00foo\x00";
2255 /// assert!(re.is_match(hay));
2256 /// ```
2257 ///
2258 /// This example shows that the behavior of `.` is impacted by this
2259 /// setting as well:
2260 ///
2261 /// ```
2262 /// use regex::bytes::RegexSetBuilder;
2263 ///
2264 /// let re = RegexSetBuilder::new([r"."])
2265 /// .line_terminator(b'\x00')
2266 /// .build()
2267 /// .unwrap();
2268 /// assert!(re.is_match(b"\n"));
2269 /// assert!(!re.is_match(b"\x00"));
2270 /// ```
2271 ///
2272 /// This shows that building a regex will work even when the byte given
2273 /// is not ASCII. This is unlike the top-level `RegexSet` API where
2274 /// matching invalid UTF-8 is not allowed.
2275 ///
2276 /// Note though that you must disable Unicode mode. This is required
2277 /// because Unicode mode requires matching one codepoint at a time,
2278 /// and there is no way to match a non-ASCII byte as if it were a
2279 /// codepoint.
2280 ///
2281 /// ```
2282 /// use regex::bytes::RegexSetBuilder;
2283 ///
2284 /// assert!(
2285 /// RegexSetBuilder::new([r"."])
2286 /// .unicode(false)
2287 /// .line_terminator(0x80)
2288 /// .build()
2289 /// .is_ok(),
2290 /// );
2291 /// ```
2292 pub fn line_terminator(&mut self, byte: u8) -> &mut RegexSetBuilder {
2293 self.builder.line_terminator(byte);
2294 self
2295 }
2296
2297 /// This configures swap-greed mode for all of the patterns.
2298 ///
2299 /// When swap-greed mode is enabled, patterns like `a+` will become
2300 /// non-greedy and patterns like `a+?` will become greedy. In other
2301 /// words, the meanings of `a+` and `a+?` are switched.
2302 ///
2303 /// This setting can also be configured using the inline flag `U` in
2304 /// the pattern.
2305 ///
2306 /// Note that this is generally not useful for a `RegexSet` since a
2307 /// `RegexSet` can only report whether a pattern matches or not. Since
2308 /// greediness never impacts whether a match is found or not (only the
2309 /// offsets of the match), it follows that whether parts of a pattern
2310 /// are greedy or not doesn't matter for a `RegexSet`.
2311 ///
2312 /// The default for this is `false`.
2313 pub fn swap_greed(&mut self, yes: bool) -> &mut RegexSetBuilder {
2314 self.builder.swap_greed(yes);
2315 self
2316 }
2317
2318 /// This configures verbose mode for all of the patterns.
2319 ///
2320 /// When enabled, whitespace will treated as insignifcant in the
2321 /// pattern and `#` can be used to start a comment until the next new
2322 /// line.
2323 ///
2324 /// Normally, in most places in a pattern, whitespace is treated
2325 /// literally. For example ` +` will match one or more ASCII whitespace
2326 /// characters.
2327 ///
2328 /// When verbose mode is enabled, `\#` can be used to match a literal
2329 /// `#` and `\ ` can be used to match a literal ASCII whitespace
2330 /// character.
2331 ///
2332 /// Verbose mode is useful for permitting regexes to be formatted and
2333 /// broken up more nicely. This may make them more easily readable.
2334 ///
2335 /// This setting can also be configured using the inline flag `x` in
2336 /// the pattern.
2337 ///
2338 /// The default for this is `false`.
2339 ///
2340 /// # Example
2341 ///
2342 /// ```
2343 /// use regex::bytes::RegexSetBuilder;
2344 ///
2345 /// let pat = r"
2346 /// \b
2347 /// (?<first>\p{Uppercase}\w*) # always start with uppercase letter
2348 /// [\s--\n]+ # whitespace should separate names
2349 /// (?: # middle name can be an initial!
2350 /// (?:(?<initial>\p{Uppercase})\.|(?<middle>\p{Uppercase}\w*))
2351 /// [\s--\n]+
2352 /// )?
2353 /// (?<last>\p{Uppercase}\w*)
2354 /// \b
2355 /// ";
2356 /// let re = RegexSetBuilder::new([pat])
2357 /// .ignore_whitespace(true)
2358 /// .build()
2359 /// .unwrap();
2360 /// assert!(re.is_match(b"Harry Potter"));
2361 /// assert!(re.is_match(b"Harry J. Potter"));
2362 /// assert!(re.is_match(b"Harry James Potter"));
2363 /// assert!(!re.is_match(b"harry J. Potter"));
2364 /// ```
2365 pub fn ignore_whitespace(
2366 &mut self,
2367 yes: bool,
2368 ) -> &mut RegexSetBuilder {
2369 self.builder.ignore_whitespace(yes);
2370 self
2371 }
2372
2373 /// This configures octal mode for all of the patterns.
2374 ///
2375 /// Octal syntax is a little-known way of uttering Unicode codepoints
2376 /// in a pattern. For example, `a`, `\x61`, `\u0061` and `\141` are all
2377 /// equivalent patterns, where the last example shows octal syntax.
2378 ///
2379 /// While supporting octal syntax isn't in and of itself a problem,
2380 /// it does make good error messages harder. That is, in PCRE based
2381 /// regex engines, syntax like `\1` invokes a backreference, which is
2382 /// explicitly unsupported this library. However, many users expect
2383 /// backreferences to be supported. Therefore, when octal support
2384 /// is disabled, the error message will explicitly mention that
2385 /// backreferences aren't supported.
2386 ///
2387 /// The default for this is `false`.
2388 ///
2389 /// # Example
2390 ///
2391 /// ```
2392 /// use regex::bytes::RegexSetBuilder;
2393 ///
2394 /// // Normally this pattern would not compile, with an error message
2395 /// // about backreferences not being supported. But with octal mode
2396 /// // enabled, octal escape sequences work.
2397 /// let re = RegexSetBuilder::new([r"\141"])
2398 /// .octal(true)
2399 /// .build()
2400 /// .unwrap();
2401 /// assert!(re.is_match(b"a"));
2402 /// ```
2403 pub fn octal(&mut self, yes: bool) -> &mut RegexSetBuilder {
2404 self.builder.octal(yes);
2405 self
2406 }
2407
2408 /// Sets the approximate size limit, in bytes, of the compiled regex.
2409 ///
2410 /// This roughly corresponds to the number of heap memory, in
2411 /// bytes, occupied by a single regex. If the regex would otherwise
2412 /// approximately exceed this limit, then compiling that regex will
2413 /// fail.
2414 ///
2415 /// The main utility of a method like this is to avoid compiling
2416 /// regexes that use an unexpected amount of resources, such as
2417 /// time and memory. Even if the memory usage of a large regex is
2418 /// acceptable, its search time may not be. Namely, worst case time
2419 /// complexity for search is `O(m * n)`, where `m ~ len(pattern)` and
2420 /// `n ~ len(haystack)`. That is, search time depends, in part, on the
2421 /// size of the compiled regex. This means that putting a limit on the
2422 /// size of the regex limits how much a regex can impact search time.
2423 ///
2424 /// For more information about regex size limits, see the section on
2425 /// [untrusted inputs](crate#untrusted-input) in the top-level crate
2426 /// documentation.
2427 ///
2428 /// The default for this is some reasonable number that permits most
2429 /// patterns to compile successfully.
2430 ///
2431 /// # Example
2432 ///
2433 /// ```
2434 /// # if !cfg!(target_pointer_width = "64") { return; } // see #1041
2435 /// use regex::bytes::RegexSetBuilder;
2436 ///
2437 /// // It may surprise you how big some seemingly small patterns can
2438 /// // be! Since \w is Unicode aware, this generates a regex that can
2439 /// // match approximately 140,000 distinct codepoints.
2440 /// assert!(
2441 /// RegexSetBuilder::new([r"\w"])
2442 /// .size_limit(45_000)
2443 /// .build()
2444 /// .is_err()
2445 /// );
2446 /// ```
2447 pub fn size_limit(&mut self, bytes: usize) -> &mut RegexSetBuilder {
2448 self.builder.size_limit(bytes);
2449 self
2450 }
2451
2452 /// Set the approximate capacity, in bytes, of the cache of transitions
2453 /// used by the lazy DFA.
2454 ///
2455 /// While the lazy DFA isn't always used, in tends to be the most
2456 /// commonly use regex engine in default configurations. It tends to
2457 /// adopt the performance profile of a fully build DFA, but without the
2458 /// downside of taking worst case exponential time to build.
2459 ///
2460 /// The downside is that it needs to keep a cache of transitions and
2461 /// states that are built while running a search, and this cache
2462 /// can fill up. When it fills up, the cache will reset itself. Any
2463 /// previously generated states and transitions will then need to be
2464 /// re-generated. If this happens too many times, then this library
2465 /// will bail out of using the lazy DFA and switch to a different regex
2466 /// engine.
2467 ///
2468 /// If your regex provokes this particular downside of the lazy DFA,
2469 /// then it may be beneficial to increase its cache capacity. This will
2470 /// potentially reduce the frequency of cache resetting (ideally to
2471 /// `0`). While it won't fix all potential performance problems with
2472 /// the lazy DFA, increasing the cache capacity does fix some.
2473 ///
2474 /// There is no easy way to determine, a priori, whether increasing
2475 /// this cache capacity will help. In general, the larger your regex,
2476 /// the more cache it's likely to use. But that isn't an ironclad rule.
2477 /// For example, a regex like `[01]*1[01]{N}` would normally produce a
2478 /// fully build DFA that is exponential in size with respect to `N`.
2479 /// The lazy DFA will prevent exponential space blow-up, but it cache
2480 /// is likely to fill up, even when it's large and even for smallish
2481 /// values of `N`.
2482 ///
2483 /// If you aren't sure whether this helps or not, it is sensible to
2484 /// set this to some arbitrarily large number in testing, such as
2485 /// `usize::MAX`. Namely, this represents the amount of capacity that
2486 /// *may* be used. It's probably not a good idea to use `usize::MAX` in
2487 /// production though, since it implies there are no controls on heap
2488 /// memory used by this library during a search. In effect, set it to
2489 /// whatever you're willing to allocate for a single regex search.
2490 pub fn dfa_size_limit(
2491 &mut self,
2492 bytes: usize,
2493 ) -> &mut RegexSetBuilder {
2494 self.builder.dfa_size_limit(bytes);
2495 self
2496 }
2497
2498 /// Set the nesting limit for this parser.
2499 ///
2500 /// The nesting limit controls how deep the abstract syntax tree is
2501 /// allowed to be. If the AST exceeds the given limit (e.g., with too
2502 /// many nested groups), then an error is returned by the parser.
2503 ///
2504 /// The purpose of this limit is to act as a heuristic to prevent stack
2505 /// overflow for consumers that do structural induction on an AST using
2506 /// explicit recursion. While this crate never does this (instead using
2507 /// constant stack space and moving the call stack to the heap), other
2508 /// crates may.
2509 ///
2510 /// This limit is not checked until the entire AST is parsed.
2511 /// Therefore, if callers want to put a limit on the amount of heap
2512 /// space used, then they should impose a limit on the length, in
2513 /// bytes, of the concrete pattern string. In particular, this is
2514 /// viable since this parser implementation will limit itself to heap
2515 /// space proportional to the length of the pattern string. See also
2516 /// the [untrusted inputs](crate#untrusted-input) section in the
2517 /// top-level crate documentation for more information about this.
2518 ///
2519 /// Note that a nest limit of `0` will return a nest limit error for
2520 /// most patterns but not all. For example, a nest limit of `0` permits
2521 /// `a` but not `ab`, since `ab` requires an explicit concatenation,
2522 /// which results in a nest depth of `1`. In general, a nest limit is
2523 /// not something that manifests in an obvious way in the concrete
2524 /// syntax, therefore, it should not be used in a granular way.
2525 ///
2526 /// # Example
2527 ///
2528 /// ```
2529 /// use regex::bytes::RegexSetBuilder;
2530 ///
2531 /// assert!(RegexSetBuilder::new([r"a"]).nest_limit(0).build().is_ok());
2532 /// assert!(RegexSetBuilder::new([r"ab"]).nest_limit(0).build().is_err());
2533 /// ```
2534 pub fn nest_limit(&mut self, limit: u32) -> &mut RegexSetBuilder {
2535 self.builder.nest_limit(limit);
2536 self
2537 }
2538 }
2539}
2540