builders.rs source code [crates/regex-1.10.3/src/builders.rs]

1	#![allow(warnings)]
2
3	// This module defines an internal builder that encapsulates all interaction
4	// with meta::Regex construction, and then 4 public API builders that wrap
5	// around it. The docs are essentially repeated on each of the 4 public
6	// builders, with tweaks to the examples as needed.
7	//
8	// The reason why there are so many builders is partially because of a misstep
9	// in the initial API design: the builder constructor takes in the pattern
10	// strings instead of using the `build` method to accept the pattern strings.
11	// This means `new` has a different signature for each builder. It probably
12	// would have been nicer to to use one builder with `fn new()`, and then add
13	// `build(pat)` and `build_many(pats)` constructors.
14	//
15	// The other reason is because I think the `bytes` module should probably
16	// have its own builder type. That way, it is completely isolated from the
17	// top-level API.
18	//
19	// If I could do it again, I'd probably have a `regex::Builder` and a
20	// `regex::bytes::Builder`. Each would have `build` and `build_set` (or
21	// `build_many`) methods for constructing a single pattern `Regex` and a
22	// multi-pattern `RegexSet`, respectively.
23
24	use alloc::{
25	string::{String, ToString},
26	sync::Arc,
27	vec,
28	vec::Vec,
29	};
30
31	use regex_automata::{
32	meta, nfa::thompson::WhichCaptures, util::syntax, MatchKind,
33	};
34
35	use crate::error::Error;
36
37	/// A builder for constructing a `Regex`, `bytes::Regex`, `RegexSet` or a
38	/// `bytes::RegexSet`.
39	///
40	/// This is essentially the implementation of the four different builder types
41	/// in the public API: `RegexBuilder`, `bytes::RegexBuilder`, `RegexSetBuilder`
42	/// and `bytes::RegexSetBuilder`.
43	#[derive(Clone, Debug)]
44	struct Builder {
45	pats: Vec<String>,
46	metac: meta::Config,
47	syntaxc: syntax::Config,
48	}
49
50	impl Default for Builder {
51	fn default() -> Builder {
52	let metac: Config = meta::Config::new()
53	.nfa_size_limit(Some(`10` * (`1` << `20`)))
54	.hybrid_cache_capacity(limit:`2` * (`1` << `20`));
55	Builder { pats: vec![], metac, syntaxc: syntax::Config::default() }
56	}
57	}
58
59	impl Builder {
60	fn new<I, S>(patterns: I) -> Builder
61	where
62	S: AsRef<str>,
63	I: IntoIterator<Item = S>,
64	{
65	let mut b = Builder::default();
66	b.pats.extend(patterns.into_iter().map(\|p\| p.as_ref().to_string()));
67	b
68	}
69
70	fn build_one_string(&self) -> Result<crate::Regex, Error> {
71	assert_eq!(`1`, self.pats.len());
72	let metac = self
73	.metac
74	.clone()
75	.match_kind(MatchKind::LeftmostFirst)
76	.utf8_empty(`true`);
77	let syntaxc = self.syntaxc.clone().utf8(`true`);
78	let pattern = Arc::from(self.pats[`0`].as_str());
79	meta::Builder::new()
80	.configure(metac)
81	.syntax(syntaxc)
82	.build(&pattern)
83	.map(\|meta\| crate::Regex { meta, pattern })
84	.map_err(Error::from_meta_build_error)
85	}
86
87	fn build_one_bytes(&self) -> Result<crate::bytes::Regex, Error> {
88	assert_eq!(`1`, self.pats.len());
89	let metac = self
90	.metac
91	.clone()
92	.match_kind(MatchKind::LeftmostFirst)
93	.utf8_empty(`false`);
94	let syntaxc = self.syntaxc.clone().utf8(`false`);
95	let pattern = Arc::from(self.pats[`0`].as_str());
96	meta::Builder::new()
97	.configure(metac)
98	.syntax(syntaxc)
99	.build(&pattern)
100	.map(\|meta\| crate::bytes::Regex { meta, pattern })
101	.map_err(Error::from_meta_build_error)
102	}
103
104	fn build_many_string(&self) -> Result<crate::RegexSet, Error> {
105	let metac = self
106	.metac
107	.clone()
108	.match_kind(MatchKind::All)
109	.utf8_empty(`true`)
110	.which_captures(WhichCaptures::None);
111	let syntaxc = self.syntaxc.clone().utf8(`true`);
112	let patterns = Arc::from(self.pats.as_slice());
113	meta::Builder::new()
114	.configure(metac)
115	.syntax(syntaxc)
116	.build_many(&patterns)
117	.map(\|meta\| crate::RegexSet { meta, patterns })
118	.map_err(Error::from_meta_build_error)
119	}
120
121	fn build_many_bytes(&self) -> Result<crate::bytes::RegexSet, Error> {
122	let metac = self
123	.metac
124	.clone()
125	.match_kind(MatchKind::All)
126	.utf8_empty(`false`)
127	.which_captures(WhichCaptures::None);
128	let syntaxc = self.syntaxc.clone().utf8(`false`);
129	let patterns = Arc::from(self.pats.as_slice());
130	meta::Builder::new()
131	.configure(metac)
132	.syntax(syntaxc)
133	.build_many(&patterns)
134	.map(\|meta\| crate::bytes::RegexSet { meta, patterns })
135	.map_err(Error::from_meta_build_error)
136	}
137
138	fn case_insensitive(&mut self, yes: bool) -> &mut Builder {
139	self.syntaxc = self.syntaxc.case_insensitive(yes);
140	self
141	}
142
143	fn multi_line(&mut self, yes: bool) -> &mut Builder {
144	self.syntaxc = self.syntaxc.multi_line(yes);
145	self
146	}
147
148	fn dot_matches_new_line(&mut self, yes: bool) -> &mut Builder {
149	self.syntaxc = self.syntaxc.dot_matches_new_line(yes);
150	self
151	}
152
153	fn crlf(&mut self, yes: bool) -> &mut Builder {
154	self.syntaxc = self.syntaxc.crlf(yes);
155	self
156	}
157
158	fn line_terminator(&mut self, byte: u8) -> &mut Builder {
159	self.metac = self.metac.clone().line_terminator(byte);
160	self.syntaxc = self.syntaxc.line_terminator(byte);
161	self
162	}
163
164	fn swap_greed(&mut self, yes: bool) -> &mut Builder {
165	self.syntaxc = self.syntaxc.swap_greed(yes);
166	self
167	}
168
169	fn ignore_whitespace(&mut self, yes: bool) -> &mut Builder {
170	self.syntaxc = self.syntaxc.ignore_whitespace(yes);
171	self
172	}
173
174	fn unicode(&mut self, yes: bool) -> &mut Builder {
175	self.syntaxc = self.syntaxc.unicode(yes);
176	self
177	}
178
179	fn octal(&mut self, yes: bool) -> &mut Builder {
180	self.syntaxc = self.syntaxc.octal(yes);
181	self
182	}
183
184	fn size_limit(&mut self, limit: usize) -> &mut Builder {
185	self.metac = self.metac.clone().nfa_size_limit(Some(limit));
186	self
187	}
188
189	fn dfa_size_limit(&mut self, limit: usize) -> &mut Builder {
190	self.metac = self.metac.clone().hybrid_cache_capacity(limit);
191	self
192	}
193
194	fn nest_limit(&mut self, limit: u32) -> &mut Builder {
195	self.syntaxc = self.syntaxc.nest_limit(limit);
196	self
197	}
198	}
199
200	pub(crate) mod string {
201	use crate::{error::Error, Regex, RegexSet};
202
203	use super::Builder;
204
205	/// A configurable builder for a [`Regex`].
206	///
207	/// This builder can be used to programmatically set flags such as `i`
208	/// (case insensitive) and `x` (for verbose mode). This builder can also be
209	/// used to configure things like the line terminator and a size limit on
210	/// the compiled regular expression.
211	#[derive(Clone, Debug)]
212	pub struct RegexBuilder {
213	builder: Builder,
214	}
215
216	impl RegexBuilder {
217	/// Create a new builder with a default configuration for the given
218	/// pattern.
219	///
220	/// If the pattern is invalid or exceeds the configured size limits,
221	/// then an error will be returned when [`RegexBuilder::build`] is
222	/// called.
223	pub fn new(pattern: &str) -> RegexBuilder {
224	RegexBuilder { builder: Builder::new([pattern]) }
225	}
226
227	/// Compiles the pattern given to `RegexBuilder::new` with the
228	/// configuration set on this builder.
229	///
230	/// If the pattern isn't a valid regex or if a configured size limit
231	/// was exceeded, then an error is returned.
232	pub fn build(&self) -> Result<Regex, Error> {
233	self.builder.build_one_string()
234	}
235
236	/// This configures Unicode mode for the entire pattern.
237	///
238	/// Enabling Unicode mode does a number of things:
239	///
240	/// Most fundamentally, it causes the fundamental atom of matching*
241	/// to be a single codepoint. When Unicode mode is disabled, it's a
242	/// single byte. For example, when Unicode mode is enabled, `.` will
243	/// match `💩` once, where as it will match 4 times when Unicode mode
244	/// is disabled. (Since the UTF-8 encoding of `💩` is 4 bytes long.)
245	/// Case insensitive matching uses Unicode simple case folding rules.*
246	/// Unicode character classes like `\p{Letter}` and `\p{Greek}` are*
247	/// available.
248	/// Perl character classes are Unicode aware. That is, `\w`, `\s` and*
249	/// `\d`.
250	/// The word boundary assertions, `\b` and `\B`, use the Unicode*
251	/// definition of a word character.
252	///
253	/// Note that if Unicode mode is disabled, then the regex will fail to
254	/// compile if it could match invalid UTF-8. For example, when Unicode
255	/// mode is disabled, then since `.` matches any byte (except for
256	/// `\n`), then it can match invalid UTF-8 and thus building a regex
257	/// from it will fail. Another example is `\w` and `\W`. Since `\w` can
258	/// only match ASCII bytes when Unicode mode is disabled, it's allowed.
259	/// But `\W` can match more than ASCII bytes, including invalid UTF-8,
260	/// and so it is not allowed. This restriction can be lifted only by
261	/// using a [`bytes::Regex`](crate::bytes::Regex).
262	///
263	/// For more details on the Unicode support in this crate, see the
264	/// [Unicode section](crate#unicode) in this crate's top-level
265	/// documentation.
266	///
267	/// The default for this is `true`.
268	///
269	/// # Example
270	///
271	/// ```
272	/// use regex::RegexBuilder;
273	///
274	/// let re = RegexBuilder::new(r"\w")
275	/// .unicode(`false`)
276	/// .build()
277	/// .unwrap();
278	/// // Normally greek letters would be included in \w, but since
279	/// // Unicode mode is disabled, it only matches ASCII letters.
280	/// assert!(!re.is_match("δ"));
281	///
282	/// let re = RegexBuilder::new(r"s")
283	/// .case_insensitive(`true`)
284	/// .unicode(`false`)
285	/// .build()
286	/// .unwrap();
287	/// // Normally 'ſ' is included when searching for 's' case
288	/// // insensitively due to Unicode's simple case folding rules. But
289	/// // when Unicode mode is disabled, only ASCII case insensitive rules
290	/// // are used.
291	/// assert!(!re.is_match("ſ"));
292	/// ```
293	pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder {
294	self.builder.unicode(yes);
295	self
296	}
297
298	/// This configures whether to enable case insensitive matching for the
299	/// entire pattern.
300	///
301	/// This setting can also be configured using the inline flag `i`
302	/// in the pattern. For example, `(?i:foo)` matches `foo` case
303	/// insensitively while `(?-i:foo)` matches `foo` case sensitively.
304	///
305	/// The default for this is `false`.
306	///
307	/// # Example
308	///
309	/// ```
310	/// use regex::RegexBuilder;
311	///
312	/// let re = RegexBuilder::new(r"foo(?-i:bar)quux")
313	/// .case_insensitive(`true`)
314	/// .build()
315	/// .unwrap();
316	/// assert!(re.is_match("FoObarQuUx"));
317	/// // Even though case insensitive matching is enabled in the builder,
318	/// // it can be locally disabled within the pattern. In this case,
319	/// // `bar` is matched case sensitively.
320	/// assert!(!re.is_match("fooBARquux"));
321	/// ```
322	pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder {
323	self.builder.case_insensitive(yes);
324	self
325	}
326
327	/// This configures multi-line mode for the entire pattern.
328	///
329	/// Enabling multi-line mode changes the behavior of the `^` and `$`
330	/// anchor assertions. Instead of only matching at the beginning and
331	/// end of a haystack, respectively, multi-line mode causes them to
332	/// match at the beginning and end of a line in addition* to the*
333	/// beginning and end of a haystack. More precisely, `^` will match at
334	/// the position immediately following a `\n` and `$` will match at the
335	/// position immediately preceding a `\n`.
336	///
337	/// The behavior of this option can be impacted by other settings too:
338	///
339	/// The* [`RegexBuilder::line_terminator`] option changes `\n` above
340	/// to any ASCII byte.
341	/// The* [`RegexBuilder::crlf`] option changes the line terminator to
342	/// be either `\r` or `\n`, but never at the position between a `\r`
343	/// and `\n`.
344	///
345	/// This setting can also be configured using the inline flag `m` in
346	/// the pattern.
347	///
348	/// The default for this is `false`.
349	///
350	/// # Example
351	///
352	/// ```
353	/// use regex::RegexBuilder;
354	///
355	/// let re = RegexBuilder::new(r"^foo$")
356	/// .multi_line(`true`)
357	/// .build()
358	/// .unwrap();
359	/// assert_eq!(Some(`1`..`4`), re.find("`\n`foo`\n`").map(\|m\| m.range()));
360	/// ```
361	pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder {
362	self.builder.multi_line(yes);
363	self
364	}
365
366	/// This configures dot-matches-new-line mode for the entire pattern.
367	///
368	/// Perhaps surprisingly, the default behavior for `.` is not to match
369	/// any character, but rather, to match any character except for the
370	/// line terminator (which is `\n` by default). When this mode is
371	/// enabled, the behavior changes such that `.` truly matches any
372	/// character.
373	///
374	/// This setting can also be configured using the inline flag `s` in
375	/// the pattern. For example, `(?s:.)` and `\p{any}` are equivalent
376	/// regexes.
377	///
378	/// The default for this is `false`.
379	///
380	/// # Example
381	///
382	/// ```
383	/// use regex::RegexBuilder;
384	///
385	/// let re = RegexBuilder::new(r"foo.bar")
386	/// .dot_matches_new_line(`true`)
387	/// .build()
388	/// .unwrap();
389	/// let hay = "foo`\n`bar";
390	/// assert_eq!(Some("foo`\n`bar"), re.find(hay).map(\|m\| m.as_str()));
391	/// ```
392	pub fn dot_matches_new_line(
393	&mut self,
394	yes: bool,
395	) -> &mut RegexBuilder {
396	self.builder.dot_matches_new_line(yes);
397	self
398	}
399
400	/// This configures CRLF mode for the entire pattern.
401	///
402	/// When CRLF mode is enabled, both `\r` ("carriage return" or CR for
403	/// short) and `\n` ("line feed" or LF for short) are treated as line
404	/// terminators. This results in the following:
405	///
406	/// Unless dot-matches-new-line mode is enabled, `.` will now match*
407	/// any character except for `\n` and `\r`.
408	/// When multi-line mode is enabled, `^` will match immediately*
409	/// following a `\n` or a `\r`. Similarly, `$` will match immediately
410	/// preceding a `\n` or a `\r`. Neither `^` nor `$` will ever match
411	/// between `\r` and `\n`.
412	///
413	/// This setting can also be configured using the inline flag `R` in
414	/// the pattern.
415	///
416	/// The default for this is `false`.
417	///
418	/// # Example
419	///
420	/// ```
421	/// use regex::RegexBuilder;
422	///
423	/// let re = RegexBuilder::new(r"^foo$")
424	/// .multi_line(`true`)
425	/// .crlf(`true`)
426	/// .build()
427	/// .unwrap();
428	/// let hay = "`\r\n`foo`\r\n`";
429	/// // If CRLF mode weren't enabled here, then '$' wouldn't match
430	/// // immediately after 'foo', and thus no match would be found.
431	/// assert_eq!(Some("foo"), re.find(hay).map(\|m\| m.as_str()));
432	/// ```
433	///
434	/// This example demonstrates that `^` will never match at a position
435	/// between `\r` and `\n`. (`$` will similarly not match between a `\r`
436	/// and a `\n`.)
437	///
438	/// ```
439	/// use regex::RegexBuilder;
440	///
441	/// let re = RegexBuilder::new(r"^")
442	/// .multi_line(`true`)
443	/// .crlf(`true`)
444	/// .build()
445	/// .unwrap();
446	/// let hay = "`\r\n\r\n`";
447	/// let ranges: Vec<_> = re.find_iter(hay).map(\|m\| m.range()).collect();
448	/// assert_eq!(ranges, vec![`0`..`0`, `2`..`2`, `4`..`4`]);
449	/// ```
450	pub fn crlf(&mut self, yes: bool) -> &mut RegexBuilder {
451	self.builder.crlf(yes);
452	self
453	}
454
455	/// Configures the line terminator to be used by the regex.
456	///
457	/// The line terminator is relevant in two ways for a particular regex:
458	///
459	/// When dot-matches-new-line mode is not enabled (the default),*
460	/// then `.` will match any character except for the configured line
461	/// terminator.
462	/// When multi-line mode is enabled (not the default), then `^` and*
463	/// `$` will match immediately after and before, respectively, a line
464	/// terminator.
465	///
466	/// In both cases, if CRLF mode is enabled in a particular context,
467	/// then it takes precedence over any configured line terminator.
468	///
469	/// This option cannot be configured from within the pattern.
470	///
471	/// The default line terminator is `\n`.
472	///
473	/// # Example
474	///
475	/// This shows how to treat the NUL byte as a line terminator. This can
476	/// be a useful heuristic when searching binary data.
477	///
478	/// ```
479	/// use regex::RegexBuilder;
480	///
481	/// let re = RegexBuilder::new(r"^foo$")
482	/// .multi_line(`true`)
483	/// .line_terminator(b'`\x00`')
484	/// .build()
485	/// .unwrap();
486	/// let hay = "`\x00`foo`\x00`";
487	/// assert_eq!(Some(`1`..`4`), re.find(hay).map(\|m\| m.range()));
488	/// ```
489	///
490	/// This example shows that the behavior of `.` is impacted by this
491	/// setting as well:
492	///
493	/// ```
494	/// use regex::RegexBuilder;
495	///
496	/// let re = RegexBuilder::new(r".")
497	/// .line_terminator(b'`\x00`')
498	/// .build()
499	/// .unwrap();
500	/// assert!(re.is_match("`\n`"));
501	/// assert!(!re.is_match("`\x00`"));
502	/// ```
503	///
504	/// This shows that building a regex will fail if the byte given
505	/// is not ASCII and the pattern could result in matching invalid
506	/// UTF-8. This is because any singular non-ASCII byte is not valid
507	/// UTF-8, and it is not permitted for a [`Regex`] to match invalid
508	/// UTF-8. (It is permissible to use a non-ASCII byte when building a
509	/// [`bytes::Regex`](crate::bytes::Regex).)
510	///
511	/// ```
512	/// use regex::RegexBuilder;
513	///
514	/// assert!(RegexBuilder::new(r".").line_terminator(`0x80`).build().is_err());
515	/// // Note that using a non-ASCII byte isn't enough on its own to
516	/// // cause regex compilation to fail. You actually have to make use
517	/// // of it in the regex in a way that leads to matching invalid
518	/// // UTF-8. If you don't, then regex compilation will succeed!
519	/// assert!(RegexBuilder::new(r"a").line_terminator(`0x80`).build().is_ok());
520	/// ```
521	pub fn line_terminator(&mut self, byte: u8) -> &mut RegexBuilder {
522	self.builder.line_terminator(byte);
523	self
524	}
525
526	/// This configures swap-greed mode for the entire pattern.
527	///
528	/// When swap-greed mode is enabled, patterns like `a+` will become
529	/// non-greedy and patterns like `a+?` will become greedy. In other
530	/// words, the meanings of `a+` and `a+?` are switched.
531	///
532	/// This setting can also be configured using the inline flag `U` in
533	/// the pattern.
534	///
535	/// The default for this is `false`.
536	///
537	/// # Example
538	///
539	/// ```
540	/// use regex::RegexBuilder;
541	///
542	/// let re = RegexBuilder::new(r"a+")
543	/// .swap_greed(`true`)
544	/// .build()
545	/// .unwrap();
546	/// assert_eq!(Some("a"), re.find("aaa").map(\|m\| m.as_str()));
547	/// ```
548	pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder {
549	self.builder.swap_greed(yes);
550	self
551	}
552
553	/// This configures verbose mode for the entire pattern.
554	///
555	/// When enabled, whitespace will treated as insignifcant in the
556	/// pattern and `#` can be used to start a comment until the next new
557	/// line.
558	///
559	/// Normally, in most places in a pattern, whitespace is treated
560	/// literally. For example ` +` will match one or more ASCII whitespace
561	/// characters.
562	///
563	/// When verbose mode is enabled, `\#` can be used to match a literal
564	/// `#` and `\ ` can be used to match a literal ASCII whitespace
565	/// character.
566	///
567	/// Verbose mode is useful for permitting regexes to be formatted and
568	/// broken up more nicely. This may make them more easily readable.
569	///
570	/// This setting can also be configured using the inline flag `x` in
571	/// the pattern.
572	///
573	/// The default for this is `false`.
574	///
575	/// # Example
576	///
577	/// ```
578	/// use regex::RegexBuilder;
579	///
580	/// let pat = r"
581	/// \b
582	/// (?<first>\p{Uppercase}\w*) # always start with uppercase letter
583	/// [\s--\n]+ # whitespace should separate names
584	/// (?: # middle name can be an initial!
585	/// (?:(?<initial>\p{Uppercase})\.\|(?<middle>\p{Uppercase}\w*))
586	/// [\s--\n]+
587	/// )?
588	/// (?<last>\p{Uppercase}\w*)
589	/// \b
590	/// ";
591	/// let re = RegexBuilder::new(pat)
592	/// .ignore_whitespace(`true`)
593	/// .build()
594	/// .unwrap();
595	///
596	/// let caps = re.captures("Harry Potter").unwrap();
597	/// assert_eq!("Harry", &caps["first"]);
598	/// assert_eq!("Potter", &caps["last"]);
599	///
600	/// let caps = re.captures("Harry J. Potter").unwrap();
601	/// assert_eq!("Harry", &caps["first"]);
602	/// // Since a middle name/initial isn't required for an overall match,
603	/// // we can't assume that 'initial' or 'middle' will be populated!
604	/// assert_eq!(Some("J"), caps.name("initial").map(\|m\| m.as_str()));
605	/// assert_eq!(None, caps.name("middle").map(\|m\| m.as_str()));
606	/// assert_eq!("Potter", &caps["last"]);
607	///
608	/// let caps = re.captures("Harry James Potter").unwrap();
609	/// assert_eq!("Harry", &caps["first"]);
610	/// // Since a middle name/initial isn't required for an overall match,
611	/// // we can't assume that 'initial' or 'middle' will be populated!
612	/// assert_eq!(None, caps.name("initial").map(\|m\| m.as_str()));
613	/// assert_eq!(Some("James"), caps.name("middle").map(\|m\| m.as_str()));
614	/// assert_eq!("Potter", &caps["last"]);
615	/// ```
616	pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder {
617	self.builder.ignore_whitespace(yes);
618	self
619	}
620
621	/// This configures octal mode for the entire pattern.
622	///
623	/// Octal syntax is a little-known way of uttering Unicode codepoints
624	/// in a pattern. For example, `a`, `\x61`, `\u0061` and `\141` are all
625	/// equivalent patterns, where the last example shows octal syntax.
626	///
627	/// While supporting octal syntax isn't in and of itself a problem,
628	/// it does make good error messages harder. That is, in PCRE based
629	/// regex engines, syntax like `\1` invokes a backreference, which is
630	/// explicitly unsupported this library. However, many users expect
631	/// backreferences to be supported. Therefore, when octal support
632	/// is disabled, the error message will explicitly mention that
633	/// backreferences aren't supported.
634	///
635	/// The default for this is `false`.
636	///
637	/// # Example
638	///
639	/// ```
640	/// use regex::RegexBuilder;
641	///
642	/// // Normally this pattern would not compile, with an error message
643	/// // about backreferences not being supported. But with octal mode
644	/// // enabled, octal escape sequences work.
645	/// let re = RegexBuilder::new(r"\141")
646	/// .octal(`true`)
647	/// .build()
648	/// .unwrap();
649	/// assert!(re.is_match("a"));
650	/// ```
651	pub fn octal(&mut self, yes: bool) -> &mut RegexBuilder {
652	self.builder.octal(yes);
653	self
654	}
655
656	/// Sets the approximate size limit, in bytes, of the compiled regex.
657	///
658	/// This roughly corresponds to the number of heap memory, in
659	/// bytes, occupied by a single regex. If the regex would otherwise
660	/// approximately exceed this limit, then compiling that regex will
661	/// fail.
662	///
663	/// The main utility of a method like this is to avoid compiling
664	/// regexes that use an unexpected amount of resources, such as
665	/// time and memory. Even if the memory usage of a large regex is
666	/// acceptable, its search time may not be. Namely, worst case time
667	/// complexity for search is `O(m n)`, where `m ~ len(pattern)` and*
668	/// `n ~ len(haystack)`. That is, search time depends, in part, on the
669	/// size of the compiled regex. This means that putting a limit on the
670	/// size of the regex limits how much a regex can impact search time.
671	///
672	/// For more information about regex size limits, see the section on
673	/// [untrusted inputs](crate#untrusted-input) in the top-level crate
674	/// documentation.
675	///
676	/// The default for this is some reasonable number that permits most
677	/// patterns to compile successfully.
678	///
679	/// # Example
680	///
681	/// ```
682	/// # if !cfg!(target_pointer_width = "64") { return; } // see #1041
683	/// use regex::RegexBuilder;
684	///
685	/// // It may surprise you how big some seemingly small patterns can
686	/// // be! Since \w is Unicode aware, this generates a regex that can
687	/// // match approximately 140,000 distinct codepoints.
688	/// assert!(RegexBuilder::new(r"\w").size_limit(`45_000`).build().is_err());
689	/// ```
690	pub fn size_limit(&mut self, bytes: usize) -> &mut RegexBuilder {
691	self.builder.size_limit(bytes);
692	self
693	}
694
695	/// Set the approximate capacity, in bytes, of the cache of transitions
696	/// used by the lazy DFA.
697	///
698	/// While the lazy DFA isn't always used, in tends to be the most
699	/// commonly use regex engine in default configurations. It tends to
700	/// adopt the performance profile of a fully build DFA, but without the
701	/// downside of taking worst case exponential time to build.
702	///
703	/// The downside is that it needs to keep a cache of transitions and
704	/// states that are built while running a search, and this cache
705	/// can fill up. When it fills up, the cache will reset itself. Any
706	/// previously generated states and transitions will then need to be
707	/// re-generated. If this happens too many times, then this library
708	/// will bail out of using the lazy DFA and switch to a different regex
709	/// engine.
710	///
711	/// If your regex provokes this particular downside of the lazy DFA,
712	/// then it may be beneficial to increase its cache capacity. This will
713	/// potentially reduce the frequency of cache resetting (ideally to
714	/// `0`). While it won't fix all potential performance problems with
715	/// the lazy DFA, increasing the cache capacity does fix some.
716	///
717	/// There is no easy way to determine, a priori, whether increasing
718	/// this cache capacity will help. In general, the larger your regex,
719	/// the more cache it's likely to use. But that isn't an ironclad rule.
720	/// For example, a regex like `[01]1[01]{N}` would normally produce a*
721	/// fully build DFA that is exponential in size with respect to `N`.
722	/// The lazy DFA will prevent exponential space blow-up, but it cache
723	/// is likely to fill up, even when it's large and even for smallish
724	/// values of `N`.
725	///
726	/// If you aren't sure whether this helps or not, it is sensible to
727	/// set this to some arbitrarily large number in testing, such as
728	/// `usize::MAX`. Namely, this represents the amount of capacity that
729	/// may* be used. It's probably not a good idea to use `usize::MAX` in*
730	/// production though, since it implies there are no controls on heap
731	/// memory used by this library during a search. In effect, set it to
732	/// whatever you're willing to allocate for a single regex search.
733	pub fn dfa_size_limit(&mut self, bytes: usize) -> &mut RegexBuilder {
734	self.builder.dfa_size_limit(bytes);
735	self
736	}
737
738	/// Set the nesting limit for this parser.
739	///
740	/// The nesting limit controls how deep the abstract syntax tree is
741	/// allowed to be. If the AST exceeds the given limit (e.g., with too
742	/// many nested groups), then an error is returned by the parser.
743	///
744	/// The purpose of this limit is to act as a heuristic to prevent stack
745	/// overflow for consumers that do structural induction on an AST using
746	/// explicit recursion. While this crate never does this (instead using
747	/// constant stack space and moving the call stack to the heap), other
748	/// crates may.
749	///
750	/// This limit is not checked until the entire AST is parsed.
751	/// Therefore, if callers want to put a limit on the amount of heap
752	/// space used, then they should impose a limit on the length, in
753	/// bytes, of the concrete pattern string. In particular, this is
754	/// viable since this parser implementation will limit itself to heap
755	/// space proportional to the length of the pattern string. See also
756	/// the [untrusted inputs](crate#untrusted-input) section in the
757	/// top-level crate documentation for more information about this.
758	///
759	/// Note that a nest limit of `0` will return a nest limit error for
760	/// most patterns but not all. For example, a nest limit of `0` permits
761	/// `a` but not `ab`, since `ab` requires an explicit concatenation,
762	/// which results in a nest depth of `1`. In general, a nest limit is
763	/// not something that manifests in an obvious way in the concrete
764	/// syntax, therefore, it should not be used in a granular way.
765	///
766	/// # Example
767	///
768	/// ```
769	/// use regex::RegexBuilder;
770	///
771	/// assert!(RegexBuilder::new(r"a").nest_limit(`0`).build().is_ok());
772	/// assert!(RegexBuilder::new(r"ab").nest_limit(`0`).build().is_err());
773	/// ```
774	pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder {
775	self.builder.nest_limit(limit);
776	self
777	}
778	}
779
780	/// A configurable builder for a [`RegexSet`].
781	///
782	/// This builder can be used to programmatically set flags such as
783	/// `i` (case insensitive) and `x` (for verbose mode). This builder
784	/// can also be used to configure things like the line terminator
785	/// and a size limit on the compiled regular expression.
786	#[derive(Clone, Debug)]
787	pub struct RegexSetBuilder {
788	builder: Builder,
789	}
790
791	impl RegexSetBuilder {
792	/// Create a new builder with a default configuration for the given
793	/// patterns.
794	///
795	/// If the patterns are invalid or exceed the configured size limits,
796	/// then an error will be returned when [`RegexSetBuilder::build`] is
797	/// called.
798	pub fn new<I, S>(patterns: I) -> RegexSetBuilder
799	where
800	I: IntoIterator<Item = S>,
801	S: AsRef<str>,
802	{
803	RegexSetBuilder { builder: Builder::new(patterns) }
804	}
805
806	/// Compiles the patterns given to `RegexSetBuilder::new` with the
807	/// configuration set on this builder.
808	///
809	/// If the patterns aren't valid regexes or if a configured size limit
810	/// was exceeded, then an error is returned.
811	pub fn build(&self) -> Result<RegexSet, Error> {
812	self.builder.build_many_string()
813	}
814
815	/// This configures Unicode mode for the all of the patterns.
816	///
817	/// Enabling Unicode mode does a number of things:
818	///
819	/// Most fundamentally, it causes the fundamental atom of matching*
820	/// to be a single codepoint. When Unicode mode is disabled, it's a
821	/// single byte. For example, when Unicode mode is enabled, `.` will
822	/// match `💩` once, where as it will match 4 times when Unicode mode
823	/// is disabled. (Since the UTF-8 encoding of `💩` is 4 bytes long.)
824	/// Case insensitive matching uses Unicode simple case folding rules.*
825	/// Unicode character classes like `\p{Letter}` and `\p{Greek}` are*
826	/// available.
827	/// Perl character classes are Unicode aware. That is, `\w`, `\s` and*
828	/// `\d`.
829	/// The word boundary assertions, `\b` and `\B`, use the Unicode*
830	/// definition of a word character.
831	///
832	/// Note that if Unicode mode is disabled, then the regex will fail to
833	/// compile if it could match invalid UTF-8. For example, when Unicode
834	/// mode is disabled, then since `.` matches any byte (except for
835	/// `\n`), then it can match invalid UTF-8 and thus building a regex
836	/// from it will fail. Another example is `\w` and `\W`. Since `\w` can
837	/// only match ASCII bytes when Unicode mode is disabled, it's allowed.
838	/// But `\W` can match more than ASCII bytes, including invalid UTF-8,
839	/// and so it is not allowed. This restriction can be lifted only by
840	/// using a [`bytes::RegexSet`](crate::bytes::RegexSet).
841	///
842	/// For more details on the Unicode support in this crate, see the
843	/// [Unicode section](crate#unicode) in this crate's top-level
844	/// documentation.
845	///
846	/// The default for this is `true`.
847	///
848	/// # Example
849	///
850	/// ```
851	/// use regex::RegexSetBuilder;
852	///
853	/// let re = RegexSetBuilder::new([r"\w"])
854	/// .unicode(`false`)
855	/// .build()
856	/// .unwrap();
857	/// // Normally greek letters would be included in \w, but since
858	/// // Unicode mode is disabled, it only matches ASCII letters.
859	/// assert!(!re.is_match("δ"));
860	///
861	/// let re = RegexSetBuilder::new([r"s"])
862	/// .case_insensitive(`true`)
863	/// .unicode(`false`)
864	/// .build()
865	/// .unwrap();
866	/// // Normally 'ſ' is included when searching for 's' case
867	/// // insensitively due to Unicode's simple case folding rules. But
868	/// // when Unicode mode is disabled, only ASCII case insensitive rules
869	/// // are used.
870	/// assert!(!re.is_match("ſ"));
871	/// ```
872	pub fn unicode(&mut self, yes: bool) -> &mut RegexSetBuilder {
873	self.builder.unicode(yes);
874	self
875	}
876
877	/// This configures whether to enable case insensitive matching for all
878	/// of the patterns.
879	///
880	/// This setting can also be configured using the inline flag `i`
881	/// in the pattern. For example, `(?i:foo)` matches `foo` case
882	/// insensitively while `(?-i:foo)` matches `foo` case sensitively.
883	///
884	/// The default for this is `false`.
885	///
886	/// # Example
887	///
888	/// ```
889	/// use regex::RegexSetBuilder;
890	///
891	/// let re = RegexSetBuilder::new([r"foo(?-i:bar)quux"])
892	/// .case_insensitive(`true`)
893	/// .build()
894	/// .unwrap();
895	/// assert!(re.is_match("FoObarQuUx"));
896	/// // Even though case insensitive matching is enabled in the builder,
897	/// // it can be locally disabled within the pattern. In this case,
898	/// // `bar` is matched case sensitively.
899	/// assert!(!re.is_match("fooBARquux"));
900	/// ```
901	pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexSetBuilder {
902	self.builder.case_insensitive(yes);
903	self
904	}
905
906	/// This configures multi-line mode for all of the patterns.
907	///
908	/// Enabling multi-line mode changes the behavior of the `^` and `$`
909	/// anchor assertions. Instead of only matching at the beginning and
910	/// end of a haystack, respectively, multi-line mode causes them to
911	/// match at the beginning and end of a line in addition* to the*
912	/// beginning and end of a haystack. More precisely, `^` will match at
913	/// the position immediately following a `\n` and `$` will match at the
914	/// position immediately preceding a `\n`.
915	///
916	/// The behavior of this option can be impacted by other settings too:
917	///
918	/// The* [`RegexSetBuilder::line_terminator`] option changes `\n`
919	/// above to any ASCII byte.
920	/// The* [`RegexSetBuilder::crlf`] option changes the line terminator
921	/// to be either `\r` or `\n`, but never at the position between a `\r`
922	/// and `\n`.
923	///
924	/// This setting can also be configured using the inline flag `m` in
925	/// the pattern.
926	///
927	/// The default for this is `false`.
928	///
929	/// # Example
930	///
931	/// ```
932	/// use regex::RegexSetBuilder;
933	///
934	/// let re = RegexSetBuilder::new([r"^foo$"])
935	/// .multi_line(`true`)
936	/// .build()
937	/// .unwrap();
938	/// assert!(re.is_match("`\n`foo`\n`"));
939	/// ```
940	pub fn multi_line(&mut self, yes: bool) -> &mut RegexSetBuilder {
941	self.builder.multi_line(yes);
942	self
943	}
944
945	/// This configures dot-matches-new-line mode for the entire pattern.
946	///
947	/// Perhaps surprisingly, the default behavior for `.` is not to match
948	/// any character, but rather, to match any character except for the
949	/// line terminator (which is `\n` by default). When this mode is
950	/// enabled, the behavior changes such that `.` truly matches any
951	/// character.
952	///
953	/// This setting can also be configured using the inline flag `s` in
954	/// the pattern. For example, `(?s:.)` and `\p{any}` are equivalent
955	/// regexes.
956	///
957	/// The default for this is `false`.
958	///
959	/// # Example
960	///
961	/// ```
962	/// use regex::RegexSetBuilder;
963	///
964	/// let re = RegexSetBuilder::new([r"foo.bar"])
965	/// .dot_matches_new_line(`true`)
966	/// .build()
967	/// .unwrap();
968	/// let hay = "foo`\n`bar";
969	/// assert!(re.is_match(hay));
970	/// ```
971	pub fn dot_matches_new_line(
972	&mut self,
973	yes: bool,
974	) -> &mut RegexSetBuilder {
975	self.builder.dot_matches_new_line(yes);
976	self
977	}
978
979	/// This configures CRLF mode for all of the patterns.
980	///
981	/// When CRLF mode is enabled, both `\r` ("carriage return" or CR for
982	/// short) and `\n` ("line feed" or LF for short) are treated as line
983	/// terminators. This results in the following:
984	///
985	/// Unless dot-matches-new-line mode is enabled, `.` will now match*
986	/// any character except for `\n` and `\r`.
987	/// When multi-line mode is enabled, `^` will match immediately*
988	/// following a `\n` or a `\r`. Similarly, `$` will match immediately
989	/// preceding a `\n` or a `\r`. Neither `^` nor `$` will ever match
990	/// between `\r` and `\n`.
991	///
992	/// This setting can also be configured using the inline flag `R` in
993	/// the pattern.
994	///
995	/// The default for this is `false`.
996	///
997	/// # Example
998	///
999	/// ```
1000	/// use regex::RegexSetBuilder;
1001	///
1002	/// let re = RegexSetBuilder::new([r"^foo$"])
1003	/// .multi_line(`true`)
1004	/// .crlf(`true`)
1005	/// .build()
1006	/// .unwrap();
1007	/// let hay = "`\r\n`foo`\r\n`";
1008	/// // If CRLF mode weren't enabled here, then '$' wouldn't match
1009	/// // immediately after 'foo', and thus no match would be found.
1010	/// assert!(re.is_match(hay));
1011	/// ```
1012	///
1013	/// This example demonstrates that `^` will never match at a position
1014	/// between `\r` and `\n`. (`$` will similarly not match between a `\r`
1015	/// and a `\n`.)
1016	///
1017	/// ```
1018	/// use regex::RegexSetBuilder;
1019	///
1020	/// let re = RegexSetBuilder::new([r"^\n"])
1021	/// .multi_line(`true`)
1022	/// .crlf(`true`)
1023	/// .build()
1024	/// .unwrap();
1025	/// assert!(!re.is_match("`\r\n`"));
1026	/// ```
1027	pub fn crlf(&mut self, yes: bool) -> &mut RegexSetBuilder {
1028	self.builder.crlf(yes);
1029	self
1030	}
1031
1032	/// Configures the line terminator to be used by the regex.
1033	///
1034	/// The line terminator is relevant in two ways for a particular regex:
1035	///
1036	/// When dot-matches-new-line mode is not enabled (the default),*
1037	/// then `.` will match any character except for the configured line
1038	/// terminator.
1039	/// When multi-line mode is enabled (not the default), then `^` and*
1040	/// `$` will match immediately after and before, respectively, a line
1041	/// terminator.
1042	///
1043	/// In both cases, if CRLF mode is enabled in a particular context,
1044	/// then it takes precedence over any configured line terminator.
1045	///
1046	/// This option cannot be configured from within the pattern.
1047	///
1048	/// The default line terminator is `\n`.
1049	///
1050	/// # Example
1051	///
1052	/// This shows how to treat the NUL byte as a line terminator. This can
1053	/// be a useful heuristic when searching binary data.
1054	///
1055	/// ```
1056	/// use regex::RegexSetBuilder;
1057	///
1058	/// let re = RegexSetBuilder::new([r"^foo$"])
1059	/// .multi_line(`true`)
1060	/// .line_terminator(b'`\x00`')
1061	/// .build()
1062	/// .unwrap();
1063	/// let hay = "`\x00`foo`\x00`";
1064	/// assert!(re.is_match(hay));
1065	/// ```
1066	///
1067	/// This example shows that the behavior of `.` is impacted by this
1068	/// setting as well:
1069	///
1070	/// ```
1071	/// use regex::RegexSetBuilder;
1072	///
1073	/// let re = RegexSetBuilder::new([r"."])
1074	/// .line_terminator(b'`\x00`')
1075	/// .build()
1076	/// .unwrap();
1077	/// assert!(re.is_match("`\n`"));
1078	/// assert!(!re.is_match("`\x00`"));
1079	/// ```
1080	///
1081	/// This shows that building a regex will fail if the byte given
1082	/// is not ASCII and the pattern could result in matching invalid
1083	/// UTF-8. This is because any singular non-ASCII byte is not valid
1084	/// UTF-8, and it is not permitted for a [`RegexSet`] to match invalid
1085	/// UTF-8. (It is permissible to use a non-ASCII byte when building a
1086	/// [`bytes::RegexSet`](crate::bytes::RegexSet).)
1087	///
1088	/// ```
1089	/// use regex::RegexSetBuilder;
1090	///
1091	/// assert!(
1092	/// RegexSetBuilder::new([r"."])
1093	/// .line_terminator(`0x80`)
1094	/// .build()
1095	/// .is_err()
1096	/// );
1097	/// // Note that using a non-ASCII byte isn't enough on its own to
1098	/// // cause regex compilation to fail. You actually have to make use
1099	/// // of it in the regex in a way that leads to matching invalid
1100	/// // UTF-8. If you don't, then regex compilation will succeed!
1101	/// assert!(
1102	/// RegexSetBuilder::new([r"a"])
1103	/// .line_terminator(`0x80`)
1104	/// .build()
1105	/// .is_ok()
1106	/// );
1107	/// ```
1108	pub fn line_terminator(&mut self, byte: u8) -> &mut RegexSetBuilder {
1109	self.builder.line_terminator(byte);
1110	self
1111	}
1112
1113	/// This configures swap-greed mode for all of the patterns.
1114	///
1115	/// When swap-greed mode is enabled, patterns like `a+` will become
1116	/// non-greedy and patterns like `a+?` will become greedy. In other
1117	/// words, the meanings of `a+` and `a+?` are switched.
1118	///
1119	/// This setting can also be configured using the inline flag `U` in
1120	/// the pattern.
1121	///
1122	/// Note that this is generally not useful for a `RegexSet` since a
1123	/// `RegexSet` can only report whether a pattern matches or not. Since
1124	/// greediness never impacts whether a match is found or not (only the
1125	/// offsets of the match), it follows that whether parts of a pattern
1126	/// are greedy or not doesn't matter for a `RegexSet`.
1127	///
1128	/// The default for this is `false`.
1129	pub fn swap_greed(&mut self, yes: bool) -> &mut RegexSetBuilder {
1130	self.builder.swap_greed(yes);
1131	self
1132	}
1133
1134	/// This configures verbose mode for all of the patterns.
1135	///
1136	/// When enabled, whitespace will treated as insignifcant in the
1137	/// pattern and `#` can be used to start a comment until the next new
1138	/// line.
1139	///
1140	/// Normally, in most places in a pattern, whitespace is treated
1141	/// literally. For example ` +` will match one or more ASCII whitespace
1142	/// characters.
1143	///
1144	/// When verbose mode is enabled, `\#` can be used to match a literal
1145	/// `#` and `\ ` can be used to match a literal ASCII whitespace
1146	/// character.
1147	///
1148	/// Verbose mode is useful for permitting regexes to be formatted and
1149	/// broken up more nicely. This may make them more easily readable.
1150	///
1151	/// This setting can also be configured using the inline flag `x` in
1152	/// the pattern.
1153	///
1154	/// The default for this is `false`.
1155	///
1156	/// # Example
1157	///
1158	/// ```
1159	/// use regex::RegexSetBuilder;
1160	///
1161	/// let pat = r"
1162	/// \b
1163	/// (?<first>\p{Uppercase}\w*) # always start with uppercase letter
1164	/// [\s--\n]+ # whitespace should separate names
1165	/// (?: # middle name can be an initial!
1166	/// (?:(?<initial>\p{Uppercase})\.\|(?<middle>\p{Uppercase}\w*))
1167	/// [\s--\n]+
1168	/// )?
1169	/// (?<last>\p{Uppercase}\w*)
1170	/// \b
1171	/// ";
1172	/// let re = RegexSetBuilder::new([pat])
1173	/// .ignore_whitespace(`true`)
1174	/// .build()
1175	/// .unwrap();
1176	/// assert!(re.is_match("Harry Potter"));
1177	/// assert!(re.is_match("Harry J. Potter"));
1178	/// assert!(re.is_match("Harry James Potter"));
1179	/// assert!(!re.is_match("harry J. Potter"));
1180	/// ```
1181	pub fn ignore_whitespace(
1182	&mut self,
1183	yes: bool,
1184	) -> &mut RegexSetBuilder {
1185	self.builder.ignore_whitespace(yes);
1186	self
1187	}
1188
1189	/// This configures octal mode for all of the patterns.
1190	///
1191	/// Octal syntax is a little-known way of uttering Unicode codepoints
1192	/// in a pattern. For example, `a`, `\x61`, `\u0061` and `\141` are all
1193	/// equivalent patterns, where the last example shows octal syntax.
1194	///
1195	/// While supporting octal syntax isn't in and of itself a problem,
1196	/// it does make good error messages harder. That is, in PCRE based
1197	/// regex engines, syntax like `\1` invokes a backreference, which is
1198	/// explicitly unsupported this library. However, many users expect
1199	/// backreferences to be supported. Therefore, when octal support
1200	/// is disabled, the error message will explicitly mention that
1201	/// backreferences aren't supported.
1202	///
1203	/// The default for this is `false`.
1204	///
1205	/// # Example
1206	///
1207	/// ```
1208	/// use regex::RegexSetBuilder;
1209	///
1210	/// // Normally this pattern would not compile, with an error message
1211	/// // about backreferences not being supported. But with octal mode
1212	/// // enabled, octal escape sequences work.
1213	/// let re = RegexSetBuilder::new([r"\141"])
1214	/// .octal(`true`)
1215	/// .build()
1216	/// .unwrap();
1217	/// assert!(re.is_match("a"));
1218	/// ```
1219	pub fn octal(&mut self, yes: bool) -> &mut RegexSetBuilder {
1220	self.builder.octal(yes);
1221	self
1222	}
1223
1224	/// Sets the approximate size limit, in bytes, of the compiled regex.
1225	///
1226	/// This roughly corresponds to the number of heap memory, in
1227	/// bytes, occupied by a single regex. If the regex would otherwise
1228	/// approximately exceed this limit, then compiling that regex will
1229	/// fail.
1230	///
1231	/// The main utility of a method like this is to avoid compiling
1232	/// regexes that use an unexpected amount of resources, such as
1233	/// time and memory. Even if the memory usage of a large regex is
1234	/// acceptable, its search time may not be. Namely, worst case time
1235	/// complexity for search is `O(m n)`, where `m ~ len(pattern)` and*
1236	/// `n ~ len(haystack)`. That is, search time depends, in part, on the
1237	/// size of the compiled regex. This means that putting a limit on the
1238	/// size of the regex limits how much a regex can impact search time.
1239	///
1240	/// For more information about regex size limits, see the section on
1241	/// [untrusted inputs](crate#untrusted-input) in the top-level crate
1242	/// documentation.
1243	///
1244	/// The default for this is some reasonable number that permits most
1245	/// patterns to compile successfully.
1246	///
1247	/// # Example
1248	///
1249	/// ```
1250	/// # if !cfg!(target_pointer_width = "64") { return; } // see #1041
1251	/// use regex::RegexSetBuilder;
1252	///
1253	/// // It may surprise you how big some seemingly small patterns can
1254	/// // be! Since \w is Unicode aware, this generates a regex that can
1255	/// // match approximately 140,000 distinct codepoints.
1256	/// assert!(
1257	/// RegexSetBuilder::new([r"\w"])
1258	/// .size_limit(`45_000`)
1259	/// .build()
1260	/// .is_err()
1261	/// );
1262	/// ```
1263	pub fn size_limit(&mut self, bytes: usize) -> &mut RegexSetBuilder {
1264	self.builder.size_limit(bytes);
1265	self
1266	}
1267
1268	/// Set the approximate capacity, in bytes, of the cache of transitions
1269	/// used by the lazy DFA.
1270	///
1271	/// While the lazy DFA isn't always used, in tends to be the most
1272	/// commonly use regex engine in default configurations. It tends to
1273	/// adopt the performance profile of a fully build DFA, but without the
1274	/// downside of taking worst case exponential time to build.
1275	///
1276	/// The downside is that it needs to keep a cache of transitions and
1277	/// states that are built while running a search, and this cache
1278	/// can fill up. When it fills up, the cache will reset itself. Any
1279	/// previously generated states and transitions will then need to be
1280	/// re-generated. If this happens too many times, then this library
1281	/// will bail out of using the lazy DFA and switch to a different regex
1282	/// engine.
1283	///
1284	/// If your regex provokes this particular downside of the lazy DFA,
1285	/// then it may be beneficial to increase its cache capacity. This will
1286	/// potentially reduce the frequency of cache resetting (ideally to
1287	/// `0`). While it won't fix all potential performance problems with
1288	/// the lazy DFA, increasing the cache capacity does fix some.
1289	///
1290	/// There is no easy way to determine, a priori, whether increasing
1291	/// this cache capacity will help. In general, the larger your regex,
1292	/// the more cache it's likely to use. But that isn't an ironclad rule.
1293	/// For example, a regex like `[01]1[01]{N}` would normally produce a*
1294	/// fully build DFA that is exponential in size with respect to `N`.
1295	/// The lazy DFA will prevent exponential space blow-up, but it cache
1296	/// is likely to fill up, even when it's large and even for smallish
1297	/// values of `N`.
1298	///
1299	/// If you aren't sure whether this helps or not, it is sensible to
1300	/// set this to some arbitrarily large number in testing, such as
1301	/// `usize::MAX`. Namely, this represents the amount of capacity that
1302	/// may* be used. It's probably not a good idea to use `usize::MAX` in*
1303	/// production though, since it implies there are no controls on heap
1304	/// memory used by this library during a search. In effect, set it to
1305	/// whatever you're willing to allocate for a single regex search.
1306	pub fn dfa_size_limit(
1307	&mut self,
1308	bytes: usize,
1309	) -> &mut RegexSetBuilder {
1310	self.builder.dfa_size_limit(bytes);
1311	self
1312	}
1313
1314	/// Set the nesting limit for this parser.
1315	///
1316	/// The nesting limit controls how deep the abstract syntax tree is
1317	/// allowed to be. If the AST exceeds the given limit (e.g., with too
1318	/// many nested groups), then an error is returned by the parser.
1319	///
1320	/// The purpose of this limit is to act as a heuristic to prevent stack
1321	/// overflow for consumers that do structural induction on an AST using
1322	/// explicit recursion. While this crate never does this (instead using
1323	/// constant stack space and moving the call stack to the heap), other
1324	/// crates may.
1325	///
1326	/// This limit is not checked until the entire AST is parsed.
1327	/// Therefore, if callers want to put a limit on the amount of heap
1328	/// space used, then they should impose a limit on the length, in
1329	/// bytes, of the concrete pattern string. In particular, this is
1330	/// viable since this parser implementation will limit itself to heap
1331	/// space proportional to the length of the pattern string. See also
1332	/// the [untrusted inputs](crate#untrusted-input) section in the
1333	/// top-level crate documentation for more information about this.
1334	///
1335	/// Note that a nest limit of `0` will return a nest limit error for
1336	/// most patterns but not all. For example, a nest limit of `0` permits
1337	/// `a` but not `ab`, since `ab` requires an explicit concatenation,
1338	/// which results in a nest depth of `1`. In general, a nest limit is
1339	/// not something that manifests in an obvious way in the concrete
1340	/// syntax, therefore, it should not be used in a granular way.
1341	///
1342	/// # Example
1343	///
1344	/// ```
1345	/// use regex::RegexSetBuilder;
1346	///
1347	/// assert!(RegexSetBuilder::new([r"a"]).nest_limit(`0`).build().is_ok());
1348	/// assert!(RegexSetBuilder::new([r"ab"]).nest_limit(`0`).build().is_err());
1349	/// ```
1350	pub fn nest_limit(&mut self, limit: u32) -> &mut RegexSetBuilder {
1351	self.builder.nest_limit(limit);
1352	self
1353	}
1354	}
1355	}
1356
1357	pub(crate) mod bytes {
1358	use crate::{
1359	bytes::{Regex, RegexSet},
1360	error::Error,
1361	};
1362
1363	use super::Builder;
1364
1365	/// A configurable builder for a [`Regex`].
1366	///
1367	/// This builder can be used to programmatically set flags such as `i`
1368	/// (case insensitive) and `x` (for verbose mode). This builder can also be
1369	/// used to configure things like the line terminator and a size limit on
1370	/// the compiled regular expression.
1371	#[derive(Clone, Debug)]
1372	pub struct RegexBuilder {
1373	builder: Builder,
1374	}
1375
1376	impl RegexBuilder {
1377	/// Create a new builder with a default configuration for the given
1378	/// pattern.
1379	///
1380	/// If the pattern is invalid or exceeds the configured size limits,
1381	/// then an error will be returned when [`RegexBuilder::build`] is
1382	/// called.
1383	pub fn new(pattern: &str) -> RegexBuilder {
1384	RegexBuilder { builder: Builder::new([pattern]) }
1385	}
1386
1387	/// Compiles the pattern given to `RegexBuilder::new` with the
1388	/// configuration set on this builder.
1389	///
1390	/// If the pattern isn't a valid regex or if a configured size limit
1391	/// was exceeded, then an error is returned.
1392	pub fn build(&self) -> Result<Regex, Error> {
1393	self.builder.build_one_bytes()
1394	}
1395
1396	/// This configures Unicode mode for the entire pattern.
1397	///
1398	/// Enabling Unicode mode does a number of things:
1399	///
1400	/// Most fundamentally, it causes the fundamental atom of matching*
1401	/// to be a single codepoint. When Unicode mode is disabled, it's a
1402	/// single byte. For example, when Unicode mode is enabled, `.` will
1403	/// match `💩` once, where as it will match 4 times when Unicode mode
1404	/// is disabled. (Since the UTF-8 encoding of `💩` is 4 bytes long.)
1405	/// Case insensitive matching uses Unicode simple case folding rules.*
1406	/// Unicode character classes like `\p{Letter}` and `\p{Greek}` are*
1407	/// available.
1408	/// Perl character classes are Unicode aware. That is, `\w`, `\s` and*
1409	/// `\d`.
1410	/// The word boundary assertions, `\b` and `\B`, use the Unicode*
1411	/// definition of a word character.
1412	///
1413	/// Note that unlike the top-level `Regex` for searching `&str`, it
1414	/// is permitted to disable Unicode mode even if the resulting pattern
1415	/// could match invalid UTF-8. For example, `(?-u:.)` is not a valid
1416	/// pattern for a top-level `Regex`, but is valid for a `bytes::Regex`.
1417	///
1418	/// For more details on the Unicode support in this crate, see the
1419	/// [Unicode section](crate#unicode) in this crate's top-level
1420	/// documentation.
1421	///
1422	/// The default for this is `true`.
1423	///
1424	/// # Example
1425	///
1426	/// ```
1427	/// use regex::bytes::RegexBuilder;
1428	///
1429	/// let re = RegexBuilder::new(r"\w")
1430	/// .unicode(`false`)
1431	/// .build()
1432	/// .unwrap();
1433	/// // Normally greek letters would be included in \w, but since
1434	/// // Unicode mode is disabled, it only matches ASCII letters.
1435	/// assert!(!re.is_match("δ".as_bytes()));
1436	///
1437	/// let re = RegexBuilder::new(r"s")
1438	/// .case_insensitive(`true`)
1439	/// .unicode(`false`)
1440	/// .build()
1441	/// .unwrap();
1442	/// // Normally 'ſ' is included when searching for 's' case
1443	/// // insensitively due to Unicode's simple case folding rules. But
1444	/// // when Unicode mode is disabled, only ASCII case insensitive rules
1445	/// // are used.
1446	/// assert!(!re.is_match("ſ".as_bytes()));
1447	/// ```
1448	///
1449	/// Since this builder is for constructing a [`bytes::Regex`](Regex),
1450	/// one can disable Unicode mode even if it would match invalid UTF-8:
1451	///
1452	/// ```
1453	/// use regex::bytes::RegexBuilder;
1454	///
1455	/// let re = RegexBuilder::new(r".")
1456	/// .unicode(`false`)
1457	/// .build()
1458	/// .unwrap();
1459	/// // Normally greek letters would be included in \w, but since
1460	/// // Unicode mode is disabled, it only matches ASCII letters.
1461	/// assert!(re.is_match(b"`\xFF`"));
1462	/// ```
1463	pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder {
1464	self.builder.unicode(yes);
1465	self
1466	}
1467
1468	/// This configures whether to enable case insensitive matching for the
1469	/// entire pattern.
1470	///
1471	/// This setting can also be configured using the inline flag `i`
1472	/// in the pattern. For example, `(?i:foo)` matches `foo` case
1473	/// insensitively while `(?-i:foo)` matches `foo` case sensitively.
1474	///
1475	/// The default for this is `false`.
1476	///
1477	/// # Example
1478	///
1479	/// ```
1480	/// use regex::bytes::RegexBuilder;
1481	///
1482	/// let re = RegexBuilder::new(r"foo(?-i:bar)quux")
1483	/// .case_insensitive(`true`)
1484	/// .build()
1485	/// .unwrap();
1486	/// assert!(re.is_match(b"FoObarQuUx"));
1487	/// // Even though case insensitive matching is enabled in the builder,
1488	/// // it can be locally disabled within the pattern. In this case,
1489	/// // `bar` is matched case sensitively.
1490	/// assert!(!re.is_match(b"fooBARquux"));
1491	/// ```
1492	pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder {
1493	self.builder.case_insensitive(yes);
1494	self
1495	}
1496
1497	/// This configures multi-line mode for the entire pattern.
1498	///
1499	/// Enabling multi-line mode changes the behavior of the `^` and `$`
1500	/// anchor assertions. Instead of only matching at the beginning and
1501	/// end of a haystack, respectively, multi-line mode causes them to
1502	/// match at the beginning and end of a line in addition* to the*
1503	/// beginning and end of a haystack. More precisely, `^` will match at
1504	/// the position immediately following a `\n` and `$` will match at the
1505	/// position immediately preceding a `\n`.
1506	///
1507	/// The behavior of this option can be impacted by other settings too:
1508	///
1509	/// The* [`RegexBuilder::line_terminator`] option changes `\n` above
1510	/// to any ASCII byte.
1511	/// The* [`RegexBuilder::crlf`] option changes the line terminator to
1512	/// be either `\r` or `\n`, but never at the position between a `\r`
1513	/// and `\n`.
1514	///
1515	/// This setting can also be configured using the inline flag `m` in
1516	/// the pattern.
1517	///
1518	/// The default for this is `false`.
1519	///
1520	/// # Example
1521	///
1522	/// ```
1523	/// use regex::bytes::RegexBuilder;
1524	///
1525	/// let re = RegexBuilder::new(r"^foo$")
1526	/// .multi_line(`true`)
1527	/// .build()
1528	/// .unwrap();
1529	/// assert_eq!(Some(`1`..`4`), re.find(b"`\n`foo`\n`").map(\|m\| m.range()));
1530	/// ```
1531	pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder {
1532	self.builder.multi_line(yes);
1533	self
1534	}
1535
1536	/// This configures dot-matches-new-line mode for the entire pattern.
1537	///
1538	/// Perhaps surprisingly, the default behavior for `.` is not to match
1539	/// any character, but rather, to match any character except for the
1540	/// line terminator (which is `\n` by default). When this mode is
1541	/// enabled, the behavior changes such that `.` truly matches any
1542	/// character.
1543	///
1544	/// This setting can also be configured using the inline flag `s` in
1545	/// the pattern. For example, `(?s:.)` and `\p{any}` are equivalent
1546	/// regexes.
1547	///
1548	/// The default for this is `false`.
1549	///
1550	/// # Example
1551	///
1552	/// ```
1553	/// use regex::bytes::RegexBuilder;
1554	///
1555	/// let re = RegexBuilder::new(r"foo.bar")
1556	/// .dot_matches_new_line(`true`)
1557	/// .build()
1558	/// .unwrap();
1559	/// let hay = b"foo`\n`bar";
1560	/// assert_eq!(Some(&b"foo`\n`bar"[..]), re.find(hay).map(\|m\| m.as_bytes()));
1561	/// ```
1562	pub fn dot_matches_new_line(
1563	&mut self,
1564	yes: bool,
1565	) -> &mut RegexBuilder {
1566	self.builder.dot_matches_new_line(yes);
1567	self
1568	}
1569
1570	/// This configures CRLF mode for the entire pattern.
1571	///
1572	/// When CRLF mode is enabled, both `\r` ("carriage return" or CR for
1573	/// short) and `\n` ("line feed" or LF for short) are treated as line
1574	/// terminators. This results in the following:
1575	///
1576	/// Unless dot-matches-new-line mode is enabled, `.` will now match*
1577	/// any character except for `\n` and `\r`.
1578	/// When multi-line mode is enabled, `^` will match immediately*
1579	/// following a `\n` or a `\r`. Similarly, `$` will match immediately
1580	/// preceding a `\n` or a `\r`. Neither `^` nor `$` will ever match
1581	/// between `\r` and `\n`.
1582	///
1583	/// This setting can also be configured using the inline flag `R` in
1584	/// the pattern.
1585	///
1586	/// The default for this is `false`.
1587	///
1588	/// # Example
1589	///
1590	/// ```
1591	/// use regex::bytes::RegexBuilder;
1592	///
1593	/// let re = RegexBuilder::new(r"^foo$")
1594	/// .multi_line(`true`)
1595	/// .crlf(`true`)
1596	/// .build()
1597	/// .unwrap();
1598	/// let hay = b"`\r\n`foo`\r\n`";
1599	/// // If CRLF mode weren't enabled here, then '$' wouldn't match
1600	/// // immediately after 'foo', and thus no match would be found.
1601	/// assert_eq!(Some(&b"foo"[..]), re.find(hay).map(\|m\| m.as_bytes()));
1602	/// ```
1603	///
1604	/// This example demonstrates that `^` will never match at a position
1605	/// between `\r` and `\n`. (`$` will similarly not match between a `\r`
1606	/// and a `\n`.)
1607	///
1608	/// ```
1609	/// use regex::bytes::RegexBuilder;
1610	///
1611	/// let re = RegexBuilder::new(r"^")
1612	/// .multi_line(`true`)
1613	/// .crlf(`true`)
1614	/// .build()
1615	/// .unwrap();
1616	/// let hay = b"`\r\n\r\n`";
1617	/// let ranges: Vec<_> = re.find_iter(hay).map(\|m\| m.range()).collect();
1618	/// assert_eq!(ranges, vec![`0`..`0`, `2`..`2`, `4`..`4`]);
1619	/// ```
1620	pub fn crlf(&mut self, yes: bool) -> &mut RegexBuilder {
1621	self.builder.crlf(yes);
1622	self
1623	}
1624
1625	/// Configures the line terminator to be used by the regex.
1626	///
1627	/// The line terminator is relevant in two ways for a particular regex:
1628	///
1629	/// When dot-matches-new-line mode is not enabled (the default),*
1630	/// then `.` will match any character except for the configured line
1631	/// terminator.
1632	/// When multi-line mode is enabled (not the default), then `^` and*
1633	/// `$` will match immediately after and before, respectively, a line
1634	/// terminator.
1635	///
1636	/// In both cases, if CRLF mode is enabled in a particular context,
1637	/// then it takes precedence over any configured line terminator.
1638	///
1639	/// This option cannot be configured from within the pattern.
1640	///
1641	/// The default line terminator is `\n`.
1642	///
1643	/// # Example
1644	///
1645	/// This shows how to treat the NUL byte as a line terminator. This can
1646	/// be a useful heuristic when searching binary data.
1647	///
1648	/// ```
1649	/// use regex::bytes::RegexBuilder;
1650	///
1651	/// let re = RegexBuilder::new(r"^foo$")
1652	/// .multi_line(`true`)
1653	/// .line_terminator(b'`\x00`')
1654	/// .build()
1655	/// .unwrap();
1656	/// let hay = b"`\x00`foo`\x00`";
1657	/// assert_eq!(Some(`1`..`4`), re.find(hay).map(\|m\| m.range()));
1658	/// ```
1659	///
1660	/// This example shows that the behavior of `.` is impacted by this
1661	/// setting as well:
1662	///
1663	/// ```
1664	/// use regex::bytes::RegexBuilder;
1665	///
1666	/// let re = RegexBuilder::new(r".")
1667	/// .line_terminator(b'`\x00`')
1668	/// .build()
1669	/// .unwrap();
1670	/// assert!(re.is_match(b"`\n`"));
1671	/// assert!(!re.is_match(b"`\x00`"));
1672	/// ```
1673	///
1674	/// This shows that building a regex will work even when the byte
1675	/// given is not ASCII. This is unlike the top-level `Regex` API where
1676	/// matching invalid UTF-8 is not allowed.
1677	///
1678	/// Note though that you must disable Unicode mode. This is required
1679	/// because Unicode mode requires matching one codepoint at a time,
1680	/// and there is no way to match a non-ASCII byte as if it were a
1681	/// codepoint.
1682	///
1683	/// ```
1684	/// use regex::bytes::RegexBuilder;
1685	///
1686	/// assert!(
1687	/// RegexBuilder::new(r".")
1688	/// .unicode(`false`)
1689	/// .line_terminator(`0x80`)
1690	/// .build()
1691	/// .is_ok(),
1692	/// );
1693	/// ```
1694	pub fn line_terminator(&mut self, byte: u8) -> &mut RegexBuilder {
1695	self.builder.line_terminator(byte);
1696	self
1697	}
1698
1699	/// This configures swap-greed mode for the entire pattern.
1700	///
1701	/// When swap-greed mode is enabled, patterns like `a+` will become
1702	/// non-greedy and patterns like `a+?` will become greedy. In other
1703	/// words, the meanings of `a+` and `a+?` are switched.
1704	///
1705	/// This setting can also be configured using the inline flag `U` in
1706	/// the pattern.
1707	///
1708	/// The default for this is `false`.
1709	///
1710	/// # Example
1711	///
1712	/// ```
1713	/// use regex::bytes::RegexBuilder;
1714	///
1715	/// let re = RegexBuilder::new(r"a+")
1716	/// .swap_greed(`true`)
1717	/// .build()
1718	/// .unwrap();
1719	/// assert_eq!(Some(&b"a"[..]), re.find(b"aaa").map(\|m\| m.as_bytes()));
1720	/// ```
1721	pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder {
1722	self.builder.swap_greed(yes);
1723	self
1724	}
1725
1726	/// This configures verbose mode for the entire pattern.
1727	///
1728	/// When enabled, whitespace will treated as insignifcant in the
1729	/// pattern and `#` can be used to start a comment until the next new
1730	/// line.
1731	///
1732	/// Normally, in most places in a pattern, whitespace is treated
1733	/// literally. For example ` +` will match one or more ASCII whitespace
1734	/// characters.
1735	///
1736	/// When verbose mode is enabled, `\#` can be used to match a literal
1737	/// `#` and `\ ` can be used to match a literal ASCII whitespace
1738	/// character.
1739	///
1740	/// Verbose mode is useful for permitting regexes to be formatted and
1741	/// broken up more nicely. This may make them more easily readable.
1742	///
1743	/// This setting can also be configured using the inline flag `x` in
1744	/// the pattern.
1745	///
1746	/// The default for this is `false`.
1747	///
1748	/// # Example
1749	///
1750	/// ```
1751	/// use regex::bytes::RegexBuilder;
1752	///
1753	/// let pat = r"
1754	/// \b
1755	/// (?<first>\p{Uppercase}\w*) # always start with uppercase letter
1756	/// [\s--\n]+ # whitespace should separate names
1757	/// (?: # middle name can be an initial!
1758	/// (?:(?<initial>\p{Uppercase})\.\|(?<middle>\p{Uppercase}\w*))
1759	/// [\s--\n]+
1760	/// )?
1761	/// (?<last>\p{Uppercase}\w*)
1762	/// \b
1763	/// ";
1764	/// let re = RegexBuilder::new(pat)
1765	/// .ignore_whitespace(`true`)
1766	/// .build()
1767	/// .unwrap();
1768	///
1769	/// let caps = re.captures(b"Harry Potter").unwrap();
1770	/// assert_eq!(&b"Harry"[..], &caps["first"]);
1771	/// assert_eq!(&b"Potter"[..], &caps["last"]);
1772	///
1773	/// let caps = re.captures(b"Harry J. Potter").unwrap();
1774	/// assert_eq!(&b"Harry"[..], &caps["first"]);
1775	/// // Since a middle name/initial isn't required for an overall match,
1776	/// // we can't assume that 'initial' or 'middle' will be populated!
1777	/// assert_eq!(
1778	/// Some(&b"J"[..]),
1779	/// caps.name("initial").map(\|m\| m.as_bytes()),
1780	/// );
1781	/// assert_eq!(None, caps.name("middle").map(\|m\| m.as_bytes()));
1782	/// assert_eq!(&b"Potter"[..], &caps["last"]);
1783	///
1784	/// let caps = re.captures(b"Harry James Potter").unwrap();
1785	/// assert_eq!(&b"Harry"[..], &caps["first"]);
1786	/// // Since a middle name/initial isn't required for an overall match,
1787	/// // we can't assume that 'initial' or 'middle' will be populated!
1788	/// assert_eq!(None, caps.name("initial").map(\|m\| m.as_bytes()));
1789	/// assert_eq!(
1790	/// Some(&b"James"[..]),
1791	/// caps.name("middle").map(\|m\| m.as_bytes()),
1792	/// );
1793	/// assert_eq!(&b"Potter"[..], &caps["last"]);
1794	/// ```
1795	pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder {
1796	self.builder.ignore_whitespace(yes);
1797	self
1798	}
1799
1800	/// This configures octal mode for the entire pattern.
1801	///
1802	/// Octal syntax is a little-known way of uttering Unicode codepoints
1803	/// in a pattern. For example, `a`, `\x61`, `\u0061` and `\141` are all
1804	/// equivalent patterns, where the last example shows octal syntax.
1805	///
1806	/// While supporting octal syntax isn't in and of itself a problem,
1807	/// it does make good error messages harder. That is, in PCRE based
1808	/// regex engines, syntax like `\1` invokes a backreference, which is
1809	/// explicitly unsupported this library. However, many users expect
1810	/// backreferences to be supported. Therefore, when octal support
1811	/// is disabled, the error message will explicitly mention that
1812	/// backreferences aren't supported.
1813	///
1814	/// The default for this is `false`.
1815	///
1816	/// # Example
1817	///
1818	/// ```
1819	/// use regex::bytes::RegexBuilder;
1820	///
1821	/// // Normally this pattern would not compile, with an error message
1822	/// // about backreferences not being supported. But with octal mode
1823	/// // enabled, octal escape sequences work.
1824	/// let re = RegexBuilder::new(r"\141")
1825	/// .octal(`true`)
1826	/// .build()
1827	/// .unwrap();
1828	/// assert!(re.is_match(b"a"));
1829	/// ```
1830	pub fn octal(&mut self, yes: bool) -> &mut RegexBuilder {
1831	self.builder.octal(yes);
1832	self
1833	}
1834
1835	/// Sets the approximate size limit, in bytes, of the compiled regex.
1836	///
1837	/// This roughly corresponds to the number of heap memory, in
1838	/// bytes, occupied by a single regex. If the regex would otherwise
1839	/// approximately exceed this limit, then compiling that regex will
1840	/// fail.
1841	///
1842	/// The main utility of a method like this is to avoid compiling
1843	/// regexes that use an unexpected amount of resources, such as
1844	/// time and memory. Even if the memory usage of a large regex is
1845	/// acceptable, its search time may not be. Namely, worst case time
1846	/// complexity for search is `O(m n)`, where `m ~ len(pattern)` and*
1847	/// `n ~ len(haystack)`. That is, search time depends, in part, on the
1848	/// size of the compiled regex. This means that putting a limit on the
1849	/// size of the regex limits how much a regex can impact search time.
1850	///
1851	/// For more information about regex size limits, see the section on
1852	/// [untrusted inputs](crate#untrusted-input) in the top-level crate
1853	/// documentation.
1854	///
1855	/// The default for this is some reasonable number that permits most
1856	/// patterns to compile successfully.
1857	///
1858	/// # Example
1859	///
1860	/// ```
1861	/// # if !cfg!(target_pointer_width = "64") { return; } // see #1041
1862	/// use regex::bytes::RegexBuilder;
1863	///
1864	/// // It may surprise you how big some seemingly small patterns can
1865	/// // be! Since \w is Unicode aware, this generates a regex that can
1866	/// // match approximately 140,000 distinct codepoints.
1867	/// assert!(RegexBuilder::new(r"\w").size_limit(`45_000`).build().is_err());
1868	/// ```
1869	pub fn size_limit(&mut self, bytes: usize) -> &mut RegexBuilder {
1870	self.builder.size_limit(bytes);
1871	self
1872	}
1873
1874	/// Set the approximate capacity, in bytes, of the cache of transitions
1875	/// used by the lazy DFA.
1876	///
1877	/// While the lazy DFA isn't always used, in tends to be the most
1878	/// commonly use regex engine in default configurations. It tends to
1879	/// adopt the performance profile of a fully build DFA, but without the
1880	/// downside of taking worst case exponential time to build.
1881	///
1882	/// The downside is that it needs to keep a cache of transitions and
1883	/// states that are built while running a search, and this cache
1884	/// can fill up. When it fills up, the cache will reset itself. Any
1885	/// previously generated states and transitions will then need to be
1886	/// re-generated. If this happens too many times, then this library
1887	/// will bail out of using the lazy DFA and switch to a different regex
1888	/// engine.
1889	///
1890	/// If your regex provokes this particular downside of the lazy DFA,
1891	/// then it may be beneficial to increase its cache capacity. This will
1892	/// potentially reduce the frequency of cache resetting (ideally to
1893	/// `0`). While it won't fix all potential performance problems with
1894	/// the lazy DFA, increasing the cache capacity does fix some.
1895	///
1896	/// There is no easy way to determine, a priori, whether increasing
1897	/// this cache capacity will help. In general, the larger your regex,
1898	/// the more cache it's likely to use. But that isn't an ironclad rule.
1899	/// For example, a regex like `[01]1[01]{N}` would normally produce a*
1900	/// fully build DFA that is exponential in size with respect to `N`.
1901	/// The lazy DFA will prevent exponential space blow-up, but it cache
1902	/// is likely to fill up, even when it's large and even for smallish
1903	/// values of `N`.
1904	///
1905	/// If you aren't sure whether this helps or not, it is sensible to
1906	/// set this to some arbitrarily large number in testing, such as
1907	/// `usize::MAX`. Namely, this represents the amount of capacity that
1908	/// may* be used. It's probably not a good idea to use `usize::MAX` in*
1909	/// production though, since it implies there are no controls on heap
1910	/// memory used by this library during a search. In effect, set it to
1911	/// whatever you're willing to allocate for a single regex search.
1912	pub fn dfa_size_limit(&mut self, bytes: usize) -> &mut RegexBuilder {
1913	self.builder.dfa_size_limit(bytes);
1914	self
1915	}
1916
1917	/// Set the nesting limit for this parser.
1918	///
1919	/// The nesting limit controls how deep the abstract syntax tree is
1920	/// allowed to be. If the AST exceeds the given limit (e.g., with too
1921	/// many nested groups), then an error is returned by the parser.
1922	///
1923	/// The purpose of this limit is to act as a heuristic to prevent stack
1924	/// overflow for consumers that do structural induction on an AST using
1925	/// explicit recursion. While this crate never does this (instead using
1926	/// constant stack space and moving the call stack to the heap), other
1927	/// crates may.
1928	///
1929	/// This limit is not checked until the entire AST is parsed.
1930	/// Therefore, if callers want to put a limit on the amount of heap
1931	/// space used, then they should impose a limit on the length, in
1932	/// bytes, of the concrete pattern string. In particular, this is
1933	/// viable since this parser implementation will limit itself to heap
1934	/// space proportional to the length of the pattern string. See also
1935	/// the [untrusted inputs](crate#untrusted-input) section in the
1936	/// top-level crate documentation for more information about this.
1937	///
1938	/// Note that a nest limit of `0` will return a nest limit error for
1939	/// most patterns but not all. For example, a nest limit of `0` permits
1940	/// `a` but not `ab`, since `ab` requires an explicit concatenation,
1941	/// which results in a nest depth of `1`. In general, a nest limit is
1942	/// not something that manifests in an obvious way in the concrete
1943	/// syntax, therefore, it should not be used in a granular way.
1944	///
1945	/// # Example
1946	///
1947	/// ```
1948	/// use regex::bytes::RegexBuilder;
1949	///
1950	/// assert!(RegexBuilder::new(r"a").nest_limit(`0`).build().is_ok());
1951	/// assert!(RegexBuilder::new(r"ab").nest_limit(`0`).build().is_err());
1952	/// ```
1953	pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder {
1954	self.builder.nest_limit(limit);
1955	self
1956	}
1957	}
1958
1959	/// A configurable builder for a [`RegexSet`].
1960	///
1961	/// This builder can be used to programmatically set flags such as `i`
1962	/// (case insensitive) and `x` (for verbose mode). This builder can also be
1963	/// used to configure things like the line terminator and a size limit on
1964	/// the compiled regular expression.
1965	#[derive(Clone, Debug)]
1966	pub struct RegexSetBuilder {
1967	builder: Builder,
1968	}
1969
1970	impl RegexSetBuilder {
1971	/// Create a new builder with a default configuration for the given
1972	/// patterns.
1973	///
1974	/// If the patterns are invalid or exceed the configured size limits,
1975	/// then an error will be returned when [`RegexSetBuilder::build`] is
1976	/// called.
1977	pub fn new<I, S>(patterns: I) -> RegexSetBuilder
1978	where
1979	I: IntoIterator<Item = S>,
1980	S: AsRef<str>,
1981	{
1982	RegexSetBuilder { builder: Builder::new(patterns) }
1983	}
1984
1985	/// Compiles the patterns given to `RegexSetBuilder::new` with the
1986	/// configuration set on this builder.
1987	///
1988	/// If the patterns aren't valid regexes or if a configured size limit
1989	/// was exceeded, then an error is returned.
1990	pub fn build(&self) -> Result<RegexSet, Error> {
1991	self.builder.build_many_bytes()
1992	}
1993
1994	/// This configures Unicode mode for the all of the patterns.
1995	///
1996	/// Enabling Unicode mode does a number of things:
1997	///
1998	/// Most fundamentally, it causes the fundamental atom of matching*
1999	/// to be a single codepoint. When Unicode mode is disabled, it's a
2000	/// single byte. For example, when Unicode mode is enabled, `.` will
2001	/// match `💩` once, where as it will match 4 times when Unicode mode
2002	/// is disabled. (Since the UTF-8 encoding of `💩` is 4 bytes long.)
2003	/// Case insensitive matching uses Unicode simple case folding rules.*
2004	/// Unicode character classes like `\p{Letter}` and `\p{Greek}` are*
2005	/// available.
2006	/// Perl character classes are Unicode aware. That is, `\w`, `\s` and*
2007	/// `\d`.
2008	/// The word boundary assertions, `\b` and `\B`, use the Unicode*
2009	/// definition of a word character.
2010	///
2011	/// Note that unlike the top-level `RegexSet` for searching `&str`,
2012	/// it is permitted to disable Unicode mode even if the resulting
2013	/// pattern could match invalid UTF-8. For example, `(?-u:.)` is not
2014	/// a valid pattern for a top-level `RegexSet`, but is valid for a
2015	/// `bytes::RegexSet`.
2016	///
2017	/// For more details on the Unicode support in this crate, see the
2018	/// [Unicode section](crate#unicode) in this crate's top-level
2019	/// documentation.
2020	///
2021	/// The default for this is `true`.
2022	///
2023	/// # Example
2024	///
2025	/// ```
2026	/// use regex::bytes::RegexSetBuilder;
2027	///
2028	/// let re = RegexSetBuilder::new([r"\w"])
2029	/// .unicode(`false`)
2030	/// .build()
2031	/// .unwrap();
2032	/// // Normally greek letters would be included in \w, but since
2033	/// // Unicode mode is disabled, it only matches ASCII letters.
2034	/// assert!(!re.is_match("δ".as_bytes()));
2035	///
2036	/// let re = RegexSetBuilder::new([r"s"])
2037	/// .case_insensitive(`true`)
2038	/// .unicode(`false`)
2039	/// .build()
2040	/// .unwrap();
2041	/// // Normally 'ſ' is included when searching for 's' case
2042	/// // insensitively due to Unicode's simple case folding rules. But
2043	/// // when Unicode mode is disabled, only ASCII case insensitive rules
2044	/// // are used.
2045	/// assert!(!re.is_match("ſ".as_bytes()));
2046	/// ```
2047	///
2048	/// Since this builder is for constructing a
2049	/// [`bytes::RegexSet`](RegexSet), one can disable Unicode mode even if
2050	/// it would match invalid UTF-8:
2051	///
2052	/// ```
2053	/// use regex::bytes::RegexSetBuilder;
2054	///
2055	/// let re = RegexSetBuilder::new([r"."])
2056	/// .unicode(`false`)
2057	/// .build()
2058	/// .unwrap();
2059	/// // Normally greek letters would be included in \w, but since
2060	/// // Unicode mode is disabled, it only matches ASCII letters.
2061	/// assert!(re.is_match(b"`\xFF`"));
2062	/// ```
2063	pub fn unicode(&mut self, yes: bool) -> &mut RegexSetBuilder {
2064	self.builder.unicode(yes);
2065	self
2066	}
2067
2068	/// This configures whether to enable case insensitive matching for all
2069	/// of the patterns.
2070	///
2071	/// This setting can also be configured using the inline flag `i`
2072	/// in the pattern. For example, `(?i:foo)` matches `foo` case
2073	/// insensitively while `(?-i:foo)` matches `foo` case sensitively.
2074	///
2075	/// The default for this is `false`.
2076	///
2077	/// # Example
2078	///
2079	/// ```
2080	/// use regex::bytes::RegexSetBuilder;
2081	///
2082	/// let re = RegexSetBuilder::new([r"foo(?-i:bar)quux"])
2083	/// .case_insensitive(`true`)
2084	/// .build()
2085	/// .unwrap();
2086	/// assert!(re.is_match(b"FoObarQuUx"));
2087	/// // Even though case insensitive matching is enabled in the builder,
2088	/// // it can be locally disabled within the pattern. In this case,
2089	/// // `bar` is matched case sensitively.
2090	/// assert!(!re.is_match(b"fooBARquux"));
2091	/// ```
2092	pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexSetBuilder {
2093	self.builder.case_insensitive(yes);
2094	self
2095	}
2096
2097	/// This configures multi-line mode for all of the patterns.
2098	///
2099	/// Enabling multi-line mode changes the behavior of the `^` and `$`
2100	/// anchor assertions. Instead of only matching at the beginning and
2101	/// end of a haystack, respectively, multi-line mode causes them to
2102	/// match at the beginning and end of a line in addition* to the*
2103	/// beginning and end of a haystack. More precisely, `^` will match at
2104	/// the position immediately following a `\n` and `$` will match at the
2105	/// position immediately preceding a `\n`.
2106	///
2107	/// The behavior of this option can be impacted by other settings too:
2108	///
2109	/// The* [`RegexSetBuilder::line_terminator`] option changes `\n`
2110	/// above to any ASCII byte.
2111	/// The* [`RegexSetBuilder::crlf`] option changes the line terminator
2112	/// to be either `\r` or `\n`, but never at the position between a `\r`
2113	/// and `\n`.
2114	///
2115	/// This setting can also be configured using the inline flag `m` in
2116	/// the pattern.
2117	///
2118	/// The default for this is `false`.
2119	///
2120	/// # Example
2121	///
2122	/// ```
2123	/// use regex::bytes::RegexSetBuilder;
2124	///
2125	/// let re = RegexSetBuilder::new([r"^foo$"])
2126	/// .multi_line(`true`)
2127	/// .build()
2128	/// .unwrap();
2129	/// assert!(re.is_match(b"`\n`foo`\n`"));
2130	/// ```
2131	pub fn multi_line(&mut self, yes: bool) -> &mut RegexSetBuilder {
2132	self.builder.multi_line(yes);
2133	self
2134	}
2135
2136	/// This configures dot-matches-new-line mode for the entire pattern.
2137	///
2138	/// Perhaps surprisingly, the default behavior for `.` is not to match
2139	/// any character, but rather, to match any character except for the
2140	/// line terminator (which is `\n` by default). When this mode is
2141	/// enabled, the behavior changes such that `.` truly matches any
2142	/// character.
2143	///
2144	/// This setting can also be configured using the inline flag `s` in
2145	/// the pattern. For example, `(?s:.)` and `\p{any}` are equivalent
2146	/// regexes.
2147	///
2148	/// The default for this is `false`.
2149	///
2150	/// # Example
2151	///
2152	/// ```
2153	/// use regex::bytes::RegexSetBuilder;
2154	///
2155	/// let re = RegexSetBuilder::new([r"foo.bar"])
2156	/// .dot_matches_new_line(`true`)
2157	/// .build()
2158	/// .unwrap();
2159	/// let hay = b"foo`\n`bar";
2160	/// assert!(re.is_match(hay));
2161	/// ```
2162	pub fn dot_matches_new_line(
2163	&mut self,
2164	yes: bool,
2165	) -> &mut RegexSetBuilder {
2166	self.builder.dot_matches_new_line(yes);
2167	self
2168	}
2169
2170	/// This configures CRLF mode for all of the patterns.
2171	///
2172	/// When CRLF mode is enabled, both `\r` ("carriage return" or CR for
2173	/// short) and `\n` ("line feed" or LF for short) are treated as line
2174	/// terminators. This results in the following:
2175	///
2176	/// Unless dot-matches-new-line mode is enabled, `.` will now match*
2177	/// any character except for `\n` and `\r`.
2178	/// When multi-line mode is enabled, `^` will match immediately*
2179	/// following a `\n` or a `\r`. Similarly, `$` will match immediately
2180	/// preceding a `\n` or a `\r`. Neither `^` nor `$` will ever match
2181	/// between `\r` and `\n`.
2182	///
2183	/// This setting can also be configured using the inline flag `R` in
2184	/// the pattern.
2185	///
2186	/// The default for this is `false`.
2187	///
2188	/// # Example
2189	///
2190	/// ```
2191	/// use regex::bytes::RegexSetBuilder;
2192	///
2193	/// let re = RegexSetBuilder::new([r"^foo$"])
2194	/// .multi_line(`true`)
2195	/// .crlf(`true`)
2196	/// .build()
2197	/// .unwrap();
2198	/// let hay = b"`\r\n`foo`\r\n`";
2199	/// // If CRLF mode weren't enabled here, then '$' wouldn't match
2200	/// // immediately after 'foo', and thus no match would be found.
2201	/// assert!(re.is_match(hay));
2202	/// ```
2203	///
2204	/// This example demonstrates that `^` will never match at a position
2205	/// between `\r` and `\n`. (`$` will similarly not match between a `\r`
2206	/// and a `\n`.)
2207	///
2208	/// ```
2209	/// use regex::bytes::RegexSetBuilder;
2210	///
2211	/// let re = RegexSetBuilder::new([r"^\n"])
2212	/// .multi_line(`true`)
2213	/// .crlf(`true`)
2214	/// .build()
2215	/// .unwrap();
2216	/// assert!(!re.is_match(b"`\r\n`"));
2217	/// ```
2218	pub fn crlf(&mut self, yes: bool) -> &mut RegexSetBuilder {
2219	self.builder.crlf(yes);
2220	self
2221	}
2222
2223	/// Configures the line terminator to be used by the regex.
2224	///
2225	/// The line terminator is relevant in two ways for a particular regex:
2226	///
2227	/// When dot-matches-new-line mode is not enabled (the default),*
2228	/// then `.` will match any character except for the configured line
2229	/// terminator.
2230	/// When multi-line mode is enabled (not the default), then `^` and*
2231	/// `$` will match immediately after and before, respectively, a line
2232	/// terminator.
2233	///
2234	/// In both cases, if CRLF mode is enabled in a particular context,
2235	/// then it takes precedence over any configured line terminator.
2236	///
2237	/// This option cannot be configured from within the pattern.
2238	///
2239	/// The default line terminator is `\n`.
2240	///
2241	/// # Example
2242	///
2243	/// This shows how to treat the NUL byte as a line terminator. This can
2244	/// be a useful heuristic when searching binary data.
2245	///
2246	/// ```
2247	/// use regex::bytes::RegexSetBuilder;
2248	///
2249	/// let re = RegexSetBuilder::new([r"^foo$"])
2250	/// .multi_line(`true`)
2251	/// .line_terminator(b'`\x00`')
2252	/// .build()
2253	/// .unwrap();
2254	/// let hay = b"`\x00`foo`\x00`";
2255	/// assert!(re.is_match(hay));
2256	/// ```
2257	///
2258	/// This example shows that the behavior of `.` is impacted by this
2259	/// setting as well:
2260	///
2261	/// ```
2262	/// use regex::bytes::RegexSetBuilder;
2263	///
2264	/// let re = RegexSetBuilder::new([r"."])
2265	/// .line_terminator(b'`\x00`')
2266	/// .build()
2267	/// .unwrap();
2268	/// assert!(re.is_match(b"`\n`"));
2269	/// assert!(!re.is_match(b"`\x00`"));
2270	/// ```
2271	///
2272	/// This shows that building a regex will work even when the byte given
2273	/// is not ASCII. This is unlike the top-level `RegexSet` API where
2274	/// matching invalid UTF-8 is not allowed.
2275	///
2276	/// Note though that you must disable Unicode mode. This is required
2277	/// because Unicode mode requires matching one codepoint at a time,
2278	/// and there is no way to match a non-ASCII byte as if it were a
2279	/// codepoint.
2280	///
2281	/// ```
2282	/// use regex::bytes::RegexSetBuilder;
2283	///
2284	/// assert!(
2285	/// RegexSetBuilder::new([r"."])
2286	/// .unicode(`false`)
2287	/// .line_terminator(`0x80`)
2288	/// .build()
2289	/// .is_ok(),
2290	/// );
2291	/// ```
2292	pub fn line_terminator(&mut self, byte: u8) -> &mut RegexSetBuilder {
2293	self.builder.line_terminator(byte);
2294	self
2295	}
2296
2297	/// This configures swap-greed mode for all of the patterns.
2298	///
2299	/// When swap-greed mode is enabled, patterns like `a+` will become
2300	/// non-greedy and patterns like `a+?` will become greedy. In other
2301	/// words, the meanings of `a+` and `a+?` are switched.
2302	///
2303	/// This setting can also be configured using the inline flag `U` in
2304	/// the pattern.
2305	///
2306	/// Note that this is generally not useful for a `RegexSet` since a
2307	/// `RegexSet` can only report whether a pattern matches or not. Since
2308	/// greediness never impacts whether a match is found or not (only the
2309	/// offsets of the match), it follows that whether parts of a pattern
2310	/// are greedy or not doesn't matter for a `RegexSet`.
2311	///
2312	/// The default for this is `false`.
2313	pub fn swap_greed(&mut self, yes: bool) -> &mut RegexSetBuilder {
2314	self.builder.swap_greed(yes);
2315	self
2316	}
2317
2318	/// This configures verbose mode for all of the patterns.
2319	///
2320	/// When enabled, whitespace will treated as insignifcant in the
2321	/// pattern and `#` can be used to start a comment until the next new
2322	/// line.
2323	///
2324	/// Normally, in most places in a pattern, whitespace is treated
2325	/// literally. For example ` +` will match one or more ASCII whitespace
2326	/// characters.
2327	///
2328	/// When verbose mode is enabled, `\#` can be used to match a literal
2329	/// `#` and `\ ` can be used to match a literal ASCII whitespace
2330	/// character.
2331	///
2332	/// Verbose mode is useful for permitting regexes to be formatted and
2333	/// broken up more nicely. This may make them more easily readable.
2334	///
2335	/// This setting can also be configured using the inline flag `x` in
2336	/// the pattern.
2337	///
2338	/// The default for this is `false`.
2339	///
2340	/// # Example
2341	///
2342	/// ```
2343	/// use regex::bytes::RegexSetBuilder;
2344	///
2345	/// let pat = r"
2346	/// \b
2347	/// (?<first>\p{Uppercase}\w*) # always start with uppercase letter
2348	/// [\s--\n]+ # whitespace should separate names
2349	/// (?: # middle name can be an initial!
2350	/// (?:(?<initial>\p{Uppercase})\.\|(?<middle>\p{Uppercase}\w*))
2351	/// [\s--\n]+
2352	/// )?
2353	/// (?<last>\p{Uppercase}\w*)
2354	/// \b
2355	/// ";
2356	/// let re = RegexSetBuilder::new([pat])
2357	/// .ignore_whitespace(`true`)
2358	/// .build()
2359	/// .unwrap();
2360	/// assert!(re.is_match(b"Harry Potter"));
2361	/// assert!(re.is_match(b"Harry J. Potter"));
2362	/// assert!(re.is_match(b"Harry James Potter"));
2363	/// assert!(!re.is_match(b"harry J. Potter"));
2364	/// ```
2365	pub fn ignore_whitespace(
2366	&mut self,
2367	yes: bool,
2368	) -> &mut RegexSetBuilder {
2369	self.builder.ignore_whitespace(yes);
2370	self
2371	}
2372
2373	/// This configures octal mode for all of the patterns.
2374	///
2375	/// Octal syntax is a little-known way of uttering Unicode codepoints
2376	/// in a pattern. For example, `a`, `\x61`, `\u0061` and `\141` are all
2377	/// equivalent patterns, where the last example shows octal syntax.
2378	///
2379	/// While supporting octal syntax isn't in and of itself a problem,
2380	/// it does make good error messages harder. That is, in PCRE based
2381	/// regex engines, syntax like `\1` invokes a backreference, which is
2382	/// explicitly unsupported this library. However, many users expect
2383	/// backreferences to be supported. Therefore, when octal support
2384	/// is disabled, the error message will explicitly mention that
2385	/// backreferences aren't supported.
2386	///
2387	/// The default for this is `false`.
2388	///
2389	/// # Example
2390	///
2391	/// ```
2392	/// use regex::bytes::RegexSetBuilder;
2393	///
2394	/// // Normally this pattern would not compile, with an error message
2395	/// // about backreferences not being supported. But with octal mode
2396	/// // enabled, octal escape sequences work.
2397	/// let re = RegexSetBuilder::new([r"\141"])
2398	/// .octal(`true`)
2399	/// .build()
2400	/// .unwrap();
2401	/// assert!(re.is_match(b"a"));
2402	/// ```
2403	pub fn octal(&mut self, yes: bool) -> &mut RegexSetBuilder {
2404	self.builder.octal(yes);
2405	self
2406	}
2407
2408	/// Sets the approximate size limit, in bytes, of the compiled regex.
2409	///
2410	/// This roughly corresponds to the number of heap memory, in
2411	/// bytes, occupied by a single regex. If the regex would otherwise
2412	/// approximately exceed this limit, then compiling that regex will
2413	/// fail.
2414	///
2415	/// The main utility of a method like this is to avoid compiling
2416	/// regexes that use an unexpected amount of resources, such as
2417	/// time and memory. Even if the memory usage of a large regex is
2418	/// acceptable, its search time may not be. Namely, worst case time
2419	/// complexity for search is `O(m n)`, where `m ~ len(pattern)` and*
2420	/// `n ~ len(haystack)`. That is, search time depends, in part, on the
2421	/// size of the compiled regex. This means that putting a limit on the
2422	/// size of the regex limits how much a regex can impact search time.
2423	///
2424	/// For more information about regex size limits, see the section on
2425	/// [untrusted inputs](crate#untrusted-input) in the top-level crate
2426	/// documentation.
2427	///
2428	/// The default for this is some reasonable number that permits most
2429	/// patterns to compile successfully.
2430	///
2431	/// # Example
2432	///
2433	/// ```
2434	/// # if !cfg!(target_pointer_width = "64") { return; } // see #1041
2435	/// use regex::bytes::RegexSetBuilder;
2436	///
2437	/// // It may surprise you how big some seemingly small patterns can
2438	/// // be! Since \w is Unicode aware, this generates a regex that can
2439	/// // match approximately 140,000 distinct codepoints.
2440	/// assert!(
2441	/// RegexSetBuilder::new([r"\w"])
2442	/// .size_limit(`45_000`)
2443	/// .build()
2444	/// .is_err()
2445	/// );
2446	/// ```
2447	pub fn size_limit(&mut self, bytes: usize) -> &mut RegexSetBuilder {
2448	self.builder.size_limit(bytes);
2449	self
2450	}
2451
2452	/// Set the approximate capacity, in bytes, of the cache of transitions
2453	/// used by the lazy DFA.
2454	///
2455	/// While the lazy DFA isn't always used, in tends to be the most
2456	/// commonly use regex engine in default configurations. It tends to
2457	/// adopt the performance profile of a fully build DFA, but without the
2458	/// downside of taking worst case exponential time to build.
2459	///
2460	/// The downside is that it needs to keep a cache of transitions and
2461	/// states that are built while running a search, and this cache
2462	/// can fill up. When it fills up, the cache will reset itself. Any
2463	/// previously generated states and transitions will then need to be
2464	/// re-generated. If this happens too many times, then this library
2465	/// will bail out of using the lazy DFA and switch to a different regex
2466	/// engine.
2467	///
2468	/// If your regex provokes this particular downside of the lazy DFA,
2469	/// then it may be beneficial to increase its cache capacity. This will
2470	/// potentially reduce the frequency of cache resetting (ideally to
2471	/// `0`). While it won't fix all potential performance problems with
2472	/// the lazy DFA, increasing the cache capacity does fix some.
2473	///
2474	/// There is no easy way to determine, a priori, whether increasing
2475	/// this cache capacity will help. In general, the larger your regex,
2476	/// the more cache it's likely to use. But that isn't an ironclad rule.
2477	/// For example, a regex like `[01]1[01]{N}` would normally produce a*
2478	/// fully build DFA that is exponential in size with respect to `N`.
2479	/// The lazy DFA will prevent exponential space blow-up, but it cache
2480	/// is likely to fill up, even when it's large and even for smallish
2481	/// values of `N`.
2482	///
2483	/// If you aren't sure whether this helps or not, it is sensible to
2484	/// set this to some arbitrarily large number in testing, such as
2485	/// `usize::MAX`. Namely, this represents the amount of capacity that
2486	/// may* be used. It's probably not a good idea to use `usize::MAX` in*
2487	/// production though, since it implies there are no controls on heap
2488	/// memory used by this library during a search. In effect, set it to
2489	/// whatever you're willing to allocate for a single regex search.
2490	pub fn dfa_size_limit(
2491	&mut self,
2492	bytes: usize,
2493	) -> &mut RegexSetBuilder {
2494	self.builder.dfa_size_limit(bytes);
2495	self
2496	}
2497
2498	/// Set the nesting limit for this parser.
2499	///
2500	/// The nesting limit controls how deep the abstract syntax tree is
2501	/// allowed to be. If the AST exceeds the given limit (e.g., with too
2502	/// many nested groups), then an error is returned by the parser.
2503	///
2504	/// The purpose of this limit is to act as a heuristic to prevent stack
2505	/// overflow for consumers that do structural induction on an AST using
2506	/// explicit recursion. While this crate never does this (instead using
2507	/// constant stack space and moving the call stack to the heap), other
2508	/// crates may.
2509	///
2510	/// This limit is not checked until the entire AST is parsed.
2511	/// Therefore, if callers want to put a limit on the amount of heap
2512	/// space used, then they should impose a limit on the length, in
2513	/// bytes, of the concrete pattern string. In particular, this is
2514	/// viable since this parser implementation will limit itself to heap
2515	/// space proportional to the length of the pattern string. See also
2516	/// the [untrusted inputs](crate#untrusted-input) section in the
2517	/// top-level crate documentation for more information about this.
2518	///
2519	/// Note that a nest limit of `0` will return a nest limit error for
2520	/// most patterns but not all. For example, a nest limit of `0` permits
2521	/// `a` but not `ab`, since `ab` requires an explicit concatenation,
2522	/// which results in a nest depth of `1`. In general, a nest limit is
2523	/// not something that manifests in an obvious way in the concrete
2524	/// syntax, therefore, it should not be used in a granular way.
2525	///
2526	/// # Example
2527	///
2528	/// ```
2529	/// use regex::bytes::RegexSetBuilder;
2530	///
2531	/// assert!(RegexSetBuilder::new([r"a"]).nest_limit(`0`).build().is_ok());
2532	/// assert!(RegexSetBuilder::new([r"ab"]).nest_limit(`0`).build().is_err());
2533	/// ```
2534	pub fn nest_limit(&mut self, limit: u32) -> &mut RegexSetBuilder {
2535	self.builder.nest_limit(limit);
2536	self
2537	}
2538	}
2539	}
2540

Provided by KDAB

Definitions