regex.rs - Codebrowser

1	#[cfg(feature = "std")]
2	use dense::{self, DenseDFA};
3	use dfa::DFA;
4	#[cfg(feature = "std")]
5	use error::Result;
6	#[cfg(feature = "std")]
7	use sparse::SparseDFA;
8	#[cfg(feature = "std")]
9	use state_id::StateID;
10
11	/// A regular expression that uses deterministic finite automata for fast
12	/// searching.
13	///
14	/// A regular expression is comprised of two DFAs, a "forward" DFA and a
15	/// "reverse" DFA. The forward DFA is responsible for detecting the end of a
16	/// match while the reverse DFA is responsible for detecting the start of a
17	/// match. Thus, in order to find the bounds of any given match, a forward
18	/// search must first be run followed by a reverse search. A match found by
19	/// the forward DFA guarantees that the reverse DFA will also find a match.
20	///
21	/// The type of the DFA used by a `Regex` corresponds to the `D` type
22	/// parameter, which must satisfy the [`DFA`](trait.DFA.html) trait. Typically,
23	/// `D` is either a [`DenseDFA`](enum.DenseDFA.html) or a
24	/// [`SparseDFA`](enum.SparseDFA.html), where dense DFAs use more memory but
25	/// search faster, while sparse DFAs use less memory but search more slowly.
26	///
27	/// By default, a regex's DFA type parameter is set to
28	/// `DenseDFA<Vec<usize>, usize>`. For most in-memory work loads, this is the
29	/// most convenient type that gives the best search performance.
30	///
31	/// # Sparse DFAs
32	///
33	/// Since a `Regex` is generic over the `DFA` trait, it can be used with any
34	/// kind of DFA. While this crate constructs dense DFAs by default, it is easy
35	/// enough to build corresponding sparse DFAs, and then build a regex from
36	/// them:
37	///
38	/// ```
39	/// use regex_automata::Regex;
40	///
41	/// # fn example() -> Result<(), regex_automata::Error> {
42	/// // First, build a regex that uses dense DFAs.
43	/// let dense_re = Regex::new("foo[0-9]+")?;
44	///
45	/// // Second, build sparse DFAs from the forward and reverse dense DFAs.
46	/// let fwd = dense_re.forward().to_sparse()?;
47	/// let rev = dense_re.reverse().to_sparse()?;
48	///
49	/// // Third, build a new regex from the constituent sparse DFAs.
50	/// let sparse_re = Regex::from_dfas(fwd, rev);
51	///
52	/// // A regex that uses sparse DFAs can be used just like with dense DFAs.
53	/// assert_eq!(`true`, sparse_re.is_match(b"foo123"));
54	/// # Ok(()) }; example().unwrap()
55	/// ```
56	#[cfg(feature = "std")]
57	#[derive(Clone, Debug)]
58	pub struct Regex<D: DFA = DenseDFA<Vec<usize>, usize>> {
59	forward: D,
60	reverse: D,
61	}
62
63	/// A regular expression that uses deterministic finite automata for fast
64	/// searching.
65	///
66	/// A regular expression is comprised of two DFAs, a "forward" DFA and a
67	/// "reverse" DFA. The forward DFA is responsible for detecting the end of a
68	/// match while the reverse DFA is responsible for detecting the start of a
69	/// match. Thus, in order to find the bounds of any given match, a forward
70	/// search must first be run followed by a reverse search. A match found by
71	/// the forward DFA guarantees that the reverse DFA will also find a match.
72	///
73	/// The type of the DFA used by a `Regex` corresponds to the `D` type
74	/// parameter, which must satisfy the [`DFA`](trait.DFA.html) trait. Typically,
75	/// `D` is either a [`DenseDFA`](enum.DenseDFA.html) or a
76	/// [`SparseDFA`](enum.SparseDFA.html), where dense DFAs use more memory but
77	/// search faster, while sparse DFAs use less memory but search more slowly.
78	///
79	/// When using this crate without the standard library, the `Regex` type has
80	/// no default type parameter.
81	///
82	/// # Sparse DFAs
83	///
84	/// Since a `Regex` is generic over the `DFA` trait, it can be used with any
85	/// kind of DFA. While this crate constructs dense DFAs by default, it is easy
86	/// enough to build corresponding sparse DFAs, and then build a regex from
87	/// them:
88	///
89	/// ```
90	/// use regex_automata::Regex;
91	///
92	/// # fn example() -> Result<(), regex_automata::Error> {
93	/// // First, build a regex that uses dense DFAs.
94	/// let dense_re = Regex::new("foo[0-9]+")?;
95	///
96	/// // Second, build sparse DFAs from the forward and reverse dense DFAs.
97	/// let fwd = dense_re.forward().to_sparse()?;
98	/// let rev = dense_re.reverse().to_sparse()?;
99	///
100	/// // Third, build a new regex from the constituent sparse DFAs.
101	/// let sparse_re = Regex::from_dfas(fwd, rev);
102	///
103	/// // A regex that uses sparse DFAs can be used just like with dense DFAs.
104	/// assert_eq!(true, sparse_re.is_match(b"foo123"));
105	/// # Ok(()) }; example().unwrap()
106	/// ```
107	#[cfg(not(feature = "std"))]
108	#[derive(Clone, Debug)]
109	pub struct Regex<D> {
110	forward: D,
111	reverse: D,
112	}
113
114	#[cfg(feature = "std")]
115	impl Regex {
116	/// Parse the given regular expression using a default configuration and
117	/// return the corresponding regex.
118	///
119	/// The default configuration uses `usize` for state IDs, premultiplies
120	/// them and reduces the alphabet size by splitting bytes into equivalence
121	/// classes. The underlying DFAs are not* minimized.*
122	///
123	/// If you want a non-default configuration, then use the
124	/// [`RegexBuilder`](struct.RegexBuilder.html)
125	/// to set your own configuration.
126	///
127	/// # Example
128	///
129	/// ```
130	/// use regex_automata::Regex;
131	///
132	/// # fn example() -> Result<(), regex_automata::Error> {
133	/// let re = Regex::new("foo[0-9]+bar")?;
134	/// assert_eq!(Some((`3`, `14`)), re.find(b"zzzfoo12345barzzz"));
135	/// # Ok(()) }; example().unwrap()
136	/// ```
137	pub fn new(pattern: &str) -> Result<Regex> {
138	RegexBuilder::new().build(pattern)
139	}
140	}
141
142	#[cfg(feature = "std")]
143	impl Regex<SparseDFA<Vec<u8>, usize>> {
144	/// Parse the given regular expression using a default configuration and
145	/// return the corresponding regex using sparse DFAs.
146	///
147	/// The default configuration uses `usize` for state IDs, reduces the
148	/// alphabet size by splitting bytes into equivalence classes. The
149	/// underlying DFAs are not* minimized.*
150	///
151	/// If you want a non-default configuration, then use the
152	/// [`RegexBuilder`](struct.RegexBuilder.html)
153	/// to set your own configuration.
154	///
155	/// # Example
156	///
157	/// ```
158	/// use regex_automata::Regex;
159	///
160	/// # fn example() -> Result<(), regex_automata::Error> {
161	/// let re = Regex::new_sparse("foo[0-9]+bar")?;
162	/// assert_eq!(Some((`3`, `14`)), re.find(b"zzzfoo12345barzzz"));
163	/// # Ok(()) }; example().unwrap()
164	/// ```
165	pub fn new_sparse(
166	pattern: &str,
167	) -> Result<Regex<SparseDFA<Vec<u8>, usize>>> {
168	RegexBuilder::new().build_sparse(pattern)
169	}
170	}
171
172	impl<D: DFA> Regex<D> {
173	/// Returns true if and only if the given bytes match.
174	///
175	/// This routine may short circuit if it knows that scanning future input
176	/// will never lead to a different result. In particular, if the underlying
177	/// DFA enters a match state or a dead state, then this routine will return
178	/// `true` or `false`, respectively, without inspecting any future input.
179	///
180	/// # Example
181	///
182	/// ```
183	/// use regex_automata::Regex;
184	///
185	/// # fn example() -> Result<(), regex_automata::Error> {
186	/// let re = Regex::new("foo[0-9]+bar")?;
187	/// assert_eq!(`true`, re.is_match(b"foo12345bar"));
188	/// assert_eq!(`false`, re.is_match(b"foobar"));
189	/// # Ok(()) }; example().unwrap()
190	/// ```
191	pub fn is_match(&self, input: &[u8]) -> bool {
192	self.is_match_at(input, `0`)
193	}
194
195	/// Returns the first position at which a match is found.
196	///
197	/// This routine stops scanning input in precisely the same circumstances
198	/// as `is_match`. The key difference is that this routine returns the
199	/// position at which it stopped scanning input if and only if a match
200	/// was found. If no match is found, then `None` is returned.
201	///
202	/// # Example
203	///
204	/// ```
205	/// use regex_automata::Regex;
206	///
207	/// # fn example() -> Result<(), regex_automata::Error> {
208	/// let re = Regex::new("foo[0-9]+")?;
209	/// assert_eq!(Some(`4`), re.shortest_match(b"foo12345"));
210	///
211	/// // Normally, the end of the leftmost first match here would be 3,
212	/// // but the shortest match semantics detect a match earlier.
213	/// let re = Regex::new("abc\|a")?;
214	/// assert_eq!(Some(`1`), re.shortest_match(b"abc"));
215	/// # Ok(()) }; example().unwrap()
216	/// ```
217	pub fn shortest_match(&self, input: &[u8]) -> Option<usize> {
218	self.shortest_match_at(input, `0`)
219	}
220
221	/// Returns the start and end offset of the leftmost first match. If no
222	/// match exists, then `None` is returned.
223	///
224	/// The "leftmost first" match corresponds to the match with the smallest
225	/// starting offset, but where the end offset is determined by preferring
226	/// earlier branches in the original regular expression. For example,
227	/// `Sam\|Samwise` will match `Sam` in `Samwise`, but `Samwise\|Sam` will
228	/// match `Samwise` in `Samwise`.
229	///
230	/// Generally speaking, the "leftmost first" match is how most backtracking
231	/// regular expressions tend to work. This is in contrast to POSIX-style
232	/// regular expressions that yield "leftmost longest" matches. Namely,
233	/// both `Sam\|Samwise` and `Samwise\|Sam` match `Samwise` when using
234	/// leftmost longest semantics.
235	///
236	/// # Example
237	///
238	/// ```
239	/// use regex_automata::Regex;
240	///
241	/// # fn example() -> Result<(), regex_automata::Error> {
242	/// let re = Regex::new("foo[0-9]+")?;
243	/// assert_eq!(Some((`3`, `11`)), re.find(b"zzzfoo12345zzz"));
244	///
245	/// // Even though a match is found after reading the first byte (`a`),
246	/// // the leftmost first match semantics demand that we find the earliest
247	/// // match that prefers earlier parts of the pattern over latter parts.
248	/// let re = Regex::new("abc\|a")?;
249	/// assert_eq!(Some((`0`, `3`)), re.find(b"abc"));
250	/// # Ok(()) }; example().unwrap()
251	/// ```
252	pub fn find(&self, input: &[u8]) -> Option<(usize, usize)> {
253	self.find_at(input, `0`)
254	}
255
256	/// Returns the same as `is_match`, but starts the search at the given
257	/// offset.
258	///
259	/// The significance of the starting point is that it takes the surrounding
260	/// context into consideration. For example, if the DFA is anchored, then
261	/// a match can only occur when `start == 0`.
262	pub fn is_match_at(&self, input: &[u8], start: usize) -> bool {
263	self.forward().is_match_at(input, start)
264	}
265
266	/// Returns the same as `shortest_match`, but starts the search at the
267	/// given offset.
268	///
269	/// The significance of the starting point is that it takes the surrounding
270	/// context into consideration. For example, if the DFA is anchored, then
271	/// a match can only occur when `start == 0`.
272	pub fn shortest_match_at(
273	&self,
274	input: &[u8],
275	start: usize,
276	) -> Option<usize> {
277	self.forward().shortest_match_at(input, start)
278	}
279
280	/// Returns the same as `find`, but starts the search at the given
281	/// offset.
282	///
283	/// The significance of the starting point is that it takes the surrounding
284	/// context into consideration. For example, if the DFA is anchored, then
285	/// a match can only occur when `start == 0`.
286	pub fn find_at(
287	&self,
288	input: &[u8],
289	start: usize,
290	) -> Option<(usize, usize)> {
291	let end = match self.forward().find_at(input, start) {
292	None => return None,
293	Some(end) => end,
294	};
295	let start = self
296	.reverse()
297	.rfind(&input[start..end])
298	.map(\|i\| start + i)
299	.expect("reverse search must match if forward search does");
300	Some((start, end))
301	}
302
303	/// Returns an iterator over all non-overlapping leftmost first matches
304	/// in the given bytes. If no match exists, then the iterator yields no
305	/// elements.
306	///
307	/// Note that if the regex can match the empty string, then it is
308	/// possible for the iterator to yield a zero-width match at a location
309	/// that is not a valid UTF-8 boundary (for example, between the code units
310	/// of a UTF-8 encoded codepoint). This can happen regardless of whether
311	/// [`allow_invalid_utf8`](struct.RegexBuilder.html#method.allow_invalid_utf8)
312	/// was enabled or not.
313	///
314	/// # Example
315	///
316	/// ```
317	/// use regex_automata::Regex;
318	///
319	/// # fn example() -> Result<(), regex_automata::Error> {
320	/// let re = Regex::new("foo[0-9]+")?;
321	/// let text = b"foo1 foo12 foo123";
322	/// let matches: Vec<(usize, usize)> = re.find_iter(text).collect();
323	/// assert_eq!(matches, vec![(`0`, `4`), (`5`, `10`), (`11`, `17`)]);
324	/// # Ok(()) }; example().unwrap()
325	/// ```
326	pub fn find_iter<'r, 't>(&'r self, input: &'t [u8]) -> Matches<'r, 't, D> {
327	Matches::new(self, input)
328	}
329
330	/// Build a new regex from its constituent forward and reverse DFAs.
331	///
332	/// This is useful when deserializing a regex from some arbitrary
333	/// memory region. This is also useful for building regexes from other
334	/// types of DFAs.
335	///
336	/// # Example
337	///
338	/// This example is a bit a contrived. The usual use of these methods
339	/// would involve serializing `initial_re` somewhere and then deserializing
340	/// it later to build a regex.
341	///
342	/// ```
343	/// use regex_automata::Regex;
344	///
345	/// # fn example() -> Result<(), regex_automata::Error> {
346	/// let initial_re = Regex::new("foo[0-9]+")?;
347	/// assert_eq!(`true`, initial_re.is_match(b"foo123"));
348	///
349	/// let (fwd, rev) = (initial_re.forward(), initial_re.reverse());
350	/// let re = Regex::from_dfas(fwd, rev);
351	/// assert_eq!(`true`, re.is_match(b"foo123"));
352	/// # Ok(()) }; example().unwrap()
353	/// ```
354	///
355	/// This example shows how you might build smaller DFAs, and then use those
356	/// smaller DFAs to build a new regex.
357	///
358	/// ```
359	/// use regex_automata::Regex;
360	///
361	/// # fn example() -> Result<(), regex_automata::Error> {
362	/// let initial_re = Regex::new("foo[0-9]+")?;
363	/// assert_eq!(`true`, initial_re.is_match(b"foo123"));
364	///
365	/// let fwd = initial_re.forward().to_u16()?;
366	/// let rev = initial_re.reverse().to_u16()?;
367	/// let re = Regex::from_dfas(fwd, rev);
368	/// assert_eq!(`true`, re.is_match(b"foo123"));
369	/// # Ok(()) }; example().unwrap()
370	/// ```
371	///
372	/// This example shows how to build a `Regex` that uses sparse DFAs instead
373	/// of dense DFAs:
374	///
375	/// ```
376	/// use regex_automata::Regex;
377	///
378	/// # fn example() -> Result<(), regex_automata::Error> {
379	/// let initial_re = Regex::new("foo[0-9]+")?;
380	/// assert_eq!(`true`, initial_re.is_match(b"foo123"));
381	///
382	/// let fwd = initial_re.forward().to_sparse()?;
383	/// let rev = initial_re.reverse().to_sparse()?;
384	/// let re = Regex::from_dfas(fwd, rev);
385	/// assert_eq!(`true`, re.is_match(b"foo123"));
386	/// # Ok(()) }; example().unwrap()
387	/// ```
388	pub fn from_dfas(forward: D, reverse: D) -> Regex<D> {
389	Regex { forward, reverse }
390	}
391
392	/// Return the underlying DFA responsible for forward matching.
393	pub fn forward(&self) -> &D {
394	&self.forward
395	}
396
397	/// Return the underlying DFA responsible for reverse matching.
398	pub fn reverse(&self) -> &D {
399	&self.reverse
400	}
401	}
402
403	/// An iterator over all non-overlapping matches for a particular search.
404	///
405	/// The iterator yields a `(usize, usize)` value until no more matches could be
406	/// found. The first `usize` is the start of the match (inclusive) while the
407	/// second `usize` is the end of the match (exclusive).
408	///
409	/// `S` is the type used to represent state identifiers in the underlying
410	/// regex. The lifetime variables are as follows:
411	///
412	/// `'r` is the lifetime of the regular expression value itself.*
413	/// `'t` is the lifetime of the text being searched.*
414	#[derive(Clone, Debug)]
415	pub struct Matches<'r, 't, D: DFA + 'r> {
416	re: &'r Regex<D>,
417	text: &'t [u8],
418	last_end: usize,
419	last_match: Option<usize>,
420	}
421
422	impl<'r, 't, D: DFA> Matches<'r, 't, D> {
423	fn new(re: &'r Regex<D>, text: &'t [u8]) -> Matches<'r, 't, D> {
424	Matches { re, text, last_end: `0`, last_match: None }
425	}
426	}
427
428	impl<'r, 't, D: DFA> Iterator for Matches<'r, 't, D> {
429	type Item = (usize, usize);
430
431	fn next(&mut self) -> Option<(usize, usize)> {
432	if self.last_end > self.text.len() {
433	return None;
434	}
435	let (s, e) = match self.re.find_at(self.text, self.last_end) {
436	None => return None,
437	Some((s, e)) => (s, e),
438	};
439	if s == e {
440	// This is an empty match. To ensure we make progress, start
441	// the next search at the smallest possible starting position
442	// of the next match following this one.
443	self.last_end = e + `1`;
444	// Don't accept empty matches immediately following a match.
445	// Just move on to the next match.
446	if Some(e) == self.last_match {
447	return self.next();
448	}
449	} else {
450	self.last_end = e;
451	}
452	self.last_match = Some(e);
453	Some((s, e))
454	}
455	}
456
457	/// A builder for a regex based on deterministic finite automatons.
458	///
459	/// This builder permits configuring several aspects of the construction
460	/// process such as case insensitivity, Unicode support and various options
461	/// that impact the size of the underlying DFAs. In some cases, options (like
462	/// performing DFA minimization) can come with a substantial additional cost.
463	///
464	/// This builder generally constructs two DFAs, where one is responsible for
465	/// finding the end of a match and the other is responsible for finding the
466	/// start of a match. If you only need to detect whether something matched,
467	/// or only the end of a match, then you should use a
468	/// [`dense::Builder`](dense/struct.Builder.html)
469	/// to construct a single DFA, which is cheaper than building two DFAs.
470	#[cfg(feature = "std")]
471	#[derive(Clone, Debug)]
472	pub struct RegexBuilder {
473	dfa: dense::Builder,
474	}
475
476	#[cfg(feature = "std")]
477	impl RegexBuilder {
478	/// Create a new regex builder with the default configuration.
479	pub fn new() -> RegexBuilder {
480	RegexBuilder { dfa: dense::Builder::new() }
481	}
482
483	/// Build a regex from the given pattern.
484	///
485	/// If there was a problem parsing or compiling the pattern, then an error
486	/// is returned.
487	pub fn build(&self, pattern: &str) -> Result<Regex> {
488	self.build_with_size::<usize>(pattern)
489	}
490
491	/// Build a regex from the given pattern using sparse DFAs.
492	///
493	/// If there was a problem parsing or compiling the pattern, then an error
494	/// is returned.
495	pub fn build_sparse(
496	&self,
497	pattern: &str,
498	) -> Result<Regex<SparseDFA<Vec<u8>, usize>>> {
499	self.build_with_size_sparse::<usize>(pattern)
500	}
501
502	/// Build a regex from the given pattern using a specific representation
503	/// for the underlying DFA state IDs.
504	///
505	/// If there was a problem parsing or compiling the pattern, then an error
506	/// is returned.
507	///
508	/// The representation of state IDs is determined by the `S` type
509	/// parameter. In general, `S` is usually one of `u8`, `u16`, `u32`, `u64`
510	/// or `usize`, where `usize` is the default used for `build`. The purpose
511	/// of specifying a representation for state IDs is to reduce the memory
512	/// footprint of the underlying DFAs.
513	///
514	/// When using this routine, the chosen state ID representation will be
515	/// used throughout determinization and minimization, if minimization was
516	/// requested. Even if the minimized DFAs can fit into the chosen state ID
517	/// representation but the initial determinized DFA cannot, then this will
518	/// still return an error. To get a minimized DFA with a smaller state ID
519	/// representation, first build it with a bigger state ID representation,
520	/// and then shrink the sizes of the DFAs using one of its conversion
521	/// routines, such as [`DenseDFA::to_u16`](enum.DenseDFA.html#method.to_u16).
522	/// Finally, reconstitute the regex via
523	/// [`Regex::from_dfa`](struct.Regex.html#method.from_dfa).
524	pub fn build_with_size<S: StateID>(
525	&self,
526	pattern: &str,
527	) -> Result<Regex<DenseDFA<Vec<S>, S>>> {
528	let forward = self.dfa.build_with_size(pattern)?;
529	let reverse = self
530	.dfa
531	.clone()
532	.anchored(`true`)
533	.reverse(`true`)
534	.longest_match(`true`)
535	.build_with_size(pattern)?;
536	Ok(Regex::from_dfas(forward, reverse))
537	}
538
539	/// Build a regex from the given pattern using a specific representation
540	/// for the underlying DFA state IDs using sparse DFAs.
541	pub fn build_with_size_sparse<S: StateID>(
542	&self,
543	pattern: &str,
544	) -> Result<Regex<SparseDFA<Vec<u8>, S>>> {
545	let re = self.build_with_size(pattern)?;
546	let fwd = re.forward().to_sparse()?;
547	let rev = re.reverse().to_sparse()?;
548	Ok(Regex::from_dfas(fwd, rev))
549	}
550
551	/// Set whether matching must be anchored at the beginning of the input.
552	///
553	/// When enabled, a match must begin at the start of the input. When
554	/// disabled, the regex will act as if the pattern started with a `.?`,*
555	/// which enables a match to appear anywhere.
556	///
557	/// By default this is disabled.
558	pub fn anchored(&mut self, yes: bool) -> &mut RegexBuilder {
559	self.dfa.anchored(yes);
560	self
561	}
562
563	/// Enable or disable the case insensitive flag by default.
564	///
565	/// By default this is disabled. It may alternatively be selectively
566	/// enabled in the regular expression itself via the `i` flag.
567	pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder {
568	self.dfa.case_insensitive(yes);
569	self
570	}
571
572	/// Enable verbose mode in the regular expression.
573	///
574	/// When enabled, verbose mode permits insigificant whitespace in many
575	/// places in the regular expression, as well as comments. Comments are
576	/// started using `#` and continue until the end of the line.
577	///
578	/// By default, this is disabled. It may be selectively enabled in the
579	/// regular expression by using the `x` flag regardless of this setting.
580	pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder {
581	self.dfa.ignore_whitespace(yes);
582	self
583	}
584
585	/// Enable or disable the "dot matches any character" flag by default.
586	///
587	/// By default this is disabled. It may alternatively be selectively
588	/// enabled in the regular expression itself via the `s` flag.
589	pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut RegexBuilder {
590	self.dfa.dot_matches_new_line(yes);
591	self
592	}
593
594	/// Enable or disable the "swap greed" flag by default.
595	///
596	/// By default this is disabled. It may alternatively be selectively
597	/// enabled in the regular expression itself via the `U` flag.
598	pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder {
599	self.dfa.swap_greed(yes);
600	self
601	}
602
603	/// Enable or disable the Unicode flag (`u`) by default.
604	///
605	/// By default this is enabled. It may alternatively be selectively
606	/// disabled in the regular expression itself via the `u` flag.
607	///
608	/// Note that unless `allow_invalid_utf8` is enabled (it's disabled by
609	/// default), a regular expression will fail to parse if Unicode mode is
610	/// disabled and a sub-expression could possibly match invalid UTF-8.
611	pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder {
612	self.dfa.unicode(yes);
613	self
614	}
615
616	/// When enabled, the builder will permit the construction of a regular
617	/// expression that may match invalid UTF-8.
618	///
619	/// When disabled (the default), the builder is guaranteed to produce a
620	/// regex that will only ever match valid UTF-8 (otherwise, the builder
621	/// will return an error).
622	pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut RegexBuilder {
623	self.dfa.allow_invalid_utf8(yes);
624	self
625	}
626
627	/// Set the nesting limit used for the regular expression parser.
628	///
629	/// The nesting limit controls how deep the abstract syntax tree is allowed
630	/// to be. If the AST exceeds the given limit (e.g., with too many nested
631	/// groups), then an error is returned by the parser.
632	///
633	/// The purpose of this limit is to act as a heuristic to prevent stack
634	/// overflow when building a finite automaton from a regular expression's
635	/// abstract syntax tree. In particular, construction currently uses
636	/// recursion. In the future, the implementation may stop using recursion
637	/// and this option will no longer be necessary.
638	///
639	/// This limit is not checked until the entire AST is parsed. Therefore,
640	/// if callers want to put a limit on the amount of heap space used, then
641	/// they should impose a limit on the length, in bytes, of the concrete
642	/// pattern string. In particular, this is viable since the parser will
643	/// limit itself to heap space proportional to the lenth of the pattern
644	/// string.
645	///
646	/// Note that a nest limit of `0` will return a nest limit error for most
647	/// patterns but not all. For example, a nest limit of `0` permits `a` but
648	/// not `ab`, since `ab` requires a concatenation AST item, which results
649	/// in a nest depth of `1`. In general, a nest limit is not something that
650	/// manifests in an obvious way in the concrete syntax, therefore, it
651	/// should not be used in a granular way.
652	pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder {
653	self.dfa.nest_limit(limit);
654	self
655	}
656
657	/// Minimize the underlying DFAs.
658	///
659	/// When enabled, the DFAs powering the resulting regex will be minimized
660	/// such that it is as small as possible.
661	///
662	/// Whether one enables minimization or not depends on the types of costs
663	/// you're willing to pay and how much you care about its benefits. In
664	/// particular, minimization has worst case `O(nklogn)` time and `O(kn)`*
665	/// space, where `n` is the number of DFA states and `k` is the alphabet
666	/// size. In practice, minimization can be quite costly in terms of both
667	/// space and time, so it should only be done if you're willing to wait
668	/// longer to produce a DFA. In general, you might want a minimal DFA in
669	/// the following circumstances:
670	///
671	/// 1. You would like to optimize for the size of the automaton. This can
672	/// manifest in one of two ways. Firstly, if you're converting the
673	/// DFA into Rust code (or a table embedded in the code), then a minimal
674	/// DFA will translate into a corresponding reduction in code size, and
675	/// thus, also the final compiled binary size. Secondly, if you are
676	/// building many DFAs and putting them on the heap, you'll be able to
677	/// fit more if they are smaller. Note though that building a minimal
678	/// DFA itself requires additional space; you only realize the space
679	/// savings once the minimal DFA is constructed (at which point, the
680	/// space used for minimization is freed).
681	/// 2. You've observed that a smaller DFA results in faster match
682	/// performance. Naively, this isn't guaranteed since there is no
683	/// inherent difference between matching with a bigger-than-minimal
684	/// DFA and a minimal DFA. However, a smaller DFA may make use of your
685	/// CPU's cache more efficiently.
686	/// 3. You are trying to establish an equivalence between regular
687	/// languages. The standard method for this is to build a minimal DFA
688	/// for each language and then compare them. If the DFAs are equivalent
689	/// (up to state renaming), then the languages are equivalent.
690	///
691	/// This option is disabled by default.
692	pub fn minimize(&mut self, yes: bool) -> &mut RegexBuilder {
693	self.dfa.minimize(yes);
694	self
695	}
696
697	/// Premultiply state identifiers in the underlying DFA transition tables.
698	///
699	/// When enabled, state identifiers are premultiplied to point to their
700	/// corresponding row in the DFA's transition table. That is, given the
701	/// `i`th state, its corresponding premultiplied identifier is `i k`*
702	/// where `k` is the alphabet size of the DFA. (The alphabet size is at
703	/// most 256, but is in practice smaller if byte classes is enabled.)
704	///
705	/// When state identifiers are not premultiplied, then the identifier of
706	/// the `i`th state is `i`.
707	///
708	/// The advantage of premultiplying state identifiers is that is saves
709	/// a multiplication instruction per byte when searching with the DFA.
710	/// This has been observed to lead to a 20% performance benefit in
711	/// micro-benchmarks.
712	///
713	/// The primary disadvantage of premultiplying state identifiers is
714	/// that they require a larger integer size to represent. For example,
715	/// if your DFA has 200 states, then its premultiplied form requires
716	/// 16 bits to represent every possible state identifier, where as its
717	/// non-premultiplied form only requires 8 bits.
718	///
719	/// This option is enabled by default.
720	pub fn premultiply(&mut self, yes: bool) -> &mut RegexBuilder {
721	self.dfa.premultiply(yes);
722	self
723	}
724
725	/// Shrink the size of the underlying DFA alphabet by mapping bytes to
726	/// their equivalence classes.
727	///
728	/// When enabled, each DFA will use a map from all possible bytes to their
729	/// corresponding equivalence class. Each equivalence class represents a
730	/// set of bytes that does not discriminate between a match and a non-match
731	/// in the DFA. For example, the pattern `[ab]+` has at least two
732	/// equivalence classes: a set containing `a` and `b` and a set containing
733	/// every byte except for `a` and `b`. `a` and `b` are in the same
734	/// equivalence classes because they never discriminate between a match
735	/// and a non-match.
736	///
737	/// The advantage of this map is that the size of the transition table can
738	/// be reduced drastically from `#states 256 * sizeof(id)` to*
739	/// `#states k * sizeof(id)` where `k` is the number of equivalence*
740	/// classes. As a result, total space usage can decrease substantially.
741	/// Moreover, since a smaller alphabet is used, compilation becomes faster
742	/// as well.
743	///
744	/// The disadvantage of this map is that every byte searched must be
745	/// passed through this map before it can be used to determine the next
746	/// transition. This has a small match time performance cost.
747	///
748	/// This option is enabled by default.
749	pub fn byte_classes(&mut self, yes: bool) -> &mut RegexBuilder {
750	self.dfa.byte_classes(yes);
751	self
752	}
753
754	/// Apply best effort heuristics to shrink the NFA at the expense of more
755	/// time/memory.
756	///
757	/// This may be exposed in the future, but for now is exported for use in
758	/// the `regex-automata-debug` tool.
759	#[doc(hidden)]
760	pub fn shrink(&mut self, yes: bool) -> &mut RegexBuilder {
761	self.dfa.shrink(yes);
762	self
763	}
764	}
765
766	#[cfg(feature = "std")]
767	impl Default for RegexBuilder {
768	fn default() -> RegexBuilder {
769	RegexBuilder::new()
770	}
771	}
772

Provided by KDAB

Definitions