re_builder.rs source code [crates/regex-1.8.4/src/re_builder.rs]

1	/// The set of user configurable options for compiling zero or more regexes.
2	#[derive(Clone, Debug)]
3	#[allow(missing_docs)]
4	pub struct RegexOptions {
5	pub pats: Vec<String>,
6	pub size_limit: usize,
7	pub dfa_size_limit: usize,
8	pub nest_limit: u32,
9	pub case_insensitive: bool,
10	pub multi_line: bool,
11	pub dot_matches_new_line: bool,
12	pub swap_greed: bool,
13	pub ignore_whitespace: bool,
14	pub unicode: bool,
15	pub octal: bool,
16	}
17
18	impl Default for RegexOptions {
19	fn default() -> Self {
20	RegexOptions {
21	pats: vec![],
22	size_limit: `10` * (`1` << `20`),
23	dfa_size_limit: `2` * (`1` << `20`),
24	nest_limit: `250`,
25	case_insensitive: `false`,
26	multi_line: `false`,
27	dot_matches_new_line: `false`,
28	swap_greed: `false`,
29	ignore_whitespace: `false`,
30	unicode: `true`,
31	octal: `false`,
32	}
33	}
34	}
35
36	macro_rules! define_builder {
37	($name:ident, $regex_mod:ident, $only_utf8:expr) => {
38	pub mod $name {
39	use super::RegexOptions;
40	use crate::error::Error;
41	use crate::exec::ExecBuilder;
42
43	use crate::$regex_mod::Regex;
44
45	/// A configurable builder for a regular expression.
46	///
47	/// A builder can be used to configure how the regex is built, for example, by
48	/// setting the default flags (which can be overridden in the expression
49	/// itself) or setting various limits.
50	#[derive(Debug)]
51	pub struct RegexBuilder(RegexOptions);
52
53	impl RegexBuilder {
54	/// Create a new regular expression builder with the given pattern.
55	///
56	/// If the pattern is invalid, then an error will be returned when
57	/// `build` is called.
58	pub fn new(pattern: &str) -> RegexBuilder {
59	let mut builder = RegexBuilder(RegexOptions::default());
60	builder.`0`.pats.push(pattern.to_owned());
61	builder
62	}
63
64	/// Consume the builder and compile the regular expression.
65	///
66	/// Note that calling `as_str` on the resulting `Regex` will produce the
67	/// pattern given to `new` verbatim. Notably, it will not incorporate any
68	/// of the flags set on this builder.
69	pub fn build(&self) -> Result<Regex, Error> {
70	ExecBuilder::new_options(self.`0`.clone())
71	.only_utf8($only_utf8)
72	.build()
73	.map(Regex::from)
74	}
75
76	/// Set the value for the case insensitive (`i`) flag.
77	///
78	/// When enabled, letters in the pattern will match both upper case and
79	/// lower case variants.
80	pub fn case_insensitive(
81	&mut self,
82	yes: bool,
83	) -> &mut RegexBuilder {
84	self.`0`.case_insensitive = yes;
85	self
86	}
87
88	/// Set the value for the multi-line matching (`m`) flag.
89	///
90	/// When enabled, `^` matches the beginning of lines and `$` matches the
91	/// end of lines.
92	///
93	/// By default, they match beginning/end of the input.
94	pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder {
95	self.`0`.multi_line = yes;
96	self
97	}
98
99	/// Set the value for the any character (`s`) flag, where in `.` matches
100	/// anything when `s` is set and matches anything except for new line when
101	/// it is not set (the default).
102	///
103	/// N.B. "matches anything" means "any byte" when Unicode is disabled and
104	/// means "any valid UTF-8 encoding of any Unicode scalar value" when
105	/// Unicode is enabled.
106	pub fn dot_matches_new_line(
107	&mut self,
108	yes: bool,
109	) -> &mut RegexBuilder {
110	self.`0`.dot_matches_new_line = yes;
111	self
112	}
113
114	/// Set the value for the greedy swap (`U`) flag.
115	///
116	/// When enabled, a pattern like `a` is lazy (tries to find shortest*
117	/// match) and `a?` is greedy (tries to find longest match).*
118	///
119	/// By default, `a` is greedy and `a?` is lazy.
120	pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder {
121	self.`0`.swap_greed = yes;
122	self
123	}
124
125	/// Set the value for the ignore whitespace (`x`) flag.
126	///
127	/// When enabled, whitespace such as new lines and spaces will be ignored
128	/// between expressions of the pattern, and `#` can be used to start a
129	/// comment until the next new line.
130	pub fn ignore_whitespace(
131	&mut self,
132	yes: bool,
133	) -> &mut RegexBuilder {
134	self.`0`.ignore_whitespace = yes;
135	self
136	}
137
138	/// Set the value for the Unicode (`u`) flag.
139	///
140	/// Enabled by default. When disabled, character classes such as `\w` only
141	/// match ASCII word characters instead of all Unicode word characters.
142	pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder {
143	self.`0`.unicode = yes;
144	self
145	}
146
147	/// Whether to support octal syntax or not.
148	///
149	/// Octal syntax is a little-known way of uttering Unicode codepoints in
150	/// a regular expression. For example, `a`, `\x61`, `\u0061` and
151	/// `\141` are all equivalent regular expressions, where the last example
152	/// shows octal syntax.
153	///
154	/// While supporting octal syntax isn't in and of itself a problem, it does
155	/// make good error messages harder. That is, in PCRE based regex engines,
156	/// syntax like `\0` invokes a backreference, which is explicitly
157	/// unsupported in Rust's regex engine. However, many users expect it to
158	/// be supported. Therefore, when octal support is disabled, the error
159	/// message will explicitly mention that backreferences aren't supported.
160	///
161	/// Octal syntax is disabled by default.
162	pub fn octal(&mut self, yes: bool) -> &mut RegexBuilder {
163	self.`0`.octal = yes;
164	self
165	}
166
167	/// Set the approximate size limit of the compiled regular expression.
168	///
169	/// This roughly corresponds to the number of bytes occupied by a single
170	/// compiled program. If the program exceeds this number, then a
171	/// compilation error is returned.
172	pub fn size_limit(
173	&mut self,
174	limit: usize,
175	) -> &mut RegexBuilder {
176	self.`0`.size_limit = limit;
177	self
178	}
179
180	/// Set the approximate size of the cache used by the DFA.
181	///
182	/// This roughly corresponds to the number of bytes that the DFA will
183	/// use while searching.
184	///
185	/// Note that this is a per thread* limit. There is no way to set a global*
186	/// limit. In particular, if a regex is used from multiple threads
187	/// simultaneously, then each thread may use up to the number of bytes
188	/// specified here.
189	pub fn dfa_size_limit(
190	&mut self,
191	limit: usize,
192	) -> &mut RegexBuilder {
193	self.`0`.dfa_size_limit = limit;
194	self
195	}
196
197	/// Set the nesting limit for this parser.
198	///
199	/// The nesting limit controls how deep the abstract syntax tree is allowed
200	/// to be. If the AST exceeds the given limit (e.g., with too many nested
201	/// groups), then an error is returned by the parser.
202	///
203	/// The purpose of this limit is to act as a heuristic to prevent stack
204	/// overflow for consumers that do structural induction on an `Ast` using
205	/// explicit recursion. While this crate never does this (instead using
206	/// constant stack space and moving the call stack to the heap), other
207	/// crates may.
208	///
209	/// This limit is not checked until the entire Ast is parsed. Therefore,
210	/// if callers want to put a limit on the amount of heap space used, then
211	/// they should impose a limit on the length, in bytes, of the concrete
212	/// pattern string. In particular, this is viable since this parser
213	/// implementation will limit itself to heap space proportional to the
214	/// length of the pattern string.
215	///
216	/// Note that a nest limit of `0` will return a nest limit error for most
217	/// patterns but not all. For example, a nest limit of `0` permits `a` but
218	/// not `ab`, since `ab` requires a concatenation, which results in a nest
219	/// depth of `1`. In general, a nest limit is not something that manifests
220	/// in an obvious way in the concrete syntax, therefore, it should not be
221	/// used in a granular way.
222	pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder {
223	self.`0`.nest_limit = limit;
224	self
225	}
226	}
227	}
228	};
229	}
230
231	define_builder!(bytes, re_bytes, `false`);
232	define_builder!(unicode, re_unicode, `true`);
233
234	macro_rules! define_set_builder {
235	($name:ident, $regex_mod:ident, $only_utf8:expr) => {
236	pub mod $name {
237	use super::RegexOptions;
238	use crate::error::Error;
239	use crate::exec::ExecBuilder;
240
241	use crate::re_set::$regex_mod::RegexSet;
242
243	/// A configurable builder for a set of regular expressions.
244	///
245	/// A builder can be used to configure how the regexes are built, for example,
246	/// by setting the default flags (which can be overridden in the expression
247	/// itself) or setting various limits.
248	#[derive(Debug)]
249	pub struct RegexSetBuilder(RegexOptions);
250
251	impl RegexSetBuilder {
252	/// Create a new regular expression builder with the given pattern.
253	///
254	/// If the pattern is invalid, then an error will be returned when
255	/// `build` is called.
256	pub fn new<I, S>(patterns: I) -> RegexSetBuilder
257	where
258	S: AsRef<str>,
259	I: IntoIterator<Item = S>,
260	{
261	let mut builder = RegexSetBuilder(RegexOptions::default());
262	for pat in patterns {
263	builder.`0`.pats.push(pat.as_ref().to_owned());
264	}
265	builder
266	}
267
268	/// Consume the builder and compile the regular expressions into a set.
269	pub fn build(&self) -> Result<RegexSet, Error> {
270	ExecBuilder::new_options(self.`0`.clone())
271	.only_utf8($only_utf8)
272	.build()
273	.map(RegexSet::from)
274	}
275
276	/// Set the value for the case insensitive (`i`) flag.
277	pub fn case_insensitive(
278	&mut self,
279	yes: bool,
280	) -> &mut RegexSetBuilder {
281	self.`0`.case_insensitive = yes;
282	self
283	}
284
285	/// Set the value for the multi-line matching (`m`) flag.
286	pub fn multi_line(
287	&mut self,
288	yes: bool,
289	) -> &mut RegexSetBuilder {
290	self.`0`.multi_line = yes;
291	self
292	}
293
294	/// Set the value for the any character (`s`) flag, where in `.` matches
295	/// anything when `s` is set and matches anything except for new line when
296	/// it is not set (the default).
297	///
298	/// N.B. "matches anything" means "any byte" for `regex::bytes::RegexSet`
299	/// expressions and means "any Unicode scalar value" for `regex::RegexSet`
300	/// expressions.
301	pub fn dot_matches_new_line(
302	&mut self,
303	yes: bool,
304	) -> &mut RegexSetBuilder {
305	self.`0`.dot_matches_new_line = yes;
306	self
307	}
308
309	/// Set the value for the greedy swap (`U`) flag.
310	pub fn swap_greed(
311	&mut self,
312	yes: bool,
313	) -> &mut RegexSetBuilder {
314	self.`0`.swap_greed = yes;
315	self
316	}
317
318	/// Set the value for the ignore whitespace (`x`) flag.
319	pub fn ignore_whitespace(
320	&mut self,
321	yes: bool,
322	) -> &mut RegexSetBuilder {
323	self.`0`.ignore_whitespace = yes;
324	self
325	}
326
327	/// Set the value for the Unicode (`u`) flag.
328	pub fn unicode(&mut self, yes: bool) -> &mut RegexSetBuilder {
329	self.`0`.unicode = yes;
330	self
331	}
332
333	/// Whether to support octal syntax or not.
334	///
335	/// Octal syntax is a little-known way of uttering Unicode codepoints in
336	/// a regular expression. For example, `a`, `\x61`, `\u0061` and
337	/// `\141` are all equivalent regular expressions, where the last example
338	/// shows octal syntax.
339	///
340	/// While supporting octal syntax isn't in and of itself a problem, it does
341	/// make good error messages harder. That is, in PCRE based regex engines,
342	/// syntax like `\0` invokes a backreference, which is explicitly
343	/// unsupported in Rust's regex engine. However, many users expect it to
344	/// be supported. Therefore, when octal support is disabled, the error
345	/// message will explicitly mention that backreferences aren't supported.
346	///
347	/// Octal syntax is disabled by default.
348	pub fn octal(&mut self, yes: bool) -> &mut RegexSetBuilder {
349	self.`0`.octal = yes;
350	self
351	}
352
353	/// Set the approximate size limit of the compiled regular expression.
354	///
355	/// This roughly corresponds to the number of bytes occupied by a single
356	/// compiled program. If the program exceeds this number, then a
357	/// compilation error is returned.
358	pub fn size_limit(
359	&mut self,
360	limit: usize,
361	) -> &mut RegexSetBuilder {
362	self.`0`.size_limit = limit;
363	self
364	}
365
366	/// Set the approximate size of the cache used by the DFA.
367	///
368	/// This roughly corresponds to the number of bytes that the DFA will
369	/// use while searching.
370	///
371	/// Note that this is a per thread* limit. There is no way to set a global*
372	/// limit. In particular, if a regex is used from multiple threads
373	/// simultaneously, then each thread may use up to the number of bytes
374	/// specified here.
375	pub fn dfa_size_limit(
376	&mut self,
377	limit: usize,
378	) -> &mut RegexSetBuilder {
379	self.`0`.dfa_size_limit = limit;
380	self
381	}
382
383	/// Set the nesting limit for this parser.
384	///
385	/// The nesting limit controls how deep the abstract syntax tree is allowed
386	/// to be. If the AST exceeds the given limit (e.g., with too many nested
387	/// groups), then an error is returned by the parser.
388	///
389	/// The purpose of this limit is to act as a heuristic to prevent stack
390	/// overflow for consumers that do structural induction on an `Ast` using
391	/// explicit recursion. While this crate never does this (instead using
392	/// constant stack space and moving the call stack to the heap), other
393	/// crates may.
394	///
395	/// This limit is not checked until the entire Ast is parsed. Therefore,
396	/// if callers want to put a limit on the amount of heap space used, then
397	/// they should impose a limit on the length, in bytes, of the concrete
398	/// pattern string. In particular, this is viable since this parser
399	/// implementation will limit itself to heap space proportional to the
400	/// length of the pattern string.
401	///
402	/// Note that a nest limit of `0` will return a nest limit error for most
403	/// patterns but not all. For example, a nest limit of `0` permits `a` but
404	/// not `ab`, since `ab` requires a concatenation, which results in a nest
405	/// depth of `1`. In general, a nest limit is not something that manifests
406	/// in an obvious way in the concrete syntax, therefore, it should not be
407	/// used in a granular way.
408	pub fn nest_limit(
409	&mut self,
410	limit: u32,
411	) -> &mut RegexSetBuilder {
412	self.`0`.nest_limit = limit;
413	self
414	}
415	}
416	}
417	};
418	}
419
420	define_set_builder!(set_bytes, bytes, `false`);
421	define_set_builder!(set_unicode, unicode, `true`);
422