1/// The set of user configurable options for compiling zero or more regexes.
2#[derive(Clone, Debug)]
3#[allow(missing_docs)]
4pub struct RegexOptions {
5 pub pats: Vec<String>,
6 pub size_limit: usize,
7 pub dfa_size_limit: usize,
8 pub nest_limit: u32,
9 pub case_insensitive: bool,
10 pub multi_line: bool,
11 pub dot_matches_new_line: bool,
12 pub swap_greed: bool,
13 pub ignore_whitespace: bool,
14 pub unicode: bool,
15 pub octal: bool,
16}
17
18impl Default for RegexOptions {
19 fn default() -> Self {
20 RegexOptions {
21 pats: vec![],
22 size_limit: 10 * (1 << 20),
23 dfa_size_limit: 2 * (1 << 20),
24 nest_limit: 250,
25 case_insensitive: false,
26 multi_line: false,
27 dot_matches_new_line: false,
28 swap_greed: false,
29 ignore_whitespace: false,
30 unicode: true,
31 octal: false,
32 }
33 }
34}
35
36macro_rules! define_builder {
37 ($name:ident, $regex_mod:ident, $only_utf8:expr) => {
38 pub mod $name {
39 use super::RegexOptions;
40 use crate::error::Error;
41 use crate::exec::ExecBuilder;
42
43 use crate::$regex_mod::Regex;
44
45 /// A configurable builder for a regular expression.
46 ///
47 /// A builder can be used to configure how the regex is built, for example, by
48 /// setting the default flags (which can be overridden in the expression
49 /// itself) or setting various limits.
50 #[derive(Debug)]
51 pub struct RegexBuilder(RegexOptions);
52
53 impl RegexBuilder {
54 /// Create a new regular expression builder with the given pattern.
55 ///
56 /// If the pattern is invalid, then an error will be returned when
57 /// `build` is called.
58 pub fn new(pattern: &str) -> RegexBuilder {
59 let mut builder = RegexBuilder(RegexOptions::default());
60 builder.0.pats.push(pattern.to_owned());
61 builder
62 }
63
64 /// Consume the builder and compile the regular expression.
65 ///
66 /// Note that calling `as_str` on the resulting `Regex` will produce the
67 /// pattern given to `new` verbatim. Notably, it will not incorporate any
68 /// of the flags set on this builder.
69 pub fn build(&self) -> Result<Regex, Error> {
70 ExecBuilder::new_options(self.0.clone())
71 .only_utf8($only_utf8)
72 .build()
73 .map(Regex::from)
74 }
75
76 /// Set the value for the case insensitive (`i`) flag.
77 ///
78 /// When enabled, letters in the pattern will match both upper case and
79 /// lower case variants.
80 pub fn case_insensitive(
81 &mut self,
82 yes: bool,
83 ) -> &mut RegexBuilder {
84 self.0.case_insensitive = yes;
85 self
86 }
87
88 /// Set the value for the multi-line matching (`m`) flag.
89 ///
90 /// When enabled, `^` matches the beginning of lines and `$` matches the
91 /// end of lines.
92 ///
93 /// By default, they match beginning/end of the input.
94 pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder {
95 self.0.multi_line = yes;
96 self
97 }
98
99 /// Set the value for the any character (`s`) flag, where in `.` matches
100 /// anything when `s` is set and matches anything except for new line when
101 /// it is not set (the default).
102 ///
103 /// N.B. "matches anything" means "any byte" when Unicode is disabled and
104 /// means "any valid UTF-8 encoding of any Unicode scalar value" when
105 /// Unicode is enabled.
106 pub fn dot_matches_new_line(
107 &mut self,
108 yes: bool,
109 ) -> &mut RegexBuilder {
110 self.0.dot_matches_new_line = yes;
111 self
112 }
113
114 /// Set the value for the greedy swap (`U`) flag.
115 ///
116 /// When enabled, a pattern like `a*` is lazy (tries to find shortest
117 /// match) and `a*?` is greedy (tries to find longest match).
118 ///
119 /// By default, `a*` is greedy and `a*?` is lazy.
120 pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder {
121 self.0.swap_greed = yes;
122 self
123 }
124
125 /// Set the value for the ignore whitespace (`x`) flag.
126 ///
127 /// When enabled, whitespace such as new lines and spaces will be ignored
128 /// between expressions of the pattern, and `#` can be used to start a
129 /// comment until the next new line.
130 pub fn ignore_whitespace(
131 &mut self,
132 yes: bool,
133 ) -> &mut RegexBuilder {
134 self.0.ignore_whitespace = yes;
135 self
136 }
137
138 /// Set the value for the Unicode (`u`) flag.
139 ///
140 /// Enabled by default. When disabled, character classes such as `\w` only
141 /// match ASCII word characters instead of all Unicode word characters.
142 pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder {
143 self.0.unicode = yes;
144 self
145 }
146
147 /// Whether to support octal syntax or not.
148 ///
149 /// Octal syntax is a little-known way of uttering Unicode codepoints in
150 /// a regular expression. For example, `a`, `\x61`, `\u0061` and
151 /// `\141` are all equivalent regular expressions, where the last example
152 /// shows octal syntax.
153 ///
154 /// While supporting octal syntax isn't in and of itself a problem, it does
155 /// make good error messages harder. That is, in PCRE based regex engines,
156 /// syntax like `\0` invokes a backreference, which is explicitly
157 /// unsupported in Rust's regex engine. However, many users expect it to
158 /// be supported. Therefore, when octal support is disabled, the error
159 /// message will explicitly mention that backreferences aren't supported.
160 ///
161 /// Octal syntax is disabled by default.
162 pub fn octal(&mut self, yes: bool) -> &mut RegexBuilder {
163 self.0.octal = yes;
164 self
165 }
166
167 /// Set the approximate size limit of the compiled regular expression.
168 ///
169 /// This roughly corresponds to the number of bytes occupied by a single
170 /// compiled program. If the program exceeds this number, then a
171 /// compilation error is returned.
172 pub fn size_limit(
173 &mut self,
174 limit: usize,
175 ) -> &mut RegexBuilder {
176 self.0.size_limit = limit;
177 self
178 }
179
180 /// Set the approximate size of the cache used by the DFA.
181 ///
182 /// This roughly corresponds to the number of bytes that the DFA will
183 /// use while searching.
184 ///
185 /// Note that this is a *per thread* limit. There is no way to set a global
186 /// limit. In particular, if a regex is used from multiple threads
187 /// simultaneously, then each thread may use up to the number of bytes
188 /// specified here.
189 pub fn dfa_size_limit(
190 &mut self,
191 limit: usize,
192 ) -> &mut RegexBuilder {
193 self.0.dfa_size_limit = limit;
194 self
195 }
196
197 /// Set the nesting limit for this parser.
198 ///
199 /// The nesting limit controls how deep the abstract syntax tree is allowed
200 /// to be. If the AST exceeds the given limit (e.g., with too many nested
201 /// groups), then an error is returned by the parser.
202 ///
203 /// The purpose of this limit is to act as a heuristic to prevent stack
204 /// overflow for consumers that do structural induction on an `Ast` using
205 /// explicit recursion. While this crate never does this (instead using
206 /// constant stack space and moving the call stack to the heap), other
207 /// crates may.
208 ///
209 /// This limit is not checked until the entire Ast is parsed. Therefore,
210 /// if callers want to put a limit on the amount of heap space used, then
211 /// they should impose a limit on the length, in bytes, of the concrete
212 /// pattern string. In particular, this is viable since this parser
213 /// implementation will limit itself to heap space proportional to the
214 /// length of the pattern string.
215 ///
216 /// Note that a nest limit of `0` will return a nest limit error for most
217 /// patterns but not all. For example, a nest limit of `0` permits `a` but
218 /// not `ab`, since `ab` requires a concatenation, which results in a nest
219 /// depth of `1`. In general, a nest limit is not something that manifests
220 /// in an obvious way in the concrete syntax, therefore, it should not be
221 /// used in a granular way.
222 pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder {
223 self.0.nest_limit = limit;
224 self
225 }
226 }
227 }
228 };
229}
230
231define_builder!(bytes, re_bytes, false);
232define_builder!(unicode, re_unicode, true);
233
234macro_rules! define_set_builder {
235 ($name:ident, $regex_mod:ident, $only_utf8:expr) => {
236 pub mod $name {
237 use super::RegexOptions;
238 use crate::error::Error;
239 use crate::exec::ExecBuilder;
240
241 use crate::re_set::$regex_mod::RegexSet;
242
243 /// A configurable builder for a set of regular expressions.
244 ///
245 /// A builder can be used to configure how the regexes are built, for example,
246 /// by setting the default flags (which can be overridden in the expression
247 /// itself) or setting various limits.
248 #[derive(Debug)]
249 pub struct RegexSetBuilder(RegexOptions);
250
251 impl RegexSetBuilder {
252 /// Create a new regular expression builder with the given pattern.
253 ///
254 /// If the pattern is invalid, then an error will be returned when
255 /// `build` is called.
256 pub fn new<I, S>(patterns: I) -> RegexSetBuilder
257 where
258 S: AsRef<str>,
259 I: IntoIterator<Item = S>,
260 {
261 let mut builder = RegexSetBuilder(RegexOptions::default());
262 for pat in patterns {
263 builder.0.pats.push(pat.as_ref().to_owned());
264 }
265 builder
266 }
267
268 /// Consume the builder and compile the regular expressions into a set.
269 pub fn build(&self) -> Result<RegexSet, Error> {
270 ExecBuilder::new_options(self.0.clone())
271 .only_utf8($only_utf8)
272 .build()
273 .map(RegexSet::from)
274 }
275
276 /// Set the value for the case insensitive (`i`) flag.
277 pub fn case_insensitive(
278 &mut self,
279 yes: bool,
280 ) -> &mut RegexSetBuilder {
281 self.0.case_insensitive = yes;
282 self
283 }
284
285 /// Set the value for the multi-line matching (`m`) flag.
286 pub fn multi_line(
287 &mut self,
288 yes: bool,
289 ) -> &mut RegexSetBuilder {
290 self.0.multi_line = yes;
291 self
292 }
293
294 /// Set the value for the any character (`s`) flag, where in `.` matches
295 /// anything when `s` is set and matches anything except for new line when
296 /// it is not set (the default).
297 ///
298 /// N.B. "matches anything" means "any byte" for `regex::bytes::RegexSet`
299 /// expressions and means "any Unicode scalar value" for `regex::RegexSet`
300 /// expressions.
301 pub fn dot_matches_new_line(
302 &mut self,
303 yes: bool,
304 ) -> &mut RegexSetBuilder {
305 self.0.dot_matches_new_line = yes;
306 self
307 }
308
309 /// Set the value for the greedy swap (`U`) flag.
310 pub fn swap_greed(
311 &mut self,
312 yes: bool,
313 ) -> &mut RegexSetBuilder {
314 self.0.swap_greed = yes;
315 self
316 }
317
318 /// Set the value for the ignore whitespace (`x`) flag.
319 pub fn ignore_whitespace(
320 &mut self,
321 yes: bool,
322 ) -> &mut RegexSetBuilder {
323 self.0.ignore_whitespace = yes;
324 self
325 }
326
327 /// Set the value for the Unicode (`u`) flag.
328 pub fn unicode(&mut self, yes: bool) -> &mut RegexSetBuilder {
329 self.0.unicode = yes;
330 self
331 }
332
333 /// Whether to support octal syntax or not.
334 ///
335 /// Octal syntax is a little-known way of uttering Unicode codepoints in
336 /// a regular expression. For example, `a`, `\x61`, `\u0061` and
337 /// `\141` are all equivalent regular expressions, where the last example
338 /// shows octal syntax.
339 ///
340 /// While supporting octal syntax isn't in and of itself a problem, it does
341 /// make good error messages harder. That is, in PCRE based regex engines,
342 /// syntax like `\0` invokes a backreference, which is explicitly
343 /// unsupported in Rust's regex engine. However, many users expect it to
344 /// be supported. Therefore, when octal support is disabled, the error
345 /// message will explicitly mention that backreferences aren't supported.
346 ///
347 /// Octal syntax is disabled by default.
348 pub fn octal(&mut self, yes: bool) -> &mut RegexSetBuilder {
349 self.0.octal = yes;
350 self
351 }
352
353 /// Set the approximate size limit of the compiled regular expression.
354 ///
355 /// This roughly corresponds to the number of bytes occupied by a single
356 /// compiled program. If the program exceeds this number, then a
357 /// compilation error is returned.
358 pub fn size_limit(
359 &mut self,
360 limit: usize,
361 ) -> &mut RegexSetBuilder {
362 self.0.size_limit = limit;
363 self
364 }
365
366 /// Set the approximate size of the cache used by the DFA.
367 ///
368 /// This roughly corresponds to the number of bytes that the DFA will
369 /// use while searching.
370 ///
371 /// Note that this is a *per thread* limit. There is no way to set a global
372 /// limit. In particular, if a regex is used from multiple threads
373 /// simultaneously, then each thread may use up to the number of bytes
374 /// specified here.
375 pub fn dfa_size_limit(
376 &mut self,
377 limit: usize,
378 ) -> &mut RegexSetBuilder {
379 self.0.dfa_size_limit = limit;
380 self
381 }
382
383 /// Set the nesting limit for this parser.
384 ///
385 /// The nesting limit controls how deep the abstract syntax tree is allowed
386 /// to be. If the AST exceeds the given limit (e.g., with too many nested
387 /// groups), then an error is returned by the parser.
388 ///
389 /// The purpose of this limit is to act as a heuristic to prevent stack
390 /// overflow for consumers that do structural induction on an `Ast` using
391 /// explicit recursion. While this crate never does this (instead using
392 /// constant stack space and moving the call stack to the heap), other
393 /// crates may.
394 ///
395 /// This limit is not checked until the entire Ast is parsed. Therefore,
396 /// if callers want to put a limit on the amount of heap space used, then
397 /// they should impose a limit on the length, in bytes, of the concrete
398 /// pattern string. In particular, this is viable since this parser
399 /// implementation will limit itself to heap space proportional to the
400 /// length of the pattern string.
401 ///
402 /// Note that a nest limit of `0` will return a nest limit error for most
403 /// patterns but not all. For example, a nest limit of `0` permits `a` but
404 /// not `ab`, since `ab` requires a concatenation, which results in a nest
405 /// depth of `1`. In general, a nest limit is not something that manifests
406 /// in an obvious way in the concrete syntax, therefore, it should not be
407 /// used in a granular way.
408 pub fn nest_limit(
409 &mut self,
410 limit: u32,
411 ) -> &mut RegexSetBuilder {
412 self.0.nest_limit = limit;
413 self
414 }
415 }
416 }
417 };
418}
419
420define_set_builder!(set_bytes, bytes, false);
421define_set_builder!(set_unicode, unicode, true);
422