1 | /// The set of user configurable options for compiling zero or more regexes. |
2 | #[derive (Clone, Debug)] |
3 | #[allow (missing_docs)] |
4 | pub struct RegexOptions { |
5 | pub pats: Vec<String>, |
6 | pub size_limit: usize, |
7 | pub dfa_size_limit: usize, |
8 | pub nest_limit: u32, |
9 | pub case_insensitive: bool, |
10 | pub multi_line: bool, |
11 | pub dot_matches_new_line: bool, |
12 | pub swap_greed: bool, |
13 | pub ignore_whitespace: bool, |
14 | pub unicode: bool, |
15 | pub octal: bool, |
16 | } |
17 | |
18 | impl Default for RegexOptions { |
19 | fn default() -> Self { |
20 | RegexOptions { |
21 | pats: vec![], |
22 | size_limit: 10 * (1 << 20), |
23 | dfa_size_limit: 2 * (1 << 20), |
24 | nest_limit: 250, |
25 | case_insensitive: false, |
26 | multi_line: false, |
27 | dot_matches_new_line: false, |
28 | swap_greed: false, |
29 | ignore_whitespace: false, |
30 | unicode: true, |
31 | octal: false, |
32 | } |
33 | } |
34 | } |
35 | |
36 | macro_rules! define_builder { |
37 | ($name:ident, $regex_mod:ident, $only_utf8:expr) => { |
38 | pub mod $name { |
39 | use super::RegexOptions; |
40 | use crate::error::Error; |
41 | use crate::exec::ExecBuilder; |
42 | |
43 | use crate::$regex_mod::Regex; |
44 | |
45 | /// A configurable builder for a regular expression. |
46 | /// |
47 | /// A builder can be used to configure how the regex is built, for example, by |
48 | /// setting the default flags (which can be overridden in the expression |
49 | /// itself) or setting various limits. |
50 | #[derive(Debug)] |
51 | pub struct RegexBuilder(RegexOptions); |
52 | |
53 | impl RegexBuilder { |
54 | /// Create a new regular expression builder with the given pattern. |
55 | /// |
56 | /// If the pattern is invalid, then an error will be returned when |
57 | /// `build` is called. |
58 | pub fn new(pattern: &str) -> RegexBuilder { |
59 | let mut builder = RegexBuilder(RegexOptions::default()); |
60 | builder.0.pats.push(pattern.to_owned()); |
61 | builder |
62 | } |
63 | |
64 | /// Consume the builder and compile the regular expression. |
65 | /// |
66 | /// Note that calling `as_str` on the resulting `Regex` will produce the |
67 | /// pattern given to `new` verbatim. Notably, it will not incorporate any |
68 | /// of the flags set on this builder. |
69 | pub fn build(&self) -> Result<Regex, Error> { |
70 | ExecBuilder::new_options(self.0.clone()) |
71 | .only_utf8($only_utf8) |
72 | .build() |
73 | .map(Regex::from) |
74 | } |
75 | |
76 | /// Set the value for the case insensitive (`i`) flag. |
77 | /// |
78 | /// When enabled, letters in the pattern will match both upper case and |
79 | /// lower case variants. |
80 | pub fn case_insensitive( |
81 | &mut self, |
82 | yes: bool, |
83 | ) -> &mut RegexBuilder { |
84 | self.0.case_insensitive = yes; |
85 | self |
86 | } |
87 | |
88 | /// Set the value for the multi-line matching (`m`) flag. |
89 | /// |
90 | /// When enabled, `^` matches the beginning of lines and `$` matches the |
91 | /// end of lines. |
92 | /// |
93 | /// By default, they match beginning/end of the input. |
94 | pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder { |
95 | self.0.multi_line = yes; |
96 | self |
97 | } |
98 | |
99 | /// Set the value for the any character (`s`) flag, where in `.` matches |
100 | /// anything when `s` is set and matches anything except for new line when |
101 | /// it is not set (the default). |
102 | /// |
103 | /// N.B. "matches anything" means "any byte" when Unicode is disabled and |
104 | /// means "any valid UTF-8 encoding of any Unicode scalar value" when |
105 | /// Unicode is enabled. |
106 | pub fn dot_matches_new_line( |
107 | &mut self, |
108 | yes: bool, |
109 | ) -> &mut RegexBuilder { |
110 | self.0.dot_matches_new_line = yes; |
111 | self |
112 | } |
113 | |
114 | /// Set the value for the greedy swap (`U`) flag. |
115 | /// |
116 | /// When enabled, a pattern like `a*` is lazy (tries to find shortest |
117 | /// match) and `a*?` is greedy (tries to find longest match). |
118 | /// |
119 | /// By default, `a*` is greedy and `a*?` is lazy. |
120 | pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder { |
121 | self.0.swap_greed = yes; |
122 | self |
123 | } |
124 | |
125 | /// Set the value for the ignore whitespace (`x`) flag. |
126 | /// |
127 | /// When enabled, whitespace such as new lines and spaces will be ignored |
128 | /// between expressions of the pattern, and `#` can be used to start a |
129 | /// comment until the next new line. |
130 | pub fn ignore_whitespace( |
131 | &mut self, |
132 | yes: bool, |
133 | ) -> &mut RegexBuilder { |
134 | self.0.ignore_whitespace = yes; |
135 | self |
136 | } |
137 | |
138 | /// Set the value for the Unicode (`u`) flag. |
139 | /// |
140 | /// Enabled by default. When disabled, character classes such as `\w` only |
141 | /// match ASCII word characters instead of all Unicode word characters. |
142 | pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder { |
143 | self.0.unicode = yes; |
144 | self |
145 | } |
146 | |
147 | /// Whether to support octal syntax or not. |
148 | /// |
149 | /// Octal syntax is a little-known way of uttering Unicode codepoints in |
150 | /// a regular expression. For example, `a`, `\x61`, `\u0061` and |
151 | /// `\141` are all equivalent regular expressions, where the last example |
152 | /// shows octal syntax. |
153 | /// |
154 | /// While supporting octal syntax isn't in and of itself a problem, it does |
155 | /// make good error messages harder. That is, in PCRE based regex engines, |
156 | /// syntax like `\0` invokes a backreference, which is explicitly |
157 | /// unsupported in Rust's regex engine. However, many users expect it to |
158 | /// be supported. Therefore, when octal support is disabled, the error |
159 | /// message will explicitly mention that backreferences aren't supported. |
160 | /// |
161 | /// Octal syntax is disabled by default. |
162 | pub fn octal(&mut self, yes: bool) -> &mut RegexBuilder { |
163 | self.0.octal = yes; |
164 | self |
165 | } |
166 | |
167 | /// Set the approximate size limit of the compiled regular expression. |
168 | /// |
169 | /// This roughly corresponds to the number of bytes occupied by a single |
170 | /// compiled program. If the program exceeds this number, then a |
171 | /// compilation error is returned. |
172 | pub fn size_limit( |
173 | &mut self, |
174 | limit: usize, |
175 | ) -> &mut RegexBuilder { |
176 | self.0.size_limit = limit; |
177 | self |
178 | } |
179 | |
180 | /// Set the approximate size of the cache used by the DFA. |
181 | /// |
182 | /// This roughly corresponds to the number of bytes that the DFA will |
183 | /// use while searching. |
184 | /// |
185 | /// Note that this is a *per thread* limit. There is no way to set a global |
186 | /// limit. In particular, if a regex is used from multiple threads |
187 | /// simultaneously, then each thread may use up to the number of bytes |
188 | /// specified here. |
189 | pub fn dfa_size_limit( |
190 | &mut self, |
191 | limit: usize, |
192 | ) -> &mut RegexBuilder { |
193 | self.0.dfa_size_limit = limit; |
194 | self |
195 | } |
196 | |
197 | /// Set the nesting limit for this parser. |
198 | /// |
199 | /// The nesting limit controls how deep the abstract syntax tree is allowed |
200 | /// to be. If the AST exceeds the given limit (e.g., with too many nested |
201 | /// groups), then an error is returned by the parser. |
202 | /// |
203 | /// The purpose of this limit is to act as a heuristic to prevent stack |
204 | /// overflow for consumers that do structural induction on an `Ast` using |
205 | /// explicit recursion. While this crate never does this (instead using |
206 | /// constant stack space and moving the call stack to the heap), other |
207 | /// crates may. |
208 | /// |
209 | /// This limit is not checked until the entire Ast is parsed. Therefore, |
210 | /// if callers want to put a limit on the amount of heap space used, then |
211 | /// they should impose a limit on the length, in bytes, of the concrete |
212 | /// pattern string. In particular, this is viable since this parser |
213 | /// implementation will limit itself to heap space proportional to the |
214 | /// length of the pattern string. |
215 | /// |
216 | /// Note that a nest limit of `0` will return a nest limit error for most |
217 | /// patterns but not all. For example, a nest limit of `0` permits `a` but |
218 | /// not `ab`, since `ab` requires a concatenation, which results in a nest |
219 | /// depth of `1`. In general, a nest limit is not something that manifests |
220 | /// in an obvious way in the concrete syntax, therefore, it should not be |
221 | /// used in a granular way. |
222 | pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder { |
223 | self.0.nest_limit = limit; |
224 | self |
225 | } |
226 | } |
227 | } |
228 | }; |
229 | } |
230 | |
231 | define_builder!(bytes, re_bytes, false); |
232 | define_builder!(unicode, re_unicode, true); |
233 | |
234 | macro_rules! define_set_builder { |
235 | ($name:ident, $regex_mod:ident, $only_utf8:expr) => { |
236 | pub mod $name { |
237 | use super::RegexOptions; |
238 | use crate::error::Error; |
239 | use crate::exec::ExecBuilder; |
240 | |
241 | use crate::re_set::$regex_mod::RegexSet; |
242 | |
243 | /// A configurable builder for a set of regular expressions. |
244 | /// |
245 | /// A builder can be used to configure how the regexes are built, for example, |
246 | /// by setting the default flags (which can be overridden in the expression |
247 | /// itself) or setting various limits. |
248 | #[derive(Debug)] |
249 | pub struct RegexSetBuilder(RegexOptions); |
250 | |
251 | impl RegexSetBuilder { |
252 | /// Create a new regular expression builder with the given pattern. |
253 | /// |
254 | /// If the pattern is invalid, then an error will be returned when |
255 | /// `build` is called. |
256 | pub fn new<I, S>(patterns: I) -> RegexSetBuilder |
257 | where |
258 | S: AsRef<str>, |
259 | I: IntoIterator<Item = S>, |
260 | { |
261 | let mut builder = RegexSetBuilder(RegexOptions::default()); |
262 | for pat in patterns { |
263 | builder.0.pats.push(pat.as_ref().to_owned()); |
264 | } |
265 | builder |
266 | } |
267 | |
268 | /// Consume the builder and compile the regular expressions into a set. |
269 | pub fn build(&self) -> Result<RegexSet, Error> { |
270 | ExecBuilder::new_options(self.0.clone()) |
271 | .only_utf8($only_utf8) |
272 | .build() |
273 | .map(RegexSet::from) |
274 | } |
275 | |
276 | /// Set the value for the case insensitive (`i`) flag. |
277 | pub fn case_insensitive( |
278 | &mut self, |
279 | yes: bool, |
280 | ) -> &mut RegexSetBuilder { |
281 | self.0.case_insensitive = yes; |
282 | self |
283 | } |
284 | |
285 | /// Set the value for the multi-line matching (`m`) flag. |
286 | pub fn multi_line( |
287 | &mut self, |
288 | yes: bool, |
289 | ) -> &mut RegexSetBuilder { |
290 | self.0.multi_line = yes; |
291 | self |
292 | } |
293 | |
294 | /// Set the value for the any character (`s`) flag, where in `.` matches |
295 | /// anything when `s` is set and matches anything except for new line when |
296 | /// it is not set (the default). |
297 | /// |
298 | /// N.B. "matches anything" means "any byte" for `regex::bytes::RegexSet` |
299 | /// expressions and means "any Unicode scalar value" for `regex::RegexSet` |
300 | /// expressions. |
301 | pub fn dot_matches_new_line( |
302 | &mut self, |
303 | yes: bool, |
304 | ) -> &mut RegexSetBuilder { |
305 | self.0.dot_matches_new_line = yes; |
306 | self |
307 | } |
308 | |
309 | /// Set the value for the greedy swap (`U`) flag. |
310 | pub fn swap_greed( |
311 | &mut self, |
312 | yes: bool, |
313 | ) -> &mut RegexSetBuilder { |
314 | self.0.swap_greed = yes; |
315 | self |
316 | } |
317 | |
318 | /// Set the value for the ignore whitespace (`x`) flag. |
319 | pub fn ignore_whitespace( |
320 | &mut self, |
321 | yes: bool, |
322 | ) -> &mut RegexSetBuilder { |
323 | self.0.ignore_whitespace = yes; |
324 | self |
325 | } |
326 | |
327 | /// Set the value for the Unicode (`u`) flag. |
328 | pub fn unicode(&mut self, yes: bool) -> &mut RegexSetBuilder { |
329 | self.0.unicode = yes; |
330 | self |
331 | } |
332 | |
333 | /// Whether to support octal syntax or not. |
334 | /// |
335 | /// Octal syntax is a little-known way of uttering Unicode codepoints in |
336 | /// a regular expression. For example, `a`, `\x61`, `\u0061` and |
337 | /// `\141` are all equivalent regular expressions, where the last example |
338 | /// shows octal syntax. |
339 | /// |
340 | /// While supporting octal syntax isn't in and of itself a problem, it does |
341 | /// make good error messages harder. That is, in PCRE based regex engines, |
342 | /// syntax like `\0` invokes a backreference, which is explicitly |
343 | /// unsupported in Rust's regex engine. However, many users expect it to |
344 | /// be supported. Therefore, when octal support is disabled, the error |
345 | /// message will explicitly mention that backreferences aren't supported. |
346 | /// |
347 | /// Octal syntax is disabled by default. |
348 | pub fn octal(&mut self, yes: bool) -> &mut RegexSetBuilder { |
349 | self.0.octal = yes; |
350 | self |
351 | } |
352 | |
353 | /// Set the approximate size limit of the compiled regular expression. |
354 | /// |
355 | /// This roughly corresponds to the number of bytes occupied by a single |
356 | /// compiled program. If the program exceeds this number, then a |
357 | /// compilation error is returned. |
358 | pub fn size_limit( |
359 | &mut self, |
360 | limit: usize, |
361 | ) -> &mut RegexSetBuilder { |
362 | self.0.size_limit = limit; |
363 | self |
364 | } |
365 | |
366 | /// Set the approximate size of the cache used by the DFA. |
367 | /// |
368 | /// This roughly corresponds to the number of bytes that the DFA will |
369 | /// use while searching. |
370 | /// |
371 | /// Note that this is a *per thread* limit. There is no way to set a global |
372 | /// limit. In particular, if a regex is used from multiple threads |
373 | /// simultaneously, then each thread may use up to the number of bytes |
374 | /// specified here. |
375 | pub fn dfa_size_limit( |
376 | &mut self, |
377 | limit: usize, |
378 | ) -> &mut RegexSetBuilder { |
379 | self.0.dfa_size_limit = limit; |
380 | self |
381 | } |
382 | |
383 | /// Set the nesting limit for this parser. |
384 | /// |
385 | /// The nesting limit controls how deep the abstract syntax tree is allowed |
386 | /// to be. If the AST exceeds the given limit (e.g., with too many nested |
387 | /// groups), then an error is returned by the parser. |
388 | /// |
389 | /// The purpose of this limit is to act as a heuristic to prevent stack |
390 | /// overflow for consumers that do structural induction on an `Ast` using |
391 | /// explicit recursion. While this crate never does this (instead using |
392 | /// constant stack space and moving the call stack to the heap), other |
393 | /// crates may. |
394 | /// |
395 | /// This limit is not checked until the entire Ast is parsed. Therefore, |
396 | /// if callers want to put a limit on the amount of heap space used, then |
397 | /// they should impose a limit on the length, in bytes, of the concrete |
398 | /// pattern string. In particular, this is viable since this parser |
399 | /// implementation will limit itself to heap space proportional to the |
400 | /// length of the pattern string. |
401 | /// |
402 | /// Note that a nest limit of `0` will return a nest limit error for most |
403 | /// patterns but not all. For example, a nest limit of `0` permits `a` but |
404 | /// not `ab`, since `ab` requires a concatenation, which results in a nest |
405 | /// depth of `1`. In general, a nest limit is not something that manifests |
406 | /// in an obvious way in the concrete syntax, therefore, it should not be |
407 | /// used in a granular way. |
408 | pub fn nest_limit( |
409 | &mut self, |
410 | limit: u32, |
411 | ) -> &mut RegexSetBuilder { |
412 | self.0.nest_limit = limit; |
413 | self |
414 | } |
415 | } |
416 | } |
417 | }; |
418 | } |
419 | |
420 | define_set_builder!(set_bytes, bytes, false); |
421 | define_set_builder!(set_unicode, unicode, true); |
422 | |