1 | #![allow (warnings)] |
2 | |
3 | // This module defines an internal builder that encapsulates all interaction |
4 | // with meta::Regex construction, and then 4 public API builders that wrap |
5 | // around it. The docs are essentially repeated on each of the 4 public |
6 | // builders, with tweaks to the examples as needed. |
7 | // |
8 | // The reason why there are so many builders is partially because of a misstep |
9 | // in the initial API design: the builder constructor takes in the pattern |
10 | // strings instead of using the `build` method to accept the pattern strings. |
11 | // This means `new` has a different signature for each builder. It probably |
12 | // would have been nicer to to use one builder with `fn new()`, and then add |
13 | // `build(pat)` and `build_many(pats)` constructors. |
14 | // |
15 | // The other reason is because I think the `bytes` module should probably |
16 | // have its own builder type. That way, it is completely isolated from the |
17 | // top-level API. |
18 | // |
19 | // If I could do it again, I'd probably have a `regex::Builder` and a |
20 | // `regex::bytes::Builder`. Each would have `build` and `build_set` (or |
21 | // `build_many`) methods for constructing a single pattern `Regex` and a |
22 | // multi-pattern `RegexSet`, respectively. |
23 | |
24 | use alloc::{ |
25 | string::{String, ToString}, |
26 | sync::Arc, |
27 | vec, |
28 | vec::Vec, |
29 | }; |
30 | |
31 | use regex_automata::{ |
32 | meta, nfa::thompson::WhichCaptures, util::syntax, MatchKind, |
33 | }; |
34 | |
35 | use crate::error::Error; |
36 | |
37 | /// A builder for constructing a `Regex`, `bytes::Regex`, `RegexSet` or a |
38 | /// `bytes::RegexSet`. |
39 | /// |
40 | /// This is essentially the implementation of the four different builder types |
41 | /// in the public API: `RegexBuilder`, `bytes::RegexBuilder`, `RegexSetBuilder` |
42 | /// and `bytes::RegexSetBuilder`. |
43 | #[derive(Clone, Debug)] |
44 | struct Builder { |
45 | pats: Vec<String>, |
46 | metac: meta::Config, |
47 | syntaxc: syntax::Config, |
48 | } |
49 | |
50 | impl Default for Builder { |
51 | fn default() -> Builder { |
52 | let metac = meta::Config::new() |
53 | .nfa_size_limit(Some(10 * (1 << 20))) |
54 | .hybrid_cache_capacity(2 * (1 << 20)); |
55 | Builder { pats: vec![], metac, syntaxc: syntax::Config::default() } |
56 | } |
57 | } |
58 | |
59 | impl Builder { |
60 | fn new<I, S>(patterns: I) -> Builder |
61 | where |
62 | S: AsRef<str>, |
63 | I: IntoIterator<Item = S>, |
64 | { |
65 | let mut b = Builder::default(); |
66 | b.pats.extend(patterns.into_iter().map(|p| p.as_ref().to_string())); |
67 | b |
68 | } |
69 | |
70 | fn build_one_string(&self) -> Result<crate::Regex, Error> { |
71 | assert_eq!(1, self.pats.len()); |
72 | let metac = self |
73 | .metac |
74 | .clone() |
75 | .match_kind(MatchKind::LeftmostFirst) |
76 | .utf8_empty(true); |
77 | let syntaxc = self.syntaxc.clone().utf8(true); |
78 | let pattern = Arc::from(self.pats[0].as_str()); |
79 | meta::Builder::new() |
80 | .configure(metac) |
81 | .syntax(syntaxc) |
82 | .build(&pattern) |
83 | .map(|meta| crate::Regex { meta, pattern }) |
84 | .map_err(Error::from_meta_build_error) |
85 | } |
86 | |
87 | fn build_one_bytes(&self) -> Result<crate::bytes::Regex, Error> { |
88 | assert_eq!(1, self.pats.len()); |
89 | let metac = self |
90 | .metac |
91 | .clone() |
92 | .match_kind(MatchKind::LeftmostFirst) |
93 | .utf8_empty(false); |
94 | let syntaxc = self.syntaxc.clone().utf8(false); |
95 | let pattern = Arc::from(self.pats[0].as_str()); |
96 | meta::Builder::new() |
97 | .configure(metac) |
98 | .syntax(syntaxc) |
99 | .build(&pattern) |
100 | .map(|meta| crate::bytes::Regex { meta, pattern }) |
101 | .map_err(Error::from_meta_build_error) |
102 | } |
103 | |
104 | fn build_many_string(&self) -> Result<crate::RegexSet, Error> { |
105 | let metac = self |
106 | .metac |
107 | .clone() |
108 | .match_kind(MatchKind::All) |
109 | .utf8_empty(true) |
110 | .which_captures(WhichCaptures::None); |
111 | let syntaxc = self.syntaxc.clone().utf8(true); |
112 | let patterns = Arc::from(self.pats.as_slice()); |
113 | meta::Builder::new() |
114 | .configure(metac) |
115 | .syntax(syntaxc) |
116 | .build_many(&patterns) |
117 | .map(|meta| crate::RegexSet { meta, patterns }) |
118 | .map_err(Error::from_meta_build_error) |
119 | } |
120 | |
121 | fn build_many_bytes(&self) -> Result<crate::bytes::RegexSet, Error> { |
122 | let metac = self |
123 | .metac |
124 | .clone() |
125 | .match_kind(MatchKind::All) |
126 | .utf8_empty(false) |
127 | .which_captures(WhichCaptures::None); |
128 | let syntaxc = self.syntaxc.clone().utf8(false); |
129 | let patterns = Arc::from(self.pats.as_slice()); |
130 | meta::Builder::new() |
131 | .configure(metac) |
132 | .syntax(syntaxc) |
133 | .build_many(&patterns) |
134 | .map(|meta| crate::bytes::RegexSet { meta, patterns }) |
135 | .map_err(Error::from_meta_build_error) |
136 | } |
137 | |
138 | fn case_insensitive(&mut self, yes: bool) -> &mut Builder { |
139 | self.syntaxc = self.syntaxc.case_insensitive(yes); |
140 | self |
141 | } |
142 | |
143 | fn multi_line(&mut self, yes: bool) -> &mut Builder { |
144 | self.syntaxc = self.syntaxc.multi_line(yes); |
145 | self |
146 | } |
147 | |
148 | fn dot_matches_new_line(&mut self, yes: bool) -> &mut Builder { |
149 | self.syntaxc = self.syntaxc.dot_matches_new_line(yes); |
150 | self |
151 | } |
152 | |
153 | fn crlf(&mut self, yes: bool) -> &mut Builder { |
154 | self.syntaxc = self.syntaxc.crlf(yes); |
155 | self |
156 | } |
157 | |
158 | fn line_terminator(&mut self, byte: u8) -> &mut Builder { |
159 | self.metac = self.metac.clone().line_terminator(byte); |
160 | self.syntaxc = self.syntaxc.line_terminator(byte); |
161 | self |
162 | } |
163 | |
164 | fn swap_greed(&mut self, yes: bool) -> &mut Builder { |
165 | self.syntaxc = self.syntaxc.swap_greed(yes); |
166 | self |
167 | } |
168 | |
169 | fn ignore_whitespace(&mut self, yes: bool) -> &mut Builder { |
170 | self.syntaxc = self.syntaxc.ignore_whitespace(yes); |
171 | self |
172 | } |
173 | |
174 | fn unicode(&mut self, yes: bool) -> &mut Builder { |
175 | self.syntaxc = self.syntaxc.unicode(yes); |
176 | self |
177 | } |
178 | |
179 | fn octal(&mut self, yes: bool) -> &mut Builder { |
180 | self.syntaxc = self.syntaxc.octal(yes); |
181 | self |
182 | } |
183 | |
184 | fn size_limit(&mut self, limit: usize) -> &mut Builder { |
185 | self.metac = self.metac.clone().nfa_size_limit(Some(limit)); |
186 | self |
187 | } |
188 | |
189 | fn dfa_size_limit(&mut self, limit: usize) -> &mut Builder { |
190 | self.metac = self.metac.clone().hybrid_cache_capacity(limit); |
191 | self |
192 | } |
193 | |
194 | fn nest_limit(&mut self, limit: u32) -> &mut Builder { |
195 | self.syntaxc = self.syntaxc.nest_limit(limit); |
196 | self |
197 | } |
198 | } |
199 | |
200 | pub(crate) mod string { |
201 | use crate::{error::Error, Regex, RegexSet}; |
202 | |
203 | use super::Builder; |
204 | |
205 | /// A configurable builder for a [`Regex`]. |
206 | /// |
207 | /// This builder can be used to programmatically set flags such as `i` |
208 | /// (case insensitive) and `x` (for verbose mode). This builder can also be |
209 | /// used to configure things like the line terminator and a size limit on |
210 | /// the compiled regular expression. |
211 | #[derive(Clone, Debug)] |
212 | pub struct RegexBuilder { |
213 | builder: Builder, |
214 | } |
215 | |
216 | impl RegexBuilder { |
217 | /// Create a new builder with a default configuration for the given |
218 | /// pattern. |
219 | /// |
220 | /// If the pattern is invalid or exceeds the configured size limits, |
221 | /// then an error will be returned when [`RegexBuilder::build`] is |
222 | /// called. |
223 | pub fn new(pattern: &str) -> RegexBuilder { |
224 | RegexBuilder { builder: Builder::new([pattern]) } |
225 | } |
226 | |
227 | /// Compiles the pattern given to `RegexBuilder::new` with the |
228 | /// configuration set on this builder. |
229 | /// |
230 | /// If the pattern isn't a valid regex or if a configured size limit |
231 | /// was exceeded, then an error is returned. |
232 | pub fn build(&self) -> Result<Regex, Error> { |
233 | self.builder.build_one_string() |
234 | } |
235 | |
236 | /// This configures Unicode mode for the entire pattern. |
237 | /// |
238 | /// Enabling Unicode mode does a number of things: |
239 | /// |
240 | /// * Most fundamentally, it causes the fundamental atom of matching |
241 | /// to be a single codepoint. When Unicode mode is disabled, it's a |
242 | /// single byte. For example, when Unicode mode is enabled, `.` will |
243 | /// match `💩` once, where as it will match 4 times when Unicode mode |
244 | /// is disabled. (Since the UTF-8 encoding of `💩` is 4 bytes long.) |
245 | /// * Case insensitive matching uses Unicode simple case folding rules. |
246 | /// * Unicode character classes like `\p{Letter}` and `\p{Greek}` are |
247 | /// available. |
248 | /// * Perl character classes are Unicode aware. That is, `\w`, `\s` and |
249 | /// `\d`. |
250 | /// * The word boundary assertions, `\b` and `\B`, use the Unicode |
251 | /// definition of a word character. |
252 | /// |
253 | /// Note that if Unicode mode is disabled, then the regex will fail to |
254 | /// compile if it could match invalid UTF-8. For example, when Unicode |
255 | /// mode is disabled, then since `.` matches any byte (except for |
256 | /// `\n`), then it can match invalid UTF-8 and thus building a regex |
257 | /// from it will fail. Another example is `\w` and `\W`. Since `\w` can |
258 | /// only match ASCII bytes when Unicode mode is disabled, it's allowed. |
259 | /// But `\W` can match more than ASCII bytes, including invalid UTF-8, |
260 | /// and so it is not allowed. This restriction can be lifted only by |
261 | /// using a [`bytes::Regex`](crate::bytes::Regex). |
262 | /// |
263 | /// For more details on the Unicode support in this crate, see the |
264 | /// [Unicode section](crate#unicode) in this crate's top-level |
265 | /// documentation. |
266 | /// |
267 | /// The default for this is `true`. |
268 | /// |
269 | /// # Example |
270 | /// |
271 | /// ``` |
272 | /// use regex::RegexBuilder; |
273 | /// |
274 | /// let re = RegexBuilder::new(r"\w" ) |
275 | /// .unicode(false) |
276 | /// .build() |
277 | /// .unwrap(); |
278 | /// // Normally greek letters would be included in \w, but since |
279 | /// // Unicode mode is disabled, it only matches ASCII letters. |
280 | /// assert!(!re.is_match("δ" )); |
281 | /// |
282 | /// let re = RegexBuilder::new(r"s" ) |
283 | /// .case_insensitive(true) |
284 | /// .unicode(false) |
285 | /// .build() |
286 | /// .unwrap(); |
287 | /// // Normally 'Å¿' is included when searching for 's' case |
288 | /// // insensitively due to Unicode's simple case folding rules. But |
289 | /// // when Unicode mode is disabled, only ASCII case insensitive rules |
290 | /// // are used. |
291 | /// assert!(!re.is_match("Å¿" )); |
292 | /// ``` |
293 | pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder { |
294 | self.builder.unicode(yes); |
295 | self |
296 | } |
297 | |
298 | /// This configures whether to enable case insensitive matching for the |
299 | /// entire pattern. |
300 | /// |
301 | /// This setting can also be configured using the inline flag `i` |
302 | /// in the pattern. For example, `(?i:foo)` matches `foo` case |
303 | /// insensitively while `(?-i:foo)` matches `foo` case sensitively. |
304 | /// |
305 | /// The default for this is `false`. |
306 | /// |
307 | /// # Example |
308 | /// |
309 | /// ``` |
310 | /// use regex::RegexBuilder; |
311 | /// |
312 | /// let re = RegexBuilder::new(r"foo(?-i:bar)quux" ) |
313 | /// .case_insensitive(true) |
314 | /// .build() |
315 | /// .unwrap(); |
316 | /// assert!(re.is_match("FoObarQuUx" )); |
317 | /// // Even though case insensitive matching is enabled in the builder, |
318 | /// // it can be locally disabled within the pattern. In this case, |
319 | /// // `bar` is matched case sensitively. |
320 | /// assert!(!re.is_match("fooBARquux" )); |
321 | /// ``` |
322 | pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder { |
323 | self.builder.case_insensitive(yes); |
324 | self |
325 | } |
326 | |
327 | /// This configures multi-line mode for the entire pattern. |
328 | /// |
329 | /// Enabling multi-line mode changes the behavior of the `^` and `$` |
330 | /// anchor assertions. Instead of only matching at the beginning and |
331 | /// end of a haystack, respectively, multi-line mode causes them to |
332 | /// match at the beginning and end of a line *in addition* to the |
333 | /// beginning and end of a haystack. More precisely, `^` will match at |
334 | /// the position immediately following a `\n` and `$` will match at the |
335 | /// position immediately preceding a `\n`. |
336 | /// |
337 | /// The behavior of this option can be impacted by other settings too: |
338 | /// |
339 | /// * The [`RegexBuilder::line_terminator`] option changes `\n` above |
340 | /// to any ASCII byte. |
341 | /// * The [`RegexBuilder::crlf`] option changes the line terminator to |
342 | /// be either `\r` or `\n`, but never at the position between a `\r` |
343 | /// and `\n`. |
344 | /// |
345 | /// This setting can also be configured using the inline flag `m` in |
346 | /// the pattern. |
347 | /// |
348 | /// The default for this is `false`. |
349 | /// |
350 | /// # Example |
351 | /// |
352 | /// ``` |
353 | /// use regex::RegexBuilder; |
354 | /// |
355 | /// let re = RegexBuilder::new(r"^foo$" ) |
356 | /// .multi_line(true) |
357 | /// .build() |
358 | /// .unwrap(); |
359 | /// assert_eq!(Some(1..4), re.find(" \nfoo \n" ).map(|m| m.range())); |
360 | /// ``` |
361 | pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder { |
362 | self.builder.multi_line(yes); |
363 | self |
364 | } |
365 | |
366 | /// This configures dot-matches-new-line mode for the entire pattern. |
367 | /// |
368 | /// Perhaps surprisingly, the default behavior for `.` is not to match |
369 | /// any character, but rather, to match any character except for the |
370 | /// line terminator (which is `\n` by default). When this mode is |
371 | /// enabled, the behavior changes such that `.` truly matches any |
372 | /// character. |
373 | /// |
374 | /// This setting can also be configured using the inline flag `s` in |
375 | /// the pattern. For example, `(?s:.)` and `\p{any}` are equivalent |
376 | /// regexes. |
377 | /// |
378 | /// The default for this is `false`. |
379 | /// |
380 | /// # Example |
381 | /// |
382 | /// ``` |
383 | /// use regex::RegexBuilder; |
384 | /// |
385 | /// let re = RegexBuilder::new(r"foo.bar" ) |
386 | /// .dot_matches_new_line(true) |
387 | /// .build() |
388 | /// .unwrap(); |
389 | /// let hay = "foo \nbar" ; |
390 | /// assert_eq!(Some("foo \nbar" ), re.find(hay).map(|m| m.as_str())); |
391 | /// ``` |
392 | pub fn dot_matches_new_line( |
393 | &mut self, |
394 | yes: bool, |
395 | ) -> &mut RegexBuilder { |
396 | self.builder.dot_matches_new_line(yes); |
397 | self |
398 | } |
399 | |
400 | /// This configures CRLF mode for the entire pattern. |
401 | /// |
402 | /// When CRLF mode is enabled, both `\r` ("carriage return" or CR for |
403 | /// short) and `\n` ("line feed" or LF for short) are treated as line |
404 | /// terminators. This results in the following: |
405 | /// |
406 | /// * Unless dot-matches-new-line mode is enabled, `.` will now match |
407 | /// any character except for `\n` and `\r`. |
408 | /// * When multi-line mode is enabled, `^` will match immediately |
409 | /// following a `\n` or a `\r`. Similarly, `$` will match immediately |
410 | /// preceding a `\n` or a `\r`. Neither `^` nor `$` will ever match |
411 | /// between `\r` and `\n`. |
412 | /// |
413 | /// This setting can also be configured using the inline flag `R` in |
414 | /// the pattern. |
415 | /// |
416 | /// The default for this is `false`. |
417 | /// |
418 | /// # Example |
419 | /// |
420 | /// ``` |
421 | /// use regex::RegexBuilder; |
422 | /// |
423 | /// let re = RegexBuilder::new(r"^foo$" ) |
424 | /// .multi_line(true) |
425 | /// .crlf(true) |
426 | /// .build() |
427 | /// .unwrap(); |
428 | /// let hay = " \r\nfoo \r\n" ; |
429 | /// // If CRLF mode weren't enabled here, then '$' wouldn't match |
430 | /// // immediately after 'foo', and thus no match would be found. |
431 | /// assert_eq!(Some("foo" ), re.find(hay).map(|m| m.as_str())); |
432 | /// ``` |
433 | /// |
434 | /// This example demonstrates that `^` will never match at a position |
435 | /// between `\r` and `\n`. (`$` will similarly not match between a `\r` |
436 | /// and a `\n`.) |
437 | /// |
438 | /// ``` |
439 | /// use regex::RegexBuilder; |
440 | /// |
441 | /// let re = RegexBuilder::new(r"^" ) |
442 | /// .multi_line(true) |
443 | /// .crlf(true) |
444 | /// .build() |
445 | /// .unwrap(); |
446 | /// let hay = " \r\n\r\n" ; |
447 | /// let ranges: Vec<_> = re.find_iter(hay).map(|m| m.range()).collect(); |
448 | /// assert_eq!(ranges, vec![0..0, 2..2, 4..4]); |
449 | /// ``` |
450 | pub fn crlf(&mut self, yes: bool) -> &mut RegexBuilder { |
451 | self.builder.crlf(yes); |
452 | self |
453 | } |
454 | |
455 | /// Configures the line terminator to be used by the regex. |
456 | /// |
457 | /// The line terminator is relevant in two ways for a particular regex: |
458 | /// |
459 | /// * When dot-matches-new-line mode is *not* enabled (the default), |
460 | /// then `.` will match any character except for the configured line |
461 | /// terminator. |
462 | /// * When multi-line mode is enabled (not the default), then `^` and |
463 | /// `$` will match immediately after and before, respectively, a line |
464 | /// terminator. |
465 | /// |
466 | /// In both cases, if CRLF mode is enabled in a particular context, |
467 | /// then it takes precedence over any configured line terminator. |
468 | /// |
469 | /// This option cannot be configured from within the pattern. |
470 | /// |
471 | /// The default line terminator is `\n`. |
472 | /// |
473 | /// # Example |
474 | /// |
475 | /// This shows how to treat the NUL byte as a line terminator. This can |
476 | /// be a useful heuristic when searching binary data. |
477 | /// |
478 | /// ``` |
479 | /// use regex::RegexBuilder; |
480 | /// |
481 | /// let re = RegexBuilder::new(r"^foo$" ) |
482 | /// .multi_line(true) |
483 | /// .line_terminator(b' \x00' ) |
484 | /// .build() |
485 | /// .unwrap(); |
486 | /// let hay = " \x00foo \x00" ; |
487 | /// assert_eq!(Some(1..4), re.find(hay).map(|m| m.range())); |
488 | /// ``` |
489 | /// |
490 | /// This example shows that the behavior of `.` is impacted by this |
491 | /// setting as well: |
492 | /// |
493 | /// ``` |
494 | /// use regex::RegexBuilder; |
495 | /// |
496 | /// let re = RegexBuilder::new(r"." ) |
497 | /// .line_terminator(b' \x00' ) |
498 | /// .build() |
499 | /// .unwrap(); |
500 | /// assert!(re.is_match(" \n" )); |
501 | /// assert!(!re.is_match(" \x00" )); |
502 | /// ``` |
503 | /// |
504 | /// This shows that building a regex will fail if the byte given |
505 | /// is not ASCII and the pattern could result in matching invalid |
506 | /// UTF-8. This is because any singular non-ASCII byte is not valid |
507 | /// UTF-8, and it is not permitted for a [`Regex`] to match invalid |
508 | /// UTF-8. (It is permissible to use a non-ASCII byte when building a |
509 | /// [`bytes::Regex`](crate::bytes::Regex).) |
510 | /// |
511 | /// ``` |
512 | /// use regex::RegexBuilder; |
513 | /// |
514 | /// assert!(RegexBuilder::new(r"." ).line_terminator(0x80).build().is_err()); |
515 | /// // Note that using a non-ASCII byte isn't enough on its own to |
516 | /// // cause regex compilation to fail. You actually have to make use |
517 | /// // of it in the regex in a way that leads to matching invalid |
518 | /// // UTF-8. If you don't, then regex compilation will succeed! |
519 | /// assert!(RegexBuilder::new(r"a" ).line_terminator(0x80).build().is_ok()); |
520 | /// ``` |
521 | pub fn line_terminator(&mut self, byte: u8) -> &mut RegexBuilder { |
522 | self.builder.line_terminator(byte); |
523 | self |
524 | } |
525 | |
526 | /// This configures swap-greed mode for the entire pattern. |
527 | /// |
528 | /// When swap-greed mode is enabled, patterns like `a+` will become |
529 | /// non-greedy and patterns like `a+?` will become greedy. In other |
530 | /// words, the meanings of `a+` and `a+?` are switched. |
531 | /// |
532 | /// This setting can also be configured using the inline flag `U` in |
533 | /// the pattern. |
534 | /// |
535 | /// The default for this is `false`. |
536 | /// |
537 | /// # Example |
538 | /// |
539 | /// ``` |
540 | /// use regex::RegexBuilder; |
541 | /// |
542 | /// let re = RegexBuilder::new(r"a+" ) |
543 | /// .swap_greed(true) |
544 | /// .build() |
545 | /// .unwrap(); |
546 | /// assert_eq!(Some("a" ), re.find("aaa" ).map(|m| m.as_str())); |
547 | /// ``` |
548 | pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder { |
549 | self.builder.swap_greed(yes); |
550 | self |
551 | } |
552 | |
553 | /// This configures verbose mode for the entire pattern. |
554 | /// |
555 | /// When enabled, whitespace will treated as insignifcant in the |
556 | /// pattern and `#` can be used to start a comment until the next new |
557 | /// line. |
558 | /// |
559 | /// Normally, in most places in a pattern, whitespace is treated |
560 | /// literally. For example ` +` will match one or more ASCII whitespace |
561 | /// characters. |
562 | /// |
563 | /// When verbose mode is enabled, `\#` can be used to match a literal |
564 | /// `#` and `\ ` can be used to match a literal ASCII whitespace |
565 | /// character. |
566 | /// |
567 | /// Verbose mode is useful for permitting regexes to be formatted and |
568 | /// broken up more nicely. This may make them more easily readable. |
569 | /// |
570 | /// This setting can also be configured using the inline flag `x` in |
571 | /// the pattern. |
572 | /// |
573 | /// The default for this is `false`. |
574 | /// |
575 | /// # Example |
576 | /// |
577 | /// ``` |
578 | /// use regex::RegexBuilder; |
579 | /// |
580 | /// let pat = r" |
581 | /// \b |
582 | /// (?<first>\p{Uppercase}\w*) # always start with uppercase letter |
583 | /// [\s--\n]+ # whitespace should separate names |
584 | /// (?: # middle name can be an initial! |
585 | /// (?:(?<initial>\p{Uppercase})\.|(?<middle>\p{Uppercase}\w*)) |
586 | /// [\s--\n]+ |
587 | /// )? |
588 | /// (?<last>\p{Uppercase}\w*) |
589 | /// \b |
590 | /// " ; |
591 | /// let re = RegexBuilder::new(pat) |
592 | /// .ignore_whitespace(true) |
593 | /// .build() |
594 | /// .unwrap(); |
595 | /// |
596 | /// let caps = re.captures("Harry Potter" ).unwrap(); |
597 | /// assert_eq!("Harry" , &caps["first" ]); |
598 | /// assert_eq!("Potter" , &caps["last" ]); |
599 | /// |
600 | /// let caps = re.captures("Harry J. Potter" ).unwrap(); |
601 | /// assert_eq!("Harry" , &caps["first" ]); |
602 | /// // Since a middle name/initial isn't required for an overall match, |
603 | /// // we can't assume that 'initial' or 'middle' will be populated! |
604 | /// assert_eq!(Some("J" ), caps.name("initial" ).map(|m| m.as_str())); |
605 | /// assert_eq!(None, caps.name("middle" ).map(|m| m.as_str())); |
606 | /// assert_eq!("Potter" , &caps["last" ]); |
607 | /// |
608 | /// let caps = re.captures("Harry James Potter" ).unwrap(); |
609 | /// assert_eq!("Harry" , &caps["first" ]); |
610 | /// // Since a middle name/initial isn't required for an overall match, |
611 | /// // we can't assume that 'initial' or 'middle' will be populated! |
612 | /// assert_eq!(None, caps.name("initial" ).map(|m| m.as_str())); |
613 | /// assert_eq!(Some("James" ), caps.name("middle" ).map(|m| m.as_str())); |
614 | /// assert_eq!("Potter" , &caps["last" ]); |
615 | /// ``` |
616 | pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder { |
617 | self.builder.ignore_whitespace(yes); |
618 | self |
619 | } |
620 | |
621 | /// This configures octal mode for the entire pattern. |
622 | /// |
623 | /// Octal syntax is a little-known way of uttering Unicode codepoints |
624 | /// in a pattern. For example, `a`, `\x61`, `\u0061` and `\141` are all |
625 | /// equivalent patterns, where the last example shows octal syntax. |
626 | /// |
627 | /// While supporting octal syntax isn't in and of itself a problem, |
628 | /// it does make good error messages harder. That is, in PCRE based |
629 | /// regex engines, syntax like `\1` invokes a backreference, which is |
630 | /// explicitly unsupported this library. However, many users expect |
631 | /// backreferences to be supported. Therefore, when octal support |
632 | /// is disabled, the error message will explicitly mention that |
633 | /// backreferences aren't supported. |
634 | /// |
635 | /// The default for this is `false`. |
636 | /// |
637 | /// # Example |
638 | /// |
639 | /// ``` |
640 | /// use regex::RegexBuilder; |
641 | /// |
642 | /// // Normally this pattern would not compile, with an error message |
643 | /// // about backreferences not being supported. But with octal mode |
644 | /// // enabled, octal escape sequences work. |
645 | /// let re = RegexBuilder::new(r"\141" ) |
646 | /// .octal(true) |
647 | /// .build() |
648 | /// .unwrap(); |
649 | /// assert!(re.is_match("a" )); |
650 | /// ``` |
651 | pub fn octal(&mut self, yes: bool) -> &mut RegexBuilder { |
652 | self.builder.octal(yes); |
653 | self |
654 | } |
655 | |
656 | /// Sets the approximate size limit, in bytes, of the compiled regex. |
657 | /// |
658 | /// This roughly corresponds to the number of heap memory, in |
659 | /// bytes, occupied by a single regex. If the regex would otherwise |
660 | /// approximately exceed this limit, then compiling that regex will |
661 | /// fail. |
662 | /// |
663 | /// The main utility of a method like this is to avoid compiling |
664 | /// regexes that use an unexpected amount of resources, such as |
665 | /// time and memory. Even if the memory usage of a large regex is |
666 | /// acceptable, its search time may not be. Namely, worst case time |
667 | /// complexity for search is `O(m * n)`, where `m ~ len(pattern)` and |
668 | /// `n ~ len(haystack)`. That is, search time depends, in part, on the |
669 | /// size of the compiled regex. This means that putting a limit on the |
670 | /// size of the regex limits how much a regex can impact search time. |
671 | /// |
672 | /// For more information about regex size limits, see the section on |
673 | /// [untrusted inputs](crate#untrusted-input) in the top-level crate |
674 | /// documentation. |
675 | /// |
676 | /// The default for this is some reasonable number that permits most |
677 | /// patterns to compile successfully. |
678 | /// |
679 | /// # Example |
680 | /// |
681 | /// ``` |
682 | /// # if !cfg!(target_pointer_width = "64" ) { return; } // see #1041 |
683 | /// use regex::RegexBuilder; |
684 | /// |
685 | /// // It may surprise you how big some seemingly small patterns can |
686 | /// // be! Since \w is Unicode aware, this generates a regex that can |
687 | /// // match approximately 140,000 distinct codepoints. |
688 | /// assert!(RegexBuilder::new(r"\w" ).size_limit(45_000).build().is_err()); |
689 | /// ``` |
690 | pub fn size_limit(&mut self, bytes: usize) -> &mut RegexBuilder { |
691 | self.builder.size_limit(bytes); |
692 | self |
693 | } |
694 | |
695 | /// Set the approximate capacity, in bytes, of the cache of transitions |
696 | /// used by the lazy DFA. |
697 | /// |
698 | /// While the lazy DFA isn't always used, in tends to be the most |
699 | /// commonly use regex engine in default configurations. It tends to |
700 | /// adopt the performance profile of a fully build DFA, but without the |
701 | /// downside of taking worst case exponential time to build. |
702 | /// |
703 | /// The downside is that it needs to keep a cache of transitions and |
704 | /// states that are built while running a search, and this cache |
705 | /// can fill up. When it fills up, the cache will reset itself. Any |
706 | /// previously generated states and transitions will then need to be |
707 | /// re-generated. If this happens too many times, then this library |
708 | /// will bail out of using the lazy DFA and switch to a different regex |
709 | /// engine. |
710 | /// |
711 | /// If your regex provokes this particular downside of the lazy DFA, |
712 | /// then it may be beneficial to increase its cache capacity. This will |
713 | /// potentially reduce the frequency of cache resetting (ideally to |
714 | /// `0`). While it won't fix all potential performance problems with |
715 | /// the lazy DFA, increasing the cache capacity does fix some. |
716 | /// |
717 | /// There is no easy way to determine, a priori, whether increasing |
718 | /// this cache capacity will help. In general, the larger your regex, |
719 | /// the more cache it's likely to use. But that isn't an ironclad rule. |
720 | /// For example, a regex like `[01]*1[01]{N}` would normally produce a |
721 | /// fully build DFA that is exponential in size with respect to `N`. |
722 | /// The lazy DFA will prevent exponential space blow-up, but it cache |
723 | /// is likely to fill up, even when it's large and even for smallish |
724 | /// values of `N`. |
725 | /// |
726 | /// If you aren't sure whether this helps or not, it is sensible to |
727 | /// set this to some arbitrarily large number in testing, such as |
728 | /// `usize::MAX`. Namely, this represents the amount of capacity that |
729 | /// *may* be used. It's probably not a good idea to use `usize::MAX` in |
730 | /// production though, since it implies there are no controls on heap |
731 | /// memory used by this library during a search. In effect, set it to |
732 | /// whatever you're willing to allocate for a single regex search. |
733 | pub fn dfa_size_limit(&mut self, bytes: usize) -> &mut RegexBuilder { |
734 | self.builder.dfa_size_limit(bytes); |
735 | self |
736 | } |
737 | |
738 | /// Set the nesting limit for this parser. |
739 | /// |
740 | /// The nesting limit controls how deep the abstract syntax tree is |
741 | /// allowed to be. If the AST exceeds the given limit (e.g., with too |
742 | /// many nested groups), then an error is returned by the parser. |
743 | /// |
744 | /// The purpose of this limit is to act as a heuristic to prevent stack |
745 | /// overflow for consumers that do structural induction on an AST using |
746 | /// explicit recursion. While this crate never does this (instead using |
747 | /// constant stack space and moving the call stack to the heap), other |
748 | /// crates may. |
749 | /// |
750 | /// This limit is not checked until the entire AST is parsed. |
751 | /// Therefore, if callers want to put a limit on the amount of heap |
752 | /// space used, then they should impose a limit on the length, in |
753 | /// bytes, of the concrete pattern string. In particular, this is |
754 | /// viable since this parser implementation will limit itself to heap |
755 | /// space proportional to the length of the pattern string. See also |
756 | /// the [untrusted inputs](crate#untrusted-input) section in the |
757 | /// top-level crate documentation for more information about this. |
758 | /// |
759 | /// Note that a nest limit of `0` will return a nest limit error for |
760 | /// most patterns but not all. For example, a nest limit of `0` permits |
761 | /// `a` but not `ab`, since `ab` requires an explicit concatenation, |
762 | /// which results in a nest depth of `1`. In general, a nest limit is |
763 | /// not something that manifests in an obvious way in the concrete |
764 | /// syntax, therefore, it should not be used in a granular way. |
765 | /// |
766 | /// # Example |
767 | /// |
768 | /// ``` |
769 | /// use regex::RegexBuilder; |
770 | /// |
771 | /// assert!(RegexBuilder::new(r"a" ).nest_limit(0).build().is_ok()); |
772 | /// assert!(RegexBuilder::new(r"ab" ).nest_limit(0).build().is_err()); |
773 | /// ``` |
774 | pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder { |
775 | self.builder.nest_limit(limit); |
776 | self |
777 | } |
778 | } |
779 | |
780 | /// A configurable builder for a [`RegexSet`]. |
781 | /// |
782 | /// This builder can be used to programmatically set flags such as |
783 | /// `i` (case insensitive) and `x` (for verbose mode). This builder |
784 | /// can also be used to configure things like the line terminator |
785 | /// and a size limit on the compiled regular expression. |
786 | #[derive(Clone, Debug)] |
787 | pub struct RegexSetBuilder { |
788 | builder: Builder, |
789 | } |
790 | |
791 | impl RegexSetBuilder { |
792 | /// Create a new builder with a default configuration for the given |
793 | /// patterns. |
794 | /// |
795 | /// If the patterns are invalid or exceed the configured size limits, |
796 | /// then an error will be returned when [`RegexSetBuilder::build`] is |
797 | /// called. |
798 | pub fn new<I, S>(patterns: I) -> RegexSetBuilder |
799 | where |
800 | I: IntoIterator<Item = S>, |
801 | S: AsRef<str>, |
802 | { |
803 | RegexSetBuilder { builder: Builder::new(patterns) } |
804 | } |
805 | |
806 | /// Compiles the patterns given to `RegexSetBuilder::new` with the |
807 | /// configuration set on this builder. |
808 | /// |
809 | /// If the patterns aren't valid regexes or if a configured size limit |
810 | /// was exceeded, then an error is returned. |
811 | pub fn build(&self) -> Result<RegexSet, Error> { |
812 | self.builder.build_many_string() |
813 | } |
814 | |
815 | /// This configures Unicode mode for the all of the patterns. |
816 | /// |
817 | /// Enabling Unicode mode does a number of things: |
818 | /// |
819 | /// * Most fundamentally, it causes the fundamental atom of matching |
820 | /// to be a single codepoint. When Unicode mode is disabled, it's a |
821 | /// single byte. For example, when Unicode mode is enabled, `.` will |
822 | /// match `💩` once, where as it will match 4 times when Unicode mode |
823 | /// is disabled. (Since the UTF-8 encoding of `💩` is 4 bytes long.) |
824 | /// * Case insensitive matching uses Unicode simple case folding rules. |
825 | /// * Unicode character classes like `\p{Letter}` and `\p{Greek}` are |
826 | /// available. |
827 | /// * Perl character classes are Unicode aware. That is, `\w`, `\s` and |
828 | /// `\d`. |
829 | /// * The word boundary assertions, `\b` and `\B`, use the Unicode |
830 | /// definition of a word character. |
831 | /// |
832 | /// Note that if Unicode mode is disabled, then the regex will fail to |
833 | /// compile if it could match invalid UTF-8. For example, when Unicode |
834 | /// mode is disabled, then since `.` matches any byte (except for |
835 | /// `\n`), then it can match invalid UTF-8 and thus building a regex |
836 | /// from it will fail. Another example is `\w` and `\W`. Since `\w` can |
837 | /// only match ASCII bytes when Unicode mode is disabled, it's allowed. |
838 | /// But `\W` can match more than ASCII bytes, including invalid UTF-8, |
839 | /// and so it is not allowed. This restriction can be lifted only by |
840 | /// using a [`bytes::RegexSet`](crate::bytes::RegexSet). |
841 | /// |
842 | /// For more details on the Unicode support in this crate, see the |
843 | /// [Unicode section](crate#unicode) in this crate's top-level |
844 | /// documentation. |
845 | /// |
846 | /// The default for this is `true`. |
847 | /// |
848 | /// # Example |
849 | /// |
850 | /// ``` |
851 | /// use regex::RegexSetBuilder; |
852 | /// |
853 | /// let re = RegexSetBuilder::new([r"\w" ]) |
854 | /// .unicode(false) |
855 | /// .build() |
856 | /// .unwrap(); |
857 | /// // Normally greek letters would be included in \w, but since |
858 | /// // Unicode mode is disabled, it only matches ASCII letters. |
859 | /// assert!(!re.is_match("δ" )); |
860 | /// |
861 | /// let re = RegexSetBuilder::new([r"s" ]) |
862 | /// .case_insensitive(true) |
863 | /// .unicode(false) |
864 | /// .build() |
865 | /// .unwrap(); |
866 | /// // Normally 'Å¿' is included when searching for 's' case |
867 | /// // insensitively due to Unicode's simple case folding rules. But |
868 | /// // when Unicode mode is disabled, only ASCII case insensitive rules |
869 | /// // are used. |
870 | /// assert!(!re.is_match("Å¿" )); |
871 | /// ``` |
872 | pub fn unicode(&mut self, yes: bool) -> &mut RegexSetBuilder { |
873 | self.builder.unicode(yes); |
874 | self |
875 | } |
876 | |
877 | /// This configures whether to enable case insensitive matching for all |
878 | /// of the patterns. |
879 | /// |
880 | /// This setting can also be configured using the inline flag `i` |
881 | /// in the pattern. For example, `(?i:foo)` matches `foo` case |
882 | /// insensitively while `(?-i:foo)` matches `foo` case sensitively. |
883 | /// |
884 | /// The default for this is `false`. |
885 | /// |
886 | /// # Example |
887 | /// |
888 | /// ``` |
889 | /// use regex::RegexSetBuilder; |
890 | /// |
891 | /// let re = RegexSetBuilder::new([r"foo(?-i:bar)quux" ]) |
892 | /// .case_insensitive(true) |
893 | /// .build() |
894 | /// .unwrap(); |
895 | /// assert!(re.is_match("FoObarQuUx" )); |
896 | /// // Even though case insensitive matching is enabled in the builder, |
897 | /// // it can be locally disabled within the pattern. In this case, |
898 | /// // `bar` is matched case sensitively. |
899 | /// assert!(!re.is_match("fooBARquux" )); |
900 | /// ``` |
901 | pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexSetBuilder { |
902 | self.builder.case_insensitive(yes); |
903 | self |
904 | } |
905 | |
906 | /// This configures multi-line mode for all of the patterns. |
907 | /// |
908 | /// Enabling multi-line mode changes the behavior of the `^` and `$` |
909 | /// anchor assertions. Instead of only matching at the beginning and |
910 | /// end of a haystack, respectively, multi-line mode causes them to |
911 | /// match at the beginning and end of a line *in addition* to the |
912 | /// beginning and end of a haystack. More precisely, `^` will match at |
913 | /// the position immediately following a `\n` and `$` will match at the |
914 | /// position immediately preceding a `\n`. |
915 | /// |
916 | /// The behavior of this option can be impacted by other settings too: |
917 | /// |
918 | /// * The [`RegexSetBuilder::line_terminator`] option changes `\n` |
919 | /// above to any ASCII byte. |
920 | /// * The [`RegexSetBuilder::crlf`] option changes the line terminator |
921 | /// to be either `\r` or `\n`, but never at the position between a `\r` |
922 | /// and `\n`. |
923 | /// |
924 | /// This setting can also be configured using the inline flag `m` in |
925 | /// the pattern. |
926 | /// |
927 | /// The default for this is `false`. |
928 | /// |
929 | /// # Example |
930 | /// |
931 | /// ``` |
932 | /// use regex::RegexSetBuilder; |
933 | /// |
934 | /// let re = RegexSetBuilder::new([r"^foo$" ]) |
935 | /// .multi_line(true) |
936 | /// .build() |
937 | /// .unwrap(); |
938 | /// assert!(re.is_match(" \nfoo \n" )); |
939 | /// ``` |
940 | pub fn multi_line(&mut self, yes: bool) -> &mut RegexSetBuilder { |
941 | self.builder.multi_line(yes); |
942 | self |
943 | } |
944 | |
945 | /// This configures dot-matches-new-line mode for the entire pattern. |
946 | /// |
947 | /// Perhaps surprisingly, the default behavior for `.` is not to match |
948 | /// any character, but rather, to match any character except for the |
949 | /// line terminator (which is `\n` by default). When this mode is |
950 | /// enabled, the behavior changes such that `.` truly matches any |
951 | /// character. |
952 | /// |
953 | /// This setting can also be configured using the inline flag `s` in |
954 | /// the pattern. For example, `(?s:.)` and `\p{any}` are equivalent |
955 | /// regexes. |
956 | /// |
957 | /// The default for this is `false`. |
958 | /// |
959 | /// # Example |
960 | /// |
961 | /// ``` |
962 | /// use regex::RegexSetBuilder; |
963 | /// |
964 | /// let re = RegexSetBuilder::new([r"foo.bar" ]) |
965 | /// .dot_matches_new_line(true) |
966 | /// .build() |
967 | /// .unwrap(); |
968 | /// let hay = "foo \nbar" ; |
969 | /// assert!(re.is_match(hay)); |
970 | /// ``` |
971 | pub fn dot_matches_new_line( |
972 | &mut self, |
973 | yes: bool, |
974 | ) -> &mut RegexSetBuilder { |
975 | self.builder.dot_matches_new_line(yes); |
976 | self |
977 | } |
978 | |
979 | /// This configures CRLF mode for all of the patterns. |
980 | /// |
981 | /// When CRLF mode is enabled, both `\r` ("carriage return" or CR for |
982 | /// short) and `\n` ("line feed" or LF for short) are treated as line |
983 | /// terminators. This results in the following: |
984 | /// |
985 | /// * Unless dot-matches-new-line mode is enabled, `.` will now match |
986 | /// any character except for `\n` and `\r`. |
987 | /// * When multi-line mode is enabled, `^` will match immediately |
988 | /// following a `\n` or a `\r`. Similarly, `$` will match immediately |
989 | /// preceding a `\n` or a `\r`. Neither `^` nor `$` will ever match |
990 | /// between `\r` and `\n`. |
991 | /// |
992 | /// This setting can also be configured using the inline flag `R` in |
993 | /// the pattern. |
994 | /// |
995 | /// The default for this is `false`. |
996 | /// |
997 | /// # Example |
998 | /// |
999 | /// ``` |
1000 | /// use regex::RegexSetBuilder; |
1001 | /// |
1002 | /// let re = RegexSetBuilder::new([r"^foo$" ]) |
1003 | /// .multi_line(true) |
1004 | /// .crlf(true) |
1005 | /// .build() |
1006 | /// .unwrap(); |
1007 | /// let hay = " \r\nfoo \r\n" ; |
1008 | /// // If CRLF mode weren't enabled here, then '$' wouldn't match |
1009 | /// // immediately after 'foo', and thus no match would be found. |
1010 | /// assert!(re.is_match(hay)); |
1011 | /// ``` |
1012 | /// |
1013 | /// This example demonstrates that `^` will never match at a position |
1014 | /// between `\r` and `\n`. (`$` will similarly not match between a `\r` |
1015 | /// and a `\n`.) |
1016 | /// |
1017 | /// ``` |
1018 | /// use regex::RegexSetBuilder; |
1019 | /// |
1020 | /// let re = RegexSetBuilder::new([r"^\n" ]) |
1021 | /// .multi_line(true) |
1022 | /// .crlf(true) |
1023 | /// .build() |
1024 | /// .unwrap(); |
1025 | /// assert!(!re.is_match(" \r\n" )); |
1026 | /// ``` |
1027 | pub fn crlf(&mut self, yes: bool) -> &mut RegexSetBuilder { |
1028 | self.builder.crlf(yes); |
1029 | self |
1030 | } |
1031 | |
1032 | /// Configures the line terminator to be used by the regex. |
1033 | /// |
1034 | /// The line terminator is relevant in two ways for a particular regex: |
1035 | /// |
1036 | /// * When dot-matches-new-line mode is *not* enabled (the default), |
1037 | /// then `.` will match any character except for the configured line |
1038 | /// terminator. |
1039 | /// * When multi-line mode is enabled (not the default), then `^` and |
1040 | /// `$` will match immediately after and before, respectively, a line |
1041 | /// terminator. |
1042 | /// |
1043 | /// In both cases, if CRLF mode is enabled in a particular context, |
1044 | /// then it takes precedence over any configured line terminator. |
1045 | /// |
1046 | /// This option cannot be configured from within the pattern. |
1047 | /// |
1048 | /// The default line terminator is `\n`. |
1049 | /// |
1050 | /// # Example |
1051 | /// |
1052 | /// This shows how to treat the NUL byte as a line terminator. This can |
1053 | /// be a useful heuristic when searching binary data. |
1054 | /// |
1055 | /// ``` |
1056 | /// use regex::RegexSetBuilder; |
1057 | /// |
1058 | /// let re = RegexSetBuilder::new([r"^foo$" ]) |
1059 | /// .multi_line(true) |
1060 | /// .line_terminator(b' \x00' ) |
1061 | /// .build() |
1062 | /// .unwrap(); |
1063 | /// let hay = " \x00foo \x00" ; |
1064 | /// assert!(re.is_match(hay)); |
1065 | /// ``` |
1066 | /// |
1067 | /// This example shows that the behavior of `.` is impacted by this |
1068 | /// setting as well: |
1069 | /// |
1070 | /// ``` |
1071 | /// use regex::RegexSetBuilder; |
1072 | /// |
1073 | /// let re = RegexSetBuilder::new([r"." ]) |
1074 | /// .line_terminator(b' \x00' ) |
1075 | /// .build() |
1076 | /// .unwrap(); |
1077 | /// assert!(re.is_match(" \n" )); |
1078 | /// assert!(!re.is_match(" \x00" )); |
1079 | /// ``` |
1080 | /// |
1081 | /// This shows that building a regex will fail if the byte given |
1082 | /// is not ASCII and the pattern could result in matching invalid |
1083 | /// UTF-8. This is because any singular non-ASCII byte is not valid |
1084 | /// UTF-8, and it is not permitted for a [`RegexSet`] to match invalid |
1085 | /// UTF-8. (It is permissible to use a non-ASCII byte when building a |
1086 | /// [`bytes::RegexSet`](crate::bytes::RegexSet).) |
1087 | /// |
1088 | /// ``` |
1089 | /// use regex::RegexSetBuilder; |
1090 | /// |
1091 | /// assert!( |
1092 | /// RegexSetBuilder::new([r"." ]) |
1093 | /// .line_terminator(0x80) |
1094 | /// .build() |
1095 | /// .is_err() |
1096 | /// ); |
1097 | /// // Note that using a non-ASCII byte isn't enough on its own to |
1098 | /// // cause regex compilation to fail. You actually have to make use |
1099 | /// // of it in the regex in a way that leads to matching invalid |
1100 | /// // UTF-8. If you don't, then regex compilation will succeed! |
1101 | /// assert!( |
1102 | /// RegexSetBuilder::new([r"a" ]) |
1103 | /// .line_terminator(0x80) |
1104 | /// .build() |
1105 | /// .is_ok() |
1106 | /// ); |
1107 | /// ``` |
1108 | pub fn line_terminator(&mut self, byte: u8) -> &mut RegexSetBuilder { |
1109 | self.builder.line_terminator(byte); |
1110 | self |
1111 | } |
1112 | |
1113 | /// This configures swap-greed mode for all of the patterns. |
1114 | /// |
1115 | /// When swap-greed mode is enabled, patterns like `a+` will become |
1116 | /// non-greedy and patterns like `a+?` will become greedy. In other |
1117 | /// words, the meanings of `a+` and `a+?` are switched. |
1118 | /// |
1119 | /// This setting can also be configured using the inline flag `U` in |
1120 | /// the pattern. |
1121 | /// |
1122 | /// Note that this is generally not useful for a `RegexSet` since a |
1123 | /// `RegexSet` can only report whether a pattern matches or not. Since |
1124 | /// greediness never impacts whether a match is found or not (only the |
1125 | /// offsets of the match), it follows that whether parts of a pattern |
1126 | /// are greedy or not doesn't matter for a `RegexSet`. |
1127 | /// |
1128 | /// The default for this is `false`. |
1129 | pub fn swap_greed(&mut self, yes: bool) -> &mut RegexSetBuilder { |
1130 | self.builder.swap_greed(yes); |
1131 | self |
1132 | } |
1133 | |
1134 | /// This configures verbose mode for all of the patterns. |
1135 | /// |
1136 | /// When enabled, whitespace will treated as insignifcant in the |
1137 | /// pattern and `#` can be used to start a comment until the next new |
1138 | /// line. |
1139 | /// |
1140 | /// Normally, in most places in a pattern, whitespace is treated |
1141 | /// literally. For example ` +` will match one or more ASCII whitespace |
1142 | /// characters. |
1143 | /// |
1144 | /// When verbose mode is enabled, `\#` can be used to match a literal |
1145 | /// `#` and `\ ` can be used to match a literal ASCII whitespace |
1146 | /// character. |
1147 | /// |
1148 | /// Verbose mode is useful for permitting regexes to be formatted and |
1149 | /// broken up more nicely. This may make them more easily readable. |
1150 | /// |
1151 | /// This setting can also be configured using the inline flag `x` in |
1152 | /// the pattern. |
1153 | /// |
1154 | /// The default for this is `false`. |
1155 | /// |
1156 | /// # Example |
1157 | /// |
1158 | /// ``` |
1159 | /// use regex::RegexSetBuilder; |
1160 | /// |
1161 | /// let pat = r" |
1162 | /// \b |
1163 | /// (?<first>\p{Uppercase}\w*) # always start with uppercase letter |
1164 | /// [\s--\n]+ # whitespace should separate names |
1165 | /// (?: # middle name can be an initial! |
1166 | /// (?:(?<initial>\p{Uppercase})\.|(?<middle>\p{Uppercase}\w*)) |
1167 | /// [\s--\n]+ |
1168 | /// )? |
1169 | /// (?<last>\p{Uppercase}\w*) |
1170 | /// \b |
1171 | /// " ; |
1172 | /// let re = RegexSetBuilder::new([pat]) |
1173 | /// .ignore_whitespace(true) |
1174 | /// .build() |
1175 | /// .unwrap(); |
1176 | /// assert!(re.is_match("Harry Potter" )); |
1177 | /// assert!(re.is_match("Harry J. Potter" )); |
1178 | /// assert!(re.is_match("Harry James Potter" )); |
1179 | /// assert!(!re.is_match("harry J. Potter" )); |
1180 | /// ``` |
1181 | pub fn ignore_whitespace( |
1182 | &mut self, |
1183 | yes: bool, |
1184 | ) -> &mut RegexSetBuilder { |
1185 | self.builder.ignore_whitespace(yes); |
1186 | self |
1187 | } |
1188 | |
1189 | /// This configures octal mode for all of the patterns. |
1190 | /// |
1191 | /// Octal syntax is a little-known way of uttering Unicode codepoints |
1192 | /// in a pattern. For example, `a`, `\x61`, `\u0061` and `\141` are all |
1193 | /// equivalent patterns, where the last example shows octal syntax. |
1194 | /// |
1195 | /// While supporting octal syntax isn't in and of itself a problem, |
1196 | /// it does make good error messages harder. That is, in PCRE based |
1197 | /// regex engines, syntax like `\1` invokes a backreference, which is |
1198 | /// explicitly unsupported this library. However, many users expect |
1199 | /// backreferences to be supported. Therefore, when octal support |
1200 | /// is disabled, the error message will explicitly mention that |
1201 | /// backreferences aren't supported. |
1202 | /// |
1203 | /// The default for this is `false`. |
1204 | /// |
1205 | /// # Example |
1206 | /// |
1207 | /// ``` |
1208 | /// use regex::RegexSetBuilder; |
1209 | /// |
1210 | /// // Normally this pattern would not compile, with an error message |
1211 | /// // about backreferences not being supported. But with octal mode |
1212 | /// // enabled, octal escape sequences work. |
1213 | /// let re = RegexSetBuilder::new([r"\141" ]) |
1214 | /// .octal(true) |
1215 | /// .build() |
1216 | /// .unwrap(); |
1217 | /// assert!(re.is_match("a" )); |
1218 | /// ``` |
1219 | pub fn octal(&mut self, yes: bool) -> &mut RegexSetBuilder { |
1220 | self.builder.octal(yes); |
1221 | self |
1222 | } |
1223 | |
1224 | /// Sets the approximate size limit, in bytes, of the compiled regex. |
1225 | /// |
1226 | /// This roughly corresponds to the number of heap memory, in |
1227 | /// bytes, occupied by a single regex. If the regex would otherwise |
1228 | /// approximately exceed this limit, then compiling that regex will |
1229 | /// fail. |
1230 | /// |
1231 | /// The main utility of a method like this is to avoid compiling |
1232 | /// regexes that use an unexpected amount of resources, such as |
1233 | /// time and memory. Even if the memory usage of a large regex is |
1234 | /// acceptable, its search time may not be. Namely, worst case time |
1235 | /// complexity for search is `O(m * n)`, where `m ~ len(pattern)` and |
1236 | /// `n ~ len(haystack)`. That is, search time depends, in part, on the |
1237 | /// size of the compiled regex. This means that putting a limit on the |
1238 | /// size of the regex limits how much a regex can impact search time. |
1239 | /// |
1240 | /// For more information about regex size limits, see the section on |
1241 | /// [untrusted inputs](crate#untrusted-input) in the top-level crate |
1242 | /// documentation. |
1243 | /// |
1244 | /// The default for this is some reasonable number that permits most |
1245 | /// patterns to compile successfully. |
1246 | /// |
1247 | /// # Example |
1248 | /// |
1249 | /// ``` |
1250 | /// # if !cfg!(target_pointer_width = "64" ) { return; } // see #1041 |
1251 | /// use regex::RegexSetBuilder; |
1252 | /// |
1253 | /// // It may surprise you how big some seemingly small patterns can |
1254 | /// // be! Since \w is Unicode aware, this generates a regex that can |
1255 | /// // match approximately 140,000 distinct codepoints. |
1256 | /// assert!( |
1257 | /// RegexSetBuilder::new([r"\w" ]) |
1258 | /// .size_limit(45_000) |
1259 | /// .build() |
1260 | /// .is_err() |
1261 | /// ); |
1262 | /// ``` |
1263 | pub fn size_limit(&mut self, bytes: usize) -> &mut RegexSetBuilder { |
1264 | self.builder.size_limit(bytes); |
1265 | self |
1266 | } |
1267 | |
1268 | /// Set the approximate capacity, in bytes, of the cache of transitions |
1269 | /// used by the lazy DFA. |
1270 | /// |
1271 | /// While the lazy DFA isn't always used, in tends to be the most |
1272 | /// commonly use regex engine in default configurations. It tends to |
1273 | /// adopt the performance profile of a fully build DFA, but without the |
1274 | /// downside of taking worst case exponential time to build. |
1275 | /// |
1276 | /// The downside is that it needs to keep a cache of transitions and |
1277 | /// states that are built while running a search, and this cache |
1278 | /// can fill up. When it fills up, the cache will reset itself. Any |
1279 | /// previously generated states and transitions will then need to be |
1280 | /// re-generated. If this happens too many times, then this library |
1281 | /// will bail out of using the lazy DFA and switch to a different regex |
1282 | /// engine. |
1283 | /// |
1284 | /// If your regex provokes this particular downside of the lazy DFA, |
1285 | /// then it may be beneficial to increase its cache capacity. This will |
1286 | /// potentially reduce the frequency of cache resetting (ideally to |
1287 | /// `0`). While it won't fix all potential performance problems with |
1288 | /// the lazy DFA, increasing the cache capacity does fix some. |
1289 | /// |
1290 | /// There is no easy way to determine, a priori, whether increasing |
1291 | /// this cache capacity will help. In general, the larger your regex, |
1292 | /// the more cache it's likely to use. But that isn't an ironclad rule. |
1293 | /// For example, a regex like `[01]*1[01]{N}` would normally produce a |
1294 | /// fully build DFA that is exponential in size with respect to `N`. |
1295 | /// The lazy DFA will prevent exponential space blow-up, but it cache |
1296 | /// is likely to fill up, even when it's large and even for smallish |
1297 | /// values of `N`. |
1298 | /// |
1299 | /// If you aren't sure whether this helps or not, it is sensible to |
1300 | /// set this to some arbitrarily large number in testing, such as |
1301 | /// `usize::MAX`. Namely, this represents the amount of capacity that |
1302 | /// *may* be used. It's probably not a good idea to use `usize::MAX` in |
1303 | /// production though, since it implies there are no controls on heap |
1304 | /// memory used by this library during a search. In effect, set it to |
1305 | /// whatever you're willing to allocate for a single regex search. |
1306 | pub fn dfa_size_limit( |
1307 | &mut self, |
1308 | bytes: usize, |
1309 | ) -> &mut RegexSetBuilder { |
1310 | self.builder.dfa_size_limit(bytes); |
1311 | self |
1312 | } |
1313 | |
1314 | /// Set the nesting limit for this parser. |
1315 | /// |
1316 | /// The nesting limit controls how deep the abstract syntax tree is |
1317 | /// allowed to be. If the AST exceeds the given limit (e.g., with too |
1318 | /// many nested groups), then an error is returned by the parser. |
1319 | /// |
1320 | /// The purpose of this limit is to act as a heuristic to prevent stack |
1321 | /// overflow for consumers that do structural induction on an AST using |
1322 | /// explicit recursion. While this crate never does this (instead using |
1323 | /// constant stack space and moving the call stack to the heap), other |
1324 | /// crates may. |
1325 | /// |
1326 | /// This limit is not checked until the entire AST is parsed. |
1327 | /// Therefore, if callers want to put a limit on the amount of heap |
1328 | /// space used, then they should impose a limit on the length, in |
1329 | /// bytes, of the concrete pattern string. In particular, this is |
1330 | /// viable since this parser implementation will limit itself to heap |
1331 | /// space proportional to the length of the pattern string. See also |
1332 | /// the [untrusted inputs](crate#untrusted-input) section in the |
1333 | /// top-level crate documentation for more information about this. |
1334 | /// |
1335 | /// Note that a nest limit of `0` will return a nest limit error for |
1336 | /// most patterns but not all. For example, a nest limit of `0` permits |
1337 | /// `a` but not `ab`, since `ab` requires an explicit concatenation, |
1338 | /// which results in a nest depth of `1`. In general, a nest limit is |
1339 | /// not something that manifests in an obvious way in the concrete |
1340 | /// syntax, therefore, it should not be used in a granular way. |
1341 | /// |
1342 | /// # Example |
1343 | /// |
1344 | /// ``` |
1345 | /// use regex::RegexSetBuilder; |
1346 | /// |
1347 | /// assert!(RegexSetBuilder::new([r"a" ]).nest_limit(0).build().is_ok()); |
1348 | /// assert!(RegexSetBuilder::new([r"ab" ]).nest_limit(0).build().is_err()); |
1349 | /// ``` |
1350 | pub fn nest_limit(&mut self, limit: u32) -> &mut RegexSetBuilder { |
1351 | self.builder.nest_limit(limit); |
1352 | self |
1353 | } |
1354 | } |
1355 | } |
1356 | |
1357 | pub(crate) mod bytes { |
1358 | use crate::{ |
1359 | bytes::{Regex, RegexSet}, |
1360 | error::Error, |
1361 | }; |
1362 | |
1363 | use super::Builder; |
1364 | |
1365 | /// A configurable builder for a [`Regex`]. |
1366 | /// |
1367 | /// This builder can be used to programmatically set flags such as `i` |
1368 | /// (case insensitive) and `x` (for verbose mode). This builder can also be |
1369 | /// used to configure things like the line terminator and a size limit on |
1370 | /// the compiled regular expression. |
1371 | #[derive(Clone, Debug)] |
1372 | pub struct RegexBuilder { |
1373 | builder: Builder, |
1374 | } |
1375 | |
1376 | impl RegexBuilder { |
1377 | /// Create a new builder with a default configuration for the given |
1378 | /// pattern. |
1379 | /// |
1380 | /// If the pattern is invalid or exceeds the configured size limits, |
1381 | /// then an error will be returned when [`RegexBuilder::build`] is |
1382 | /// called. |
1383 | pub fn new(pattern: &str) -> RegexBuilder { |
1384 | RegexBuilder { builder: Builder::new([pattern]) } |
1385 | } |
1386 | |
1387 | /// Compiles the pattern given to `RegexBuilder::new` with the |
1388 | /// configuration set on this builder. |
1389 | /// |
1390 | /// If the pattern isn't a valid regex or if a configured size limit |
1391 | /// was exceeded, then an error is returned. |
1392 | pub fn build(&self) -> Result<Regex, Error> { |
1393 | self.builder.build_one_bytes() |
1394 | } |
1395 | |
1396 | /// This configures Unicode mode for the entire pattern. |
1397 | /// |
1398 | /// Enabling Unicode mode does a number of things: |
1399 | /// |
1400 | /// * Most fundamentally, it causes the fundamental atom of matching |
1401 | /// to be a single codepoint. When Unicode mode is disabled, it's a |
1402 | /// single byte. For example, when Unicode mode is enabled, `.` will |
1403 | /// match `💩` once, where as it will match 4 times when Unicode mode |
1404 | /// is disabled. (Since the UTF-8 encoding of `💩` is 4 bytes long.) |
1405 | /// * Case insensitive matching uses Unicode simple case folding rules. |
1406 | /// * Unicode character classes like `\p{Letter}` and `\p{Greek}` are |
1407 | /// available. |
1408 | /// * Perl character classes are Unicode aware. That is, `\w`, `\s` and |
1409 | /// `\d`. |
1410 | /// * The word boundary assertions, `\b` and `\B`, use the Unicode |
1411 | /// definition of a word character. |
1412 | /// |
1413 | /// Note that unlike the top-level `Regex` for searching `&str`, it |
1414 | /// is permitted to disable Unicode mode even if the resulting pattern |
1415 | /// could match invalid UTF-8. For example, `(?-u:.)` is not a valid |
1416 | /// pattern for a top-level `Regex`, but is valid for a `bytes::Regex`. |
1417 | /// |
1418 | /// For more details on the Unicode support in this crate, see the |
1419 | /// [Unicode section](crate#unicode) in this crate's top-level |
1420 | /// documentation. |
1421 | /// |
1422 | /// The default for this is `true`. |
1423 | /// |
1424 | /// # Example |
1425 | /// |
1426 | /// ``` |
1427 | /// use regex::bytes::RegexBuilder; |
1428 | /// |
1429 | /// let re = RegexBuilder::new(r"\w" ) |
1430 | /// .unicode(false) |
1431 | /// .build() |
1432 | /// .unwrap(); |
1433 | /// // Normally greek letters would be included in \w, but since |
1434 | /// // Unicode mode is disabled, it only matches ASCII letters. |
1435 | /// assert!(!re.is_match("δ" .as_bytes())); |
1436 | /// |
1437 | /// let re = RegexBuilder::new(r"s" ) |
1438 | /// .case_insensitive(true) |
1439 | /// .unicode(false) |
1440 | /// .build() |
1441 | /// .unwrap(); |
1442 | /// // Normally 'Å¿' is included when searching for 's' case |
1443 | /// // insensitively due to Unicode's simple case folding rules. But |
1444 | /// // when Unicode mode is disabled, only ASCII case insensitive rules |
1445 | /// // are used. |
1446 | /// assert!(!re.is_match("Å¿" .as_bytes())); |
1447 | /// ``` |
1448 | /// |
1449 | /// Since this builder is for constructing a [`bytes::Regex`](Regex), |
1450 | /// one can disable Unicode mode even if it would match invalid UTF-8: |
1451 | /// |
1452 | /// ``` |
1453 | /// use regex::bytes::RegexBuilder; |
1454 | /// |
1455 | /// let re = RegexBuilder::new(r"." ) |
1456 | /// .unicode(false) |
1457 | /// .build() |
1458 | /// .unwrap(); |
1459 | /// // Normally greek letters would be included in \w, but since |
1460 | /// // Unicode mode is disabled, it only matches ASCII letters. |
1461 | /// assert!(re.is_match(b" \xFF" )); |
1462 | /// ``` |
1463 | pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder { |
1464 | self.builder.unicode(yes); |
1465 | self |
1466 | } |
1467 | |
1468 | /// This configures whether to enable case insensitive matching for the |
1469 | /// entire pattern. |
1470 | /// |
1471 | /// This setting can also be configured using the inline flag `i` |
1472 | /// in the pattern. For example, `(?i:foo)` matches `foo` case |
1473 | /// insensitively while `(?-i:foo)` matches `foo` case sensitively. |
1474 | /// |
1475 | /// The default for this is `false`. |
1476 | /// |
1477 | /// # Example |
1478 | /// |
1479 | /// ``` |
1480 | /// use regex::bytes::RegexBuilder; |
1481 | /// |
1482 | /// let re = RegexBuilder::new(r"foo(?-i:bar)quux" ) |
1483 | /// .case_insensitive(true) |
1484 | /// .build() |
1485 | /// .unwrap(); |
1486 | /// assert!(re.is_match(b"FoObarQuUx" )); |
1487 | /// // Even though case insensitive matching is enabled in the builder, |
1488 | /// // it can be locally disabled within the pattern. In this case, |
1489 | /// // `bar` is matched case sensitively. |
1490 | /// assert!(!re.is_match(b"fooBARquux" )); |
1491 | /// ``` |
1492 | pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder { |
1493 | self.builder.case_insensitive(yes); |
1494 | self |
1495 | } |
1496 | |
1497 | /// This configures multi-line mode for the entire pattern. |
1498 | /// |
1499 | /// Enabling multi-line mode changes the behavior of the `^` and `$` |
1500 | /// anchor assertions. Instead of only matching at the beginning and |
1501 | /// end of a haystack, respectively, multi-line mode causes them to |
1502 | /// match at the beginning and end of a line *in addition* to the |
1503 | /// beginning and end of a haystack. More precisely, `^` will match at |
1504 | /// the position immediately following a `\n` and `$` will match at the |
1505 | /// position immediately preceding a `\n`. |
1506 | /// |
1507 | /// The behavior of this option can be impacted by other settings too: |
1508 | /// |
1509 | /// * The [`RegexBuilder::line_terminator`] option changes `\n` above |
1510 | /// to any ASCII byte. |
1511 | /// * The [`RegexBuilder::crlf`] option changes the line terminator to |
1512 | /// be either `\r` or `\n`, but never at the position between a `\r` |
1513 | /// and `\n`. |
1514 | /// |
1515 | /// This setting can also be configured using the inline flag `m` in |
1516 | /// the pattern. |
1517 | /// |
1518 | /// The default for this is `false`. |
1519 | /// |
1520 | /// # Example |
1521 | /// |
1522 | /// ``` |
1523 | /// use regex::bytes::RegexBuilder; |
1524 | /// |
1525 | /// let re = RegexBuilder::new(r"^foo$" ) |
1526 | /// .multi_line(true) |
1527 | /// .build() |
1528 | /// .unwrap(); |
1529 | /// assert_eq!(Some(1..4), re.find(b" \nfoo \n" ).map(|m| m.range())); |
1530 | /// ``` |
1531 | pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder { |
1532 | self.builder.multi_line(yes); |
1533 | self |
1534 | } |
1535 | |
1536 | /// This configures dot-matches-new-line mode for the entire pattern. |
1537 | /// |
1538 | /// Perhaps surprisingly, the default behavior for `.` is not to match |
1539 | /// any character, but rather, to match any character except for the |
1540 | /// line terminator (which is `\n` by default). When this mode is |
1541 | /// enabled, the behavior changes such that `.` truly matches any |
1542 | /// character. |
1543 | /// |
1544 | /// This setting can also be configured using the inline flag `s` in |
1545 | /// the pattern. For example, `(?s:.)` and `\p{any}` are equivalent |
1546 | /// regexes. |
1547 | /// |
1548 | /// The default for this is `false`. |
1549 | /// |
1550 | /// # Example |
1551 | /// |
1552 | /// ``` |
1553 | /// use regex::bytes::RegexBuilder; |
1554 | /// |
1555 | /// let re = RegexBuilder::new(r"foo.bar" ) |
1556 | /// .dot_matches_new_line(true) |
1557 | /// .build() |
1558 | /// .unwrap(); |
1559 | /// let hay = b"foo \nbar" ; |
1560 | /// assert_eq!(Some(&b"foo \nbar" [..]), re.find(hay).map(|m| m.as_bytes())); |
1561 | /// ``` |
1562 | pub fn dot_matches_new_line( |
1563 | &mut self, |
1564 | yes: bool, |
1565 | ) -> &mut RegexBuilder { |
1566 | self.builder.dot_matches_new_line(yes); |
1567 | self |
1568 | } |
1569 | |
1570 | /// This configures CRLF mode for the entire pattern. |
1571 | /// |
1572 | /// When CRLF mode is enabled, both `\r` ("carriage return" or CR for |
1573 | /// short) and `\n` ("line feed" or LF for short) are treated as line |
1574 | /// terminators. This results in the following: |
1575 | /// |
1576 | /// * Unless dot-matches-new-line mode is enabled, `.` will now match |
1577 | /// any character except for `\n` and `\r`. |
1578 | /// * When multi-line mode is enabled, `^` will match immediately |
1579 | /// following a `\n` or a `\r`. Similarly, `$` will match immediately |
1580 | /// preceding a `\n` or a `\r`. Neither `^` nor `$` will ever match |
1581 | /// between `\r` and `\n`. |
1582 | /// |
1583 | /// This setting can also be configured using the inline flag `R` in |
1584 | /// the pattern. |
1585 | /// |
1586 | /// The default for this is `false`. |
1587 | /// |
1588 | /// # Example |
1589 | /// |
1590 | /// ``` |
1591 | /// use regex::bytes::RegexBuilder; |
1592 | /// |
1593 | /// let re = RegexBuilder::new(r"^foo$" ) |
1594 | /// .multi_line(true) |
1595 | /// .crlf(true) |
1596 | /// .build() |
1597 | /// .unwrap(); |
1598 | /// let hay = b" \r\nfoo \r\n" ; |
1599 | /// // If CRLF mode weren't enabled here, then '$' wouldn't match |
1600 | /// // immediately after 'foo', and thus no match would be found. |
1601 | /// assert_eq!(Some(&b"foo" [..]), re.find(hay).map(|m| m.as_bytes())); |
1602 | /// ``` |
1603 | /// |
1604 | /// This example demonstrates that `^` will never match at a position |
1605 | /// between `\r` and `\n`. (`$` will similarly not match between a `\r` |
1606 | /// and a `\n`.) |
1607 | /// |
1608 | /// ``` |
1609 | /// use regex::bytes::RegexBuilder; |
1610 | /// |
1611 | /// let re = RegexBuilder::new(r"^" ) |
1612 | /// .multi_line(true) |
1613 | /// .crlf(true) |
1614 | /// .build() |
1615 | /// .unwrap(); |
1616 | /// let hay = b" \r\n\r\n" ; |
1617 | /// let ranges: Vec<_> = re.find_iter(hay).map(|m| m.range()).collect(); |
1618 | /// assert_eq!(ranges, vec![0..0, 2..2, 4..4]); |
1619 | /// ``` |
1620 | pub fn crlf(&mut self, yes: bool) -> &mut RegexBuilder { |
1621 | self.builder.crlf(yes); |
1622 | self |
1623 | } |
1624 | |
1625 | /// Configures the line terminator to be used by the regex. |
1626 | /// |
1627 | /// The line terminator is relevant in two ways for a particular regex: |
1628 | /// |
1629 | /// * When dot-matches-new-line mode is *not* enabled (the default), |
1630 | /// then `.` will match any character except for the configured line |
1631 | /// terminator. |
1632 | /// * When multi-line mode is enabled (not the default), then `^` and |
1633 | /// `$` will match immediately after and before, respectively, a line |
1634 | /// terminator. |
1635 | /// |
1636 | /// In both cases, if CRLF mode is enabled in a particular context, |
1637 | /// then it takes precedence over any configured line terminator. |
1638 | /// |
1639 | /// This option cannot be configured from within the pattern. |
1640 | /// |
1641 | /// The default line terminator is `\n`. |
1642 | /// |
1643 | /// # Example |
1644 | /// |
1645 | /// This shows how to treat the NUL byte as a line terminator. This can |
1646 | /// be a useful heuristic when searching binary data. |
1647 | /// |
1648 | /// ``` |
1649 | /// use regex::bytes::RegexBuilder; |
1650 | /// |
1651 | /// let re = RegexBuilder::new(r"^foo$" ) |
1652 | /// .multi_line(true) |
1653 | /// .line_terminator(b' \x00' ) |
1654 | /// .build() |
1655 | /// .unwrap(); |
1656 | /// let hay = b" \x00foo \x00" ; |
1657 | /// assert_eq!(Some(1..4), re.find(hay).map(|m| m.range())); |
1658 | /// ``` |
1659 | /// |
1660 | /// This example shows that the behavior of `.` is impacted by this |
1661 | /// setting as well: |
1662 | /// |
1663 | /// ``` |
1664 | /// use regex::bytes::RegexBuilder; |
1665 | /// |
1666 | /// let re = RegexBuilder::new(r"." ) |
1667 | /// .line_terminator(b' \x00' ) |
1668 | /// .build() |
1669 | /// .unwrap(); |
1670 | /// assert!(re.is_match(b" \n" )); |
1671 | /// assert!(!re.is_match(b" \x00" )); |
1672 | /// ``` |
1673 | /// |
1674 | /// This shows that building a regex will work even when the byte |
1675 | /// given is not ASCII. This is unlike the top-level `Regex` API where |
1676 | /// matching invalid UTF-8 is not allowed. |
1677 | /// |
1678 | /// Note though that you must disable Unicode mode. This is required |
1679 | /// because Unicode mode requires matching one codepoint at a time, |
1680 | /// and there is no way to match a non-ASCII byte as if it were a |
1681 | /// codepoint. |
1682 | /// |
1683 | /// ``` |
1684 | /// use regex::bytes::RegexBuilder; |
1685 | /// |
1686 | /// assert!( |
1687 | /// RegexBuilder::new(r"." ) |
1688 | /// .unicode(false) |
1689 | /// .line_terminator(0x80) |
1690 | /// .build() |
1691 | /// .is_ok(), |
1692 | /// ); |
1693 | /// ``` |
1694 | pub fn line_terminator(&mut self, byte: u8) -> &mut RegexBuilder { |
1695 | self.builder.line_terminator(byte); |
1696 | self |
1697 | } |
1698 | |
1699 | /// This configures swap-greed mode for the entire pattern. |
1700 | /// |
1701 | /// When swap-greed mode is enabled, patterns like `a+` will become |
1702 | /// non-greedy and patterns like `a+?` will become greedy. In other |
1703 | /// words, the meanings of `a+` and `a+?` are switched. |
1704 | /// |
1705 | /// This setting can also be configured using the inline flag `U` in |
1706 | /// the pattern. |
1707 | /// |
1708 | /// The default for this is `false`. |
1709 | /// |
1710 | /// # Example |
1711 | /// |
1712 | /// ``` |
1713 | /// use regex::bytes::RegexBuilder; |
1714 | /// |
1715 | /// let re = RegexBuilder::new(r"a+" ) |
1716 | /// .swap_greed(true) |
1717 | /// .build() |
1718 | /// .unwrap(); |
1719 | /// assert_eq!(Some(&b"a" [..]), re.find(b"aaa" ).map(|m| m.as_bytes())); |
1720 | /// ``` |
1721 | pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder { |
1722 | self.builder.swap_greed(yes); |
1723 | self |
1724 | } |
1725 | |
1726 | /// This configures verbose mode for the entire pattern. |
1727 | /// |
1728 | /// When enabled, whitespace will treated as insignifcant in the |
1729 | /// pattern and `#` can be used to start a comment until the next new |
1730 | /// line. |
1731 | /// |
1732 | /// Normally, in most places in a pattern, whitespace is treated |
1733 | /// literally. For example ` +` will match one or more ASCII whitespace |
1734 | /// characters. |
1735 | /// |
1736 | /// When verbose mode is enabled, `\#` can be used to match a literal |
1737 | /// `#` and `\ ` can be used to match a literal ASCII whitespace |
1738 | /// character. |
1739 | /// |
1740 | /// Verbose mode is useful for permitting regexes to be formatted and |
1741 | /// broken up more nicely. This may make them more easily readable. |
1742 | /// |
1743 | /// This setting can also be configured using the inline flag `x` in |
1744 | /// the pattern. |
1745 | /// |
1746 | /// The default for this is `false`. |
1747 | /// |
1748 | /// # Example |
1749 | /// |
1750 | /// ``` |
1751 | /// use regex::bytes::RegexBuilder; |
1752 | /// |
1753 | /// let pat = r" |
1754 | /// \b |
1755 | /// (?<first>\p{Uppercase}\w*) # always start with uppercase letter |
1756 | /// [\s--\n]+ # whitespace should separate names |
1757 | /// (?: # middle name can be an initial! |
1758 | /// (?:(?<initial>\p{Uppercase})\.|(?<middle>\p{Uppercase}\w*)) |
1759 | /// [\s--\n]+ |
1760 | /// )? |
1761 | /// (?<last>\p{Uppercase}\w*) |
1762 | /// \b |
1763 | /// " ; |
1764 | /// let re = RegexBuilder::new(pat) |
1765 | /// .ignore_whitespace(true) |
1766 | /// .build() |
1767 | /// .unwrap(); |
1768 | /// |
1769 | /// let caps = re.captures(b"Harry Potter" ).unwrap(); |
1770 | /// assert_eq!(&b"Harry" [..], &caps["first" ]); |
1771 | /// assert_eq!(&b"Potter" [..], &caps["last" ]); |
1772 | /// |
1773 | /// let caps = re.captures(b"Harry J. Potter" ).unwrap(); |
1774 | /// assert_eq!(&b"Harry" [..], &caps["first" ]); |
1775 | /// // Since a middle name/initial isn't required for an overall match, |
1776 | /// // we can't assume that 'initial' or 'middle' will be populated! |
1777 | /// assert_eq!( |
1778 | /// Some(&b"J" [..]), |
1779 | /// caps.name("initial" ).map(|m| m.as_bytes()), |
1780 | /// ); |
1781 | /// assert_eq!(None, caps.name("middle" ).map(|m| m.as_bytes())); |
1782 | /// assert_eq!(&b"Potter" [..], &caps["last" ]); |
1783 | /// |
1784 | /// let caps = re.captures(b"Harry James Potter" ).unwrap(); |
1785 | /// assert_eq!(&b"Harry" [..], &caps["first" ]); |
1786 | /// // Since a middle name/initial isn't required for an overall match, |
1787 | /// // we can't assume that 'initial' or 'middle' will be populated! |
1788 | /// assert_eq!(None, caps.name("initial" ).map(|m| m.as_bytes())); |
1789 | /// assert_eq!( |
1790 | /// Some(&b"James" [..]), |
1791 | /// caps.name("middle" ).map(|m| m.as_bytes()), |
1792 | /// ); |
1793 | /// assert_eq!(&b"Potter" [..], &caps["last" ]); |
1794 | /// ``` |
1795 | pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder { |
1796 | self.builder.ignore_whitespace(yes); |
1797 | self |
1798 | } |
1799 | |
1800 | /// This configures octal mode for the entire pattern. |
1801 | /// |
1802 | /// Octal syntax is a little-known way of uttering Unicode codepoints |
1803 | /// in a pattern. For example, `a`, `\x61`, `\u0061` and `\141` are all |
1804 | /// equivalent patterns, where the last example shows octal syntax. |
1805 | /// |
1806 | /// While supporting octal syntax isn't in and of itself a problem, |
1807 | /// it does make good error messages harder. That is, in PCRE based |
1808 | /// regex engines, syntax like `\1` invokes a backreference, which is |
1809 | /// explicitly unsupported this library. However, many users expect |
1810 | /// backreferences to be supported. Therefore, when octal support |
1811 | /// is disabled, the error message will explicitly mention that |
1812 | /// backreferences aren't supported. |
1813 | /// |
1814 | /// The default for this is `false`. |
1815 | /// |
1816 | /// # Example |
1817 | /// |
1818 | /// ``` |
1819 | /// use regex::bytes::RegexBuilder; |
1820 | /// |
1821 | /// // Normally this pattern would not compile, with an error message |
1822 | /// // about backreferences not being supported. But with octal mode |
1823 | /// // enabled, octal escape sequences work. |
1824 | /// let re = RegexBuilder::new(r"\141" ) |
1825 | /// .octal(true) |
1826 | /// .build() |
1827 | /// .unwrap(); |
1828 | /// assert!(re.is_match(b"a" )); |
1829 | /// ``` |
1830 | pub fn octal(&mut self, yes: bool) -> &mut RegexBuilder { |
1831 | self.builder.octal(yes); |
1832 | self |
1833 | } |
1834 | |
1835 | /// Sets the approximate size limit, in bytes, of the compiled regex. |
1836 | /// |
1837 | /// This roughly corresponds to the number of heap memory, in |
1838 | /// bytes, occupied by a single regex. If the regex would otherwise |
1839 | /// approximately exceed this limit, then compiling that regex will |
1840 | /// fail. |
1841 | /// |
1842 | /// The main utility of a method like this is to avoid compiling |
1843 | /// regexes that use an unexpected amount of resources, such as |
1844 | /// time and memory. Even if the memory usage of a large regex is |
1845 | /// acceptable, its search time may not be. Namely, worst case time |
1846 | /// complexity for search is `O(m * n)`, where `m ~ len(pattern)` and |
1847 | /// `n ~ len(haystack)`. That is, search time depends, in part, on the |
1848 | /// size of the compiled regex. This means that putting a limit on the |
1849 | /// size of the regex limits how much a regex can impact search time. |
1850 | /// |
1851 | /// For more information about regex size limits, see the section on |
1852 | /// [untrusted inputs](crate#untrusted-input) in the top-level crate |
1853 | /// documentation. |
1854 | /// |
1855 | /// The default for this is some reasonable number that permits most |
1856 | /// patterns to compile successfully. |
1857 | /// |
1858 | /// # Example |
1859 | /// |
1860 | /// ``` |
1861 | /// # if !cfg!(target_pointer_width = "64" ) { return; } // see #1041 |
1862 | /// use regex::bytes::RegexBuilder; |
1863 | /// |
1864 | /// // It may surprise you how big some seemingly small patterns can |
1865 | /// // be! Since \w is Unicode aware, this generates a regex that can |
1866 | /// // match approximately 140,000 distinct codepoints. |
1867 | /// assert!(RegexBuilder::new(r"\w" ).size_limit(45_000).build().is_err()); |
1868 | /// ``` |
1869 | pub fn size_limit(&mut self, bytes: usize) -> &mut RegexBuilder { |
1870 | self.builder.size_limit(bytes); |
1871 | self |
1872 | } |
1873 | |
1874 | /// Set the approximate capacity, in bytes, of the cache of transitions |
1875 | /// used by the lazy DFA. |
1876 | /// |
1877 | /// While the lazy DFA isn't always used, in tends to be the most |
1878 | /// commonly use regex engine in default configurations. It tends to |
1879 | /// adopt the performance profile of a fully build DFA, but without the |
1880 | /// downside of taking worst case exponential time to build. |
1881 | /// |
1882 | /// The downside is that it needs to keep a cache of transitions and |
1883 | /// states that are built while running a search, and this cache |
1884 | /// can fill up. When it fills up, the cache will reset itself. Any |
1885 | /// previously generated states and transitions will then need to be |
1886 | /// re-generated. If this happens too many times, then this library |
1887 | /// will bail out of using the lazy DFA and switch to a different regex |
1888 | /// engine. |
1889 | /// |
1890 | /// If your regex provokes this particular downside of the lazy DFA, |
1891 | /// then it may be beneficial to increase its cache capacity. This will |
1892 | /// potentially reduce the frequency of cache resetting (ideally to |
1893 | /// `0`). While it won't fix all potential performance problems with |
1894 | /// the lazy DFA, increasing the cache capacity does fix some. |
1895 | /// |
1896 | /// There is no easy way to determine, a priori, whether increasing |
1897 | /// this cache capacity will help. In general, the larger your regex, |
1898 | /// the more cache it's likely to use. But that isn't an ironclad rule. |
1899 | /// For example, a regex like `[01]*1[01]{N}` would normally produce a |
1900 | /// fully build DFA that is exponential in size with respect to `N`. |
1901 | /// The lazy DFA will prevent exponential space blow-up, but it cache |
1902 | /// is likely to fill up, even when it's large and even for smallish |
1903 | /// values of `N`. |
1904 | /// |
1905 | /// If you aren't sure whether this helps or not, it is sensible to |
1906 | /// set this to some arbitrarily large number in testing, such as |
1907 | /// `usize::MAX`. Namely, this represents the amount of capacity that |
1908 | /// *may* be used. It's probably not a good idea to use `usize::MAX` in |
1909 | /// production though, since it implies there are no controls on heap |
1910 | /// memory used by this library during a search. In effect, set it to |
1911 | /// whatever you're willing to allocate for a single regex search. |
1912 | pub fn dfa_size_limit(&mut self, bytes: usize) -> &mut RegexBuilder { |
1913 | self.builder.dfa_size_limit(bytes); |
1914 | self |
1915 | } |
1916 | |
1917 | /// Set the nesting limit for this parser. |
1918 | /// |
1919 | /// The nesting limit controls how deep the abstract syntax tree is |
1920 | /// allowed to be. If the AST exceeds the given limit (e.g., with too |
1921 | /// many nested groups), then an error is returned by the parser. |
1922 | /// |
1923 | /// The purpose of this limit is to act as a heuristic to prevent stack |
1924 | /// overflow for consumers that do structural induction on an AST using |
1925 | /// explicit recursion. While this crate never does this (instead using |
1926 | /// constant stack space and moving the call stack to the heap), other |
1927 | /// crates may. |
1928 | /// |
1929 | /// This limit is not checked until the entire AST is parsed. |
1930 | /// Therefore, if callers want to put a limit on the amount of heap |
1931 | /// space used, then they should impose a limit on the length, in |
1932 | /// bytes, of the concrete pattern string. In particular, this is |
1933 | /// viable since this parser implementation will limit itself to heap |
1934 | /// space proportional to the length of the pattern string. See also |
1935 | /// the [untrusted inputs](crate#untrusted-input) section in the |
1936 | /// top-level crate documentation for more information about this. |
1937 | /// |
1938 | /// Note that a nest limit of `0` will return a nest limit error for |
1939 | /// most patterns but not all. For example, a nest limit of `0` permits |
1940 | /// `a` but not `ab`, since `ab` requires an explicit concatenation, |
1941 | /// which results in a nest depth of `1`. In general, a nest limit is |
1942 | /// not something that manifests in an obvious way in the concrete |
1943 | /// syntax, therefore, it should not be used in a granular way. |
1944 | /// |
1945 | /// # Example |
1946 | /// |
1947 | /// ``` |
1948 | /// use regex::bytes::RegexBuilder; |
1949 | /// |
1950 | /// assert!(RegexBuilder::new(r"a" ).nest_limit(0).build().is_ok()); |
1951 | /// assert!(RegexBuilder::new(r"ab" ).nest_limit(0).build().is_err()); |
1952 | /// ``` |
1953 | pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder { |
1954 | self.builder.nest_limit(limit); |
1955 | self |
1956 | } |
1957 | } |
1958 | |
1959 | /// A configurable builder for a [`RegexSet`]. |
1960 | /// |
1961 | /// This builder can be used to programmatically set flags such as `i` |
1962 | /// (case insensitive) and `x` (for verbose mode). This builder can also be |
1963 | /// used to configure things like the line terminator and a size limit on |
1964 | /// the compiled regular expression. |
1965 | #[derive(Clone, Debug)] |
1966 | pub struct RegexSetBuilder { |
1967 | builder: Builder, |
1968 | } |
1969 | |
1970 | impl RegexSetBuilder { |
1971 | /// Create a new builder with a default configuration for the given |
1972 | /// patterns. |
1973 | /// |
1974 | /// If the patterns are invalid or exceed the configured size limits, |
1975 | /// then an error will be returned when [`RegexSetBuilder::build`] is |
1976 | /// called. |
1977 | pub fn new<I, S>(patterns: I) -> RegexSetBuilder |
1978 | where |
1979 | I: IntoIterator<Item = S>, |
1980 | S: AsRef<str>, |
1981 | { |
1982 | RegexSetBuilder { builder: Builder::new(patterns) } |
1983 | } |
1984 | |
1985 | /// Compiles the patterns given to `RegexSetBuilder::new` with the |
1986 | /// configuration set on this builder. |
1987 | /// |
1988 | /// If the patterns aren't valid regexes or if a configured size limit |
1989 | /// was exceeded, then an error is returned. |
1990 | pub fn build(&self) -> Result<RegexSet, Error> { |
1991 | self.builder.build_many_bytes() |
1992 | } |
1993 | |
1994 | /// This configures Unicode mode for the all of the patterns. |
1995 | /// |
1996 | /// Enabling Unicode mode does a number of things: |
1997 | /// |
1998 | /// * Most fundamentally, it causes the fundamental atom of matching |
1999 | /// to be a single codepoint. When Unicode mode is disabled, it's a |
2000 | /// single byte. For example, when Unicode mode is enabled, `.` will |
2001 | /// match `💩` once, where as it will match 4 times when Unicode mode |
2002 | /// is disabled. (Since the UTF-8 encoding of `💩` is 4 bytes long.) |
2003 | /// * Case insensitive matching uses Unicode simple case folding rules. |
2004 | /// * Unicode character classes like `\p{Letter}` and `\p{Greek}` are |
2005 | /// available. |
2006 | /// * Perl character classes are Unicode aware. That is, `\w`, `\s` and |
2007 | /// `\d`. |
2008 | /// * The word boundary assertions, `\b` and `\B`, use the Unicode |
2009 | /// definition of a word character. |
2010 | /// |
2011 | /// Note that unlike the top-level `RegexSet` for searching `&str`, |
2012 | /// it is permitted to disable Unicode mode even if the resulting |
2013 | /// pattern could match invalid UTF-8. For example, `(?-u:.)` is not |
2014 | /// a valid pattern for a top-level `RegexSet`, but is valid for a |
2015 | /// `bytes::RegexSet`. |
2016 | /// |
2017 | /// For more details on the Unicode support in this crate, see the |
2018 | /// [Unicode section](crate#unicode) in this crate's top-level |
2019 | /// documentation. |
2020 | /// |
2021 | /// The default for this is `true`. |
2022 | /// |
2023 | /// # Example |
2024 | /// |
2025 | /// ``` |
2026 | /// use regex::bytes::RegexSetBuilder; |
2027 | /// |
2028 | /// let re = RegexSetBuilder::new([r"\w" ]) |
2029 | /// .unicode(false) |
2030 | /// .build() |
2031 | /// .unwrap(); |
2032 | /// // Normally greek letters would be included in \w, but since |
2033 | /// // Unicode mode is disabled, it only matches ASCII letters. |
2034 | /// assert!(!re.is_match("δ" .as_bytes())); |
2035 | /// |
2036 | /// let re = RegexSetBuilder::new([r"s" ]) |
2037 | /// .case_insensitive(true) |
2038 | /// .unicode(false) |
2039 | /// .build() |
2040 | /// .unwrap(); |
2041 | /// // Normally 'Å¿' is included when searching for 's' case |
2042 | /// // insensitively due to Unicode's simple case folding rules. But |
2043 | /// // when Unicode mode is disabled, only ASCII case insensitive rules |
2044 | /// // are used. |
2045 | /// assert!(!re.is_match("Å¿" .as_bytes())); |
2046 | /// ``` |
2047 | /// |
2048 | /// Since this builder is for constructing a |
2049 | /// [`bytes::RegexSet`](RegexSet), one can disable Unicode mode even if |
2050 | /// it would match invalid UTF-8: |
2051 | /// |
2052 | /// ``` |
2053 | /// use regex::bytes::RegexSetBuilder; |
2054 | /// |
2055 | /// let re = RegexSetBuilder::new([r"." ]) |
2056 | /// .unicode(false) |
2057 | /// .build() |
2058 | /// .unwrap(); |
2059 | /// // Normally greek letters would be included in \w, but since |
2060 | /// // Unicode mode is disabled, it only matches ASCII letters. |
2061 | /// assert!(re.is_match(b" \xFF" )); |
2062 | /// ``` |
2063 | pub fn unicode(&mut self, yes: bool) -> &mut RegexSetBuilder { |
2064 | self.builder.unicode(yes); |
2065 | self |
2066 | } |
2067 | |
2068 | /// This configures whether to enable case insensitive matching for all |
2069 | /// of the patterns. |
2070 | /// |
2071 | /// This setting can also be configured using the inline flag `i` |
2072 | /// in the pattern. For example, `(?i:foo)` matches `foo` case |
2073 | /// insensitively while `(?-i:foo)` matches `foo` case sensitively. |
2074 | /// |
2075 | /// The default for this is `false`. |
2076 | /// |
2077 | /// # Example |
2078 | /// |
2079 | /// ``` |
2080 | /// use regex::bytes::RegexSetBuilder; |
2081 | /// |
2082 | /// let re = RegexSetBuilder::new([r"foo(?-i:bar)quux" ]) |
2083 | /// .case_insensitive(true) |
2084 | /// .build() |
2085 | /// .unwrap(); |
2086 | /// assert!(re.is_match(b"FoObarQuUx" )); |
2087 | /// // Even though case insensitive matching is enabled in the builder, |
2088 | /// // it can be locally disabled within the pattern. In this case, |
2089 | /// // `bar` is matched case sensitively. |
2090 | /// assert!(!re.is_match(b"fooBARquux" )); |
2091 | /// ``` |
2092 | pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexSetBuilder { |
2093 | self.builder.case_insensitive(yes); |
2094 | self |
2095 | } |
2096 | |
2097 | /// This configures multi-line mode for all of the patterns. |
2098 | /// |
2099 | /// Enabling multi-line mode changes the behavior of the `^` and `$` |
2100 | /// anchor assertions. Instead of only matching at the beginning and |
2101 | /// end of a haystack, respectively, multi-line mode causes them to |
2102 | /// match at the beginning and end of a line *in addition* to the |
2103 | /// beginning and end of a haystack. More precisely, `^` will match at |
2104 | /// the position immediately following a `\n` and `$` will match at the |
2105 | /// position immediately preceding a `\n`. |
2106 | /// |
2107 | /// The behavior of this option can be impacted by other settings too: |
2108 | /// |
2109 | /// * The [`RegexSetBuilder::line_terminator`] option changes `\n` |
2110 | /// above to any ASCII byte. |
2111 | /// * The [`RegexSetBuilder::crlf`] option changes the line terminator |
2112 | /// to be either `\r` or `\n`, but never at the position between a `\r` |
2113 | /// and `\n`. |
2114 | /// |
2115 | /// This setting can also be configured using the inline flag `m` in |
2116 | /// the pattern. |
2117 | /// |
2118 | /// The default for this is `false`. |
2119 | /// |
2120 | /// # Example |
2121 | /// |
2122 | /// ``` |
2123 | /// use regex::bytes::RegexSetBuilder; |
2124 | /// |
2125 | /// let re = RegexSetBuilder::new([r"^foo$" ]) |
2126 | /// .multi_line(true) |
2127 | /// .build() |
2128 | /// .unwrap(); |
2129 | /// assert!(re.is_match(b" \nfoo \n" )); |
2130 | /// ``` |
2131 | pub fn multi_line(&mut self, yes: bool) -> &mut RegexSetBuilder { |
2132 | self.builder.multi_line(yes); |
2133 | self |
2134 | } |
2135 | |
2136 | /// This configures dot-matches-new-line mode for the entire pattern. |
2137 | /// |
2138 | /// Perhaps surprisingly, the default behavior for `.` is not to match |
2139 | /// any character, but rather, to match any character except for the |
2140 | /// line terminator (which is `\n` by default). When this mode is |
2141 | /// enabled, the behavior changes such that `.` truly matches any |
2142 | /// character. |
2143 | /// |
2144 | /// This setting can also be configured using the inline flag `s` in |
2145 | /// the pattern. For example, `(?s:.)` and `\p{any}` are equivalent |
2146 | /// regexes. |
2147 | /// |
2148 | /// The default for this is `false`. |
2149 | /// |
2150 | /// # Example |
2151 | /// |
2152 | /// ``` |
2153 | /// use regex::bytes::RegexSetBuilder; |
2154 | /// |
2155 | /// let re = RegexSetBuilder::new([r"foo.bar" ]) |
2156 | /// .dot_matches_new_line(true) |
2157 | /// .build() |
2158 | /// .unwrap(); |
2159 | /// let hay = b"foo \nbar" ; |
2160 | /// assert!(re.is_match(hay)); |
2161 | /// ``` |
2162 | pub fn dot_matches_new_line( |
2163 | &mut self, |
2164 | yes: bool, |
2165 | ) -> &mut RegexSetBuilder { |
2166 | self.builder.dot_matches_new_line(yes); |
2167 | self |
2168 | } |
2169 | |
2170 | /// This configures CRLF mode for all of the patterns. |
2171 | /// |
2172 | /// When CRLF mode is enabled, both `\r` ("carriage return" or CR for |
2173 | /// short) and `\n` ("line feed" or LF for short) are treated as line |
2174 | /// terminators. This results in the following: |
2175 | /// |
2176 | /// * Unless dot-matches-new-line mode is enabled, `.` will now match |
2177 | /// any character except for `\n` and `\r`. |
2178 | /// * When multi-line mode is enabled, `^` will match immediately |
2179 | /// following a `\n` or a `\r`. Similarly, `$` will match immediately |
2180 | /// preceding a `\n` or a `\r`. Neither `^` nor `$` will ever match |
2181 | /// between `\r` and `\n`. |
2182 | /// |
2183 | /// This setting can also be configured using the inline flag `R` in |
2184 | /// the pattern. |
2185 | /// |
2186 | /// The default for this is `false`. |
2187 | /// |
2188 | /// # Example |
2189 | /// |
2190 | /// ``` |
2191 | /// use regex::bytes::RegexSetBuilder; |
2192 | /// |
2193 | /// let re = RegexSetBuilder::new([r"^foo$" ]) |
2194 | /// .multi_line(true) |
2195 | /// .crlf(true) |
2196 | /// .build() |
2197 | /// .unwrap(); |
2198 | /// let hay = b" \r\nfoo \r\n" ; |
2199 | /// // If CRLF mode weren't enabled here, then '$' wouldn't match |
2200 | /// // immediately after 'foo', and thus no match would be found. |
2201 | /// assert!(re.is_match(hay)); |
2202 | /// ``` |
2203 | /// |
2204 | /// This example demonstrates that `^` will never match at a position |
2205 | /// between `\r` and `\n`. (`$` will similarly not match between a `\r` |
2206 | /// and a `\n`.) |
2207 | /// |
2208 | /// ``` |
2209 | /// use regex::bytes::RegexSetBuilder; |
2210 | /// |
2211 | /// let re = RegexSetBuilder::new([r"^\n" ]) |
2212 | /// .multi_line(true) |
2213 | /// .crlf(true) |
2214 | /// .build() |
2215 | /// .unwrap(); |
2216 | /// assert!(!re.is_match(b" \r\n" )); |
2217 | /// ``` |
2218 | pub fn crlf(&mut self, yes: bool) -> &mut RegexSetBuilder { |
2219 | self.builder.crlf(yes); |
2220 | self |
2221 | } |
2222 | |
2223 | /// Configures the line terminator to be used by the regex. |
2224 | /// |
2225 | /// The line terminator is relevant in two ways for a particular regex: |
2226 | /// |
2227 | /// * When dot-matches-new-line mode is *not* enabled (the default), |
2228 | /// then `.` will match any character except for the configured line |
2229 | /// terminator. |
2230 | /// * When multi-line mode is enabled (not the default), then `^` and |
2231 | /// `$` will match immediately after and before, respectively, a line |
2232 | /// terminator. |
2233 | /// |
2234 | /// In both cases, if CRLF mode is enabled in a particular context, |
2235 | /// then it takes precedence over any configured line terminator. |
2236 | /// |
2237 | /// This option cannot be configured from within the pattern. |
2238 | /// |
2239 | /// The default line terminator is `\n`. |
2240 | /// |
2241 | /// # Example |
2242 | /// |
2243 | /// This shows how to treat the NUL byte as a line terminator. This can |
2244 | /// be a useful heuristic when searching binary data. |
2245 | /// |
2246 | /// ``` |
2247 | /// use regex::bytes::RegexSetBuilder; |
2248 | /// |
2249 | /// let re = RegexSetBuilder::new([r"^foo$" ]) |
2250 | /// .multi_line(true) |
2251 | /// .line_terminator(b' \x00' ) |
2252 | /// .build() |
2253 | /// .unwrap(); |
2254 | /// let hay = b" \x00foo \x00" ; |
2255 | /// assert!(re.is_match(hay)); |
2256 | /// ``` |
2257 | /// |
2258 | /// This example shows that the behavior of `.` is impacted by this |
2259 | /// setting as well: |
2260 | /// |
2261 | /// ``` |
2262 | /// use regex::bytes::RegexSetBuilder; |
2263 | /// |
2264 | /// let re = RegexSetBuilder::new([r"." ]) |
2265 | /// .line_terminator(b' \x00' ) |
2266 | /// .build() |
2267 | /// .unwrap(); |
2268 | /// assert!(re.is_match(b" \n" )); |
2269 | /// assert!(!re.is_match(b" \x00" )); |
2270 | /// ``` |
2271 | /// |
2272 | /// This shows that building a regex will work even when the byte given |
2273 | /// is not ASCII. This is unlike the top-level `RegexSet` API where |
2274 | /// matching invalid UTF-8 is not allowed. |
2275 | /// |
2276 | /// Note though that you must disable Unicode mode. This is required |
2277 | /// because Unicode mode requires matching one codepoint at a time, |
2278 | /// and there is no way to match a non-ASCII byte as if it were a |
2279 | /// codepoint. |
2280 | /// |
2281 | /// ``` |
2282 | /// use regex::bytes::RegexSetBuilder; |
2283 | /// |
2284 | /// assert!( |
2285 | /// RegexSetBuilder::new([r"." ]) |
2286 | /// .unicode(false) |
2287 | /// .line_terminator(0x80) |
2288 | /// .build() |
2289 | /// .is_ok(), |
2290 | /// ); |
2291 | /// ``` |
2292 | pub fn line_terminator(&mut self, byte: u8) -> &mut RegexSetBuilder { |
2293 | self.builder.line_terminator(byte); |
2294 | self |
2295 | } |
2296 | |
2297 | /// This configures swap-greed mode for all of the patterns. |
2298 | /// |
2299 | /// When swap-greed mode is enabled, patterns like `a+` will become |
2300 | /// non-greedy and patterns like `a+?` will become greedy. In other |
2301 | /// words, the meanings of `a+` and `a+?` are switched. |
2302 | /// |
2303 | /// This setting can also be configured using the inline flag `U` in |
2304 | /// the pattern. |
2305 | /// |
2306 | /// Note that this is generally not useful for a `RegexSet` since a |
2307 | /// `RegexSet` can only report whether a pattern matches or not. Since |
2308 | /// greediness never impacts whether a match is found or not (only the |
2309 | /// offsets of the match), it follows that whether parts of a pattern |
2310 | /// are greedy or not doesn't matter for a `RegexSet`. |
2311 | /// |
2312 | /// The default for this is `false`. |
2313 | pub fn swap_greed(&mut self, yes: bool) -> &mut RegexSetBuilder { |
2314 | self.builder.swap_greed(yes); |
2315 | self |
2316 | } |
2317 | |
2318 | /// This configures verbose mode for all of the patterns. |
2319 | /// |
2320 | /// When enabled, whitespace will treated as insignifcant in the |
2321 | /// pattern and `#` can be used to start a comment until the next new |
2322 | /// line. |
2323 | /// |
2324 | /// Normally, in most places in a pattern, whitespace is treated |
2325 | /// literally. For example ` +` will match one or more ASCII whitespace |
2326 | /// characters. |
2327 | /// |
2328 | /// When verbose mode is enabled, `\#` can be used to match a literal |
2329 | /// `#` and `\ ` can be used to match a literal ASCII whitespace |
2330 | /// character. |
2331 | /// |
2332 | /// Verbose mode is useful for permitting regexes to be formatted and |
2333 | /// broken up more nicely. This may make them more easily readable. |
2334 | /// |
2335 | /// This setting can also be configured using the inline flag `x` in |
2336 | /// the pattern. |
2337 | /// |
2338 | /// The default for this is `false`. |
2339 | /// |
2340 | /// # Example |
2341 | /// |
2342 | /// ``` |
2343 | /// use regex::bytes::RegexSetBuilder; |
2344 | /// |
2345 | /// let pat = r" |
2346 | /// \b |
2347 | /// (?<first>\p{Uppercase}\w*) # always start with uppercase letter |
2348 | /// [\s--\n]+ # whitespace should separate names |
2349 | /// (?: # middle name can be an initial! |
2350 | /// (?:(?<initial>\p{Uppercase})\.|(?<middle>\p{Uppercase}\w*)) |
2351 | /// [\s--\n]+ |
2352 | /// )? |
2353 | /// (?<last>\p{Uppercase}\w*) |
2354 | /// \b |
2355 | /// " ; |
2356 | /// let re = RegexSetBuilder::new([pat]) |
2357 | /// .ignore_whitespace(true) |
2358 | /// .build() |
2359 | /// .unwrap(); |
2360 | /// assert!(re.is_match(b"Harry Potter" )); |
2361 | /// assert!(re.is_match(b"Harry J. Potter" )); |
2362 | /// assert!(re.is_match(b"Harry James Potter" )); |
2363 | /// assert!(!re.is_match(b"harry J. Potter" )); |
2364 | /// ``` |
2365 | pub fn ignore_whitespace( |
2366 | &mut self, |
2367 | yes: bool, |
2368 | ) -> &mut RegexSetBuilder { |
2369 | self.builder.ignore_whitespace(yes); |
2370 | self |
2371 | } |
2372 | |
2373 | /// This configures octal mode for all of the patterns. |
2374 | /// |
2375 | /// Octal syntax is a little-known way of uttering Unicode codepoints |
2376 | /// in a pattern. For example, `a`, `\x61`, `\u0061` and `\141` are all |
2377 | /// equivalent patterns, where the last example shows octal syntax. |
2378 | /// |
2379 | /// While supporting octal syntax isn't in and of itself a problem, |
2380 | /// it does make good error messages harder. That is, in PCRE based |
2381 | /// regex engines, syntax like `\1` invokes a backreference, which is |
2382 | /// explicitly unsupported this library. However, many users expect |
2383 | /// backreferences to be supported. Therefore, when octal support |
2384 | /// is disabled, the error message will explicitly mention that |
2385 | /// backreferences aren't supported. |
2386 | /// |
2387 | /// The default for this is `false`. |
2388 | /// |
2389 | /// # Example |
2390 | /// |
2391 | /// ``` |
2392 | /// use regex::bytes::RegexSetBuilder; |
2393 | /// |
2394 | /// // Normally this pattern would not compile, with an error message |
2395 | /// // about backreferences not being supported. But with octal mode |
2396 | /// // enabled, octal escape sequences work. |
2397 | /// let re = RegexSetBuilder::new([r"\141" ]) |
2398 | /// .octal(true) |
2399 | /// .build() |
2400 | /// .unwrap(); |
2401 | /// assert!(re.is_match(b"a" )); |
2402 | /// ``` |
2403 | pub fn octal(&mut self, yes: bool) -> &mut RegexSetBuilder { |
2404 | self.builder.octal(yes); |
2405 | self |
2406 | } |
2407 | |
2408 | /// Sets the approximate size limit, in bytes, of the compiled regex. |
2409 | /// |
2410 | /// This roughly corresponds to the number of heap memory, in |
2411 | /// bytes, occupied by a single regex. If the regex would otherwise |
2412 | /// approximately exceed this limit, then compiling that regex will |
2413 | /// fail. |
2414 | /// |
2415 | /// The main utility of a method like this is to avoid compiling |
2416 | /// regexes that use an unexpected amount of resources, such as |
2417 | /// time and memory. Even if the memory usage of a large regex is |
2418 | /// acceptable, its search time may not be. Namely, worst case time |
2419 | /// complexity for search is `O(m * n)`, where `m ~ len(pattern)` and |
2420 | /// `n ~ len(haystack)`. That is, search time depends, in part, on the |
2421 | /// size of the compiled regex. This means that putting a limit on the |
2422 | /// size of the regex limits how much a regex can impact search time. |
2423 | /// |
2424 | /// For more information about regex size limits, see the section on |
2425 | /// [untrusted inputs](crate#untrusted-input) in the top-level crate |
2426 | /// documentation. |
2427 | /// |
2428 | /// The default for this is some reasonable number that permits most |
2429 | /// patterns to compile successfully. |
2430 | /// |
2431 | /// # Example |
2432 | /// |
2433 | /// ``` |
2434 | /// # if !cfg!(target_pointer_width = "64" ) { return; } // see #1041 |
2435 | /// use regex::bytes::RegexSetBuilder; |
2436 | /// |
2437 | /// // It may surprise you how big some seemingly small patterns can |
2438 | /// // be! Since \w is Unicode aware, this generates a regex that can |
2439 | /// // match approximately 140,000 distinct codepoints. |
2440 | /// assert!( |
2441 | /// RegexSetBuilder::new([r"\w" ]) |
2442 | /// .size_limit(45_000) |
2443 | /// .build() |
2444 | /// .is_err() |
2445 | /// ); |
2446 | /// ``` |
2447 | pub fn size_limit(&mut self, bytes: usize) -> &mut RegexSetBuilder { |
2448 | self.builder.size_limit(bytes); |
2449 | self |
2450 | } |
2451 | |
2452 | /// Set the approximate capacity, in bytes, of the cache of transitions |
2453 | /// used by the lazy DFA. |
2454 | /// |
2455 | /// While the lazy DFA isn't always used, in tends to be the most |
2456 | /// commonly use regex engine in default configurations. It tends to |
2457 | /// adopt the performance profile of a fully build DFA, but without the |
2458 | /// downside of taking worst case exponential time to build. |
2459 | /// |
2460 | /// The downside is that it needs to keep a cache of transitions and |
2461 | /// states that are built while running a search, and this cache |
2462 | /// can fill up. When it fills up, the cache will reset itself. Any |
2463 | /// previously generated states and transitions will then need to be |
2464 | /// re-generated. If this happens too many times, then this library |
2465 | /// will bail out of using the lazy DFA and switch to a different regex |
2466 | /// engine. |
2467 | /// |
2468 | /// If your regex provokes this particular downside of the lazy DFA, |
2469 | /// then it may be beneficial to increase its cache capacity. This will |
2470 | /// potentially reduce the frequency of cache resetting (ideally to |
2471 | /// `0`). While it won't fix all potential performance problems with |
2472 | /// the lazy DFA, increasing the cache capacity does fix some. |
2473 | /// |
2474 | /// There is no easy way to determine, a priori, whether increasing |
2475 | /// this cache capacity will help. In general, the larger your regex, |
2476 | /// the more cache it's likely to use. But that isn't an ironclad rule. |
2477 | /// For example, a regex like `[01]*1[01]{N}` would normally produce a |
2478 | /// fully build DFA that is exponential in size with respect to `N`. |
2479 | /// The lazy DFA will prevent exponential space blow-up, but it cache |
2480 | /// is likely to fill up, even when it's large and even for smallish |
2481 | /// values of `N`. |
2482 | /// |
2483 | /// If you aren't sure whether this helps or not, it is sensible to |
2484 | /// set this to some arbitrarily large number in testing, such as |
2485 | /// `usize::MAX`. Namely, this represents the amount of capacity that |
2486 | /// *may* be used. It's probably not a good idea to use `usize::MAX` in |
2487 | /// production though, since it implies there are no controls on heap |
2488 | /// memory used by this library during a search. In effect, set it to |
2489 | /// whatever you're willing to allocate for a single regex search. |
2490 | pub fn dfa_size_limit( |
2491 | &mut self, |
2492 | bytes: usize, |
2493 | ) -> &mut RegexSetBuilder { |
2494 | self.builder.dfa_size_limit(bytes); |
2495 | self |
2496 | } |
2497 | |
2498 | /// Set the nesting limit for this parser. |
2499 | /// |
2500 | /// The nesting limit controls how deep the abstract syntax tree is |
2501 | /// allowed to be. If the AST exceeds the given limit (e.g., with too |
2502 | /// many nested groups), then an error is returned by the parser. |
2503 | /// |
2504 | /// The purpose of this limit is to act as a heuristic to prevent stack |
2505 | /// overflow for consumers that do structural induction on an AST using |
2506 | /// explicit recursion. While this crate never does this (instead using |
2507 | /// constant stack space and moving the call stack to the heap), other |
2508 | /// crates may. |
2509 | /// |
2510 | /// This limit is not checked until the entire AST is parsed. |
2511 | /// Therefore, if callers want to put a limit on the amount of heap |
2512 | /// space used, then they should impose a limit on the length, in |
2513 | /// bytes, of the concrete pattern string. In particular, this is |
2514 | /// viable since this parser implementation will limit itself to heap |
2515 | /// space proportional to the length of the pattern string. See also |
2516 | /// the [untrusted inputs](crate#untrusted-input) section in the |
2517 | /// top-level crate documentation for more information about this. |
2518 | /// |
2519 | /// Note that a nest limit of `0` will return a nest limit error for |
2520 | /// most patterns but not all. For example, a nest limit of `0` permits |
2521 | /// `a` but not `ab`, since `ab` requires an explicit concatenation, |
2522 | /// which results in a nest depth of `1`. In general, a nest limit is |
2523 | /// not something that manifests in an obvious way in the concrete |
2524 | /// syntax, therefore, it should not be used in a granular way. |
2525 | /// |
2526 | /// # Example |
2527 | /// |
2528 | /// ``` |
2529 | /// use regex::bytes::RegexSetBuilder; |
2530 | /// |
2531 | /// assert!(RegexSetBuilder::new([r"a" ]).nest_limit(0).build().is_ok()); |
2532 | /// assert!(RegexSetBuilder::new([r"ab" ]).nest_limit(0).build().is_err()); |
2533 | /// ``` |
2534 | pub fn nest_limit(&mut self, limit: u32) -> &mut RegexSetBuilder { |
2535 | self.builder.nest_limit(limit); |
2536 | self |
2537 | } |
2538 | } |
2539 | } |
2540 | |