1 | /*! |
2 | Utilities for dealing with the syntax of a regular expression. |
3 | |
4 | This module currently only exposes a [`Config`] type that |
5 | itself represents a wrapper around the configuration for a |
6 | [`regex-syntax::ParserBuilder`](regex_syntax::ParserBuilder). The purpose of |
7 | this wrapper is to make configuring syntax options very similar to how other |
8 | configuration is done throughout this crate. Namely, instead of duplicating |
9 | syntax options across every builder (of which there are many), we instead |
10 | create small config objects like this one that can be passed around and |
11 | composed. |
12 | */ |
13 | |
14 | use alloc::{vec, vec::Vec}; |
15 | |
16 | use regex_syntax::{ |
17 | ast, |
18 | hir::{self, Hir}, |
19 | Error, ParserBuilder, |
20 | }; |
21 | |
22 | /// A convenience routine for parsing a pattern into an HIR value with the |
23 | /// default configuration. |
24 | /// |
25 | /// # Example |
26 | /// |
27 | /// This shows how to parse a pattern into an HIR value: |
28 | /// |
29 | /// ``` |
30 | /// use regex_automata::util::syntax; |
31 | /// |
32 | /// let hir = syntax::parse(r"([a-z]+)|([0-9]+)" )?; |
33 | /// assert_eq!(Some(1), hir.properties().static_explicit_captures_len()); |
34 | /// |
35 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
36 | /// ``` |
37 | pub fn parse(pattern: &str) -> Result<Hir, Error> { |
38 | parse_with(pattern, &Config::default()) |
39 | } |
40 | |
41 | /// A convenience routine for parsing many patterns into HIR value with the |
42 | /// default configuration. |
43 | /// |
44 | /// # Example |
45 | /// |
46 | /// This shows how to parse many patterns into an corresponding HIR values: |
47 | /// |
48 | /// ``` |
49 | /// use { |
50 | /// regex_automata::util::syntax, |
51 | /// regex_syntax::hir::Properties, |
52 | /// }; |
53 | /// |
54 | /// let hirs = syntax::parse_many(&[ |
55 | /// r"([a-z]+)|([0-9]+)" , |
56 | /// r"foo(A-Z]+)bar" , |
57 | /// ])?; |
58 | /// let props = Properties::union(hirs.iter().map(|h| h.properties())); |
59 | /// assert_eq!(Some(1), props.static_explicit_captures_len()); |
60 | /// |
61 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
62 | /// ``` |
63 | pub fn parse_many<P: AsRef<str>>(patterns: &[P]) -> Result<Vec<Hir>, Error> { |
64 | parse_many_with(patterns, &Config::default()) |
65 | } |
66 | |
67 | /// A convenience routine for parsing a pattern into an HIR value using a |
68 | /// `Config`. |
69 | /// |
70 | /// # Example |
71 | /// |
72 | /// This shows how to parse a pattern into an HIR value with a non-default |
73 | /// configuration: |
74 | /// |
75 | /// ``` |
76 | /// use regex_automata::util::syntax; |
77 | /// |
78 | /// let hir = syntax::parse_with( |
79 | /// r"^[a-z]+$" , |
80 | /// &syntax::Config::new().multi_line(true).crlf(true), |
81 | /// )?; |
82 | /// assert!(hir.properties().look_set().contains_anchor_crlf()); |
83 | /// |
84 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
85 | /// ``` |
86 | pub fn parse_with(pattern: &str, config: &Config) -> Result<Hir, Error> { |
87 | let mut builder = ParserBuilder::new(); |
88 | config.apply(&mut builder); |
89 | builder.build().parse(pattern) |
90 | } |
91 | |
92 | /// A convenience routine for parsing many patterns into HIR values using a |
93 | /// `Config`. |
94 | /// |
95 | /// # Example |
96 | /// |
97 | /// This shows how to parse many patterns into an corresponding HIR values |
98 | /// with a non-default configuration: |
99 | /// |
100 | /// ``` |
101 | /// use { |
102 | /// regex_automata::util::syntax, |
103 | /// regex_syntax::hir::Properties, |
104 | /// }; |
105 | /// |
106 | /// let patterns = &[ |
107 | /// r"([a-z]+)|([0-9]+)" , |
108 | /// r"\W" , |
109 | /// r"foo(A-Z]+)bar" , |
110 | /// ]; |
111 | /// let config = syntax::Config::new().unicode(false).utf8(false); |
112 | /// let hirs = syntax::parse_many_with(patterns, &config)?; |
113 | /// let props = Properties::union(hirs.iter().map(|h| h.properties())); |
114 | /// assert!(!props.is_utf8()); |
115 | /// |
116 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
117 | /// ``` |
118 | pub fn parse_many_with<P: AsRef<str>>( |
119 | patterns: &[P], |
120 | config: &Config, |
121 | ) -> Result<Vec<Hir>, Error> { |
122 | let mut builder = ParserBuilder::new(); |
123 | config.apply(&mut builder); |
124 | let mut hirs = vec![]; |
125 | for p in patterns.iter() { |
126 | hirs.push(builder.build().parse(p.as_ref())?); |
127 | } |
128 | Ok(hirs) |
129 | } |
130 | |
131 | /// A common set of configuration options that apply to the syntax of a regex. |
132 | /// |
133 | /// This represents a group of configuration options that specifically apply |
134 | /// to how the concrete syntax of a regular expression is interpreted. In |
135 | /// particular, they are generally forwarded to the |
136 | /// [`ParserBuilder`](https://docs.rs/regex-syntax/*/regex_syntax/struct.ParserBuilder.html) |
137 | /// in the |
138 | /// [`regex-syntax`](https://docs.rs/regex-syntax) |
139 | /// crate when building a regex from its concrete syntax directly. |
140 | /// |
141 | /// These options are defined as a group since they apply to every regex engine |
142 | /// in this crate. Instead of re-defining them on every engine's builder, they |
143 | /// are instead provided here as one cohesive unit. |
144 | #[derive(Clone, Copy, Debug)] |
145 | pub struct Config { |
146 | case_insensitive: bool, |
147 | multi_line: bool, |
148 | dot_matches_new_line: bool, |
149 | crlf: bool, |
150 | line_terminator: u8, |
151 | swap_greed: bool, |
152 | ignore_whitespace: bool, |
153 | unicode: bool, |
154 | utf8: bool, |
155 | nest_limit: u32, |
156 | octal: bool, |
157 | } |
158 | |
159 | impl Config { |
160 | /// Return a new default syntax configuration. |
161 | pub fn new() -> Config { |
162 | // These defaults match the ones used in regex-syntax. |
163 | Config { |
164 | case_insensitive: false, |
165 | multi_line: false, |
166 | dot_matches_new_line: false, |
167 | crlf: false, |
168 | line_terminator: b' \n' , |
169 | swap_greed: false, |
170 | ignore_whitespace: false, |
171 | unicode: true, |
172 | utf8: true, |
173 | nest_limit: 250, |
174 | octal: false, |
175 | } |
176 | } |
177 | |
178 | /// Enable or disable the case insensitive flag by default. |
179 | /// |
180 | /// When Unicode mode is enabled, case insensitivity is Unicode-aware. |
181 | /// Specifically, it will apply the "simple" case folding rules as |
182 | /// specified by Unicode. |
183 | /// |
184 | /// By default this is disabled. It may alternatively be selectively |
185 | /// enabled in the regular expression itself via the `i` flag. |
186 | pub fn case_insensitive(mut self, yes: bool) -> Config { |
187 | self.case_insensitive = yes; |
188 | self |
189 | } |
190 | |
191 | /// Enable or disable the multi-line matching flag by default. |
192 | /// |
193 | /// When this is enabled, the `^` and `$` look-around assertions will |
194 | /// match immediately after and immediately before a new line character, |
195 | /// respectively. Note that the `\A` and `\z` look-around assertions are |
196 | /// unaffected by this setting and always correspond to matching at the |
197 | /// beginning and end of the input. |
198 | /// |
199 | /// By default this is disabled. It may alternatively be selectively |
200 | /// enabled in the regular expression itself via the `m` flag. |
201 | pub fn multi_line(mut self, yes: bool) -> Config { |
202 | self.multi_line = yes; |
203 | self |
204 | } |
205 | |
206 | /// Enable or disable the "dot matches any character" flag by default. |
207 | /// |
208 | /// When this is enabled, `.` will match any character. When it's disabled, |
209 | /// then `.` will match any character except for a new line character. |
210 | /// |
211 | /// Note that `.` is impacted by whether the "unicode" setting is enabled |
212 | /// or not. When Unicode is enabled (the default), `.` will match any UTF-8 |
213 | /// encoding of any Unicode scalar value (sans a new line, depending on |
214 | /// whether this "dot matches new line" option is enabled). When Unicode |
215 | /// mode is disabled, `.` will match any byte instead. Because of this, |
216 | /// when Unicode mode is disabled, `.` can only be used when the "allow |
217 | /// invalid UTF-8" option is enabled, since `.` could otherwise match |
218 | /// invalid UTF-8. |
219 | /// |
220 | /// By default this is disabled. It may alternatively be selectively |
221 | /// enabled in the regular expression itself via the `s` flag. |
222 | pub fn dot_matches_new_line(mut self, yes: bool) -> Config { |
223 | self.dot_matches_new_line = yes; |
224 | self |
225 | } |
226 | |
227 | /// Enable or disable the "CRLF mode" flag by default. |
228 | /// |
229 | /// By default this is disabled. It may alternatively be selectively |
230 | /// enabled in the regular expression itself via the `R` flag. |
231 | /// |
232 | /// When CRLF mode is enabled, the following happens: |
233 | /// |
234 | /// * Unless `dot_matches_new_line` is enabled, `.` will match any character |
235 | /// except for `\r` and `\n`. |
236 | /// * When `multi_line` mode is enabled, `^` and `$` will treat `\r\n`, |
237 | /// `\r` and `\n` as line terminators. And in particular, neither will |
238 | /// match between a `\r` and a `\n`. |
239 | pub fn crlf(mut self, yes: bool) -> Config { |
240 | self.crlf = yes; |
241 | self |
242 | } |
243 | |
244 | /// Sets the line terminator for use with `(?u-s:.)` and `(?-us:.)`. |
245 | /// |
246 | /// Namely, instead of `.` (by default) matching everything except for `\n`, |
247 | /// this will cause `.` to match everything except for the byte given. |
248 | /// |
249 | /// If `.` is used in a context where Unicode mode is enabled and this byte |
250 | /// isn't ASCII, then an error will be returned. When Unicode mode is |
251 | /// disabled, then any byte is permitted, but will return an error if UTF-8 |
252 | /// mode is enabled and it is a non-ASCII byte. |
253 | /// |
254 | /// In short, any ASCII value for a line terminator is always okay. But a |
255 | /// non-ASCII byte might result in an error depending on whether Unicode |
256 | /// mode or UTF-8 mode are enabled. |
257 | /// |
258 | /// Note that if `R` mode is enabled then it always takes precedence and |
259 | /// the line terminator will be treated as `\r` and `\n` simultaneously. |
260 | /// |
261 | /// Note also that this *doesn't* impact the look-around assertions |
262 | /// `(?m:^)` and `(?m:$)`. That's usually controlled by additional |
263 | /// configuration in the regex engine itself. |
264 | pub fn line_terminator(mut self, byte: u8) -> Config { |
265 | self.line_terminator = byte; |
266 | self |
267 | } |
268 | |
269 | /// Enable or disable the "swap greed" flag by default. |
270 | /// |
271 | /// When this is enabled, `.*` (for example) will become ungreedy and `.*?` |
272 | /// will become greedy. |
273 | /// |
274 | /// By default this is disabled. It may alternatively be selectively |
275 | /// enabled in the regular expression itself via the `U` flag. |
276 | pub fn swap_greed(mut self, yes: bool) -> Config { |
277 | self.swap_greed = yes; |
278 | self |
279 | } |
280 | |
281 | /// Enable verbose mode in the regular expression. |
282 | /// |
283 | /// When enabled, verbose mode permits insigificant whitespace in many |
284 | /// places in the regular expression, as well as comments. Comments are |
285 | /// started using `#` and continue until the end of the line. |
286 | /// |
287 | /// By default, this is disabled. It may be selectively enabled in the |
288 | /// regular expression by using the `x` flag regardless of this setting. |
289 | pub fn ignore_whitespace(mut self, yes: bool) -> Config { |
290 | self.ignore_whitespace = yes; |
291 | self |
292 | } |
293 | |
294 | /// Enable or disable the Unicode flag (`u`) by default. |
295 | /// |
296 | /// By default this is **enabled**. It may alternatively be selectively |
297 | /// disabled in the regular expression itself via the `u` flag. |
298 | /// |
299 | /// Note that unless "allow invalid UTF-8" is enabled (it's disabled by |
300 | /// default), a regular expression will fail to parse if Unicode mode is |
301 | /// disabled and a sub-expression could possibly match invalid UTF-8. |
302 | /// |
303 | /// **WARNING**: Unicode mode can greatly increase the size of the compiled |
304 | /// DFA, which can noticeably impact both memory usage and compilation |
305 | /// time. This is especially noticeable if your regex contains character |
306 | /// classes like `\w` that are impacted by whether Unicode is enabled or |
307 | /// not. If Unicode is not necessary, you are encouraged to disable it. |
308 | pub fn unicode(mut self, yes: bool) -> Config { |
309 | self.unicode = yes; |
310 | self |
311 | } |
312 | |
313 | /// When disabled, the builder will permit the construction of a regular |
314 | /// expression that may match invalid UTF-8. |
315 | /// |
316 | /// For example, when [`Config::unicode`] is disabled, then |
317 | /// expressions like `[^a]` may match invalid UTF-8 since they can match |
318 | /// any single byte that is not `a`. By default, these sub-expressions |
319 | /// are disallowed to avoid returning offsets that split a UTF-8 |
320 | /// encoded codepoint. However, in cases where matching at arbitrary |
321 | /// locations is desired, this option can be disabled to permit all such |
322 | /// sub-expressions. |
323 | /// |
324 | /// When enabled (the default), the builder is guaranteed to produce a |
325 | /// regex that will only ever match valid UTF-8 (otherwise, the builder |
326 | /// will return an error). |
327 | pub fn utf8(mut self, yes: bool) -> Config { |
328 | self.utf8 = yes; |
329 | self |
330 | } |
331 | |
332 | /// Set the nesting limit used for the regular expression parser. |
333 | /// |
334 | /// The nesting limit controls how deep the abstract syntax tree is allowed |
335 | /// to be. If the AST exceeds the given limit (e.g., with too many nested |
336 | /// groups), then an error is returned by the parser. |
337 | /// |
338 | /// The purpose of this limit is to act as a heuristic to prevent stack |
339 | /// overflow when building a finite automaton from a regular expression's |
340 | /// abstract syntax tree. In particular, construction currently uses |
341 | /// recursion. In the future, the implementation may stop using recursion |
342 | /// and this option will no longer be necessary. |
343 | /// |
344 | /// This limit is not checked until the entire AST is parsed. Therefore, |
345 | /// if callers want to put a limit on the amount of heap space used, then |
346 | /// they should impose a limit on the length, in bytes, of the concrete |
347 | /// pattern string. In particular, this is viable since the parser will |
348 | /// limit itself to heap space proportional to the length of the pattern |
349 | /// string. |
350 | /// |
351 | /// Note that a nest limit of `0` will return a nest limit error for most |
352 | /// patterns but not all. For example, a nest limit of `0` permits `a` but |
353 | /// not `ab`, since `ab` requires a concatenation AST item, which results |
354 | /// in a nest depth of `1`. In general, a nest limit is not something that |
355 | /// manifests in an obvious way in the concrete syntax, therefore, it |
356 | /// should not be used in a granular way. |
357 | pub fn nest_limit(mut self, limit: u32) -> Config { |
358 | self.nest_limit = limit; |
359 | self |
360 | } |
361 | |
362 | /// Whether to support octal syntax or not. |
363 | /// |
364 | /// Octal syntax is a little-known way of uttering Unicode codepoints in |
365 | /// a regular expression. For example, `a`, `\x61`, `\u0061` and |
366 | /// `\141` are all equivalent regular expressions, where the last example |
367 | /// shows octal syntax. |
368 | /// |
369 | /// While supporting octal syntax isn't in and of itself a problem, it does |
370 | /// make good error messages harder. That is, in PCRE based regex engines, |
371 | /// syntax like `\1` invokes a backreference, which is explicitly |
372 | /// unsupported in Rust's regex engine. However, many users expect it to |
373 | /// be supported. Therefore, when octal support is disabled, the error |
374 | /// message will explicitly mention that backreferences aren't supported. |
375 | /// |
376 | /// Octal syntax is disabled by default. |
377 | pub fn octal(mut self, yes: bool) -> Config { |
378 | self.octal = yes; |
379 | self |
380 | } |
381 | |
382 | /// Returns whether "unicode" mode is enabled. |
383 | pub fn get_unicode(&self) -> bool { |
384 | self.unicode |
385 | } |
386 | |
387 | /// Returns whether "case insensitive" mode is enabled. |
388 | pub fn get_case_insensitive(&self) -> bool { |
389 | self.case_insensitive |
390 | } |
391 | |
392 | /// Returns whether "multi line" mode is enabled. |
393 | pub fn get_multi_line(&self) -> bool { |
394 | self.multi_line |
395 | } |
396 | |
397 | /// Returns whether "dot matches new line" mode is enabled. |
398 | pub fn get_dot_matches_new_line(&self) -> bool { |
399 | self.dot_matches_new_line |
400 | } |
401 | |
402 | /// Returns whether "CRLF" mode is enabled. |
403 | pub fn get_crlf(&self) -> bool { |
404 | self.crlf |
405 | } |
406 | |
407 | /// Returns the line terminator in this syntax configuration. |
408 | pub fn get_line_terminator(&self) -> u8 { |
409 | self.line_terminator |
410 | } |
411 | |
412 | /// Returns whether "swap greed" mode is enabled. |
413 | pub fn get_swap_greed(&self) -> bool { |
414 | self.swap_greed |
415 | } |
416 | |
417 | /// Returns whether "ignore whitespace" mode is enabled. |
418 | pub fn get_ignore_whitespace(&self) -> bool { |
419 | self.ignore_whitespace |
420 | } |
421 | |
422 | /// Returns whether UTF-8 mode is enabled. |
423 | pub fn get_utf8(&self) -> bool { |
424 | self.utf8 |
425 | } |
426 | |
427 | /// Returns the "nest limit" setting. |
428 | pub fn get_nest_limit(&self) -> u32 { |
429 | self.nest_limit |
430 | } |
431 | |
432 | /// Returns whether "octal" mode is enabled. |
433 | pub fn get_octal(&self) -> bool { |
434 | self.octal |
435 | } |
436 | |
437 | /// Applies this configuration to the given parser. |
438 | pub(crate) fn apply(&self, builder: &mut ParserBuilder) { |
439 | builder |
440 | .unicode(self.unicode) |
441 | .case_insensitive(self.case_insensitive) |
442 | .multi_line(self.multi_line) |
443 | .dot_matches_new_line(self.dot_matches_new_line) |
444 | .crlf(self.crlf) |
445 | .line_terminator(self.line_terminator) |
446 | .swap_greed(self.swap_greed) |
447 | .ignore_whitespace(self.ignore_whitespace) |
448 | .utf8(self.utf8) |
449 | .nest_limit(self.nest_limit) |
450 | .octal(self.octal); |
451 | } |
452 | |
453 | /// Applies this configuration to the given AST parser. |
454 | pub(crate) fn apply_ast(&self, builder: &mut ast::parse::ParserBuilder) { |
455 | builder |
456 | .ignore_whitespace(self.ignore_whitespace) |
457 | .nest_limit(self.nest_limit) |
458 | .octal(self.octal); |
459 | } |
460 | |
461 | /// Applies this configuration to the given AST-to-HIR translator. |
462 | pub(crate) fn apply_hir( |
463 | &self, |
464 | builder: &mut hir::translate::TranslatorBuilder, |
465 | ) { |
466 | builder |
467 | .unicode(self.unicode) |
468 | .case_insensitive(self.case_insensitive) |
469 | .multi_line(self.multi_line) |
470 | .crlf(self.crlf) |
471 | .dot_matches_new_line(self.dot_matches_new_line) |
472 | .line_terminator(self.line_terminator) |
473 | .swap_greed(self.swap_greed) |
474 | .utf8(self.utf8); |
475 | } |
476 | } |
477 | |
478 | impl Default for Config { |
479 | fn default() -> Config { |
480 | Config::new() |
481 | } |
482 | } |
483 | |