| 1 | /*! |
| 2 | Utilities for dealing with the syntax of a regular expression. |
| 3 | |
| 4 | This module currently only exposes a [`Config`] type that |
| 5 | itself represents a wrapper around the configuration for a |
| 6 | [`regex-syntax::ParserBuilder`](regex_syntax::ParserBuilder). The purpose of |
| 7 | this wrapper is to make configuring syntax options very similar to how other |
| 8 | configuration is done throughout this crate. Namely, instead of duplicating |
| 9 | syntax options across every builder (of which there are many), we instead |
| 10 | create small config objects like this one that can be passed around and |
| 11 | composed. |
| 12 | */ |
| 13 | |
| 14 | use alloc::{vec, vec::Vec}; |
| 15 | |
| 16 | use regex_syntax::{ |
| 17 | ast, |
| 18 | hir::{self, Hir}, |
| 19 | Error, ParserBuilder, |
| 20 | }; |
| 21 | |
| 22 | /// A convenience routine for parsing a pattern into an HIR value with the |
| 23 | /// default configuration. |
| 24 | /// |
| 25 | /// # Example |
| 26 | /// |
| 27 | /// This shows how to parse a pattern into an HIR value: |
| 28 | /// |
| 29 | /// ``` |
| 30 | /// use regex_automata::util::syntax; |
| 31 | /// |
| 32 | /// let hir = syntax::parse(r"([a-z]+)|([0-9]+)" )?; |
| 33 | /// assert_eq!(Some(1), hir.properties().static_explicit_captures_len()); |
| 34 | /// |
| 35 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 36 | /// ``` |
| 37 | pub fn parse(pattern: &str) -> Result<Hir, Error> { |
| 38 | parse_with(pattern, &Config::default()) |
| 39 | } |
| 40 | |
| 41 | /// A convenience routine for parsing many patterns into HIR value with the |
| 42 | /// default configuration. |
| 43 | /// |
| 44 | /// # Example |
| 45 | /// |
| 46 | /// This shows how to parse many patterns into an corresponding HIR values: |
| 47 | /// |
| 48 | /// ``` |
| 49 | /// use { |
| 50 | /// regex_automata::util::syntax, |
| 51 | /// regex_syntax::hir::Properties, |
| 52 | /// }; |
| 53 | /// |
| 54 | /// let hirs = syntax::parse_many(&[ |
| 55 | /// r"([a-z]+)|([0-9]+)" , |
| 56 | /// r"foo(A-Z]+)bar" , |
| 57 | /// ])?; |
| 58 | /// let props = Properties::union(hirs.iter().map(|h| h.properties())); |
| 59 | /// assert_eq!(Some(1), props.static_explicit_captures_len()); |
| 60 | /// |
| 61 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 62 | /// ``` |
| 63 | pub fn parse_many<P: AsRef<str>>(patterns: &[P]) -> Result<Vec<Hir>, Error> { |
| 64 | parse_many_with(patterns, &Config::default()) |
| 65 | } |
| 66 | |
| 67 | /// A convenience routine for parsing a pattern into an HIR value using a |
| 68 | /// `Config`. |
| 69 | /// |
| 70 | /// # Example |
| 71 | /// |
| 72 | /// This shows how to parse a pattern into an HIR value with a non-default |
| 73 | /// configuration: |
| 74 | /// |
| 75 | /// ``` |
| 76 | /// use regex_automata::util::syntax; |
| 77 | /// |
| 78 | /// let hir = syntax::parse_with( |
| 79 | /// r"^[a-z]+$" , |
| 80 | /// &syntax::Config::new().multi_line(true).crlf(true), |
| 81 | /// )?; |
| 82 | /// assert!(hir.properties().look_set().contains_anchor_crlf()); |
| 83 | /// |
| 84 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 85 | /// ``` |
| 86 | pub fn parse_with(pattern: &str, config: &Config) -> Result<Hir, Error> { |
| 87 | let mut builder: ParserBuilder = ParserBuilder::new(); |
| 88 | config.apply(&mut builder); |
| 89 | builder.build().parse(pattern) |
| 90 | } |
| 91 | |
| 92 | /// A convenience routine for parsing many patterns into HIR values using a |
| 93 | /// `Config`. |
| 94 | /// |
| 95 | /// # Example |
| 96 | /// |
| 97 | /// This shows how to parse many patterns into an corresponding HIR values |
| 98 | /// with a non-default configuration: |
| 99 | /// |
| 100 | /// ``` |
| 101 | /// use { |
| 102 | /// regex_automata::util::syntax, |
| 103 | /// regex_syntax::hir::Properties, |
| 104 | /// }; |
| 105 | /// |
| 106 | /// let patterns = &[ |
| 107 | /// r"([a-z]+)|([0-9]+)" , |
| 108 | /// r"\W" , |
| 109 | /// r"foo(A-Z]+)bar" , |
| 110 | /// ]; |
| 111 | /// let config = syntax::Config::new().unicode(false).utf8(false); |
| 112 | /// let hirs = syntax::parse_many_with(patterns, &config)?; |
| 113 | /// let props = Properties::union(hirs.iter().map(|h| h.properties())); |
| 114 | /// assert!(!props.is_utf8()); |
| 115 | /// |
| 116 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 117 | /// ``` |
| 118 | pub fn parse_many_with<P: AsRef<str>>( |
| 119 | patterns: &[P], |
| 120 | config: &Config, |
| 121 | ) -> Result<Vec<Hir>, Error> { |
| 122 | let mut builder: ParserBuilder = ParserBuilder::new(); |
| 123 | config.apply(&mut builder); |
| 124 | let mut hirs: Vec = vec![]; |
| 125 | for p: &P in patterns.iter() { |
| 126 | hirs.push(builder.build().parse(pattern:p.as_ref())?); |
| 127 | } |
| 128 | Ok(hirs) |
| 129 | } |
| 130 | |
| 131 | /// A common set of configuration options that apply to the syntax of a regex. |
| 132 | /// |
| 133 | /// This represents a group of configuration options that specifically apply |
| 134 | /// to how the concrete syntax of a regular expression is interpreted. In |
| 135 | /// particular, they are generally forwarded to the |
| 136 | /// [`ParserBuilder`](https://docs.rs/regex-syntax/*/regex_syntax/struct.ParserBuilder.html) |
| 137 | /// in the |
| 138 | /// [`regex-syntax`](https://docs.rs/regex-syntax) |
| 139 | /// crate when building a regex from its concrete syntax directly. |
| 140 | /// |
| 141 | /// These options are defined as a group since they apply to every regex engine |
| 142 | /// in this crate. Instead of re-defining them on every engine's builder, they |
| 143 | /// are instead provided here as one cohesive unit. |
| 144 | #[derive (Clone, Copy, Debug)] |
| 145 | pub struct Config { |
| 146 | case_insensitive: bool, |
| 147 | multi_line: bool, |
| 148 | dot_matches_new_line: bool, |
| 149 | crlf: bool, |
| 150 | line_terminator: u8, |
| 151 | swap_greed: bool, |
| 152 | ignore_whitespace: bool, |
| 153 | unicode: bool, |
| 154 | utf8: bool, |
| 155 | nest_limit: u32, |
| 156 | octal: bool, |
| 157 | } |
| 158 | |
| 159 | impl Config { |
| 160 | /// Return a new default syntax configuration. |
| 161 | pub fn new() -> Config { |
| 162 | // These defaults match the ones used in regex-syntax. |
| 163 | Config { |
| 164 | case_insensitive: false, |
| 165 | multi_line: false, |
| 166 | dot_matches_new_line: false, |
| 167 | crlf: false, |
| 168 | line_terminator: b' \n' , |
| 169 | swap_greed: false, |
| 170 | ignore_whitespace: false, |
| 171 | unicode: true, |
| 172 | utf8: true, |
| 173 | nest_limit: 250, |
| 174 | octal: false, |
| 175 | } |
| 176 | } |
| 177 | |
| 178 | /// Enable or disable the case insensitive flag by default. |
| 179 | /// |
| 180 | /// When Unicode mode is enabled, case insensitivity is Unicode-aware. |
| 181 | /// Specifically, it will apply the "simple" case folding rules as |
| 182 | /// specified by Unicode. |
| 183 | /// |
| 184 | /// By default this is disabled. It may alternatively be selectively |
| 185 | /// enabled in the regular expression itself via the `i` flag. |
| 186 | pub fn case_insensitive(mut self, yes: bool) -> Config { |
| 187 | self.case_insensitive = yes; |
| 188 | self |
| 189 | } |
| 190 | |
| 191 | /// Enable or disable the multi-line matching flag by default. |
| 192 | /// |
| 193 | /// When this is enabled, the `^` and `$` look-around assertions will |
| 194 | /// match immediately after and immediately before a new line character, |
| 195 | /// respectively. Note that the `\A` and `\z` look-around assertions are |
| 196 | /// unaffected by this setting and always correspond to matching at the |
| 197 | /// beginning and end of the input. |
| 198 | /// |
| 199 | /// By default this is disabled. It may alternatively be selectively |
| 200 | /// enabled in the regular expression itself via the `m` flag. |
| 201 | pub fn multi_line(mut self, yes: bool) -> Config { |
| 202 | self.multi_line = yes; |
| 203 | self |
| 204 | } |
| 205 | |
| 206 | /// Enable or disable the "dot matches any character" flag by default. |
| 207 | /// |
| 208 | /// When this is enabled, `.` will match any character. When it's disabled, |
| 209 | /// then `.` will match any character except for a new line character. |
| 210 | /// |
| 211 | /// Note that `.` is impacted by whether the "unicode" setting is enabled |
| 212 | /// or not. When Unicode is enabled (the default), `.` will match any UTF-8 |
| 213 | /// encoding of any Unicode scalar value (sans a new line, depending on |
| 214 | /// whether this "dot matches new line" option is enabled). When Unicode |
| 215 | /// mode is disabled, `.` will match any byte instead. Because of this, |
| 216 | /// when Unicode mode is disabled, `.` can only be used when the "allow |
| 217 | /// invalid UTF-8" option is enabled, since `.` could otherwise match |
| 218 | /// invalid UTF-8. |
| 219 | /// |
| 220 | /// By default this is disabled. It may alternatively be selectively |
| 221 | /// enabled in the regular expression itself via the `s` flag. |
| 222 | pub fn dot_matches_new_line(mut self, yes: bool) -> Config { |
| 223 | self.dot_matches_new_line = yes; |
| 224 | self |
| 225 | } |
| 226 | |
| 227 | /// Enable or disable the "CRLF mode" flag by default. |
| 228 | /// |
| 229 | /// By default this is disabled. It may alternatively be selectively |
| 230 | /// enabled in the regular expression itself via the `R` flag. |
| 231 | /// |
| 232 | /// When CRLF mode is enabled, the following happens: |
| 233 | /// |
| 234 | /// * Unless `dot_matches_new_line` is enabled, `.` will match any character |
| 235 | /// except for `\r` and `\n`. |
| 236 | /// * When `multi_line` mode is enabled, `^` and `$` will treat `\r\n`, |
| 237 | /// `\r` and `\n` as line terminators. And in particular, neither will |
| 238 | /// match between a `\r` and a `\n`. |
| 239 | pub fn crlf(mut self, yes: bool) -> Config { |
| 240 | self.crlf = yes; |
| 241 | self |
| 242 | } |
| 243 | |
| 244 | /// Sets the line terminator for use with `(?u-s:.)` and `(?-us:.)`. |
| 245 | /// |
| 246 | /// Namely, instead of `.` (by default) matching everything except for `\n`, |
| 247 | /// this will cause `.` to match everything except for the byte given. |
| 248 | /// |
| 249 | /// If `.` is used in a context where Unicode mode is enabled and this byte |
| 250 | /// isn't ASCII, then an error will be returned. When Unicode mode is |
| 251 | /// disabled, then any byte is permitted, but will return an error if UTF-8 |
| 252 | /// mode is enabled and it is a non-ASCII byte. |
| 253 | /// |
| 254 | /// In short, any ASCII value for a line terminator is always okay. But a |
| 255 | /// non-ASCII byte might result in an error depending on whether Unicode |
| 256 | /// mode or UTF-8 mode are enabled. |
| 257 | /// |
| 258 | /// Note that if `R` mode is enabled then it always takes precedence and |
| 259 | /// the line terminator will be treated as `\r` and `\n` simultaneously. |
| 260 | /// |
| 261 | /// Note also that this *doesn't* impact the look-around assertions |
| 262 | /// `(?m:^)` and `(?m:$)`. That's usually controlled by additional |
| 263 | /// configuration in the regex engine itself. |
| 264 | pub fn line_terminator(mut self, byte: u8) -> Config { |
| 265 | self.line_terminator = byte; |
| 266 | self |
| 267 | } |
| 268 | |
| 269 | /// Enable or disable the "swap greed" flag by default. |
| 270 | /// |
| 271 | /// When this is enabled, `.*` (for example) will become ungreedy and `.*?` |
| 272 | /// will become greedy. |
| 273 | /// |
| 274 | /// By default this is disabled. It may alternatively be selectively |
| 275 | /// enabled in the regular expression itself via the `U` flag. |
| 276 | pub fn swap_greed(mut self, yes: bool) -> Config { |
| 277 | self.swap_greed = yes; |
| 278 | self |
| 279 | } |
| 280 | |
| 281 | /// Enable verbose mode in the regular expression. |
| 282 | /// |
| 283 | /// When enabled, verbose mode permits insigificant whitespace in many |
| 284 | /// places in the regular expression, as well as comments. Comments are |
| 285 | /// started using `#` and continue until the end of the line. |
| 286 | /// |
| 287 | /// By default, this is disabled. It may be selectively enabled in the |
| 288 | /// regular expression by using the `x` flag regardless of this setting. |
| 289 | pub fn ignore_whitespace(mut self, yes: bool) -> Config { |
| 290 | self.ignore_whitespace = yes; |
| 291 | self |
| 292 | } |
| 293 | |
| 294 | /// Enable or disable the Unicode flag (`u`) by default. |
| 295 | /// |
| 296 | /// By default this is **enabled**. It may alternatively be selectively |
| 297 | /// disabled in the regular expression itself via the `u` flag. |
| 298 | /// |
| 299 | /// Note that unless "allow invalid UTF-8" is enabled (it's disabled by |
| 300 | /// default), a regular expression will fail to parse if Unicode mode is |
| 301 | /// disabled and a sub-expression could possibly match invalid UTF-8. |
| 302 | /// |
| 303 | /// **WARNING**: Unicode mode can greatly increase the size of the compiled |
| 304 | /// DFA, which can noticeably impact both memory usage and compilation |
| 305 | /// time. This is especially noticeable if your regex contains character |
| 306 | /// classes like `\w` that are impacted by whether Unicode is enabled or |
| 307 | /// not. If Unicode is not necessary, you are encouraged to disable it. |
| 308 | pub fn unicode(mut self, yes: bool) -> Config { |
| 309 | self.unicode = yes; |
| 310 | self |
| 311 | } |
| 312 | |
| 313 | /// When disabled, the builder will permit the construction of a regular |
| 314 | /// expression that may match invalid UTF-8. |
| 315 | /// |
| 316 | /// For example, when [`Config::unicode`] is disabled, then |
| 317 | /// expressions like `[^a]` may match invalid UTF-8 since they can match |
| 318 | /// any single byte that is not `a`. By default, these sub-expressions |
| 319 | /// are disallowed to avoid returning offsets that split a UTF-8 |
| 320 | /// encoded codepoint. However, in cases where matching at arbitrary |
| 321 | /// locations is desired, this option can be disabled to permit all such |
| 322 | /// sub-expressions. |
| 323 | /// |
| 324 | /// When enabled (the default), the builder is guaranteed to produce a |
| 325 | /// regex that will only ever match valid UTF-8 (otherwise, the builder |
| 326 | /// will return an error). |
| 327 | pub fn utf8(mut self, yes: bool) -> Config { |
| 328 | self.utf8 = yes; |
| 329 | self |
| 330 | } |
| 331 | |
| 332 | /// Set the nesting limit used for the regular expression parser. |
| 333 | /// |
| 334 | /// The nesting limit controls how deep the abstract syntax tree is allowed |
| 335 | /// to be. If the AST exceeds the given limit (e.g., with too many nested |
| 336 | /// groups), then an error is returned by the parser. |
| 337 | /// |
| 338 | /// The purpose of this limit is to act as a heuristic to prevent stack |
| 339 | /// overflow when building a finite automaton from a regular expression's |
| 340 | /// abstract syntax tree. In particular, construction currently uses |
| 341 | /// recursion. In the future, the implementation may stop using recursion |
| 342 | /// and this option will no longer be necessary. |
| 343 | /// |
| 344 | /// This limit is not checked until the entire AST is parsed. Therefore, |
| 345 | /// if callers want to put a limit on the amount of heap space used, then |
| 346 | /// they should impose a limit on the length, in bytes, of the concrete |
| 347 | /// pattern string. In particular, this is viable since the parser will |
| 348 | /// limit itself to heap space proportional to the length of the pattern |
| 349 | /// string. |
| 350 | /// |
| 351 | /// Note that a nest limit of `0` will return a nest limit error for most |
| 352 | /// patterns but not all. For example, a nest limit of `0` permits `a` but |
| 353 | /// not `ab`, since `ab` requires a concatenation AST item, which results |
| 354 | /// in a nest depth of `1`. In general, a nest limit is not something that |
| 355 | /// manifests in an obvious way in the concrete syntax, therefore, it |
| 356 | /// should not be used in a granular way. |
| 357 | pub fn nest_limit(mut self, limit: u32) -> Config { |
| 358 | self.nest_limit = limit; |
| 359 | self |
| 360 | } |
| 361 | |
| 362 | /// Whether to support octal syntax or not. |
| 363 | /// |
| 364 | /// Octal syntax is a little-known way of uttering Unicode codepoints in |
| 365 | /// a regular expression. For example, `a`, `\x61`, `\u0061` and |
| 366 | /// `\141` are all equivalent regular expressions, where the last example |
| 367 | /// shows octal syntax. |
| 368 | /// |
| 369 | /// While supporting octal syntax isn't in and of itself a problem, it does |
| 370 | /// make good error messages harder. That is, in PCRE based regex engines, |
| 371 | /// syntax like `\1` invokes a backreference, which is explicitly |
| 372 | /// unsupported in Rust's regex engine. However, many users expect it to |
| 373 | /// be supported. Therefore, when octal support is disabled, the error |
| 374 | /// message will explicitly mention that backreferences aren't supported. |
| 375 | /// |
| 376 | /// Octal syntax is disabled by default. |
| 377 | pub fn octal(mut self, yes: bool) -> Config { |
| 378 | self.octal = yes; |
| 379 | self |
| 380 | } |
| 381 | |
| 382 | /// Returns whether "unicode" mode is enabled. |
| 383 | pub fn get_unicode(&self) -> bool { |
| 384 | self.unicode |
| 385 | } |
| 386 | |
| 387 | /// Returns whether "case insensitive" mode is enabled. |
| 388 | pub fn get_case_insensitive(&self) -> bool { |
| 389 | self.case_insensitive |
| 390 | } |
| 391 | |
| 392 | /// Returns whether "multi line" mode is enabled. |
| 393 | pub fn get_multi_line(&self) -> bool { |
| 394 | self.multi_line |
| 395 | } |
| 396 | |
| 397 | /// Returns whether "dot matches new line" mode is enabled. |
| 398 | pub fn get_dot_matches_new_line(&self) -> bool { |
| 399 | self.dot_matches_new_line |
| 400 | } |
| 401 | |
| 402 | /// Returns whether "CRLF" mode is enabled. |
| 403 | pub fn get_crlf(&self) -> bool { |
| 404 | self.crlf |
| 405 | } |
| 406 | |
| 407 | /// Returns the line terminator in this syntax configuration. |
| 408 | pub fn get_line_terminator(&self) -> u8 { |
| 409 | self.line_terminator |
| 410 | } |
| 411 | |
| 412 | /// Returns whether "swap greed" mode is enabled. |
| 413 | pub fn get_swap_greed(&self) -> bool { |
| 414 | self.swap_greed |
| 415 | } |
| 416 | |
| 417 | /// Returns whether "ignore whitespace" mode is enabled. |
| 418 | pub fn get_ignore_whitespace(&self) -> bool { |
| 419 | self.ignore_whitespace |
| 420 | } |
| 421 | |
| 422 | /// Returns whether UTF-8 mode is enabled. |
| 423 | pub fn get_utf8(&self) -> bool { |
| 424 | self.utf8 |
| 425 | } |
| 426 | |
| 427 | /// Returns the "nest limit" setting. |
| 428 | pub fn get_nest_limit(&self) -> u32 { |
| 429 | self.nest_limit |
| 430 | } |
| 431 | |
| 432 | /// Returns whether "octal" mode is enabled. |
| 433 | pub fn get_octal(&self) -> bool { |
| 434 | self.octal |
| 435 | } |
| 436 | |
| 437 | /// Applies this configuration to the given parser. |
| 438 | pub(crate) fn apply(&self, builder: &mut ParserBuilder) { |
| 439 | builder |
| 440 | .unicode(self.unicode) |
| 441 | .case_insensitive(self.case_insensitive) |
| 442 | .multi_line(self.multi_line) |
| 443 | .dot_matches_new_line(self.dot_matches_new_line) |
| 444 | .crlf(self.crlf) |
| 445 | .line_terminator(self.line_terminator) |
| 446 | .swap_greed(self.swap_greed) |
| 447 | .ignore_whitespace(self.ignore_whitespace) |
| 448 | .utf8(self.utf8) |
| 449 | .nest_limit(self.nest_limit) |
| 450 | .octal(self.octal); |
| 451 | } |
| 452 | |
| 453 | /// Applies this configuration to the given AST parser. |
| 454 | pub(crate) fn apply_ast(&self, builder: &mut ast::parse::ParserBuilder) { |
| 455 | builder |
| 456 | .ignore_whitespace(self.ignore_whitespace) |
| 457 | .nest_limit(self.nest_limit) |
| 458 | .octal(self.octal); |
| 459 | } |
| 460 | |
| 461 | /// Applies this configuration to the given AST-to-HIR translator. |
| 462 | pub(crate) fn apply_hir( |
| 463 | &self, |
| 464 | builder: &mut hir::translate::TranslatorBuilder, |
| 465 | ) { |
| 466 | builder |
| 467 | .unicode(self.unicode) |
| 468 | .case_insensitive(self.case_insensitive) |
| 469 | .multi_line(self.multi_line) |
| 470 | .crlf(self.crlf) |
| 471 | .dot_matches_new_line(self.dot_matches_new_line) |
| 472 | .line_terminator(self.line_terminator) |
| 473 | .swap_greed(self.swap_greed) |
| 474 | .utf8(self.utf8); |
| 475 | } |
| 476 | } |
| 477 | |
| 478 | impl Default for Config { |
| 479 | fn default() -> Config { |
| 480 | Config::new() |
| 481 | } |
| 482 | } |
| 483 | |