| 1 | use core::cell::{Cell, RefCell}; |
| 2 | |
| 3 | use alloc::{ |
| 4 | boxed::Box, |
| 5 | string::{String, ToString}, |
| 6 | vec, |
| 7 | vec::Vec, |
| 8 | }; |
| 9 | |
| 10 | use crate::{ |
| 11 | error::Error, |
| 12 | hir::{self, Config, Flags, Hir, HirKind}, |
| 13 | }; |
| 14 | |
| 15 | // These are all of the errors that can occur while parsing a regex. Unlike |
| 16 | // regex-syntax, our errors are not particularly great. They are just enough |
| 17 | // to get a general sense of what went wrong. But in exchange, the error |
| 18 | // reporting mechanism is *much* simpler than what's in regex-syntax. |
| 19 | // |
| 20 | // By convention, we use each of these messages in exactly one place. That |
| 21 | // way, every branch that leads to an error has a unique message. This in turn |
| 22 | // means that given a message, one can precisely identify which part of the |
| 23 | // parser reported it. |
| 24 | // |
| 25 | // Finally, we give names to each message so that we can reference them in |
| 26 | // tests. |
| 27 | const ERR_TOO_MUCH_NESTING: &str = "pattern has too much nesting" ; |
| 28 | const ERR_TOO_MANY_CAPTURES: &str = "too many capture groups" ; |
| 29 | const ERR_DUPLICATE_CAPTURE_NAME: &str = "duplicate capture group name" ; |
| 30 | const ERR_UNCLOSED_GROUP: &str = "found open group without closing ')'" ; |
| 31 | const ERR_UNCLOSED_GROUP_QUESTION: &str = |
| 32 | "expected closing ')', but got end of pattern" ; |
| 33 | const ERR_UNOPENED_GROUP: &str = "found closing ')' without matching '('" ; |
| 34 | const ERR_LOOK_UNSUPPORTED: &str = "look-around is not supported" ; |
| 35 | const ERR_EMPTY_FLAGS: &str = "empty flag directive '(?)' is not allowed" ; |
| 36 | const ERR_MISSING_GROUP_NAME: &str = |
| 37 | "expected capture group name, but got end of pattern" ; |
| 38 | const ERR_INVALID_GROUP_NAME: &str = "invalid group name" ; |
| 39 | const ERR_UNCLOSED_GROUP_NAME: &str = |
| 40 | "expected end of capture group name, but got end of pattern" ; |
| 41 | const ERR_EMPTY_GROUP_NAME: &str = "empty capture group names are not allowed" ; |
| 42 | const ERR_FLAG_UNRECOGNIZED: &str = "unrecognized inline flag" ; |
| 43 | const ERR_FLAG_REPEATED_NEGATION: &str = |
| 44 | "inline flag negation cannot be repeated" ; |
| 45 | const ERR_FLAG_DUPLICATE: &str = "duplicate inline flag is not allowed" ; |
| 46 | const ERR_FLAG_UNEXPECTED_EOF: &str = |
| 47 | "expected ':' or ')' to end inline flags, but got end of pattern" ; |
| 48 | const ERR_FLAG_DANGLING_NEGATION: &str = |
| 49 | "inline flags cannot end with negation directive" ; |
| 50 | const ERR_DECIMAL_NO_DIGITS: &str = |
| 51 | "expected decimal number, but found no digits" ; |
| 52 | const ERR_DECIMAL_INVALID: &str = "got invalid decimal number" ; |
| 53 | const ERR_HEX_BRACE_INVALID_DIGIT: &str = |
| 54 | "expected hexadecimal number in braces, but got non-hex digit" ; |
| 55 | const ERR_HEX_BRACE_UNEXPECTED_EOF: &str = |
| 56 | "expected hexadecimal number, but saw end of pattern before closing brace" ; |
| 57 | const ERR_HEX_BRACE_EMPTY: &str = |
| 58 | "expected hexadecimal number in braces, but got no digits" ; |
| 59 | const ERR_HEX_BRACE_INVALID: &str = "got invalid hexadecimal number in braces" ; |
| 60 | const ERR_HEX_FIXED_UNEXPECTED_EOF: &str = |
| 61 | "expected fixed length hexadecimal number, but saw end of pattern first" ; |
| 62 | const ERR_HEX_FIXED_INVALID_DIGIT: &str = |
| 63 | "expected fixed length hexadecimal number, but got non-hex digit" ; |
| 64 | const ERR_HEX_FIXED_INVALID: &str = |
| 65 | "got invalid fixed length hexadecimal number" ; |
| 66 | const ERR_HEX_UNEXPECTED_EOF: &str = |
| 67 | "expected hexadecimal number, but saw end of pattern first" ; |
| 68 | const ERR_ESCAPE_UNEXPECTED_EOF: &str = |
| 69 | "saw start of escape sequence, but saw end of pattern before it finished" ; |
| 70 | const ERR_BACKREF_UNSUPPORTED: &str = "backreferences are not supported" ; |
| 71 | const ERR_UNICODE_CLASS_UNSUPPORTED: &str = |
| 72 | "Unicode character classes are not supported" ; |
| 73 | const ERR_ESCAPE_UNRECOGNIZED: &str = "unrecognized escape sequence" ; |
| 74 | const ERR_POSIX_CLASS_UNRECOGNIZED: &str = |
| 75 | "unrecognized POSIX character class" ; |
| 76 | const ERR_UNCOUNTED_REP_SUB_MISSING: &str = |
| 77 | "uncounted repetition operator must be applied to a sub-expression" ; |
| 78 | const ERR_COUNTED_REP_SUB_MISSING: &str = |
| 79 | "counted repetition operator must be applied to a sub-expression" ; |
| 80 | const ERR_COUNTED_REP_UNCLOSED: &str = |
| 81 | "found unclosed counted repetition operator" ; |
| 82 | const ERR_COUNTED_REP_MIN_UNCLOSED: &str = |
| 83 | "found incomplete and unclosed counted repetition operator" ; |
| 84 | const ERR_COUNTED_REP_COMMA_UNCLOSED: &str = |
| 85 | "found counted repetition operator with a comma that is unclosed" ; |
| 86 | const ERR_COUNTED_REP_MIN_MAX_UNCLOSED: &str = |
| 87 | "found counted repetition with min and max that is unclosed" ; |
| 88 | const ERR_COUNTED_REP_INVALID: &str = |
| 89 | "expected closing brace for counted repetition, but got something else" ; |
| 90 | const ERR_COUNTED_REP_INVALID_RANGE: &str = |
| 91 | "found counted repetition with a min bigger than its max" ; |
| 92 | const ERR_CLASS_UNCLOSED_AFTER_ITEM: &str = |
| 93 | "non-empty character class has no closing bracket" ; |
| 94 | const ERR_CLASS_INVALID_RANGE_ITEM: &str = |
| 95 | "character class ranges must start and end with a single character" ; |
| 96 | const ERR_CLASS_INVALID_ITEM: &str = |
| 97 | "invalid escape sequence in character class" ; |
| 98 | const ERR_CLASS_UNCLOSED_AFTER_DASH: &str = |
| 99 | "non-empty character class has no closing bracket after dash" ; |
| 100 | const ERR_CLASS_UNCLOSED_AFTER_NEGATION: &str = |
| 101 | "negated character class has no closing bracket" ; |
| 102 | const ERR_CLASS_UNCLOSED_AFTER_CLOSING: &str = |
| 103 | "character class begins with literal ']' but has no closing bracket" ; |
| 104 | const ERR_CLASS_INVALID_RANGE: &str = "invalid range in character class" ; |
| 105 | const ERR_CLASS_UNCLOSED: &str = "found unclosed character class" ; |
| 106 | const ERR_CLASS_NEST_UNSUPPORTED: &str = |
| 107 | "nested character classes are not supported" ; |
| 108 | const ERR_CLASS_INTERSECTION_UNSUPPORTED: &str = |
| 109 | "character class intersection is not supported" ; |
| 110 | const ERR_CLASS_DIFFERENCE_UNSUPPORTED: &str = |
| 111 | "character class difference is not supported" ; |
| 112 | const ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED: &str = |
| 113 | "character class symmetric difference is not supported" ; |
| 114 | const ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED: &str = |
| 115 | "special word boundary assertion is unclosed or has an invalid character" ; |
| 116 | const ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED: &str = |
| 117 | "special word boundary assertion is unrecognized" ; |
| 118 | const ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF: &str = |
| 119 | "found start of special word boundary or repetition without an end" ; |
| 120 | |
| 121 | /// A regular expression parser. |
| 122 | /// |
| 123 | /// This parses a string representation of a regular expression into an |
| 124 | /// abstract syntax tree. The size of the tree is proportional to the length |
| 125 | /// of the regular expression pattern. |
| 126 | /// |
| 127 | /// A `Parser` can be configured in more detail via a [`ParserBuilder`]. |
| 128 | #[derive (Clone, Debug)] |
| 129 | pub(super) struct Parser<'a> { |
| 130 | /// The configuration of the parser as given by the caller. |
| 131 | config: Config, |
| 132 | /// The pattern we're parsing as given by the caller. |
| 133 | pattern: &'a str, |
| 134 | /// The call depth of the parser. This is incremented for each |
| 135 | /// sub-expression parsed. Its peak value is the maximum nesting of the |
| 136 | /// pattern. |
| 137 | depth: Cell<u32>, |
| 138 | /// The current position of the parser. |
| 139 | pos: Cell<usize>, |
| 140 | /// The current codepoint of the parser. The codepoint corresponds to the |
| 141 | /// codepoint encoded in `pattern` beginning at `pos`. |
| 142 | /// |
| 143 | /// This is `None` if and only if `pos == pattern.len()`. |
| 144 | char: Cell<Option<char>>, |
| 145 | /// The current capture index. |
| 146 | capture_index: Cell<u32>, |
| 147 | /// The flags that are currently set. |
| 148 | flags: RefCell<Flags>, |
| 149 | /// A sorted sequence of capture names. This is used to detect duplicate |
| 150 | /// capture names and report an error if one is detected. |
| 151 | capture_names: RefCell<Vec<String>>, |
| 152 | } |
| 153 | |
| 154 | /// The constructor and a variety of helper routines. |
| 155 | impl<'a> Parser<'a> { |
| 156 | /// Build a parser from this configuration with the given pattern. |
| 157 | pub(super) fn new(config: Config, pattern: &'a str) -> Parser<'a> { |
| 158 | Parser { |
| 159 | config, |
| 160 | pattern, |
| 161 | depth: Cell::new(0), |
| 162 | pos: Cell::new(0), |
| 163 | char: Cell::new(pattern.chars().next()), |
| 164 | capture_index: Cell::new(0), |
| 165 | flags: RefCell::new(config.flags), |
| 166 | capture_names: RefCell::new(vec![]), |
| 167 | } |
| 168 | } |
| 169 | |
| 170 | /// Returns the full pattern string that we're parsing. |
| 171 | fn pattern(&self) -> &str { |
| 172 | self.pattern |
| 173 | } |
| 174 | |
| 175 | /// Return the current byte offset of the parser. |
| 176 | /// |
| 177 | /// The offset starts at `0` from the beginning of the regular expression |
| 178 | /// pattern string. |
| 179 | fn pos(&self) -> usize { |
| 180 | self.pos.get() |
| 181 | } |
| 182 | |
| 183 | /// Increments the call depth of the parser. |
| 184 | /// |
| 185 | /// If the call depth would exceed the configured nest limit, then this |
| 186 | /// returns an error. |
| 187 | /// |
| 188 | /// This returns the old depth. |
| 189 | fn increment_depth(&self) -> Result<u32, Error> { |
| 190 | let old = self.depth.get(); |
| 191 | if old > self.config.nest_limit { |
| 192 | return Err(Error::new(ERR_TOO_MUCH_NESTING)); |
| 193 | } |
| 194 | // OK because our depth starts at 0, and we return an error if it |
| 195 | // ever reaches the limit. So the call depth can never exceed u32::MAX. |
| 196 | let new = old.checked_add(1).unwrap(); |
| 197 | self.depth.set(new); |
| 198 | Ok(old) |
| 199 | } |
| 200 | |
| 201 | /// Decrements the call depth of the parser. |
| 202 | /// |
| 203 | /// This panics if the current depth is 0. |
| 204 | fn decrement_depth(&self) { |
| 205 | let old = self.depth.get(); |
| 206 | // If this fails then the caller has a bug in how they're incrementing |
| 207 | // and decrementing the depth of the parser's call stack. |
| 208 | let new = old.checked_sub(1).unwrap(); |
| 209 | self.depth.set(new); |
| 210 | } |
| 211 | |
| 212 | /// Return the codepoint at the current position of the parser. |
| 213 | /// |
| 214 | /// This panics if the parser is positioned at the end of the pattern. |
| 215 | fn char(&self) -> char { |
| 216 | self.char.get().expect("codepoint, but parser is done" ) |
| 217 | } |
| 218 | |
| 219 | /// Returns true if the next call to `bump` would return false. |
| 220 | fn is_done(&self) -> bool { |
| 221 | self.pos() == self.pattern.len() |
| 222 | } |
| 223 | |
| 224 | /// Returns the flags that are current set for this regex. |
| 225 | fn flags(&self) -> Flags { |
| 226 | *self.flags.borrow() |
| 227 | } |
| 228 | |
| 229 | /// Bump the parser to the next Unicode scalar value. |
| 230 | /// |
| 231 | /// If the end of the input has been reached, then `false` is returned. |
| 232 | fn bump(&self) -> bool { |
| 233 | if self.is_done() { |
| 234 | return false; |
| 235 | } |
| 236 | self.pos.set(self.pos() + self.char().len_utf8()); |
| 237 | self.char.set(self.pattern()[self.pos()..].chars().next()); |
| 238 | self.char.get().is_some() |
| 239 | } |
| 240 | |
| 241 | /// If the substring starting at the current position of the parser has |
| 242 | /// the given prefix, then bump the parser to the character immediately |
| 243 | /// following the prefix and return true. Otherwise, don't bump the parser |
| 244 | /// and return false. |
| 245 | fn bump_if(&self, prefix: &str) -> bool { |
| 246 | if self.pattern()[self.pos()..].starts_with(prefix) { |
| 247 | for _ in 0..prefix.chars().count() { |
| 248 | self.bump(); |
| 249 | } |
| 250 | true |
| 251 | } else { |
| 252 | false |
| 253 | } |
| 254 | } |
| 255 | |
| 256 | /// Bump the parser, and if the `x` flag is enabled, bump through any |
| 257 | /// subsequent spaces. Return true if and only if the parser is not done. |
| 258 | fn bump_and_bump_space(&self) -> bool { |
| 259 | if !self.bump() { |
| 260 | return false; |
| 261 | } |
| 262 | self.bump_space(); |
| 263 | !self.is_done() |
| 264 | } |
| 265 | |
| 266 | /// If the `x` flag is enabled (i.e., whitespace insensitivity with |
| 267 | /// comments), then this will advance the parser through all whitespace |
| 268 | /// and comments to the next non-whitespace non-comment byte. |
| 269 | /// |
| 270 | /// If the `x` flag is disabled, then this is a no-op. |
| 271 | /// |
| 272 | /// This should be used selectively throughout the parser where |
| 273 | /// arbitrary whitespace is permitted when the `x` flag is enabled. For |
| 274 | /// example, `{ 5 , 6}` is equivalent to `{5,6}`. |
| 275 | fn bump_space(&self) { |
| 276 | if !self.flags().ignore_whitespace { |
| 277 | return; |
| 278 | } |
| 279 | while !self.is_done() { |
| 280 | if self.char().is_whitespace() { |
| 281 | self.bump(); |
| 282 | } else if self.char() == '#' { |
| 283 | self.bump(); |
| 284 | while !self.is_done() { |
| 285 | let c = self.char(); |
| 286 | self.bump(); |
| 287 | if c == ' \n' { |
| 288 | break; |
| 289 | } |
| 290 | } |
| 291 | } else { |
| 292 | break; |
| 293 | } |
| 294 | } |
| 295 | } |
| 296 | |
| 297 | /// Peek at the next character in the input without advancing the parser. |
| 298 | /// |
| 299 | /// If the input has been exhausted, then this returns `None`. |
| 300 | fn peek(&self) -> Option<char> { |
| 301 | if self.is_done() { |
| 302 | return None; |
| 303 | } |
| 304 | self.pattern()[self.pos() + self.char().len_utf8()..].chars().next() |
| 305 | } |
| 306 | |
| 307 | /// Peeks at the next character in the pattern from the current offset, and |
| 308 | /// will ignore spaces when the parser is in whitespace insensitive mode. |
| 309 | fn peek_space(&self) -> Option<char> { |
| 310 | if !self.flags().ignore_whitespace { |
| 311 | return self.peek(); |
| 312 | } |
| 313 | if self.is_done() { |
| 314 | return None; |
| 315 | } |
| 316 | let mut start = self.pos() + self.char().len_utf8(); |
| 317 | let mut in_comment = false; |
| 318 | for (i, ch) in self.pattern()[start..].char_indices() { |
| 319 | if ch.is_whitespace() { |
| 320 | continue; |
| 321 | } else if !in_comment && ch == '#' { |
| 322 | in_comment = true; |
| 323 | } else if in_comment && ch == ' \n' { |
| 324 | in_comment = false; |
| 325 | } else { |
| 326 | start += i; |
| 327 | break; |
| 328 | } |
| 329 | } |
| 330 | self.pattern()[start..].chars().next() |
| 331 | } |
| 332 | |
| 333 | /// Return the next capturing index. Each subsequent call increments the |
| 334 | /// internal index. Since the way capture indices are computed is a public |
| 335 | /// API guarantee, use of this routine depends on the parser being depth |
| 336 | /// first and left-to-right. |
| 337 | /// |
| 338 | /// If the capture limit is exceeded, then an error is returned. |
| 339 | fn next_capture_index(&self) -> Result<u32, Error> { |
| 340 | let current = self.capture_index.get(); |
| 341 | let next = current |
| 342 | .checked_add(1) |
| 343 | .ok_or_else(|| Error::new(ERR_TOO_MANY_CAPTURES))?; |
| 344 | self.capture_index.set(next); |
| 345 | Ok(next) |
| 346 | } |
| 347 | |
| 348 | /// Adds the given capture name to this parser. If this capture name has |
| 349 | /// already been used, then an error is returned. |
| 350 | fn add_capture_name(&self, name: &str) -> Result<(), Error> { |
| 351 | let mut names = self.capture_names.borrow_mut(); |
| 352 | match names.binary_search_by(|n| name.cmp(n)) { |
| 353 | Ok(_) => Err(Error::new(ERR_DUPLICATE_CAPTURE_NAME)), |
| 354 | Err(i) => { |
| 355 | names.insert(i, name.to_string()); |
| 356 | Ok(()) |
| 357 | } |
| 358 | } |
| 359 | } |
| 360 | |
| 361 | /// Returns true if and only if the parser is positioned at a look-around |
| 362 | /// prefix. The conditions under which this returns true must always |
| 363 | /// correspond to a regular expression that would otherwise be consider |
| 364 | /// invalid. |
| 365 | /// |
| 366 | /// This should only be called immediately after parsing the opening of |
| 367 | /// a group or a set of flags. |
| 368 | fn is_lookaround_prefix(&self) -> bool { |
| 369 | self.bump_if("?=" ) |
| 370 | || self.bump_if("?!" ) |
| 371 | || self.bump_if("?<=" ) |
| 372 | || self.bump_if("?<!" ) |
| 373 | } |
| 374 | } |
| 375 | |
| 376 | /// The actual parser. We try to break out each kind of regex syntax into its |
| 377 | /// own routine. |
| 378 | impl<'a> Parser<'a> { |
| 379 | pub(super) fn parse(&self) -> Result<Hir, Error> { |
| 380 | let hir = self.parse_inner()?; |
| 381 | // While we also check nesting during parsing, that only checks the |
| 382 | // number of recursive parse calls. It does not necessarily cover |
| 383 | // all possible recursive nestings of the Hir itself. For example, |
| 384 | // repetition operators don't require recursive parse calls. So one |
| 385 | // can stack them arbitrarily without overflowing the stack in the |
| 386 | // *parser*. But then if one recurses over the resulting Hir, a stack |
| 387 | // overflow is possible. So here we check the Hir nesting level |
| 388 | // thoroughly to ensure it isn't nested too deeply. |
| 389 | // |
| 390 | // Note that we do still need the nesting limit check in the parser as |
| 391 | // well, since that will avoid overflowing the stack during parse time |
| 392 | // before the complete Hir value is constructed. |
| 393 | check_hir_nesting(&hir, self.config.nest_limit)?; |
| 394 | Ok(hir) |
| 395 | } |
| 396 | |
| 397 | fn parse_inner(&self) -> Result<Hir, Error> { |
| 398 | let depth = self.increment_depth()?; |
| 399 | let mut alternates = vec![]; |
| 400 | let mut concat = vec![]; |
| 401 | loop { |
| 402 | self.bump_space(); |
| 403 | if self.is_done() { |
| 404 | break; |
| 405 | } |
| 406 | match self.char() { |
| 407 | '(' => { |
| 408 | // Save the old flags and reset them only when we close |
| 409 | // the group. |
| 410 | let oldflags = *self.flags.borrow(); |
| 411 | if let Some(sub) = self.parse_group()? { |
| 412 | concat.push(sub); |
| 413 | // We only reset them here because if 'parse_group' |
| 414 | // returns None, then that means it handled a flag |
| 415 | // directive, e.g., '(?ism)'. And the whole point is |
| 416 | // that those flags remain active until either disabled |
| 417 | // or the end of the pattern or current group. |
| 418 | *self.flags.borrow_mut() = oldflags; |
| 419 | } |
| 420 | if self.char.get() != Some(')' ) { |
| 421 | return Err(Error::new(ERR_UNCLOSED_GROUP)); |
| 422 | } |
| 423 | self.bump(); |
| 424 | } |
| 425 | ')' => { |
| 426 | if depth == 0 { |
| 427 | return Err(Error::new(ERR_UNOPENED_GROUP)); |
| 428 | } |
| 429 | break; |
| 430 | } |
| 431 | '|' => { |
| 432 | alternates.push(Hir::concat(core::mem::take(&mut concat))); |
| 433 | self.bump(); |
| 434 | } |
| 435 | '[' => concat.push(self.parse_class()?), |
| 436 | '?' | '*' | '+' => { |
| 437 | concat = self.parse_uncounted_repetition(concat)?; |
| 438 | } |
| 439 | '{' => { |
| 440 | concat = self.parse_counted_repetition(concat)?; |
| 441 | } |
| 442 | _ => concat.push(self.parse_primitive()?), |
| 443 | } |
| 444 | } |
| 445 | self.decrement_depth(); |
| 446 | alternates.push(Hir::concat(concat)); |
| 447 | // N.B. This strips off the "alternation" if there's only one branch. |
| 448 | Ok(Hir::alternation(alternates)) |
| 449 | } |
| 450 | |
| 451 | /// Parses a "primitive" pattern. A primitive is any expression that does |
| 452 | /// not contain any sub-expressions. |
| 453 | /// |
| 454 | /// This assumes the parser is pointing at the beginning of the primitive. |
| 455 | fn parse_primitive(&self) -> Result<Hir, Error> { |
| 456 | let ch = self.char(); |
| 457 | self.bump(); |
| 458 | match ch { |
| 459 | ' \\' => self.parse_escape(), |
| 460 | '.' => Ok(self.hir_dot()), |
| 461 | '^' => Ok(self.hir_anchor_start()), |
| 462 | '$' => Ok(self.hir_anchor_end()), |
| 463 | ch => Ok(self.hir_char(ch)), |
| 464 | } |
| 465 | } |
| 466 | |
| 467 | /// Parse an escape sequence. This always results in a "primitive" HIR, |
| 468 | /// that is, an HIR with no sub-expressions. |
| 469 | /// |
| 470 | /// This assumes the parser is positioned at the start of the sequence, |
| 471 | /// immediately *after* the `\`. It advances the parser to the first |
| 472 | /// position immediately following the escape sequence. |
| 473 | fn parse_escape(&self) -> Result<Hir, Error> { |
| 474 | if self.is_done() { |
| 475 | return Err(Error::new(ERR_ESCAPE_UNEXPECTED_EOF)); |
| 476 | } |
| 477 | let ch = self.char(); |
| 478 | // Put some of the more complicated routines into helpers. |
| 479 | match ch { |
| 480 | '0' ..='9' => return Err(Error::new(ERR_BACKREF_UNSUPPORTED)), |
| 481 | 'p' | 'P' => { |
| 482 | return Err(Error::new(ERR_UNICODE_CLASS_UNSUPPORTED)) |
| 483 | } |
| 484 | 'x' | 'u' | 'U' => return self.parse_hex(), |
| 485 | 'd' | 's' | 'w' | 'D' | 'S' | 'W' => { |
| 486 | return Ok(self.parse_perl_class()); |
| 487 | } |
| 488 | _ => {} |
| 489 | } |
| 490 | |
| 491 | // Handle all of the one letter sequences inline. |
| 492 | self.bump(); |
| 493 | if hir::is_meta_character(ch) || hir::is_escapeable_character(ch) { |
| 494 | return Ok(self.hir_char(ch)); |
| 495 | } |
| 496 | let special = |ch| Ok(self.hir_char(ch)); |
| 497 | match ch { |
| 498 | 'a' => special(' \x07' ), |
| 499 | 'f' => special(' \x0C' ), |
| 500 | 't' => special(' \t' ), |
| 501 | 'n' => special(' \n' ), |
| 502 | 'r' => special(' \r' ), |
| 503 | 'v' => special(' \x0B' ), |
| 504 | 'A' => Ok(Hir::look(hir::Look::Start)), |
| 505 | 'z' => Ok(Hir::look(hir::Look::End)), |
| 506 | 'b' => { |
| 507 | let mut hir = Hir::look(hir::Look::Word); |
| 508 | if !self.is_done() && self.char() == '{' { |
| 509 | if let Some(special) = |
| 510 | self.maybe_parse_special_word_boundary()? |
| 511 | { |
| 512 | hir = special; |
| 513 | } |
| 514 | } |
| 515 | Ok(hir) |
| 516 | } |
| 517 | 'B' => Ok(Hir::look(hir::Look::WordNegate)), |
| 518 | '<' => Ok(Hir::look(hir::Look::WordStart)), |
| 519 | '>' => Ok(Hir::look(hir::Look::WordEnd)), |
| 520 | _ => Err(Error::new(ERR_ESCAPE_UNRECOGNIZED)), |
| 521 | } |
| 522 | } |
| 523 | |
| 524 | /// Attempt to parse a specialty word boundary. That is, `\b{start}`, |
| 525 | /// `\b{end}`, `\b{start-half}` or `\b{end-half}`. |
| 526 | /// |
| 527 | /// This is similar to `maybe_parse_ascii_class` in that, in most cases, |
| 528 | /// if it fails it will just return `None` with no error. This is done |
| 529 | /// because `\b{5}` is a valid expression and we want to let that be parsed |
| 530 | /// by the existing counted repetition parsing code. (I thought about just |
| 531 | /// invoking the counted repetition code from here, but it seemed a little |
| 532 | /// ham-fisted.) |
| 533 | /// |
| 534 | /// Unlike `maybe_parse_ascii_class` though, this can return an error. |
| 535 | /// Namely, if we definitely know it isn't a counted repetition, then we |
| 536 | /// return an error specific to the specialty word boundaries. |
| 537 | /// |
| 538 | /// This assumes the parser is positioned at a `{` immediately following |
| 539 | /// a `\b`. When `None` is returned, the parser is returned to the position |
| 540 | /// at which it started: pointing at a `{`. |
| 541 | /// |
| 542 | /// The position given should correspond to the start of the `\b`. |
| 543 | fn maybe_parse_special_word_boundary(&self) -> Result<Option<Hir>, Error> { |
| 544 | assert_eq!(self.char(), '{' ); |
| 545 | |
| 546 | let is_valid_char = |c| match c { |
| 547 | 'A' ..='Z' | 'a' ..='z' | '-' => true, |
| 548 | _ => false, |
| 549 | }; |
| 550 | let start = self.pos(); |
| 551 | if !self.bump_and_bump_space() { |
| 552 | return Err(Error::new(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF)); |
| 553 | } |
| 554 | // This is one of the critical bits: if the first non-whitespace |
| 555 | // character isn't in [-A-Za-z] (i.e., this can't be a special word |
| 556 | // boundary), then we bail and let the counted repetition parser deal |
| 557 | // with this. |
| 558 | if !is_valid_char(self.char()) { |
| 559 | self.pos.set(start); |
| 560 | self.char.set(Some('{' )); |
| 561 | return Ok(None); |
| 562 | } |
| 563 | |
| 564 | // Now collect up our chars until we see a '}'. |
| 565 | let mut scratch = String::new(); |
| 566 | while !self.is_done() && is_valid_char(self.char()) { |
| 567 | scratch.push(self.char()); |
| 568 | self.bump_and_bump_space(); |
| 569 | } |
| 570 | if self.is_done() || self.char() != '}' { |
| 571 | return Err(Error::new(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED)); |
| 572 | } |
| 573 | self.bump(); |
| 574 | let kind = match scratch.as_str() { |
| 575 | "start" => hir::Look::WordStart, |
| 576 | "end" => hir::Look::WordEnd, |
| 577 | "start-half" => hir::Look::WordStartHalf, |
| 578 | "end-half" => hir::Look::WordEndHalf, |
| 579 | _ => { |
| 580 | return Err(Error::new(ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED)) |
| 581 | } |
| 582 | }; |
| 583 | Ok(Some(Hir::look(kind))) |
| 584 | } |
| 585 | |
| 586 | /// Parse a hex representation of a Unicode codepoint. This handles both |
| 587 | /// hex notations, i.e., `\xFF` and `\x{FFFF}`. This expects the parser to |
| 588 | /// be positioned at the `x`, `u` or `U` prefix. The parser is advanced to |
| 589 | /// the first character immediately following the hexadecimal literal. |
| 590 | fn parse_hex(&self) -> Result<Hir, Error> { |
| 591 | let digit_len = match self.char() { |
| 592 | 'x' => 2, |
| 593 | 'u' => 4, |
| 594 | 'U' => 8, |
| 595 | unk => unreachable!( |
| 596 | "invalid start of fixed length hexadecimal number {}" , |
| 597 | unk |
| 598 | ), |
| 599 | }; |
| 600 | if !self.bump_and_bump_space() { |
| 601 | return Err(Error::new(ERR_HEX_UNEXPECTED_EOF)); |
| 602 | } |
| 603 | if self.char() == '{' { |
| 604 | self.parse_hex_brace() |
| 605 | } else { |
| 606 | self.parse_hex_digits(digit_len) |
| 607 | } |
| 608 | } |
| 609 | |
| 610 | /// Parse an N-digit hex representation of a Unicode codepoint. This |
| 611 | /// expects the parser to be positioned at the first digit and will advance |
| 612 | /// the parser to the first character immediately following the escape |
| 613 | /// sequence. |
| 614 | /// |
| 615 | /// The number of digits given must be 2 (for `\xNN`), 4 (for `\uNNNN`) |
| 616 | /// or 8 (for `\UNNNNNNNN`). |
| 617 | fn parse_hex_digits(&self, digit_len: usize) -> Result<Hir, Error> { |
| 618 | let mut scratch = String::new(); |
| 619 | for i in 0..digit_len { |
| 620 | if i > 0 && !self.bump_and_bump_space() { |
| 621 | return Err(Error::new(ERR_HEX_FIXED_UNEXPECTED_EOF)); |
| 622 | } |
| 623 | if !is_hex(self.char()) { |
| 624 | return Err(Error::new(ERR_HEX_FIXED_INVALID_DIGIT)); |
| 625 | } |
| 626 | scratch.push(self.char()); |
| 627 | } |
| 628 | // The final bump just moves the parser past the literal, which may |
| 629 | // be EOF. |
| 630 | self.bump_and_bump_space(); |
| 631 | match u32::from_str_radix(&scratch, 16).ok().and_then(char::from_u32) { |
| 632 | None => Err(Error::new(ERR_HEX_FIXED_INVALID)), |
| 633 | Some(ch) => Ok(self.hir_char(ch)), |
| 634 | } |
| 635 | } |
| 636 | |
| 637 | /// Parse a hex representation of any Unicode scalar value. This expects |
| 638 | /// the parser to be positioned at the opening brace `{` and will advance |
| 639 | /// the parser to the first character following the closing brace `}`. |
| 640 | fn parse_hex_brace(&self) -> Result<Hir, Error> { |
| 641 | let mut scratch = String::new(); |
| 642 | while self.bump_and_bump_space() && self.char() != '}' { |
| 643 | if !is_hex(self.char()) { |
| 644 | return Err(Error::new(ERR_HEX_BRACE_INVALID_DIGIT)); |
| 645 | } |
| 646 | scratch.push(self.char()); |
| 647 | } |
| 648 | if self.is_done() { |
| 649 | return Err(Error::new(ERR_HEX_BRACE_UNEXPECTED_EOF)); |
| 650 | } |
| 651 | assert_eq!(self.char(), '}' ); |
| 652 | self.bump_and_bump_space(); |
| 653 | |
| 654 | if scratch.is_empty() { |
| 655 | return Err(Error::new(ERR_HEX_BRACE_EMPTY)); |
| 656 | } |
| 657 | match u32::from_str_radix(&scratch, 16).ok().and_then(char::from_u32) { |
| 658 | None => Err(Error::new(ERR_HEX_BRACE_INVALID)), |
| 659 | Some(ch) => Ok(self.hir_char(ch)), |
| 660 | } |
| 661 | } |
| 662 | |
| 663 | /// Parse a decimal number into a u32 while trimming leading and trailing |
| 664 | /// whitespace. |
| 665 | /// |
| 666 | /// This expects the parser to be positioned at the first position where |
| 667 | /// a decimal digit could occur. This will advance the parser to the byte |
| 668 | /// immediately following the last contiguous decimal digit. |
| 669 | /// |
| 670 | /// If no decimal digit could be found or if there was a problem parsing |
| 671 | /// the complete set of digits into a u32, then an error is returned. |
| 672 | fn parse_decimal(&self) -> Result<u32, Error> { |
| 673 | let mut scratch = String::new(); |
| 674 | while !self.is_done() && self.char().is_whitespace() { |
| 675 | self.bump(); |
| 676 | } |
| 677 | while !self.is_done() && '0' <= self.char() && self.char() <= '9' { |
| 678 | scratch.push(self.char()); |
| 679 | self.bump_and_bump_space(); |
| 680 | } |
| 681 | while !self.is_done() && self.char().is_whitespace() { |
| 682 | self.bump_and_bump_space(); |
| 683 | } |
| 684 | let digits = scratch.as_str(); |
| 685 | if digits.is_empty() { |
| 686 | return Err(Error::new(ERR_DECIMAL_NO_DIGITS)); |
| 687 | } |
| 688 | match u32::from_str_radix(digits, 10).ok() { |
| 689 | Some(n) => Ok(n), |
| 690 | None => Err(Error::new(ERR_DECIMAL_INVALID)), |
| 691 | } |
| 692 | } |
| 693 | |
| 694 | /// Parses an uncounted repetition operator. An uncounted repetition |
| 695 | /// operator includes `?`, `*` and `+`, but does not include the `{m,n}` |
| 696 | /// syntax. The current character should be one of `?`, `*` or `+`. Any |
| 697 | /// other character will result in a panic. |
| 698 | /// |
| 699 | /// This assumes that the parser is currently positioned at the repetition |
| 700 | /// operator and advances the parser to the first character after the |
| 701 | /// operator. (Note that the operator may include a single additional `?`, |
| 702 | /// which makes the operator ungreedy.) |
| 703 | /// |
| 704 | /// The caller should include the concatenation that is being built. The |
| 705 | /// concatenation returned includes the repetition operator applied to the |
| 706 | /// last expression in the given concatenation. |
| 707 | /// |
| 708 | /// If the concatenation is empty, then this returns an error. |
| 709 | fn parse_uncounted_repetition( |
| 710 | &self, |
| 711 | mut concat: Vec<Hir>, |
| 712 | ) -> Result<Vec<Hir>, Error> { |
| 713 | let sub = match concat.pop() { |
| 714 | Some(hir) => Box::new(hir), |
| 715 | None => { |
| 716 | return Err(Error::new(ERR_UNCOUNTED_REP_SUB_MISSING)); |
| 717 | } |
| 718 | }; |
| 719 | let (min, max) = match self.char() { |
| 720 | '?' => (0, Some(1)), |
| 721 | '*' => (0, None), |
| 722 | '+' => (1, None), |
| 723 | unk => unreachable!("unrecognized repetition operator ' {}'" , unk), |
| 724 | }; |
| 725 | let mut greedy = true; |
| 726 | if self.bump() && self.char() == '?' { |
| 727 | greedy = false; |
| 728 | self.bump(); |
| 729 | } |
| 730 | if self.flags().swap_greed { |
| 731 | greedy = !greedy; |
| 732 | } |
| 733 | concat.push(Hir::repetition(hir::Repetition { |
| 734 | min, |
| 735 | max, |
| 736 | greedy, |
| 737 | sub, |
| 738 | })); |
| 739 | Ok(concat) |
| 740 | } |
| 741 | |
| 742 | /// Parses a counted repetition operation. A counted repetition operator |
| 743 | /// corresponds to the `{m,n}` syntax, and does not include the `?`, `*` or |
| 744 | /// `+` operators. |
| 745 | /// |
| 746 | /// This assumes that the parser is currently at the opening `{` and |
| 747 | /// advances the parser to the first character after the operator. (Note |
| 748 | /// that the operator may include a single additional `?`, which makes the |
| 749 | /// operator ungreedy.) |
| 750 | /// |
| 751 | /// The caller should include the concatenation that is being built. The |
| 752 | /// concatenation returned includes the repetition operator applied to the |
| 753 | /// last expression in the given concatenation. |
| 754 | /// |
| 755 | /// If the concatenation is empty, then this returns an error. |
| 756 | fn parse_counted_repetition( |
| 757 | &self, |
| 758 | mut concat: Vec<Hir>, |
| 759 | ) -> Result<Vec<Hir>, Error> { |
| 760 | assert_eq!(self.char(), '{' , "expected opening brace" ); |
| 761 | let sub = match concat.pop() { |
| 762 | Some(hir) => Box::new(hir), |
| 763 | None => { |
| 764 | return Err(Error::new(ERR_COUNTED_REP_SUB_MISSING)); |
| 765 | } |
| 766 | }; |
| 767 | if !self.bump_and_bump_space() { |
| 768 | return Err(Error::new(ERR_COUNTED_REP_UNCLOSED)); |
| 769 | } |
| 770 | let min = self.parse_decimal()?; |
| 771 | let mut max = Some(min); |
| 772 | if self.is_done() { |
| 773 | return Err(Error::new(ERR_COUNTED_REP_MIN_UNCLOSED)); |
| 774 | } |
| 775 | if self.char() == ',' { |
| 776 | if !self.bump_and_bump_space() { |
| 777 | return Err(Error::new(ERR_COUNTED_REP_COMMA_UNCLOSED)); |
| 778 | } |
| 779 | if self.char() != '}' { |
| 780 | max = Some(self.parse_decimal()?); |
| 781 | } else { |
| 782 | max = None; |
| 783 | } |
| 784 | if self.is_done() { |
| 785 | return Err(Error::new(ERR_COUNTED_REP_MIN_MAX_UNCLOSED)); |
| 786 | } |
| 787 | } |
| 788 | if self.char() != '}' { |
| 789 | return Err(Error::new(ERR_COUNTED_REP_INVALID)); |
| 790 | } |
| 791 | |
| 792 | let mut greedy = true; |
| 793 | if self.bump_and_bump_space() && self.char() == '?' { |
| 794 | greedy = false; |
| 795 | self.bump(); |
| 796 | } |
| 797 | if self.flags().swap_greed { |
| 798 | greedy = !greedy; |
| 799 | } |
| 800 | |
| 801 | if max.map_or(false, |max| min > max) { |
| 802 | return Err(Error::new(ERR_COUNTED_REP_INVALID_RANGE)); |
| 803 | } |
| 804 | concat.push(Hir::repetition(hir::Repetition { |
| 805 | min, |
| 806 | max, |
| 807 | greedy, |
| 808 | sub, |
| 809 | })); |
| 810 | Ok(concat) |
| 811 | } |
| 812 | |
| 813 | /// Parses the part of a pattern that starts with a `(`. This is usually |
| 814 | /// a group sub-expression, but might just be a directive that enables |
| 815 | /// (or disables) certain flags. |
| 816 | /// |
| 817 | /// This assumes the parser is pointing at the opening `(`. |
| 818 | fn parse_group(&self) -> Result<Option<Hir>, Error> { |
| 819 | assert_eq!(self.char(), '(' ); |
| 820 | self.bump_and_bump_space(); |
| 821 | if self.is_lookaround_prefix() { |
| 822 | return Err(Error::new(ERR_LOOK_UNSUPPORTED)); |
| 823 | } |
| 824 | if self.bump_if("?P<" ) || self.bump_if("?<" ) { |
| 825 | let index = self.next_capture_index()?; |
| 826 | let name = Some(Box::from(self.parse_capture_name()?)); |
| 827 | let sub = Box::new(self.parse_inner()?); |
| 828 | let cap = hir::Capture { index, name, sub }; |
| 829 | Ok(Some(Hir::capture(cap))) |
| 830 | } else if self.bump_if("?" ) { |
| 831 | if self.is_done() { |
| 832 | return Err(Error::new(ERR_UNCLOSED_GROUP_QUESTION)); |
| 833 | } |
| 834 | let start = self.pos(); |
| 835 | // The flags get reset in the top-level 'parse' routine. |
| 836 | *self.flags.borrow_mut() = self.parse_flags()?; |
| 837 | let consumed = self.pos() - start; |
| 838 | if self.char() == ')' { |
| 839 | // We don't allow empty flags, e.g., `(?)`. |
| 840 | if consumed == 0 { |
| 841 | return Err(Error::new(ERR_EMPTY_FLAGS)); |
| 842 | } |
| 843 | Ok(None) |
| 844 | } else { |
| 845 | assert_eq!(':' , self.char()); |
| 846 | self.bump(); |
| 847 | self.parse_inner().map(Some) |
| 848 | } |
| 849 | } else { |
| 850 | let index = self.next_capture_index()?; |
| 851 | let sub = Box::new(self.parse_inner()?); |
| 852 | let cap = hir::Capture { index, name: None, sub }; |
| 853 | Ok(Some(Hir::capture(cap))) |
| 854 | } |
| 855 | } |
| 856 | |
| 857 | /// Parses a capture group name. Assumes that the parser is positioned at |
| 858 | /// the first character in the name following the opening `<` (and may |
| 859 | /// possibly be EOF). This advances the parser to the first character |
| 860 | /// following the closing `>`. |
| 861 | fn parse_capture_name(&self) -> Result<&str, Error> { |
| 862 | if self.is_done() { |
| 863 | return Err(Error::new(ERR_MISSING_GROUP_NAME)); |
| 864 | } |
| 865 | let start = self.pos(); |
| 866 | loop { |
| 867 | if self.char() == '>' { |
| 868 | break; |
| 869 | } |
| 870 | if !is_capture_char(self.char(), self.pos() == start) { |
| 871 | return Err(Error::new(ERR_INVALID_GROUP_NAME)); |
| 872 | } |
| 873 | if !self.bump() { |
| 874 | break; |
| 875 | } |
| 876 | } |
| 877 | let end = self.pos(); |
| 878 | if self.is_done() { |
| 879 | return Err(Error::new(ERR_UNCLOSED_GROUP_NAME)); |
| 880 | } |
| 881 | assert_eq!(self.char(), '>' ); |
| 882 | self.bump(); |
| 883 | let name = &self.pattern()[start..end]; |
| 884 | if name.is_empty() { |
| 885 | return Err(Error::new(ERR_EMPTY_GROUP_NAME)); |
| 886 | } |
| 887 | self.add_capture_name(name)?; |
| 888 | Ok(name) |
| 889 | } |
| 890 | |
| 891 | /// Parse a sequence of flags starting at the current character. |
| 892 | /// |
| 893 | /// This advances the parser to the character immediately following the |
| 894 | /// flags, which is guaranteed to be either `:` or `)`. |
| 895 | /// |
| 896 | /// # Errors |
| 897 | /// |
| 898 | /// If any flags are duplicated, then an error is returned. |
| 899 | /// |
| 900 | /// If the negation operator is used more than once, then an error is |
| 901 | /// returned. |
| 902 | /// |
| 903 | /// If no flags could be found or if the negation operation is not followed |
| 904 | /// by any flags, then an error is returned. |
| 905 | fn parse_flags(&self) -> Result<Flags, Error> { |
| 906 | let mut flags = *self.flags.borrow(); |
| 907 | let mut negate = false; |
| 908 | // Keeps track of whether the previous flag item was a '-'. We use this |
| 909 | // to detect whether there is a dangling '-', which is invalid. |
| 910 | let mut last_was_negation = false; |
| 911 | // A set to keep track of the flags we've seen. Since all flags are |
| 912 | // ASCII, we only need 128 bytes. |
| 913 | let mut seen = [false; 128]; |
| 914 | while self.char() != ':' && self.char() != ')' { |
| 915 | if self.char() == '-' { |
| 916 | last_was_negation = true; |
| 917 | if negate { |
| 918 | return Err(Error::new(ERR_FLAG_REPEATED_NEGATION)); |
| 919 | } |
| 920 | negate = true; |
| 921 | } else { |
| 922 | last_was_negation = false; |
| 923 | self.parse_flag(&mut flags, negate)?; |
| 924 | // OK because every valid flag is ASCII, and we're only here if |
| 925 | // the flag is valid. |
| 926 | let flag_byte = u8::try_from(self.char()).unwrap(); |
| 927 | if seen[usize::from(flag_byte)] { |
| 928 | return Err(Error::new(ERR_FLAG_DUPLICATE)); |
| 929 | } |
| 930 | seen[usize::from(flag_byte)] = true; |
| 931 | } |
| 932 | if !self.bump() { |
| 933 | return Err(Error::new(ERR_FLAG_UNEXPECTED_EOF)); |
| 934 | } |
| 935 | } |
| 936 | if last_was_negation { |
| 937 | return Err(Error::new(ERR_FLAG_DANGLING_NEGATION)); |
| 938 | } |
| 939 | Ok(flags) |
| 940 | } |
| 941 | |
| 942 | /// Parse the current character as a flag. Do not advance the parser. |
| 943 | /// |
| 944 | /// This sets the appropriate boolean value in place on the set of flags |
| 945 | /// given. The boolean is inverted when `negate` is true. |
| 946 | /// |
| 947 | /// # Errors |
| 948 | /// |
| 949 | /// If the flag is not recognized, then an error is returned. |
| 950 | fn parse_flag( |
| 951 | &self, |
| 952 | flags: &mut Flags, |
| 953 | negate: bool, |
| 954 | ) -> Result<(), Error> { |
| 955 | let enabled = !negate; |
| 956 | match self.char() { |
| 957 | 'i' => flags.case_insensitive = enabled, |
| 958 | 'm' => flags.multi_line = enabled, |
| 959 | 's' => flags.dot_matches_new_line = enabled, |
| 960 | 'U' => flags.swap_greed = enabled, |
| 961 | 'R' => flags.crlf = enabled, |
| 962 | 'x' => flags.ignore_whitespace = enabled, |
| 963 | // We make a special exception for this flag where we let it |
| 964 | // through as a recognized flag, but treat it as a no-op. This in |
| 965 | // practice retains some compatibility with the regex crate. It is |
| 966 | // a little suspect to do this, but for example, '(?-u:\b).+' in |
| 967 | // the regex crate is equivalent to '\b.+' in regex-lite. |
| 968 | 'u' => {} |
| 969 | _ => return Err(Error::new(ERR_FLAG_UNRECOGNIZED)), |
| 970 | } |
| 971 | Ok(()) |
| 972 | } |
| 973 | |
| 974 | /// Parse a standard character class consisting primarily of characters or |
| 975 | /// character ranges. |
| 976 | /// |
| 977 | /// This assumes the parser is positioned at the opening `[`. If parsing |
| 978 | /// is successful, then the parser is advanced to the position immediately |
| 979 | /// following the closing `]`. |
| 980 | fn parse_class(&self) -> Result<Hir, Error> { |
| 981 | assert_eq!(self.char(), '[' ); |
| 982 | |
| 983 | let mut union = vec![]; |
| 984 | if !self.bump_and_bump_space() { |
| 985 | return Err(Error::new(ERR_CLASS_UNCLOSED)); |
| 986 | } |
| 987 | // Determine whether the class is negated or not. |
| 988 | let negate = if self.char() != '^' { |
| 989 | false |
| 990 | } else { |
| 991 | if !self.bump_and_bump_space() { |
| 992 | return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_NEGATION)); |
| 993 | } |
| 994 | true |
| 995 | }; |
| 996 | // Accept any number of `-` as literal `-`. |
| 997 | while self.char() == '-' { |
| 998 | union.push(hir::ClassRange { start: '-' , end: '-' }); |
| 999 | if !self.bump_and_bump_space() { |
| 1000 | return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_DASH)); |
| 1001 | } |
| 1002 | } |
| 1003 | // If `]` is the *first* char in a set, then interpret it as a literal |
| 1004 | // `]`. That is, an empty class is impossible to write. |
| 1005 | if union.is_empty() && self.char() == ']' { |
| 1006 | union.push(hir::ClassRange { start: ']' , end: ']' }); |
| 1007 | if !self.bump_and_bump_space() { |
| 1008 | return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_CLOSING)); |
| 1009 | } |
| 1010 | } |
| 1011 | loop { |
| 1012 | self.bump_space(); |
| 1013 | if self.is_done() { |
| 1014 | return Err(Error::new(ERR_CLASS_UNCLOSED)); |
| 1015 | } |
| 1016 | match self.char() { |
| 1017 | '[' => { |
| 1018 | // Attempt to treat this as the beginning of a POSIX class. |
| 1019 | // If POSIX class parsing fails, then the parser backs up |
| 1020 | // to `[`. |
| 1021 | if let Some(class) = self.maybe_parse_posix_class() { |
| 1022 | union.extend_from_slice(&class.ranges); |
| 1023 | continue; |
| 1024 | } |
| 1025 | // ... otherwise we don't support nested classes. |
| 1026 | return Err(Error::new(ERR_CLASS_NEST_UNSUPPORTED)); |
| 1027 | } |
| 1028 | ']' => { |
| 1029 | self.bump(); |
| 1030 | let mut class = hir::Class::new(union); |
| 1031 | // Note that we must apply case folding before negation! |
| 1032 | // Consider `(?i)[^x]`. If we applied negation first, then |
| 1033 | // the result would be the character class that matched any |
| 1034 | // Unicode scalar value. |
| 1035 | if self.flags().case_insensitive { |
| 1036 | class.ascii_case_fold(); |
| 1037 | } |
| 1038 | if negate { |
| 1039 | class.negate(); |
| 1040 | } |
| 1041 | return Ok(Hir::class(class)); |
| 1042 | } |
| 1043 | '&' if self.peek() == Some('&' ) => { |
| 1044 | return Err(Error::new( |
| 1045 | ERR_CLASS_INTERSECTION_UNSUPPORTED, |
| 1046 | )); |
| 1047 | } |
| 1048 | '-' if self.peek() == Some('-' ) => { |
| 1049 | return Err(Error::new(ERR_CLASS_DIFFERENCE_UNSUPPORTED)); |
| 1050 | } |
| 1051 | '~' if self.peek() == Some('~' ) => { |
| 1052 | return Err(Error::new( |
| 1053 | ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED, |
| 1054 | )); |
| 1055 | } |
| 1056 | _ => self.parse_class_range(&mut union)?, |
| 1057 | } |
| 1058 | } |
| 1059 | } |
| 1060 | |
| 1061 | /// Parse a single primitive item in a character class set. The item to |
| 1062 | /// be parsed can either be one of a simple literal character, a range |
| 1063 | /// between two simple literal characters or a "primitive" character |
| 1064 | /// class like `\w`. |
| 1065 | /// |
| 1066 | /// If an invalid escape is found, or if a character class is found where |
| 1067 | /// a simple literal is expected (e.g., in a range), then an error is |
| 1068 | /// returned. |
| 1069 | /// |
| 1070 | /// Otherwise, the range (or ranges) are appended to the given union of |
| 1071 | /// ranges. |
| 1072 | fn parse_class_range( |
| 1073 | &self, |
| 1074 | union: &mut Vec<hir::ClassRange>, |
| 1075 | ) -> Result<(), Error> { |
| 1076 | let prim1 = self.parse_class_item()?; |
| 1077 | self.bump_space(); |
| 1078 | if self.is_done() { |
| 1079 | return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_ITEM)); |
| 1080 | } |
| 1081 | // If the next char isn't a `-`, then we don't have a range. |
| 1082 | // There are two exceptions. If the char after a `-` is a `]`, then |
| 1083 | // `-` is interpreted as a literal `-`. Alternatively, if the char |
| 1084 | // after a `-` is a `-`, then `--` corresponds to a "difference" |
| 1085 | // operation. (Which we don't support in regex-lite, but error about |
| 1086 | // specifically in an effort to be loud about differences between the |
| 1087 | // main regex crate where possible.) |
| 1088 | if self.char() != '-' |
| 1089 | || self.peek_space() == Some(']' ) |
| 1090 | || self.peek_space() == Some('-' ) |
| 1091 | { |
| 1092 | union.extend_from_slice(&into_class_item_ranges(prim1)?); |
| 1093 | return Ok(()); |
| 1094 | } |
| 1095 | // OK, now we're parsing a range, so bump past the `-` and parse the |
| 1096 | // second half of the range. |
| 1097 | if !self.bump_and_bump_space() { |
| 1098 | return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_DASH)); |
| 1099 | } |
| 1100 | let prim2 = self.parse_class_item()?; |
| 1101 | let range = hir::ClassRange { |
| 1102 | start: into_class_item_range(prim1)?, |
| 1103 | end: into_class_item_range(prim2)?, |
| 1104 | }; |
| 1105 | if range.start > range.end { |
| 1106 | return Err(Error::new(ERR_CLASS_INVALID_RANGE)); |
| 1107 | } |
| 1108 | union.push(range); |
| 1109 | Ok(()) |
| 1110 | } |
| 1111 | |
| 1112 | /// Parse a single item in a character class as a primitive, where the |
| 1113 | /// primitive either consists of a verbatim literal or a single escape |
| 1114 | /// sequence. |
| 1115 | /// |
| 1116 | /// This assumes the parser is positioned at the beginning of a primitive, |
| 1117 | /// and advances the parser to the first position after the primitive if |
| 1118 | /// successful. |
| 1119 | /// |
| 1120 | /// Note that it is the caller's responsibility to report an error if an |
| 1121 | /// illegal primitive was parsed. |
| 1122 | fn parse_class_item(&self) -> Result<Hir, Error> { |
| 1123 | let ch = self.char(); |
| 1124 | self.bump(); |
| 1125 | if ch == ' \\' { |
| 1126 | self.parse_escape() |
| 1127 | } else { |
| 1128 | Ok(Hir::char(ch)) |
| 1129 | } |
| 1130 | } |
| 1131 | |
| 1132 | /// Attempt to parse a POSIX character class, e.g., `[:alnum:]`. |
| 1133 | /// |
| 1134 | /// This assumes the parser is positioned at the opening `[`. |
| 1135 | /// |
| 1136 | /// If no valid POSIX character class could be found, then this does not |
| 1137 | /// advance the parser and `None` is returned. Otherwise, the parser is |
| 1138 | /// advanced to the first byte following the closing `]` and the |
| 1139 | /// corresponding POSIX class is returned. |
| 1140 | fn maybe_parse_posix_class(&self) -> Option<hir::Class> { |
| 1141 | // POSIX character classes are interesting from a parsing perspective |
| 1142 | // because parsing cannot fail with any interesting error. For example, |
| 1143 | // in order to use an POSIX character class, it must be enclosed in |
| 1144 | // double brackets, e.g., `[[:alnum:]]`. Alternatively, you might think |
| 1145 | // of it as "POSIX character classes have the syntax `[:NAME:]` which |
| 1146 | // can only appear within character brackets." This means that things |
| 1147 | // like `[[:lower:]A]` are legal constructs. |
| 1148 | // |
| 1149 | // However, if one types an incorrect POSIX character class, e.g., |
| 1150 | // `[[:loower:]]`, then we treat that as if it were normal nested |
| 1151 | // character class containing the characters `:elorw`. (Which isn't |
| 1152 | // supported and results in an error in regex-lite.) One might argue |
| 1153 | // that we should return an error instead since the repeated colons |
| 1154 | // give away the intent to write an POSIX class. But what if the user |
| 1155 | // typed `[[:lower]]` instead? How can we tell that was intended to be |
| 1156 | // a POSXI class and not just a normal nested class? |
| 1157 | // |
| 1158 | // Reasonable people can probably disagree over this, but for better |
| 1159 | // or worse, we implement semantics that never fails at the expense of |
| 1160 | // better failure modes. |
| 1161 | assert_eq!(self.char(), '[' ); |
| 1162 | |
| 1163 | // If parsing fails, then we back up the parser to this starting point. |
| 1164 | let start_pos = self.pos(); |
| 1165 | let start_char = self.char.get(); |
| 1166 | let reset = || { |
| 1167 | self.pos.set(start_pos); |
| 1168 | self.char.set(start_char); |
| 1169 | }; |
| 1170 | |
| 1171 | let mut negated = false; |
| 1172 | if !self.bump() || self.char() != ':' { |
| 1173 | reset(); |
| 1174 | return None; |
| 1175 | } |
| 1176 | if !self.bump() { |
| 1177 | reset(); |
| 1178 | return None; |
| 1179 | } |
| 1180 | if self.char() == '^' { |
| 1181 | negated = true; |
| 1182 | if !self.bump() { |
| 1183 | reset(); |
| 1184 | return None; |
| 1185 | } |
| 1186 | } |
| 1187 | let name_start = self.pos(); |
| 1188 | while self.char() != ':' && self.bump() {} |
| 1189 | if self.is_done() { |
| 1190 | reset(); |
| 1191 | return None; |
| 1192 | } |
| 1193 | let name = &self.pattern()[name_start..self.pos()]; |
| 1194 | if !self.bump_if(":]" ) { |
| 1195 | reset(); |
| 1196 | return None; |
| 1197 | } |
| 1198 | if let Ok(ranges) = posix_class(name) { |
| 1199 | let mut class = hir::Class::new(ranges); |
| 1200 | if negated { |
| 1201 | class.negate(); |
| 1202 | } |
| 1203 | return Some(class); |
| 1204 | } |
| 1205 | reset(); |
| 1206 | None |
| 1207 | } |
| 1208 | |
| 1209 | /// Parse a Perl character class, e.g., `\d` or `\W`. This assumes the |
| 1210 | /// parser is currently at a valid character class name and will be |
| 1211 | /// advanced to the character immediately following the class. |
| 1212 | fn parse_perl_class(&self) -> Hir { |
| 1213 | let ch = self.char(); |
| 1214 | self.bump(); |
| 1215 | let mut class = hir::Class::new(match ch { |
| 1216 | 'd' | 'D' => posix_class("digit" ).unwrap(), |
| 1217 | 's' | 'S' => posix_class("space" ).unwrap(), |
| 1218 | 'w' | 'W' => posix_class("word" ).unwrap(), |
| 1219 | unk => unreachable!("invalid Perl class \\{}" , unk), |
| 1220 | }); |
| 1221 | if ch.is_ascii_uppercase() { |
| 1222 | class.negate(); |
| 1223 | } |
| 1224 | Hir::class(class) |
| 1225 | } |
| 1226 | |
| 1227 | fn hir_dot(&self) -> Hir { |
| 1228 | if self.flags().dot_matches_new_line { |
| 1229 | Hir::class(hir::Class::new([hir::ClassRange { |
| 1230 | start: ' \x00' , |
| 1231 | end: ' \u{10FFFF}' , |
| 1232 | }])) |
| 1233 | } else if self.flags().crlf { |
| 1234 | Hir::class(hir::Class::new([ |
| 1235 | hir::ClassRange { start: ' \x00' , end: ' \x09' }, |
| 1236 | hir::ClassRange { start: ' \x0B' , end: ' \x0C' }, |
| 1237 | hir::ClassRange { start: ' \x0E' , end: ' \u{10FFFF}' }, |
| 1238 | ])) |
| 1239 | } else { |
| 1240 | Hir::class(hir::Class::new([ |
| 1241 | hir::ClassRange { start: ' \x00' , end: ' \x09' }, |
| 1242 | hir::ClassRange { start: ' \x0B' , end: ' \u{10FFFF}' }, |
| 1243 | ])) |
| 1244 | } |
| 1245 | } |
| 1246 | |
| 1247 | fn hir_anchor_start(&self) -> Hir { |
| 1248 | let look = if self.flags().multi_line { |
| 1249 | if self.flags().crlf { |
| 1250 | hir::Look::StartCRLF |
| 1251 | } else { |
| 1252 | hir::Look::StartLF |
| 1253 | } |
| 1254 | } else { |
| 1255 | hir::Look::Start |
| 1256 | }; |
| 1257 | Hir::look(look) |
| 1258 | } |
| 1259 | |
| 1260 | fn hir_anchor_end(&self) -> Hir { |
| 1261 | let look = if self.flags().multi_line { |
| 1262 | if self.flags().crlf { |
| 1263 | hir::Look::EndCRLF |
| 1264 | } else { |
| 1265 | hir::Look::EndLF |
| 1266 | } |
| 1267 | } else { |
| 1268 | hir::Look::End |
| 1269 | }; |
| 1270 | Hir::look(look) |
| 1271 | } |
| 1272 | |
| 1273 | fn hir_char(&self, ch: char) -> Hir { |
| 1274 | if self.flags().case_insensitive { |
| 1275 | let this = hir::ClassRange { start: ch, end: ch }; |
| 1276 | if let Some(folded) = this.ascii_case_fold() { |
| 1277 | return Hir::class(hir::Class::new([this, folded])); |
| 1278 | } |
| 1279 | } |
| 1280 | Hir::char(ch) |
| 1281 | } |
| 1282 | } |
| 1283 | |
| 1284 | /// This checks the depth of the given `Hir` value, and if it exceeds the given |
| 1285 | /// limit, then an error is returned. |
| 1286 | fn check_hir_nesting(hir: &Hir, limit: u32) -> Result<(), Error> { |
| 1287 | fn recurse(hir: &Hir, limit: u32, depth: u32) -> Result<(), Error> { |
| 1288 | if depth > limit { |
| 1289 | return Err(Error::new(ERR_TOO_MUCH_NESTING)); |
| 1290 | } |
| 1291 | let Some(next_depth) = depth.checked_add(1) else { |
| 1292 | return Err(Error::new(ERR_TOO_MUCH_NESTING)); |
| 1293 | }; |
| 1294 | match *hir.kind() { |
| 1295 | HirKind::Empty |
| 1296 | | HirKind::Char(_) |
| 1297 | | HirKind::Class(_) |
| 1298 | | HirKind::Look(_) => Ok(()), |
| 1299 | HirKind::Repetition(hir::Repetition { ref sub, .. }) => { |
| 1300 | recurse(sub, limit, next_depth) |
| 1301 | } |
| 1302 | HirKind::Capture(hir::Capture { ref sub, .. }) => { |
| 1303 | recurse(sub, limit, next_depth) |
| 1304 | } |
| 1305 | HirKind::Concat(ref subs) | HirKind::Alternation(ref subs) => { |
| 1306 | for sub in subs.iter() { |
| 1307 | recurse(sub, limit, next_depth)?; |
| 1308 | } |
| 1309 | Ok(()) |
| 1310 | } |
| 1311 | } |
| 1312 | } |
| 1313 | recurse(hir, limit, 0) |
| 1314 | } |
| 1315 | |
| 1316 | /// Converts the given Hir to a literal char if the Hir is just a single |
| 1317 | /// character. Otherwise this returns an error. |
| 1318 | /// |
| 1319 | /// This is useful in contexts where you can only accept a single character, |
| 1320 | /// but where it is convenient to parse something more general. For example, |
| 1321 | /// parsing a single part of a character class range. It's useful to reuse |
| 1322 | /// the literal parsing code, but that code can itself return entire classes |
| 1323 | /// which can't be used as the start/end of a class range. |
| 1324 | fn into_class_item_range(hir: Hir) -> Result<char, Error> { |
| 1325 | match hir.kind { |
| 1326 | HirKind::Char(ch: char) => Ok(ch), |
| 1327 | _ => Err(Error::new(ERR_CLASS_INVALID_RANGE_ITEM)), |
| 1328 | } |
| 1329 | } |
| 1330 | |
| 1331 | fn into_class_item_ranges( |
| 1332 | mut hir: Hir, |
| 1333 | ) -> Result<Vec<hir::ClassRange>, Error> { |
| 1334 | match core::mem::replace(&mut hir.kind, src:HirKind::Empty) { |
| 1335 | HirKind::Char(ch: char) => Ok(vec![hir::ClassRange { start: ch, end: ch }]), |
| 1336 | HirKind::Class(hir::Class { ranges: Vec }) => Ok(ranges), |
| 1337 | _ => Err(Error::new(ERR_CLASS_INVALID_ITEM)), |
| 1338 | } |
| 1339 | } |
| 1340 | |
| 1341 | /// Returns an iterator of character class ranges for the given named POSIX |
| 1342 | /// character class. If no such character class exists for the name given, then |
| 1343 | /// an error is returned. |
| 1344 | fn posix_class( |
| 1345 | kind: &str, |
| 1346 | ) -> Result<impl Iterator<Item = hir::ClassRange>, Error> { |
| 1347 | let slice: &'static [(u8, u8)] = match kind { |
| 1348 | "alnum" => &[(b'0' , b'9' ), (b'A' , b'Z' ), (b'a' , b'z' )], |
| 1349 | "alpha" => &[(b'A' , b'Z' ), (b'a' , b'z' )], |
| 1350 | "ascii" => &[(b' \x00' , b' \x7F' )], |
| 1351 | "blank" => &[(b' \t' , b' \t' ), (b' ' , b' ' )], |
| 1352 | "cntrl" => &[(b' \x00' , b' \x1F' ), (b' \x7F' , b' \x7F' )], |
| 1353 | "digit" => &[(b'0' , b'9' )], |
| 1354 | "graph" => &[(b'!' , b'~' )], |
| 1355 | "lower" => &[(b'a' , b'z' )], |
| 1356 | "print" => &[(b' ' , b'~' )], |
| 1357 | "punct" => &[(b'!' , b'/' ), (b':' , b'@' ), (b'[' , b'`' ), (b'{' , b'~' )], |
| 1358 | "space" => &[ |
| 1359 | (b' \t' , b' \t' ), |
| 1360 | (b' \n' , b' \n' ), |
| 1361 | (b' \x0B' , b' \x0B' ), |
| 1362 | (b' \x0C' , b' \x0C' ), |
| 1363 | (b' \r' , b' \r' ), |
| 1364 | (b' ' , b' ' ), |
| 1365 | ], |
| 1366 | "upper" => &[(b'A' , b'Z' )], |
| 1367 | "word" => &[(b'0' , b'9' ), (b'A' , b'Z' ), (b'_' , b'_' ), (b'a' , b'z' )], |
| 1368 | "xdigit" => &[(b'0' , b'9' ), (b'A' , b'F' ), (b'a' , b'f' )], |
| 1369 | _ => return Err(Error::new(ERR_POSIX_CLASS_UNRECOGNIZED)), |
| 1370 | }; |
| 1371 | Ok(slice.iter().map(|&(start, end)| hir::ClassRange { |
| 1372 | start: char::from(start), |
| 1373 | end: char::from(end), |
| 1374 | })) |
| 1375 | } |
| 1376 | |
| 1377 | /// Returns true if the given character is a hexadecimal digit. |
| 1378 | fn is_hex(c: char) -> bool { |
| 1379 | ('0' <= c && c <= '9' ) || ('a' <= c && c <= 'f' ) || ('A' <= c && c <= 'F' ) |
| 1380 | } |
| 1381 | |
| 1382 | /// Returns true if the given character is a valid in a capture group name. |
| 1383 | /// |
| 1384 | /// If `first` is true, then `c` is treated as the first character in the |
| 1385 | /// group name (which must be alphabetic or underscore). |
| 1386 | fn is_capture_char(c: char, first: bool) -> bool { |
| 1387 | if first { |
| 1388 | c == '_' || c.is_alphabetic() |
| 1389 | } else { |
| 1390 | c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric() |
| 1391 | } |
| 1392 | } |
| 1393 | |
| 1394 | #[cfg (test)] |
| 1395 | mod tests { |
| 1396 | use super::*; |
| 1397 | |
| 1398 | fn p(pattern: &str) -> Hir { |
| 1399 | Parser::new(Config::default(), pattern).parse_inner().unwrap() |
| 1400 | } |
| 1401 | |
| 1402 | fn perr(pattern: &str) -> String { |
| 1403 | Parser::new(Config::default(), pattern) |
| 1404 | .parse_inner() |
| 1405 | .unwrap_err() |
| 1406 | .to_string() |
| 1407 | } |
| 1408 | |
| 1409 | fn class<I: IntoIterator<Item = (char, char)>>(it: I) -> Hir { |
| 1410 | Hir::class(hir::Class::new( |
| 1411 | it.into_iter().map(|(start, end)| hir::ClassRange { start, end }), |
| 1412 | )) |
| 1413 | } |
| 1414 | |
| 1415 | fn singles<I: IntoIterator<Item = char>>(it: I) -> Hir { |
| 1416 | Hir::class(hir::Class::new( |
| 1417 | it.into_iter().map(|ch| hir::ClassRange { start: ch, end: ch }), |
| 1418 | )) |
| 1419 | } |
| 1420 | |
| 1421 | fn posix(name: &str) -> Hir { |
| 1422 | Hir::class(hir::Class::new(posix_class(name).unwrap())) |
| 1423 | } |
| 1424 | |
| 1425 | fn cap(index: u32, sub: Hir) -> Hir { |
| 1426 | Hir::capture(hir::Capture { index, name: None, sub: Box::new(sub) }) |
| 1427 | } |
| 1428 | |
| 1429 | fn named_cap(index: u32, name: &str, sub: Hir) -> Hir { |
| 1430 | Hir::capture(hir::Capture { |
| 1431 | index, |
| 1432 | name: Some(Box::from(name)), |
| 1433 | sub: Box::new(sub), |
| 1434 | }) |
| 1435 | } |
| 1436 | |
| 1437 | #[test ] |
| 1438 | fn ok_literal() { |
| 1439 | assert_eq!(p("a" ), Hir::char('a' )); |
| 1440 | assert_eq!(p("ab" ), Hir::concat(vec![Hir::char('a' ), Hir::char('b' )])); |
| 1441 | assert_eq!(p("💩" ), Hir::char('💩' )); |
| 1442 | } |
| 1443 | |
| 1444 | #[test ] |
| 1445 | fn ok_meta_escapes() { |
| 1446 | assert_eq!(p(r"\*" ), Hir::char('*' )); |
| 1447 | assert_eq!(p(r"\+" ), Hir::char('+' )); |
| 1448 | assert_eq!(p(r"\?" ), Hir::char('?' )); |
| 1449 | assert_eq!(p(r"\|" ), Hir::char('|' )); |
| 1450 | assert_eq!(p(r"\(" ), Hir::char('(' )); |
| 1451 | assert_eq!(p(r"\)" ), Hir::char(')' )); |
| 1452 | assert_eq!(p(r"\^" ), Hir::char('^' )); |
| 1453 | assert_eq!(p(r"\$" ), Hir::char('$' )); |
| 1454 | assert_eq!(p(r"\[" ), Hir::char('[' )); |
| 1455 | assert_eq!(p(r"\]" ), Hir::char(']' )); |
| 1456 | } |
| 1457 | |
| 1458 | #[test ] |
| 1459 | fn ok_special_escapes() { |
| 1460 | assert_eq!(p(r"\a" ), Hir::char(' \x07' )); |
| 1461 | assert_eq!(p(r"\f" ), Hir::char(' \x0C' )); |
| 1462 | assert_eq!(p(r"\t" ), Hir::char(' \t' )); |
| 1463 | assert_eq!(p(r"\n" ), Hir::char(' \n' )); |
| 1464 | assert_eq!(p(r"\r" ), Hir::char(' \r' )); |
| 1465 | assert_eq!(p(r"\v" ), Hir::char(' \x0B' )); |
| 1466 | assert_eq!(p(r"\A" ), Hir::look(hir::Look::Start)); |
| 1467 | assert_eq!(p(r"\z" ), Hir::look(hir::Look::End)); |
| 1468 | assert_eq!(p(r"\b" ), Hir::look(hir::Look::Word)); |
| 1469 | assert_eq!(p(r"\B" ), Hir::look(hir::Look::WordNegate)); |
| 1470 | } |
| 1471 | |
| 1472 | #[test ] |
| 1473 | fn ok_hex() { |
| 1474 | // fixed length |
| 1475 | assert_eq!(p(r"\x41" ), Hir::char('A' )); |
| 1476 | assert_eq!(p(r"\u2603" ), Hir::char('☃' )); |
| 1477 | assert_eq!(p(r"\U0001F4A9" ), Hir::char('💩' )); |
| 1478 | // braces |
| 1479 | assert_eq!(p(r"\x{1F4A9}" ), Hir::char('💩' )); |
| 1480 | assert_eq!(p(r"\u{1F4A9}" ), Hir::char('💩' )); |
| 1481 | assert_eq!(p(r"\U{1F4A9}" ), Hir::char('💩' )); |
| 1482 | } |
| 1483 | |
| 1484 | #[test ] |
| 1485 | fn ok_perl() { |
| 1486 | assert_eq!(p(r"\d" ), posix("digit" )); |
| 1487 | assert_eq!(p(r"\s" ), posix("space" )); |
| 1488 | assert_eq!(p(r"\w" ), posix("word" )); |
| 1489 | |
| 1490 | let negated = |name| { |
| 1491 | let mut class = hir::Class::new(posix_class(name).unwrap()); |
| 1492 | class.negate(); |
| 1493 | Hir::class(class) |
| 1494 | }; |
| 1495 | assert_eq!(p(r"\D" ), negated("digit" )); |
| 1496 | assert_eq!(p(r"\S" ), negated("space" )); |
| 1497 | assert_eq!(p(r"\W" ), negated("word" )); |
| 1498 | } |
| 1499 | |
| 1500 | #[test ] |
| 1501 | fn ok_flags_and_primitives() { |
| 1502 | assert_eq!(p(r"a" ), Hir::char('a' )); |
| 1503 | assert_eq!(p(r"(?i:a)" ), singles(['A' , 'a' ])); |
| 1504 | |
| 1505 | assert_eq!(p(r"^" ), Hir::look(hir::Look::Start)); |
| 1506 | assert_eq!(p(r"(?m:^)" ), Hir::look(hir::Look::StartLF)); |
| 1507 | assert_eq!(p(r"(?mR:^)" ), Hir::look(hir::Look::StartCRLF)); |
| 1508 | |
| 1509 | assert_eq!(p(r"$" ), Hir::look(hir::Look::End)); |
| 1510 | assert_eq!(p(r"(?m:$)" ), Hir::look(hir::Look::EndLF)); |
| 1511 | assert_eq!(p(r"(?mR:$)" ), Hir::look(hir::Look::EndCRLF)); |
| 1512 | |
| 1513 | assert_eq!(p(r"." ), class([(' \x00' , ' \x09' ), (' \x0B' , ' \u{10FFFF}' )])); |
| 1514 | assert_eq!( |
| 1515 | p(r"(?R:.)" ), |
| 1516 | class([ |
| 1517 | (' \x00' , ' \x09' ), |
| 1518 | (' \x0B' , ' \x0C' ), |
| 1519 | (' \x0E' , ' \u{10FFFF}' ), |
| 1520 | ]) |
| 1521 | ); |
| 1522 | assert_eq!(p(r"(?s:.)" ), class([(' \x00' , ' \u{10FFFF}' )])); |
| 1523 | assert_eq!(p(r"(?sR:.)" ), class([(' \x00' , ' \u{10FFFF}' )])); |
| 1524 | } |
| 1525 | |
| 1526 | #[test ] |
| 1527 | fn ok_alternate() { |
| 1528 | assert_eq!( |
| 1529 | p(r"a|b" ), |
| 1530 | Hir::alternation(vec![Hir::char('a' ), Hir::char('b' )]) |
| 1531 | ); |
| 1532 | assert_eq!( |
| 1533 | p(r"(?:a|b)" ), |
| 1534 | Hir::alternation(vec![Hir::char('a' ), Hir::char('b' )]) |
| 1535 | ); |
| 1536 | |
| 1537 | assert_eq!( |
| 1538 | p(r"(a|b)" ), |
| 1539 | cap(1, Hir::alternation(vec![Hir::char('a' ), Hir::char('b' )])) |
| 1540 | ); |
| 1541 | assert_eq!( |
| 1542 | p(r"(?<foo>a|b)" ), |
| 1543 | named_cap( |
| 1544 | 1, |
| 1545 | "foo" , |
| 1546 | Hir::alternation(vec![Hir::char('a' ), Hir::char('b' )]) |
| 1547 | ) |
| 1548 | ); |
| 1549 | |
| 1550 | assert_eq!( |
| 1551 | p(r"a|b|c" ), |
| 1552 | Hir::alternation(vec![ |
| 1553 | Hir::char('a' ), |
| 1554 | Hir::char('b' ), |
| 1555 | Hir::char('c' ) |
| 1556 | ]) |
| 1557 | ); |
| 1558 | |
| 1559 | assert_eq!( |
| 1560 | p(r"ax|by|cz" ), |
| 1561 | Hir::alternation(vec![ |
| 1562 | Hir::concat(vec![Hir::char('a' ), Hir::char('x' )]), |
| 1563 | Hir::concat(vec![Hir::char('b' ), Hir::char('y' )]), |
| 1564 | Hir::concat(vec![Hir::char('c' ), Hir::char('z' )]), |
| 1565 | ]) |
| 1566 | ); |
| 1567 | assert_eq!( |
| 1568 | p(r"(ax|(by|(cz)))" ), |
| 1569 | cap( |
| 1570 | 1, |
| 1571 | Hir::alternation(vec![ |
| 1572 | Hir::concat(vec![Hir::char('a' ), Hir::char('x' )]), |
| 1573 | cap( |
| 1574 | 2, |
| 1575 | Hir::alternation(vec![ |
| 1576 | Hir::concat(vec![Hir::char('b' ), Hir::char('y' )]), |
| 1577 | cap( |
| 1578 | 3, |
| 1579 | Hir::concat(vec![ |
| 1580 | Hir::char('c' ), |
| 1581 | Hir::char('z' ) |
| 1582 | ]) |
| 1583 | ), |
| 1584 | ]) |
| 1585 | ), |
| 1586 | ]) |
| 1587 | ) |
| 1588 | ); |
| 1589 | |
| 1590 | assert_eq!( |
| 1591 | p(r"|" ), |
| 1592 | Hir::alternation(vec![Hir::empty(), Hir::empty()]) |
| 1593 | ); |
| 1594 | assert_eq!( |
| 1595 | p(r"||" ), |
| 1596 | Hir::alternation(vec![Hir::empty(), Hir::empty(), Hir::empty()]) |
| 1597 | ); |
| 1598 | |
| 1599 | assert_eq!( |
| 1600 | p(r"a|" ), |
| 1601 | Hir::alternation(vec![Hir::char('a' ), Hir::empty()]) |
| 1602 | ); |
| 1603 | assert_eq!( |
| 1604 | p(r"|a" ), |
| 1605 | Hir::alternation(vec![Hir::empty(), Hir::char('a' )]) |
| 1606 | ); |
| 1607 | |
| 1608 | assert_eq!( |
| 1609 | p(r"(|)" ), |
| 1610 | cap(1, Hir::alternation(vec![Hir::empty(), Hir::empty()])) |
| 1611 | ); |
| 1612 | assert_eq!( |
| 1613 | p(r"(a|)" ), |
| 1614 | cap(1, Hir::alternation(vec![Hir::char('a' ), Hir::empty()])) |
| 1615 | ); |
| 1616 | assert_eq!( |
| 1617 | p(r"(|a)" ), |
| 1618 | cap(1, Hir::alternation(vec![Hir::empty(), Hir::char('a' )])) |
| 1619 | ); |
| 1620 | } |
| 1621 | |
| 1622 | #[test ] |
| 1623 | fn ok_flag_group() { |
| 1624 | assert_eq!( |
| 1625 | p("a(?i:b)" ), |
| 1626 | Hir::concat(vec![Hir::char('a' ), singles(['B' , 'b' ])]) |
| 1627 | ); |
| 1628 | } |
| 1629 | |
| 1630 | #[test ] |
| 1631 | fn ok_flag_directive() { |
| 1632 | assert_eq!(p("(?i)a" ), singles(['A' , 'a' ])); |
| 1633 | assert_eq!(p("a(?i)" ), Hir::char('a' )); |
| 1634 | assert_eq!( |
| 1635 | p("a(?i)b" ), |
| 1636 | Hir::concat(vec![Hir::char('a' ), singles(['B' , 'b' ])]) |
| 1637 | ); |
| 1638 | assert_eq!( |
| 1639 | p("a(?i)a(?-i)a" ), |
| 1640 | Hir::concat(vec![ |
| 1641 | Hir::char('a' ), |
| 1642 | singles(['A' , 'a' ]), |
| 1643 | Hir::char('a' ), |
| 1644 | ]) |
| 1645 | ); |
| 1646 | assert_eq!( |
| 1647 | p("a(?:(?i)a)a" ), |
| 1648 | Hir::concat(vec![ |
| 1649 | Hir::char('a' ), |
| 1650 | singles(['A' , 'a' ]), |
| 1651 | Hir::char('a' ), |
| 1652 | ]) |
| 1653 | ); |
| 1654 | assert_eq!( |
| 1655 | p("a((?i)a)a" ), |
| 1656 | Hir::concat(vec![ |
| 1657 | Hir::char('a' ), |
| 1658 | cap(1, singles(['A' , 'a' ])), |
| 1659 | Hir::char('a' ), |
| 1660 | ]) |
| 1661 | ); |
| 1662 | } |
| 1663 | |
| 1664 | #[test ] |
| 1665 | fn ok_uncounted_repetition() { |
| 1666 | assert_eq!( |
| 1667 | p(r"a?" ), |
| 1668 | Hir::repetition(hir::Repetition { |
| 1669 | min: 0, |
| 1670 | max: Some(1), |
| 1671 | greedy: true, |
| 1672 | sub: Box::new(Hir::char('a' )), |
| 1673 | }), |
| 1674 | ); |
| 1675 | assert_eq!( |
| 1676 | p(r"a*" ), |
| 1677 | Hir::repetition(hir::Repetition { |
| 1678 | min: 0, |
| 1679 | max: None, |
| 1680 | greedy: true, |
| 1681 | sub: Box::new(Hir::char('a' )), |
| 1682 | }), |
| 1683 | ); |
| 1684 | assert_eq!( |
| 1685 | p(r"a+" ), |
| 1686 | Hir::repetition(hir::Repetition { |
| 1687 | min: 1, |
| 1688 | max: None, |
| 1689 | greedy: true, |
| 1690 | sub: Box::new(Hir::char('a' )), |
| 1691 | }), |
| 1692 | ); |
| 1693 | |
| 1694 | assert_eq!( |
| 1695 | p(r"a??" ), |
| 1696 | Hir::repetition(hir::Repetition { |
| 1697 | min: 0, |
| 1698 | max: Some(1), |
| 1699 | greedy: false, |
| 1700 | sub: Box::new(Hir::char('a' )), |
| 1701 | }), |
| 1702 | ); |
| 1703 | assert_eq!( |
| 1704 | p(r"a*?" ), |
| 1705 | Hir::repetition(hir::Repetition { |
| 1706 | min: 0, |
| 1707 | max: None, |
| 1708 | greedy: false, |
| 1709 | sub: Box::new(Hir::char('a' )), |
| 1710 | }), |
| 1711 | ); |
| 1712 | assert_eq!( |
| 1713 | p(r"a+?" ), |
| 1714 | Hir::repetition(hir::Repetition { |
| 1715 | min: 1, |
| 1716 | max: None, |
| 1717 | greedy: false, |
| 1718 | sub: Box::new(Hir::char('a' )), |
| 1719 | }), |
| 1720 | ); |
| 1721 | |
| 1722 | assert_eq!( |
| 1723 | p(r"a?b" ), |
| 1724 | Hir::concat(vec![ |
| 1725 | Hir::repetition(hir::Repetition { |
| 1726 | min: 0, |
| 1727 | max: Some(1), |
| 1728 | greedy: true, |
| 1729 | sub: Box::new(Hir::char('a' )), |
| 1730 | }), |
| 1731 | Hir::char('b' ), |
| 1732 | ]), |
| 1733 | ); |
| 1734 | |
| 1735 | assert_eq!( |
| 1736 | p(r"ab?" ), |
| 1737 | Hir::concat(vec![ |
| 1738 | Hir::char('a' ), |
| 1739 | Hir::repetition(hir::Repetition { |
| 1740 | min: 0, |
| 1741 | max: Some(1), |
| 1742 | greedy: true, |
| 1743 | sub: Box::new(Hir::char('b' )), |
| 1744 | }), |
| 1745 | ]), |
| 1746 | ); |
| 1747 | |
| 1748 | assert_eq!( |
| 1749 | p(r"(?:ab)?" ), |
| 1750 | Hir::repetition(hir::Repetition { |
| 1751 | min: 0, |
| 1752 | max: Some(1), |
| 1753 | greedy: true, |
| 1754 | sub: Box::new(Hir::concat(vec![ |
| 1755 | Hir::char('a' ), |
| 1756 | Hir::char('b' ) |
| 1757 | ])), |
| 1758 | }), |
| 1759 | ); |
| 1760 | |
| 1761 | assert_eq!( |
| 1762 | p(r"(ab)?" ), |
| 1763 | Hir::repetition(hir::Repetition { |
| 1764 | min: 0, |
| 1765 | max: Some(1), |
| 1766 | greedy: true, |
| 1767 | sub: Box::new(cap( |
| 1768 | 1, |
| 1769 | Hir::concat(vec![Hir::char('a' ), Hir::char('b' )]) |
| 1770 | )), |
| 1771 | }), |
| 1772 | ); |
| 1773 | |
| 1774 | assert_eq!( |
| 1775 | p(r"|a?" ), |
| 1776 | Hir::alternation(vec![ |
| 1777 | Hir::empty(), |
| 1778 | Hir::repetition(hir::Repetition { |
| 1779 | min: 0, |
| 1780 | max: Some(1), |
| 1781 | greedy: true, |
| 1782 | sub: Box::new(Hir::char('a' )), |
| 1783 | }) |
| 1784 | ]), |
| 1785 | ); |
| 1786 | } |
| 1787 | |
| 1788 | #[test ] |
| 1789 | fn ok_counted_repetition() { |
| 1790 | assert_eq!( |
| 1791 | p(r"a{5}" ), |
| 1792 | Hir::repetition(hir::Repetition { |
| 1793 | min: 5, |
| 1794 | max: Some(5), |
| 1795 | greedy: true, |
| 1796 | sub: Box::new(Hir::char('a' )), |
| 1797 | }), |
| 1798 | ); |
| 1799 | assert_eq!( |
| 1800 | p(r"a{5}?" ), |
| 1801 | Hir::repetition(hir::Repetition { |
| 1802 | min: 5, |
| 1803 | max: Some(5), |
| 1804 | greedy: false, |
| 1805 | sub: Box::new(Hir::char('a' )), |
| 1806 | }), |
| 1807 | ); |
| 1808 | |
| 1809 | assert_eq!( |
| 1810 | p(r"a{5,}" ), |
| 1811 | Hir::repetition(hir::Repetition { |
| 1812 | min: 5, |
| 1813 | max: None, |
| 1814 | greedy: true, |
| 1815 | sub: Box::new(Hir::char('a' )), |
| 1816 | }), |
| 1817 | ); |
| 1818 | |
| 1819 | assert_eq!( |
| 1820 | p(r"a{5,9}" ), |
| 1821 | Hir::repetition(hir::Repetition { |
| 1822 | min: 5, |
| 1823 | max: Some(9), |
| 1824 | greedy: true, |
| 1825 | sub: Box::new(Hir::char('a' )), |
| 1826 | }), |
| 1827 | ); |
| 1828 | |
| 1829 | assert_eq!( |
| 1830 | p(r"ab{5}c" ), |
| 1831 | Hir::concat(vec![ |
| 1832 | Hir::char('a' ), |
| 1833 | Hir::repetition(hir::Repetition { |
| 1834 | min: 5, |
| 1835 | max: Some(5), |
| 1836 | greedy: true, |
| 1837 | sub: Box::new(Hir::char('b' )), |
| 1838 | }), |
| 1839 | Hir::char('c' ), |
| 1840 | ]), |
| 1841 | ); |
| 1842 | |
| 1843 | assert_eq!( |
| 1844 | p(r"a{ 5 }" ), |
| 1845 | Hir::repetition(hir::Repetition { |
| 1846 | min: 5, |
| 1847 | max: Some(5), |
| 1848 | greedy: true, |
| 1849 | sub: Box::new(Hir::char('a' )), |
| 1850 | }), |
| 1851 | ); |
| 1852 | assert_eq!( |
| 1853 | p(r"a{ 5 , 9 }" ), |
| 1854 | Hir::repetition(hir::Repetition { |
| 1855 | min: 5, |
| 1856 | max: Some(9), |
| 1857 | greedy: true, |
| 1858 | sub: Box::new(Hir::char('a' )), |
| 1859 | }), |
| 1860 | ); |
| 1861 | } |
| 1862 | |
| 1863 | #[test ] |
| 1864 | fn ok_group_unnamed() { |
| 1865 | assert_eq!(p("(a)" ), cap(1, Hir::char('a' ))); |
| 1866 | assert_eq!( |
| 1867 | p("(ab)" ), |
| 1868 | cap(1, Hir::concat(vec![Hir::char('a' ), Hir::char('b' )])) |
| 1869 | ); |
| 1870 | } |
| 1871 | |
| 1872 | #[test ] |
| 1873 | fn ok_group_named() { |
| 1874 | assert_eq!(p("(?P<foo>a)" ), named_cap(1, "foo" , Hir::char('a' ))); |
| 1875 | assert_eq!(p("(?<foo>a)" ), named_cap(1, "foo" , Hir::char('a' ))); |
| 1876 | |
| 1877 | assert_eq!( |
| 1878 | p("(?P<foo>ab)" ), |
| 1879 | named_cap( |
| 1880 | 1, |
| 1881 | "foo" , |
| 1882 | Hir::concat(vec![Hir::char('a' ), Hir::char('b' )]) |
| 1883 | ) |
| 1884 | ); |
| 1885 | assert_eq!( |
| 1886 | p("(?<foo>ab)" ), |
| 1887 | named_cap( |
| 1888 | 1, |
| 1889 | "foo" , |
| 1890 | Hir::concat(vec![Hir::char('a' ), Hir::char('b' )]) |
| 1891 | ) |
| 1892 | ); |
| 1893 | |
| 1894 | assert_eq!(p(r"(?<a>z)" ), named_cap(1, "a" , Hir::char('z' ))); |
| 1895 | assert_eq!(p(r"(?P<a>z)" ), named_cap(1, "a" , Hir::char('z' ))); |
| 1896 | |
| 1897 | assert_eq!(p(r"(?<a_1>z)" ), named_cap(1, "a_1" , Hir::char('z' ))); |
| 1898 | assert_eq!(p(r"(?P<a_1>z)" ), named_cap(1, "a_1" , Hir::char('z' ))); |
| 1899 | |
| 1900 | assert_eq!(p(r"(?<a.1>z)" ), named_cap(1, "a.1" , Hir::char('z' ))); |
| 1901 | assert_eq!(p(r"(?P<a.1>z)" ), named_cap(1, "a.1" , Hir::char('z' ))); |
| 1902 | |
| 1903 | assert_eq!(p(r"(?<a[1]>z)" ), named_cap(1, "a[1]" , Hir::char('z' ))); |
| 1904 | assert_eq!(p(r"(?P<a[1]>z)" ), named_cap(1, "a[1]" , Hir::char('z' ))); |
| 1905 | |
| 1906 | assert_eq!(p(r"(?<a¾>z)" ), named_cap(1, "a¾" , Hir::char('z' ))); |
| 1907 | assert_eq!(p(r"(?P<a¾>z)" ), named_cap(1, "a¾" , Hir::char('z' ))); |
| 1908 | |
| 1909 | assert_eq!(p(r"(?<名字>z)" ), named_cap(1, "名字" , Hir::char('z' ))); |
| 1910 | assert_eq!(p(r"(?P<名字>z)" ), named_cap(1, "名字" , Hir::char('z' ))); |
| 1911 | } |
| 1912 | |
| 1913 | #[test ] |
| 1914 | fn ok_class() { |
| 1915 | assert_eq!(p(r"[a]" ), singles(['a' ])); |
| 1916 | assert_eq!(p(r"[a\]]" ), singles(['a' , ']' ])); |
| 1917 | assert_eq!(p(r"[a\-z]" ), singles(['a' , '-' , 'z' ])); |
| 1918 | assert_eq!(p(r"[ab]" ), class([('a' , 'b' )])); |
| 1919 | assert_eq!(p(r"[a-]" ), singles(['a' , '-' ])); |
| 1920 | assert_eq!(p(r"[-a]" ), singles(['a' , '-' ])); |
| 1921 | assert_eq!(p(r"[--a]" ), singles(['a' , '-' ])); |
| 1922 | assert_eq!(p(r"[---a]" ), singles(['a' , '-' ])); |
| 1923 | assert_eq!(p(r"[[:alnum:]]" ), posix("alnum" )); |
| 1924 | assert_eq!(p(r"[\w]" ), posix("word" )); |
| 1925 | assert_eq!(p(r"[a\wz]" ), posix("word" )); |
| 1926 | assert_eq!(p(r"[\s\S]" ), class([(' \x00' , ' \u{10FFFF}' )])); |
| 1927 | assert_eq!(p(r"[^\s\S]" ), Hir::fail()); |
| 1928 | assert_eq!(p(r"[a-cx-z]" ), class([('a' , 'c' ), ('x' , 'z' )])); |
| 1929 | assert_eq!(p(r"[☃-⛄]" ), class([('☃' , '⛄' )])); |
| 1930 | assert_eq!(p(r"[]]" ), singles([']' ])); |
| 1931 | assert_eq!(p(r"[]a]" ), singles([']' , 'a' ])); |
| 1932 | assert_eq!(p(r"[]\[]" ), singles(['[' , ']' ])); |
| 1933 | assert_eq!(p(r"[\[]" ), singles(['[' ])); |
| 1934 | |
| 1935 | assert_eq!(p(r"(?i)[a]" ), singles(['A' , 'a' ])); |
| 1936 | assert_eq!(p(r"(?i)[A]" ), singles(['A' , 'a' ])); |
| 1937 | assert_eq!(p(r"(?i)[k]" ), singles(['K' , 'k' ])); |
| 1938 | assert_eq!(p(r"(?i)[s]" ), singles(['S' , 's' ])); |
| 1939 | assert_eq!(p(r"(?i)[β]" ), singles(['β' ])); |
| 1940 | |
| 1941 | assert_eq!(p(r"[^^]" ), class([(' \x00' , ']' ), ('_' , ' \u{10FFFF}' )])); |
| 1942 | assert_eq!( |
| 1943 | p(r"[^-a]" ), |
| 1944 | class([(' \x00' , ',' ), ('.' , '`' ), ('b' , ' \u{10FFFF}' )]) |
| 1945 | ); |
| 1946 | |
| 1947 | assert_eq!( |
| 1948 | p(r"[-]a]" ), |
| 1949 | Hir::concat(vec![singles(['-' ]), Hir::char('a' ), Hir::char(']' )]) |
| 1950 | ); |
| 1951 | } |
| 1952 | |
| 1953 | #[test ] |
| 1954 | fn ok_verbatim() { |
| 1955 | assert_eq!( |
| 1956 | p(r"(?x)a{5,9} ?" ), |
| 1957 | Hir::repetition(hir::Repetition { |
| 1958 | min: 5, |
| 1959 | max: Some(9), |
| 1960 | greedy: false, |
| 1961 | sub: Box::new(Hir::char('a' )), |
| 1962 | }) |
| 1963 | ); |
| 1964 | assert_eq!(p(r"(?x)[ a]" ), singles(['a' ])); |
| 1965 | assert_eq!( |
| 1966 | p(r"(?x)[ ^ a]" ), |
| 1967 | class([(' \x00' , '`' ), ('b' , ' \u{10FFFF}' )]) |
| 1968 | ); |
| 1969 | assert_eq!(p(r"(?x)[ - a]" ), singles(['a' , '-' ])); |
| 1970 | assert_eq!(p(r"(?x)[ ] a]" ), singles([']' , 'a' ])); |
| 1971 | |
| 1972 | assert_eq!( |
| 1973 | p(r"(?x)a b" ), |
| 1974 | Hir::concat(vec![Hir::char('a' ), Hir::char('b' )]) |
| 1975 | ); |
| 1976 | assert_eq!( |
| 1977 | p(r"(?x)a b(?-x)a b" ), |
| 1978 | Hir::concat(vec![ |
| 1979 | Hir::char('a' ), |
| 1980 | Hir::char('b' ), |
| 1981 | Hir::char('a' ), |
| 1982 | Hir::char(' ' ), |
| 1983 | Hir::char('b' ), |
| 1984 | ]) |
| 1985 | ); |
| 1986 | assert_eq!( |
| 1987 | p(r"a (?x:a )a " ), |
| 1988 | Hir::concat(vec![ |
| 1989 | Hir::char('a' ), |
| 1990 | Hir::char(' ' ), |
| 1991 | Hir::char('a' ), |
| 1992 | Hir::char('a' ), |
| 1993 | Hir::char(' ' ), |
| 1994 | ]) |
| 1995 | ); |
| 1996 | assert_eq!( |
| 1997 | p(r"(?x)( ?P<foo> a )" ), |
| 1998 | named_cap(1, "foo" , Hir::char('a' )), |
| 1999 | ); |
| 2000 | assert_eq!(p(r"(?x)( a )" ), cap(1, Hir::char('a' ))); |
| 2001 | assert_eq!(p(r"(?x)( ?: a )" ), Hir::char('a' )); |
| 2002 | assert_eq!(p(r"(?x)\x { 53 }" ), Hir::char(' \x53' )); |
| 2003 | assert_eq!(p(r"(?x)\ " ), Hir::char(' ' )); |
| 2004 | } |
| 2005 | |
| 2006 | #[test ] |
| 2007 | fn ok_comments() { |
| 2008 | let pat = "(?x) |
| 2009 | # This is comment 1. |
| 2010 | foo # This is comment 2. |
| 2011 | # This is comment 3. |
| 2012 | bar |
| 2013 | # This is comment 4." ; |
| 2014 | assert_eq!( |
| 2015 | p(pat), |
| 2016 | Hir::concat(vec![ |
| 2017 | Hir::char('f' ), |
| 2018 | Hir::char('o' ), |
| 2019 | Hir::char('o' ), |
| 2020 | Hir::char('b' ), |
| 2021 | Hir::char('a' ), |
| 2022 | Hir::char('r' ), |
| 2023 | ]) |
| 2024 | ); |
| 2025 | } |
| 2026 | |
| 2027 | #[test ] |
| 2028 | fn err_standard() { |
| 2029 | assert_eq!( |
| 2030 | ERR_TOO_MUCH_NESTING, |
| 2031 | perr("(((((((((((((((((((((((((((((((((((((((((((((((((((a)))))))))))))))))))))))))))))))))))))))))))))))))))" ), |
| 2032 | ); |
| 2033 | // This one is tricky, because the only way it can happen is if the |
| 2034 | // number of captures overflows u32. Perhaps we should allow setting a |
| 2035 | // lower limit? |
| 2036 | // assert_eq!(ERR_TOO_MANY_CAPTURES, perr("")); |
| 2037 | assert_eq!(ERR_DUPLICATE_CAPTURE_NAME, perr(r"(?P<a>y)(?P<a>z)" )); |
| 2038 | assert_eq!(ERR_UNCLOSED_GROUP, perr("(" )); |
| 2039 | assert_eq!(ERR_UNCLOSED_GROUP_QUESTION, perr("(?" )); |
| 2040 | assert_eq!(ERR_UNOPENED_GROUP, perr(")" )); |
| 2041 | assert_eq!(ERR_LOOK_UNSUPPORTED, perr(r"(?=a)" )); |
| 2042 | assert_eq!(ERR_LOOK_UNSUPPORTED, perr(r"(?!a)" )); |
| 2043 | assert_eq!(ERR_LOOK_UNSUPPORTED, perr(r"(?<=a)" )); |
| 2044 | assert_eq!(ERR_LOOK_UNSUPPORTED, perr(r"(?<!a)" )); |
| 2045 | assert_eq!(ERR_EMPTY_FLAGS, perr(r"(?)" )); |
| 2046 | assert_eq!(ERR_MISSING_GROUP_NAME, perr(r"(?P<" )); |
| 2047 | assert_eq!(ERR_MISSING_GROUP_NAME, perr(r"(?<" )); |
| 2048 | assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?P<1abc>z)" )); |
| 2049 | assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<1abc>z)" )); |
| 2050 | assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<¾>z)" )); |
| 2051 | assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<¾a>z)" )); |
| 2052 | assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<☃>z)" )); |
| 2053 | assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<a☃>z)" )); |
| 2054 | assert_eq!(ERR_UNCLOSED_GROUP_NAME, perr(r"(?P<foo" )); |
| 2055 | assert_eq!(ERR_UNCLOSED_GROUP_NAME, perr(r"(?<foo" )); |
| 2056 | assert_eq!(ERR_EMPTY_GROUP_NAME, perr(r"(?P<>z)" )); |
| 2057 | assert_eq!(ERR_EMPTY_GROUP_NAME, perr(r"(?<>z)" )); |
| 2058 | assert_eq!(ERR_FLAG_UNRECOGNIZED, perr(r"(?z:foo)" )); |
| 2059 | assert_eq!(ERR_FLAG_REPEATED_NEGATION, perr(r"(?s-i-R)" )); |
| 2060 | assert_eq!(ERR_FLAG_DUPLICATE, perr(r"(?isi)" )); |
| 2061 | assert_eq!(ERR_FLAG_DUPLICATE, perr(r"(?is-i)" )); |
| 2062 | assert_eq!(ERR_FLAG_UNEXPECTED_EOF, perr(r"(?is" )); |
| 2063 | assert_eq!(ERR_FLAG_DANGLING_NEGATION, perr(r"(?is-:foo)" )); |
| 2064 | assert_eq!(ERR_HEX_BRACE_INVALID_DIGIT, perr(r"\x{Z}" )); |
| 2065 | assert_eq!(ERR_HEX_BRACE_UNEXPECTED_EOF, perr(r"\x{" )); |
| 2066 | assert_eq!(ERR_HEX_BRACE_UNEXPECTED_EOF, perr(r"\x{A" )); |
| 2067 | assert_eq!(ERR_HEX_BRACE_EMPTY, perr(r"\x{}" )); |
| 2068 | assert_eq!(ERR_HEX_BRACE_INVALID, perr(r"\x{FFFFFFFFFFFFFFFFF}" )); |
| 2069 | assert_eq!(ERR_HEX_FIXED_UNEXPECTED_EOF, perr(r"\xA" )); |
| 2070 | assert_eq!(ERR_HEX_FIXED_INVALID_DIGIT, perr(r"\xZ" )); |
| 2071 | assert_eq!(ERR_HEX_FIXED_INVALID_DIGIT, perr(r"\xZA" )); |
| 2072 | assert_eq!(ERR_HEX_FIXED_INVALID_DIGIT, perr(r"\xAZ" )); |
| 2073 | assert_eq!(ERR_HEX_FIXED_INVALID, perr(r"\uD800" )); |
| 2074 | assert_eq!(ERR_HEX_FIXED_INVALID, perr(r"\UFFFFFFFF" )); |
| 2075 | assert_eq!(ERR_HEX_UNEXPECTED_EOF, perr(r"\x" )); |
| 2076 | assert_eq!(ERR_ESCAPE_UNEXPECTED_EOF, perr(r"\" )); |
| 2077 | assert_eq!(ERR_BACKREF_UNSUPPORTED, perr(r"\0" )); |
| 2078 | assert_eq!(ERR_BACKREF_UNSUPPORTED, perr(r"\1" )); |
| 2079 | assert_eq!(ERR_BACKREF_UNSUPPORTED, perr(r"\8" )); |
| 2080 | assert_eq!(ERR_UNICODE_CLASS_UNSUPPORTED, perr(r"\pL" )); |
| 2081 | assert_eq!(ERR_UNICODE_CLASS_UNSUPPORTED, perr(r"\p{L}" )); |
| 2082 | assert_eq!(ERR_ESCAPE_UNRECOGNIZED, perr(r"\i" )); |
| 2083 | assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"?" )); |
| 2084 | assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"*" )); |
| 2085 | assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"+" )); |
| 2086 | assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"(+)" )); |
| 2087 | assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"|?" )); |
| 2088 | assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"(?i)?" )); |
| 2089 | assert_eq!(ERR_COUNTED_REP_SUB_MISSING, perr(r"{5}" )); |
| 2090 | assert_eq!(ERR_COUNTED_REP_SUB_MISSING, perr(r"({5})" )); |
| 2091 | assert_eq!(ERR_COUNTED_REP_SUB_MISSING, perr(r"(?i){5}" )); |
| 2092 | assert_eq!(ERR_COUNTED_REP_UNCLOSED, perr(r"a{" )); |
| 2093 | assert_eq!(ERR_COUNTED_REP_MIN_UNCLOSED, perr(r"a{5" )); |
| 2094 | assert_eq!(ERR_COUNTED_REP_COMMA_UNCLOSED, perr(r"a{5," )); |
| 2095 | assert_eq!(ERR_COUNTED_REP_MIN_MAX_UNCLOSED, perr(r"a{5,6" )); |
| 2096 | assert_eq!(ERR_COUNTED_REP_INVALID, perr(r"a{5,6Z" )); |
| 2097 | assert_eq!(ERR_COUNTED_REP_INVALID_RANGE, perr(r"a{6,5}" )); |
| 2098 | assert_eq!(ERR_DECIMAL_NO_DIGITS, perr(r"a{}" )); |
| 2099 | assert_eq!(ERR_DECIMAL_NO_DIGITS, perr(r"a{]}" )); |
| 2100 | assert_eq!(ERR_DECIMAL_INVALID, perr(r"a{999999999999999}" )); |
| 2101 | assert_eq!(ERR_CLASS_UNCLOSED_AFTER_ITEM, perr(r"[a" )); |
| 2102 | assert_eq!(ERR_CLASS_INVALID_RANGE_ITEM, perr(r"[\w-a]" )); |
| 2103 | assert_eq!(ERR_CLASS_INVALID_RANGE_ITEM, perr(r"[a-\w]" )); |
| 2104 | assert_eq!(ERR_CLASS_INVALID_ITEM, perr(r"[\b]" )); |
| 2105 | assert_eq!(ERR_CLASS_UNCLOSED_AFTER_DASH, perr(r"[a-" )); |
| 2106 | assert_eq!(ERR_CLASS_UNCLOSED_AFTER_NEGATION, perr(r"[^" )); |
| 2107 | assert_eq!(ERR_CLASS_UNCLOSED_AFTER_CLOSING, perr(r"[]" )); |
| 2108 | assert_eq!(ERR_CLASS_INVALID_RANGE, perr(r"[z-a]" )); |
| 2109 | assert_eq!(ERR_CLASS_UNCLOSED, perr(r"[" )); |
| 2110 | assert_eq!(ERR_CLASS_UNCLOSED, perr(r"[a-z" )); |
| 2111 | assert_eq!(ERR_CLASS_NEST_UNSUPPORTED, perr(r"[a-z[A-Z]]" )); |
| 2112 | assert_eq!(ERR_CLASS_NEST_UNSUPPORTED, perr(r"[[:alnum]]" )); |
| 2113 | assert_eq!(ERR_CLASS_INTERSECTION_UNSUPPORTED, perr(r"[a&&b]" )); |
| 2114 | assert_eq!(ERR_CLASS_DIFFERENCE_UNSUPPORTED, perr(r"[a--b]" )); |
| 2115 | assert_eq!(ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED, perr(r"[a~~b]" )); |
| 2116 | assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED, perr(r"\b{foo" )); |
| 2117 | assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED, perr(r"\b{foo!}" )); |
| 2118 | assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED, perr(r"\b{foo}" )); |
| 2119 | assert_eq!(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF, perr(r"\b{" )); |
| 2120 | assert_eq!(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF, perr(r"(?x)\b{ " )); |
| 2121 | } |
| 2122 | |
| 2123 | #[test ] |
| 2124 | fn err_verbatim() { |
| 2125 | // See: https://github.com/rust-lang/regex/issues/792 |
| 2126 | assert_eq!(ERR_CLASS_UNCLOSED_AFTER_DASH, perr(r"(?x)[-#]" )); |
| 2127 | assert_eq!(ERR_CLASS_UNCLOSED_AFTER_ITEM, perr(r"(?x)[a " )); |
| 2128 | assert_eq!(ERR_CLASS_UNCLOSED_AFTER_DASH, perr(r"(?x)[a- " )); |
| 2129 | assert_eq!(ERR_CLASS_UNCLOSED, perr(r"(?x)[ " )); |
| 2130 | } |
| 2131 | |
| 2132 | // This tests a bug fix where the nest limit checker wasn't decrementing |
| 2133 | // its depth during post-traversal, which causes long regexes to trip |
| 2134 | // the default limit too aggressively. |
| 2135 | #[test ] |
| 2136 | fn regression_454_nest_too_big() { |
| 2137 | let pattern = r#" |
| 2138 | 2(?: |
| 2139 | [45]\d{3}| |
| 2140 | 7(?: |
| 2141 | 1[0-267]| |
| 2142 | 2[0-289]| |
| 2143 | 3[0-29]| |
| 2144 | 4[01]| |
| 2145 | 5[1-3]| |
| 2146 | 6[013]| |
| 2147 | 7[0178]| |
| 2148 | 91 |
| 2149 | )| |
| 2150 | 8(?: |
| 2151 | 0[125]| |
| 2152 | [139][1-6]| |
| 2153 | 2[0157-9]| |
| 2154 | 41| |
| 2155 | 6[1-35]| |
| 2156 | 7[1-5]| |
| 2157 | 8[1-8]| |
| 2158 | 90 |
| 2159 | )| |
| 2160 | 9(?: |
| 2161 | 0[0-2]| |
| 2162 | 1[0-4]| |
| 2163 | 2[568]| |
| 2164 | 3[3-6]| |
| 2165 | 5[5-7]| |
| 2166 | 6[0167]| |
| 2167 | 7[15]| |
| 2168 | 8[0146-9] |
| 2169 | ) |
| 2170 | )\d{4} |
| 2171 | "# ; |
| 2172 | p(pattern); |
| 2173 | } |
| 2174 | |
| 2175 | // This tests that we treat a trailing `-` in a character class as a |
| 2176 | // literal `-` even when whitespace mode is enabled and there is whitespace |
| 2177 | // after the trailing `-`. |
| 2178 | #[test ] |
| 2179 | fn regression_455_trailing_dash_ignore_whitespace() { |
| 2180 | p("(?x)[ / - ]" ); |
| 2181 | p("(?x)[ a - ]" ); |
| 2182 | p("(?x)[ |
| 2183 | a |
| 2184 | - ] |
| 2185 | " ); |
| 2186 | p("(?x)[ |
| 2187 | a # wat |
| 2188 | - ] |
| 2189 | " ); |
| 2190 | |
| 2191 | perr("(?x)[ / -" ); |
| 2192 | perr("(?x)[ / - " ); |
| 2193 | perr( |
| 2194 | "(?x)[ |
| 2195 | / - |
| 2196 | " , |
| 2197 | ); |
| 2198 | perr( |
| 2199 | "(?x)[ |
| 2200 | / - # wat |
| 2201 | " , |
| 2202 | ); |
| 2203 | } |
| 2204 | |
| 2205 | #[test ] |
| 2206 | fn regression_capture_indices() { |
| 2207 | let got = p(r"(a|ab|c|bcd){4,10}(d*)" ); |
| 2208 | assert_eq!( |
| 2209 | got, |
| 2210 | Hir::concat(vec![ |
| 2211 | Hir::repetition(hir::Repetition { |
| 2212 | min: 4, |
| 2213 | max: Some(10), |
| 2214 | greedy: true, |
| 2215 | sub: Box::new(cap( |
| 2216 | 1, |
| 2217 | Hir::alternation(vec![ |
| 2218 | Hir::char('a' ), |
| 2219 | Hir::concat(vec![Hir::char('a' ), Hir::char('b' )]), |
| 2220 | Hir::char('c' ), |
| 2221 | Hir::concat(vec![ |
| 2222 | Hir::char('b' ), |
| 2223 | Hir::char('c' ), |
| 2224 | Hir::char('d' ) |
| 2225 | ]), |
| 2226 | ]) |
| 2227 | )) |
| 2228 | }), |
| 2229 | cap( |
| 2230 | 2, |
| 2231 | Hir::repetition(hir::Repetition { |
| 2232 | min: 0, |
| 2233 | max: None, |
| 2234 | greedy: true, |
| 2235 | sub: Box::new(Hir::char('d' )), |
| 2236 | }) |
| 2237 | ), |
| 2238 | ]) |
| 2239 | ); |
| 2240 | } |
| 2241 | } |
| 2242 | |