1use core::cell::{Cell, RefCell};
2
3use alloc::{
4 boxed::Box,
5 string::{String, ToString},
6 vec,
7 vec::Vec,
8};
9
10use crate::{
11 error::Error,
12 hir::{self, Config, Flags, Hir, HirKind},
13};
14
15// These are all of the errors that can occur while parsing a regex. Unlike
16// regex-syntax, our errors are not particularly great. They are just enough
17// to get a general sense of what went wrong. But in exchange, the error
18// reporting mechanism is *much* simpler than what's in regex-syntax.
19//
20// By convention, we use each of these messages in exactly one place. That
21// way, every branch that leads to an error has a unique message. This in turn
22// means that given a message, one can precisely identify which part of the
23// parser reported it.
24//
25// Finally, we give names to each message so that we can reference them in
26// tests.
27const ERR_TOO_MUCH_NESTING: &str = "pattern has too much nesting";
28const ERR_TOO_MANY_CAPTURES: &str = "too many capture groups";
29const ERR_DUPLICATE_CAPTURE_NAME: &str = "duplicate capture group name";
30const ERR_UNCLOSED_GROUP: &str = "found open group without closing ')'";
31const ERR_UNCLOSED_GROUP_QUESTION: &str =
32 "expected closing ')', but got end of pattern";
33const ERR_UNOPENED_GROUP: &str = "found closing ')' without matching '('";
34const ERR_LOOK_UNSUPPORTED: &str = "look-around is not supported";
35const ERR_EMPTY_FLAGS: &str = "empty flag directive '(?)' is not allowed";
36const ERR_MISSING_GROUP_NAME: &str =
37 "expected capture group name, but got end of pattern";
38const ERR_INVALID_GROUP_NAME: &str = "invalid group name";
39const ERR_UNCLOSED_GROUP_NAME: &str =
40 "expected end of capture group name, but got end of pattern";
41const ERR_EMPTY_GROUP_NAME: &str = "empty capture group names are not allowed";
42const ERR_FLAG_UNRECOGNIZED: &str = "unrecognized inline flag";
43const ERR_FLAG_REPEATED_NEGATION: &str =
44 "inline flag negation cannot be repeated";
45const ERR_FLAG_DUPLICATE: &str = "duplicate inline flag is not allowed";
46const ERR_FLAG_UNEXPECTED_EOF: &str =
47 "expected ':' or ')' to end inline flags, but got end of pattern";
48const ERR_FLAG_DANGLING_NEGATION: &str =
49 "inline flags cannot end with negation directive";
50const ERR_DECIMAL_NO_DIGITS: &str =
51 "expected decimal number, but found no digits";
52const ERR_DECIMAL_INVALID: &str = "got invalid decimal number";
53const ERR_HEX_BRACE_INVALID_DIGIT: &str =
54 "expected hexadecimal number in braces, but got non-hex digit";
55const ERR_HEX_BRACE_UNEXPECTED_EOF: &str =
56 "expected hexadecimal number, but saw end of pattern before closing brace";
57const ERR_HEX_BRACE_EMPTY: &str =
58 "expected hexadecimal number in braces, but got no digits";
59const ERR_HEX_BRACE_INVALID: &str = "got invalid hexadecimal number in braces";
60const ERR_HEX_FIXED_UNEXPECTED_EOF: &str =
61 "expected fixed length hexadecimal number, but saw end of pattern first";
62const ERR_HEX_FIXED_INVALID_DIGIT: &str =
63 "expected fixed length hexadecimal number, but got non-hex digit";
64const ERR_HEX_FIXED_INVALID: &str =
65 "got invalid fixed length hexadecimal number";
66const ERR_HEX_UNEXPECTED_EOF: &str =
67 "expected hexadecimal number, but saw end of pattern first";
68const ERR_ESCAPE_UNEXPECTED_EOF: &str =
69 "saw start of escape sequence, but saw end of pattern before it finished";
70const ERR_BACKREF_UNSUPPORTED: &str = "backreferences are not supported";
71const ERR_UNICODE_CLASS_UNSUPPORTED: &str =
72 "Unicode character classes are not supported";
73const ERR_ESCAPE_UNRECOGNIZED: &str = "unrecognized escape sequence";
74const ERR_POSIX_CLASS_UNRECOGNIZED: &str =
75 "unrecognized POSIX character class";
76const ERR_UNCOUNTED_REP_SUB_MISSING: &str =
77 "uncounted repetition operator must be applied to a sub-expression";
78const ERR_COUNTED_REP_SUB_MISSING: &str =
79 "counted repetition operator must be applied to a sub-expression";
80const ERR_COUNTED_REP_UNCLOSED: &str =
81 "found unclosed counted repetition operator";
82const ERR_COUNTED_REP_MIN_UNCLOSED: &str =
83 "found incomplete and unclosed counted repetition operator";
84const ERR_COUNTED_REP_COMMA_UNCLOSED: &str =
85 "found counted repetition operator with a comma that is unclosed";
86const ERR_COUNTED_REP_MIN_MAX_UNCLOSED: &str =
87 "found counted repetition with min and max that is unclosed";
88const ERR_COUNTED_REP_INVALID: &str =
89 "expected closing brace for counted repetition, but got something else";
90const ERR_COUNTED_REP_INVALID_RANGE: &str =
91 "found counted repetition with a min bigger than its max";
92const ERR_CLASS_UNCLOSED_AFTER_ITEM: &str =
93 "non-empty character class has no closing bracket";
94const ERR_CLASS_INVALID_RANGE_ITEM: &str =
95 "character class ranges must start and end with a single character";
96const ERR_CLASS_INVALID_ITEM: &str =
97 "invalid escape sequence in character class";
98const ERR_CLASS_UNCLOSED_AFTER_DASH: &str =
99 "non-empty character class has no closing bracket after dash";
100const ERR_CLASS_UNCLOSED_AFTER_NEGATION: &str =
101 "negated character class has no closing bracket";
102const ERR_CLASS_UNCLOSED_AFTER_CLOSING: &str =
103 "character class begins with literal ']' but has no closing bracket";
104const ERR_CLASS_INVALID_RANGE: &str = "invalid range in character class";
105const ERR_CLASS_UNCLOSED: &str = "found unclosed character class";
106const ERR_CLASS_NEST_UNSUPPORTED: &str =
107 "nested character classes are not supported";
108const ERR_CLASS_INTERSECTION_UNSUPPORTED: &str =
109 "character class intersection is not supported";
110const ERR_CLASS_DIFFERENCE_UNSUPPORTED: &str =
111 "character class difference is not supported";
112const ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED: &str =
113 "character class symmetric difference is not supported";
114const ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED: &str =
115 "special word boundary assertion is unclosed or has an invalid character";
116const ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED: &str =
117 "special word boundary assertion is unrecognized";
118const ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF: &str =
119 "found start of special word boundary or repetition without an end";
120
121/// A regular expression parser.
122///
123/// This parses a string representation of a regular expression into an
124/// abstract syntax tree. The size of the tree is proportional to the length
125/// of the regular expression pattern.
126///
127/// A `Parser` can be configured in more detail via a [`ParserBuilder`].
128#[derive(Clone, Debug)]
129pub(super) struct Parser<'a> {
130 /// The configuration of the parser as given by the caller.
131 config: Config,
132 /// The pattern we're parsing as given by the caller.
133 pattern: &'a str,
134 /// The call depth of the parser. This is incremented for each
135 /// sub-expression parsed. Its peak value is the maximum nesting of the
136 /// pattern.
137 depth: Cell<u32>,
138 /// The current position of the parser.
139 pos: Cell<usize>,
140 /// The current codepoint of the parser. The codepoint corresponds to the
141 /// codepoint encoded in `pattern` beginning at `pos`.
142 ///
143 /// This is `None` if and only if `pos == pattern.len()`.
144 char: Cell<Option<char>>,
145 /// The current capture index.
146 capture_index: Cell<u32>,
147 /// The flags that are currently set.
148 flags: RefCell<Flags>,
149 /// A sorted sequence of capture names. This is used to detect duplicate
150 /// capture names and report an error if one is detected.
151 capture_names: RefCell<Vec<String>>,
152}
153
154/// The constructor and a variety of helper routines.
155impl<'a> Parser<'a> {
156 /// Build a parser from this configuration with the given pattern.
157 pub(super) fn new(config: Config, pattern: &'a str) -> Parser<'a> {
158 Parser {
159 config,
160 pattern,
161 depth: Cell::new(0),
162 pos: Cell::new(0),
163 char: Cell::new(pattern.chars().next()),
164 capture_index: Cell::new(0),
165 flags: RefCell::new(config.flags),
166 capture_names: RefCell::new(vec![]),
167 }
168 }
169
170 /// Returns the full pattern string that we're parsing.
171 fn pattern(&self) -> &str {
172 self.pattern
173 }
174
175 /// Return the current byte offset of the parser.
176 ///
177 /// The offset starts at `0` from the beginning of the regular expression
178 /// pattern string.
179 fn pos(&self) -> usize {
180 self.pos.get()
181 }
182
183 /// Increments the call depth of the parser.
184 ///
185 /// If the call depth would exceed the configured nest limit, then this
186 /// returns an error.
187 ///
188 /// This returns the old depth.
189 fn increment_depth(&self) -> Result<u32, Error> {
190 let old = self.depth.get();
191 if old > self.config.nest_limit {
192 return Err(Error::new(ERR_TOO_MUCH_NESTING));
193 }
194 // OK because our depth starts at 0, and we return an error if it
195 // ever reaches the limit. So the call depth can never exceed u32::MAX.
196 let new = old.checked_add(1).unwrap();
197 self.depth.set(new);
198 Ok(old)
199 }
200
201 /// Decrements the call depth of the parser.
202 ///
203 /// This panics if the current depth is 0.
204 fn decrement_depth(&self) {
205 let old = self.depth.get();
206 // If this fails then the caller has a bug in how they're incrementing
207 // and decrementing the depth of the parser's call stack.
208 let new = old.checked_sub(1).unwrap();
209 self.depth.set(new);
210 }
211
212 /// Return the codepoint at the current position of the parser.
213 ///
214 /// This panics if the parser is positioned at the end of the pattern.
215 fn char(&self) -> char {
216 self.char.get().expect("codepoint, but parser is done")
217 }
218
219 /// Returns true if the next call to `bump` would return false.
220 fn is_done(&self) -> bool {
221 self.pos() == self.pattern.len()
222 }
223
224 /// Returns the flags that are current set for this regex.
225 fn flags(&self) -> Flags {
226 *self.flags.borrow()
227 }
228
229 /// Bump the parser to the next Unicode scalar value.
230 ///
231 /// If the end of the input has been reached, then `false` is returned.
232 fn bump(&self) -> bool {
233 if self.is_done() {
234 return false;
235 }
236 self.pos.set(self.pos() + self.char().len_utf8());
237 self.char.set(self.pattern()[self.pos()..].chars().next());
238 self.char.get().is_some()
239 }
240
241 /// If the substring starting at the current position of the parser has
242 /// the given prefix, then bump the parser to the character immediately
243 /// following the prefix and return true. Otherwise, don't bump the parser
244 /// and return false.
245 fn bump_if(&self, prefix: &str) -> bool {
246 if self.pattern()[self.pos()..].starts_with(prefix) {
247 for _ in 0..prefix.chars().count() {
248 self.bump();
249 }
250 true
251 } else {
252 false
253 }
254 }
255
256 /// Bump the parser, and if the `x` flag is enabled, bump through any
257 /// subsequent spaces. Return true if and only if the parser is not done.
258 fn bump_and_bump_space(&self) -> bool {
259 if !self.bump() {
260 return false;
261 }
262 self.bump_space();
263 !self.is_done()
264 }
265
266 /// If the `x` flag is enabled (i.e., whitespace insensitivity with
267 /// comments), then this will advance the parser through all whitespace
268 /// and comments to the next non-whitespace non-comment byte.
269 ///
270 /// If the `x` flag is disabled, then this is a no-op.
271 ///
272 /// This should be used selectively throughout the parser where
273 /// arbitrary whitespace is permitted when the `x` flag is enabled. For
274 /// example, `{ 5 , 6}` is equivalent to `{5,6}`.
275 fn bump_space(&self) {
276 if !self.flags().ignore_whitespace {
277 return;
278 }
279 while !self.is_done() {
280 if self.char().is_whitespace() {
281 self.bump();
282 } else if self.char() == '#' {
283 self.bump();
284 while !self.is_done() {
285 let c = self.char();
286 self.bump();
287 if c == '\n' {
288 break;
289 }
290 }
291 } else {
292 break;
293 }
294 }
295 }
296
297 /// Peek at the next character in the input without advancing the parser.
298 ///
299 /// If the input has been exhausted, then this returns `None`.
300 fn peek(&self) -> Option<char> {
301 if self.is_done() {
302 return None;
303 }
304 self.pattern()[self.pos() + self.char().len_utf8()..].chars().next()
305 }
306
307 /// Peeks at the next character in the pattern from the current offset, and
308 /// will ignore spaces when the parser is in whitespace insensitive mode.
309 fn peek_space(&self) -> Option<char> {
310 if !self.flags().ignore_whitespace {
311 return self.peek();
312 }
313 if self.is_done() {
314 return None;
315 }
316 let mut start = self.pos() + self.char().len_utf8();
317 let mut in_comment = false;
318 for (i, ch) in self.pattern()[start..].char_indices() {
319 if ch.is_whitespace() {
320 continue;
321 } else if !in_comment && ch == '#' {
322 in_comment = true;
323 } else if in_comment && ch == '\n' {
324 in_comment = false;
325 } else {
326 start += i;
327 break;
328 }
329 }
330 self.pattern()[start..].chars().next()
331 }
332
333 /// Return the next capturing index. Each subsequent call increments the
334 /// internal index. Since the way capture indices are computed is a public
335 /// API guarantee, use of this routine depends on the parser being depth
336 /// first and left-to-right.
337 ///
338 /// If the capture limit is exceeded, then an error is returned.
339 fn next_capture_index(&self) -> Result<u32, Error> {
340 let current = self.capture_index.get();
341 let next = current
342 .checked_add(1)
343 .ok_or_else(|| Error::new(ERR_TOO_MANY_CAPTURES))?;
344 self.capture_index.set(next);
345 Ok(next)
346 }
347
348 /// Adds the given capture name to this parser. If this capture name has
349 /// already been used, then an error is returned.
350 fn add_capture_name(&self, name: &str) -> Result<(), Error> {
351 let mut names = self.capture_names.borrow_mut();
352 match names.binary_search_by(|n| name.cmp(n)) {
353 Ok(_) => Err(Error::new(ERR_DUPLICATE_CAPTURE_NAME)),
354 Err(i) => {
355 names.insert(i, name.to_string());
356 Ok(())
357 }
358 }
359 }
360
361 /// Returns true if and only if the parser is positioned at a look-around
362 /// prefix. The conditions under which this returns true must always
363 /// correspond to a regular expression that would otherwise be consider
364 /// invalid.
365 ///
366 /// This should only be called immediately after parsing the opening of
367 /// a group or a set of flags.
368 fn is_lookaround_prefix(&self) -> bool {
369 self.bump_if("?=")
370 || self.bump_if("?!")
371 || self.bump_if("?<=")
372 || self.bump_if("?<!")
373 }
374}
375
376/// The actual parser. We try to break out each kind of regex syntax into its
377/// own routine.
378impl<'a> Parser<'a> {
379 pub(super) fn parse(&self) -> Result<Hir, Error> {
380 let hir = self.parse_inner()?;
381 // While we also check nesting during parsing, that only checks the
382 // number of recursive parse calls. It does not necessarily cover
383 // all possible recursive nestings of the Hir itself. For example,
384 // repetition operators don't require recursive parse calls. So one
385 // can stack them arbitrarily without overflowing the stack in the
386 // *parser*. But then if one recurses over the resulting Hir, a stack
387 // overflow is possible. So here we check the Hir nesting level
388 // thoroughly to ensure it isn't nested too deeply.
389 //
390 // Note that we do still need the nesting limit check in the parser as
391 // well, since that will avoid overflowing the stack during parse time
392 // before the complete Hir value is constructed.
393 check_hir_nesting(&hir, self.config.nest_limit)?;
394 Ok(hir)
395 }
396
397 fn parse_inner(&self) -> Result<Hir, Error> {
398 let depth = self.increment_depth()?;
399 let mut alternates = vec![];
400 let mut concat = vec![];
401 loop {
402 self.bump_space();
403 if self.is_done() {
404 break;
405 }
406 match self.char() {
407 '(' => {
408 // Save the old flags and reset them only when we close
409 // the group.
410 let oldflags = *self.flags.borrow();
411 if let Some(sub) = self.parse_group()? {
412 concat.push(sub);
413 // We only reset them here because if 'parse_group'
414 // returns None, then that means it handled a flag
415 // directive, e.g., '(?ism)'. And the whole point is
416 // that those flags remain active until either disabled
417 // or the end of the pattern or current group.
418 *self.flags.borrow_mut() = oldflags;
419 }
420 if self.char.get() != Some(')') {
421 return Err(Error::new(ERR_UNCLOSED_GROUP));
422 }
423 self.bump();
424 }
425 ')' => {
426 if depth == 0 {
427 return Err(Error::new(ERR_UNOPENED_GROUP));
428 }
429 break;
430 }
431 '|' => {
432 alternates.push(Hir::concat(core::mem::take(&mut concat)));
433 self.bump();
434 }
435 '[' => concat.push(self.parse_class()?),
436 '?' | '*' | '+' => {
437 concat = self.parse_uncounted_repetition(concat)?;
438 }
439 '{' => {
440 concat = self.parse_counted_repetition(concat)?;
441 }
442 _ => concat.push(self.parse_primitive()?),
443 }
444 }
445 self.decrement_depth();
446 alternates.push(Hir::concat(concat));
447 // N.B. This strips off the "alternation" if there's only one branch.
448 Ok(Hir::alternation(alternates))
449 }
450
451 /// Parses a "primitive" pattern. A primitive is any expression that does
452 /// not contain any sub-expressions.
453 ///
454 /// This assumes the parser is pointing at the beginning of the primitive.
455 fn parse_primitive(&self) -> Result<Hir, Error> {
456 let ch = self.char();
457 self.bump();
458 match ch {
459 '\\' => self.parse_escape(),
460 '.' => Ok(self.hir_dot()),
461 '^' => Ok(self.hir_anchor_start()),
462 '$' => Ok(self.hir_anchor_end()),
463 ch => Ok(self.hir_char(ch)),
464 }
465 }
466
467 /// Parse an escape sequence. This always results in a "primitive" HIR,
468 /// that is, an HIR with no sub-expressions.
469 ///
470 /// This assumes the parser is positioned at the start of the sequence,
471 /// immediately *after* the `\`. It advances the parser to the first
472 /// position immediately following the escape sequence.
473 fn parse_escape(&self) -> Result<Hir, Error> {
474 if self.is_done() {
475 return Err(Error::new(ERR_ESCAPE_UNEXPECTED_EOF));
476 }
477 let ch = self.char();
478 // Put some of the more complicated routines into helpers.
479 match ch {
480 '0'..='9' => return Err(Error::new(ERR_BACKREF_UNSUPPORTED)),
481 'p' | 'P' => {
482 return Err(Error::new(ERR_UNICODE_CLASS_UNSUPPORTED))
483 }
484 'x' | 'u' | 'U' => return self.parse_hex(),
485 'd' | 's' | 'w' | 'D' | 'S' | 'W' => {
486 return Ok(self.parse_perl_class());
487 }
488 _ => {}
489 }
490
491 // Handle all of the one letter sequences inline.
492 self.bump();
493 if hir::is_meta_character(ch) || hir::is_escapeable_character(ch) {
494 return Ok(self.hir_char(ch));
495 }
496 let special = |ch| Ok(self.hir_char(ch));
497 match ch {
498 'a' => special('\x07'),
499 'f' => special('\x0C'),
500 't' => special('\t'),
501 'n' => special('\n'),
502 'r' => special('\r'),
503 'v' => special('\x0B'),
504 'A' => Ok(Hir::look(hir::Look::Start)),
505 'z' => Ok(Hir::look(hir::Look::End)),
506 'b' => {
507 let mut hir = Hir::look(hir::Look::Word);
508 if !self.is_done() && self.char() == '{' {
509 if let Some(special) =
510 self.maybe_parse_special_word_boundary()?
511 {
512 hir = special;
513 }
514 }
515 Ok(hir)
516 }
517 'B' => Ok(Hir::look(hir::Look::WordNegate)),
518 '<' => Ok(Hir::look(hir::Look::WordStart)),
519 '>' => Ok(Hir::look(hir::Look::WordEnd)),
520 _ => Err(Error::new(ERR_ESCAPE_UNRECOGNIZED)),
521 }
522 }
523
524 /// Attempt to parse a specialty word boundary. That is, `\b{start}`,
525 /// `\b{end}`, `\b{start-half}` or `\b{end-half}`.
526 ///
527 /// This is similar to `maybe_parse_ascii_class` in that, in most cases,
528 /// if it fails it will just return `None` with no error. This is done
529 /// because `\b{5}` is a valid expression and we want to let that be parsed
530 /// by the existing counted repetition parsing code. (I thought about just
531 /// invoking the counted repetition code from here, but it seemed a little
532 /// ham-fisted.)
533 ///
534 /// Unlike `maybe_parse_ascii_class` though, this can return an error.
535 /// Namely, if we definitely know it isn't a counted repetition, then we
536 /// return an error specific to the specialty word boundaries.
537 ///
538 /// This assumes the parser is positioned at a `{` immediately following
539 /// a `\b`. When `None` is returned, the parser is returned to the position
540 /// at which it started: pointing at a `{`.
541 ///
542 /// The position given should correspond to the start of the `\b`.
543 fn maybe_parse_special_word_boundary(&self) -> Result<Option<Hir>, Error> {
544 assert_eq!(self.char(), '{');
545
546 let is_valid_char = |c| match c {
547 'A'..='Z' | 'a'..='z' | '-' => true,
548 _ => false,
549 };
550 let start = self.pos();
551 if !self.bump_and_bump_space() {
552 return Err(Error::new(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF));
553 }
554 // This is one of the critical bits: if the first non-whitespace
555 // character isn't in [-A-Za-z] (i.e., this can't be a special word
556 // boundary), then we bail and let the counted repetition parser deal
557 // with this.
558 if !is_valid_char(self.char()) {
559 self.pos.set(start);
560 self.char.set(Some('{'));
561 return Ok(None);
562 }
563
564 // Now collect up our chars until we see a '}'.
565 let mut scratch = String::new();
566 while !self.is_done() && is_valid_char(self.char()) {
567 scratch.push(self.char());
568 self.bump_and_bump_space();
569 }
570 if self.is_done() || self.char() != '}' {
571 return Err(Error::new(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED));
572 }
573 self.bump();
574 let kind = match scratch.as_str() {
575 "start" => hir::Look::WordStart,
576 "end" => hir::Look::WordEnd,
577 "start-half" => hir::Look::WordStartHalf,
578 "end-half" => hir::Look::WordEndHalf,
579 _ => {
580 return Err(Error::new(ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED))
581 }
582 };
583 Ok(Some(Hir::look(kind)))
584 }
585
586 /// Parse a hex representation of a Unicode codepoint. This handles both
587 /// hex notations, i.e., `\xFF` and `\x{FFFF}`. This expects the parser to
588 /// be positioned at the `x`, `u` or `U` prefix. The parser is advanced to
589 /// the first character immediately following the hexadecimal literal.
590 fn parse_hex(&self) -> Result<Hir, Error> {
591 let digit_len = match self.char() {
592 'x' => 2,
593 'u' => 4,
594 'U' => 8,
595 unk => unreachable!(
596 "invalid start of fixed length hexadecimal number {}",
597 unk
598 ),
599 };
600 if !self.bump_and_bump_space() {
601 return Err(Error::new(ERR_HEX_UNEXPECTED_EOF));
602 }
603 if self.char() == '{' {
604 self.parse_hex_brace()
605 } else {
606 self.parse_hex_digits(digit_len)
607 }
608 }
609
610 /// Parse an N-digit hex representation of a Unicode codepoint. This
611 /// expects the parser to be positioned at the first digit and will advance
612 /// the parser to the first character immediately following the escape
613 /// sequence.
614 ///
615 /// The number of digits given must be 2 (for `\xNN`), 4 (for `\uNNNN`)
616 /// or 8 (for `\UNNNNNNNN`).
617 fn parse_hex_digits(&self, digit_len: usize) -> Result<Hir, Error> {
618 let mut scratch = String::new();
619 for i in 0..digit_len {
620 if i > 0 && !self.bump_and_bump_space() {
621 return Err(Error::new(ERR_HEX_FIXED_UNEXPECTED_EOF));
622 }
623 if !is_hex(self.char()) {
624 return Err(Error::new(ERR_HEX_FIXED_INVALID_DIGIT));
625 }
626 scratch.push(self.char());
627 }
628 // The final bump just moves the parser past the literal, which may
629 // be EOF.
630 self.bump_and_bump_space();
631 match u32::from_str_radix(&scratch, 16).ok().and_then(char::from_u32) {
632 None => Err(Error::new(ERR_HEX_FIXED_INVALID)),
633 Some(ch) => Ok(self.hir_char(ch)),
634 }
635 }
636
637 /// Parse a hex representation of any Unicode scalar value. This expects
638 /// the parser to be positioned at the opening brace `{` and will advance
639 /// the parser to the first character following the closing brace `}`.
640 fn parse_hex_brace(&self) -> Result<Hir, Error> {
641 let mut scratch = String::new();
642 while self.bump_and_bump_space() && self.char() != '}' {
643 if !is_hex(self.char()) {
644 return Err(Error::new(ERR_HEX_BRACE_INVALID_DIGIT));
645 }
646 scratch.push(self.char());
647 }
648 if self.is_done() {
649 return Err(Error::new(ERR_HEX_BRACE_UNEXPECTED_EOF));
650 }
651 assert_eq!(self.char(), '}');
652 self.bump_and_bump_space();
653
654 if scratch.is_empty() {
655 return Err(Error::new(ERR_HEX_BRACE_EMPTY));
656 }
657 match u32::from_str_radix(&scratch, 16).ok().and_then(char::from_u32) {
658 None => Err(Error::new(ERR_HEX_BRACE_INVALID)),
659 Some(ch) => Ok(self.hir_char(ch)),
660 }
661 }
662
663 /// Parse a decimal number into a u32 while trimming leading and trailing
664 /// whitespace.
665 ///
666 /// This expects the parser to be positioned at the first position where
667 /// a decimal digit could occur. This will advance the parser to the byte
668 /// immediately following the last contiguous decimal digit.
669 ///
670 /// If no decimal digit could be found or if there was a problem parsing
671 /// the complete set of digits into a u32, then an error is returned.
672 fn parse_decimal(&self) -> Result<u32, Error> {
673 let mut scratch = String::new();
674 while !self.is_done() && self.char().is_whitespace() {
675 self.bump();
676 }
677 while !self.is_done() && '0' <= self.char() && self.char() <= '9' {
678 scratch.push(self.char());
679 self.bump_and_bump_space();
680 }
681 while !self.is_done() && self.char().is_whitespace() {
682 self.bump_and_bump_space();
683 }
684 let digits = scratch.as_str();
685 if digits.is_empty() {
686 return Err(Error::new(ERR_DECIMAL_NO_DIGITS));
687 }
688 match u32::from_str_radix(digits, 10).ok() {
689 Some(n) => Ok(n),
690 None => Err(Error::new(ERR_DECIMAL_INVALID)),
691 }
692 }
693
694 /// Parses an uncounted repetition operator. An uncounted repetition
695 /// operator includes `?`, `*` and `+`, but does not include the `{m,n}`
696 /// syntax. The current character should be one of `?`, `*` or `+`. Any
697 /// other character will result in a panic.
698 ///
699 /// This assumes that the parser is currently positioned at the repetition
700 /// operator and advances the parser to the first character after the
701 /// operator. (Note that the operator may include a single additional `?`,
702 /// which makes the operator ungreedy.)
703 ///
704 /// The caller should include the concatenation that is being built. The
705 /// concatenation returned includes the repetition operator applied to the
706 /// last expression in the given concatenation.
707 ///
708 /// If the concatenation is empty, then this returns an error.
709 fn parse_uncounted_repetition(
710 &self,
711 mut concat: Vec<Hir>,
712 ) -> Result<Vec<Hir>, Error> {
713 let sub = match concat.pop() {
714 Some(hir) => Box::new(hir),
715 None => {
716 return Err(Error::new(ERR_UNCOUNTED_REP_SUB_MISSING));
717 }
718 };
719 let (min, max) = match self.char() {
720 '?' => (0, Some(1)),
721 '*' => (0, None),
722 '+' => (1, None),
723 unk => unreachable!("unrecognized repetition operator '{}'", unk),
724 };
725 let mut greedy = true;
726 if self.bump() && self.char() == '?' {
727 greedy = false;
728 self.bump();
729 }
730 if self.flags().swap_greed {
731 greedy = !greedy;
732 }
733 concat.push(Hir::repetition(hir::Repetition {
734 min,
735 max,
736 greedy,
737 sub,
738 }));
739 Ok(concat)
740 }
741
742 /// Parses a counted repetition operation. A counted repetition operator
743 /// corresponds to the `{m,n}` syntax, and does not include the `?`, `*` or
744 /// `+` operators.
745 ///
746 /// This assumes that the parser is currently at the opening `{` and
747 /// advances the parser to the first character after the operator. (Note
748 /// that the operator may include a single additional `?`, which makes the
749 /// operator ungreedy.)
750 ///
751 /// The caller should include the concatenation that is being built. The
752 /// concatenation returned includes the repetition operator applied to the
753 /// last expression in the given concatenation.
754 ///
755 /// If the concatenation is empty, then this returns an error.
756 fn parse_counted_repetition(
757 &self,
758 mut concat: Vec<Hir>,
759 ) -> Result<Vec<Hir>, Error> {
760 assert_eq!(self.char(), '{', "expected opening brace");
761 let sub = match concat.pop() {
762 Some(hir) => Box::new(hir),
763 None => {
764 return Err(Error::new(ERR_COUNTED_REP_SUB_MISSING));
765 }
766 };
767 if !self.bump_and_bump_space() {
768 return Err(Error::new(ERR_COUNTED_REP_UNCLOSED));
769 }
770 let min = self.parse_decimal()?;
771 let mut max = Some(min);
772 if self.is_done() {
773 return Err(Error::new(ERR_COUNTED_REP_MIN_UNCLOSED));
774 }
775 if self.char() == ',' {
776 if !self.bump_and_bump_space() {
777 return Err(Error::new(ERR_COUNTED_REP_COMMA_UNCLOSED));
778 }
779 if self.char() != '}' {
780 max = Some(self.parse_decimal()?);
781 } else {
782 max = None;
783 }
784 if self.is_done() {
785 return Err(Error::new(ERR_COUNTED_REP_MIN_MAX_UNCLOSED));
786 }
787 }
788 if self.char() != '}' {
789 return Err(Error::new(ERR_COUNTED_REP_INVALID));
790 }
791
792 let mut greedy = true;
793 if self.bump_and_bump_space() && self.char() == '?' {
794 greedy = false;
795 self.bump();
796 }
797 if self.flags().swap_greed {
798 greedy = !greedy;
799 }
800
801 if max.map_or(false, |max| min > max) {
802 return Err(Error::new(ERR_COUNTED_REP_INVALID_RANGE));
803 }
804 concat.push(Hir::repetition(hir::Repetition {
805 min,
806 max,
807 greedy,
808 sub,
809 }));
810 Ok(concat)
811 }
812
813 /// Parses the part of a pattern that starts with a `(`. This is usually
814 /// a group sub-expression, but might just be a directive that enables
815 /// (or disables) certain flags.
816 ///
817 /// This assumes the parser is pointing at the opening `(`.
818 fn parse_group(&self) -> Result<Option<Hir>, Error> {
819 assert_eq!(self.char(), '(');
820 self.bump_and_bump_space();
821 if self.is_lookaround_prefix() {
822 return Err(Error::new(ERR_LOOK_UNSUPPORTED));
823 }
824 if self.bump_if("?P<") || self.bump_if("?<") {
825 let index = self.next_capture_index()?;
826 let name = Some(Box::from(self.parse_capture_name()?));
827 let sub = Box::new(self.parse_inner()?);
828 let cap = hir::Capture { index, name, sub };
829 Ok(Some(Hir::capture(cap)))
830 } else if self.bump_if("?") {
831 if self.is_done() {
832 return Err(Error::new(ERR_UNCLOSED_GROUP_QUESTION));
833 }
834 let start = self.pos();
835 // The flags get reset in the top-level 'parse' routine.
836 *self.flags.borrow_mut() = self.parse_flags()?;
837 let consumed = self.pos() - start;
838 if self.char() == ')' {
839 // We don't allow empty flags, e.g., `(?)`.
840 if consumed == 0 {
841 return Err(Error::new(ERR_EMPTY_FLAGS));
842 }
843 Ok(None)
844 } else {
845 assert_eq!(':', self.char());
846 self.bump();
847 self.parse_inner().map(Some)
848 }
849 } else {
850 let index = self.next_capture_index()?;
851 let sub = Box::new(self.parse_inner()?);
852 let cap = hir::Capture { index, name: None, sub };
853 Ok(Some(Hir::capture(cap)))
854 }
855 }
856
857 /// Parses a capture group name. Assumes that the parser is positioned at
858 /// the first character in the name following the opening `<` (and may
859 /// possibly be EOF). This advances the parser to the first character
860 /// following the closing `>`.
861 fn parse_capture_name(&self) -> Result<&str, Error> {
862 if self.is_done() {
863 return Err(Error::new(ERR_MISSING_GROUP_NAME));
864 }
865 let start = self.pos();
866 loop {
867 if self.char() == '>' {
868 break;
869 }
870 if !is_capture_char(self.char(), self.pos() == start) {
871 return Err(Error::new(ERR_INVALID_GROUP_NAME));
872 }
873 if !self.bump() {
874 break;
875 }
876 }
877 let end = self.pos();
878 if self.is_done() {
879 return Err(Error::new(ERR_UNCLOSED_GROUP_NAME));
880 }
881 assert_eq!(self.char(), '>');
882 self.bump();
883 let name = &self.pattern()[start..end];
884 if name.is_empty() {
885 return Err(Error::new(ERR_EMPTY_GROUP_NAME));
886 }
887 self.add_capture_name(name)?;
888 Ok(name)
889 }
890
891 /// Parse a sequence of flags starting at the current character.
892 ///
893 /// This advances the parser to the character immediately following the
894 /// flags, which is guaranteed to be either `:` or `)`.
895 ///
896 /// # Errors
897 ///
898 /// If any flags are duplicated, then an error is returned.
899 ///
900 /// If the negation operator is used more than once, then an error is
901 /// returned.
902 ///
903 /// If no flags could be found or if the negation operation is not followed
904 /// by any flags, then an error is returned.
905 fn parse_flags(&self) -> Result<Flags, Error> {
906 let mut flags = *self.flags.borrow();
907 let mut negate = false;
908 // Keeps track of whether the previous flag item was a '-'. We use this
909 // to detect whether there is a dangling '-', which is invalid.
910 let mut last_was_negation = false;
911 // A set to keep track of the flags we've seen. Since all flags are
912 // ASCII, we only need 128 bytes.
913 let mut seen = [false; 128];
914 while self.char() != ':' && self.char() != ')' {
915 if self.char() == '-' {
916 last_was_negation = true;
917 if negate {
918 return Err(Error::new(ERR_FLAG_REPEATED_NEGATION));
919 }
920 negate = true;
921 } else {
922 last_was_negation = false;
923 self.parse_flag(&mut flags, negate)?;
924 // OK because every valid flag is ASCII, and we're only here if
925 // the flag is valid.
926 let flag_byte = u8::try_from(self.char()).unwrap();
927 if seen[usize::from(flag_byte)] {
928 return Err(Error::new(ERR_FLAG_DUPLICATE));
929 }
930 seen[usize::from(flag_byte)] = true;
931 }
932 if !self.bump() {
933 return Err(Error::new(ERR_FLAG_UNEXPECTED_EOF));
934 }
935 }
936 if last_was_negation {
937 return Err(Error::new(ERR_FLAG_DANGLING_NEGATION));
938 }
939 Ok(flags)
940 }
941
942 /// Parse the current character as a flag. Do not advance the parser.
943 ///
944 /// This sets the appropriate boolean value in place on the set of flags
945 /// given. The boolean is inverted when `negate` is true.
946 ///
947 /// # Errors
948 ///
949 /// If the flag is not recognized, then an error is returned.
950 fn parse_flag(
951 &self,
952 flags: &mut Flags,
953 negate: bool,
954 ) -> Result<(), Error> {
955 let enabled = !negate;
956 match self.char() {
957 'i' => flags.case_insensitive = enabled,
958 'm' => flags.multi_line = enabled,
959 's' => flags.dot_matches_new_line = enabled,
960 'U' => flags.swap_greed = enabled,
961 'R' => flags.crlf = enabled,
962 'x' => flags.ignore_whitespace = enabled,
963 // We make a special exception for this flag where we let it
964 // through as a recognized flag, but treat it as a no-op. This in
965 // practice retains some compatibility with the regex crate. It is
966 // a little suspect to do this, but for example, '(?-u:\b).+' in
967 // the regex crate is equivalent to '\b.+' in regex-lite.
968 'u' => {}
969 _ => return Err(Error::new(ERR_FLAG_UNRECOGNIZED)),
970 }
971 Ok(())
972 }
973
974 /// Parse a standard character class consisting primarily of characters or
975 /// character ranges.
976 ///
977 /// This assumes the parser is positioned at the opening `[`. If parsing
978 /// is successful, then the parser is advanced to the position immediately
979 /// following the closing `]`.
980 fn parse_class(&self) -> Result<Hir, Error> {
981 assert_eq!(self.char(), '[');
982
983 let mut union = vec![];
984 if !self.bump_and_bump_space() {
985 return Err(Error::new(ERR_CLASS_UNCLOSED));
986 }
987 // Determine whether the class is negated or not.
988 let negate = if self.char() != '^' {
989 false
990 } else {
991 if !self.bump_and_bump_space() {
992 return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_NEGATION));
993 }
994 true
995 };
996 // Accept any number of `-` as literal `-`.
997 while self.char() == '-' {
998 union.push(hir::ClassRange { start: '-', end: '-' });
999 if !self.bump_and_bump_space() {
1000 return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_DASH));
1001 }
1002 }
1003 // If `]` is the *first* char in a set, then interpret it as a literal
1004 // `]`. That is, an empty class is impossible to write.
1005 if union.is_empty() && self.char() == ']' {
1006 union.push(hir::ClassRange { start: ']', end: ']' });
1007 if !self.bump_and_bump_space() {
1008 return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_CLOSING));
1009 }
1010 }
1011 loop {
1012 self.bump_space();
1013 if self.is_done() {
1014 return Err(Error::new(ERR_CLASS_UNCLOSED));
1015 }
1016 match self.char() {
1017 '[' => {
1018 // Attempt to treat this as the beginning of a POSIX class.
1019 // If POSIX class parsing fails, then the parser backs up
1020 // to `[`.
1021 if let Some(class) = self.maybe_parse_posix_class() {
1022 union.extend_from_slice(&class.ranges);
1023 continue;
1024 }
1025 // ... otherwise we don't support nested classes.
1026 return Err(Error::new(ERR_CLASS_NEST_UNSUPPORTED));
1027 }
1028 ']' => {
1029 self.bump();
1030 let mut class = hir::Class::new(union);
1031 // Note that we must apply case folding before negation!
1032 // Consider `(?i)[^x]`. If we applied negation first, then
1033 // the result would be the character class that matched any
1034 // Unicode scalar value.
1035 if self.flags().case_insensitive {
1036 class.ascii_case_fold();
1037 }
1038 if negate {
1039 class.negate();
1040 }
1041 return Ok(Hir::class(class));
1042 }
1043 '&' if self.peek() == Some('&') => {
1044 return Err(Error::new(
1045 ERR_CLASS_INTERSECTION_UNSUPPORTED,
1046 ));
1047 }
1048 '-' if self.peek() == Some('-') => {
1049 return Err(Error::new(ERR_CLASS_DIFFERENCE_UNSUPPORTED));
1050 }
1051 '~' if self.peek() == Some('~') => {
1052 return Err(Error::new(
1053 ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED,
1054 ));
1055 }
1056 _ => self.parse_class_range(&mut union)?,
1057 }
1058 }
1059 }
1060
1061 /// Parse a single primitive item in a character class set. The item to
1062 /// be parsed can either be one of a simple literal character, a range
1063 /// between two simple literal characters or a "primitive" character
1064 /// class like `\w`.
1065 ///
1066 /// If an invalid escape is found, or if a character class is found where
1067 /// a simple literal is expected (e.g., in a range), then an error is
1068 /// returned.
1069 ///
1070 /// Otherwise, the range (or ranges) are appended to the given union of
1071 /// ranges.
1072 fn parse_class_range(
1073 &self,
1074 union: &mut Vec<hir::ClassRange>,
1075 ) -> Result<(), Error> {
1076 let prim1 = self.parse_class_item()?;
1077 self.bump_space();
1078 if self.is_done() {
1079 return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_ITEM));
1080 }
1081 // If the next char isn't a `-`, then we don't have a range.
1082 // There are two exceptions. If the char after a `-` is a `]`, then
1083 // `-` is interpreted as a literal `-`. Alternatively, if the char
1084 // after a `-` is a `-`, then `--` corresponds to a "difference"
1085 // operation. (Which we don't support in regex-lite, but error about
1086 // specifically in an effort to be loud about differences between the
1087 // main regex crate where possible.)
1088 if self.char() != '-'
1089 || self.peek_space() == Some(']')
1090 || self.peek_space() == Some('-')
1091 {
1092 union.extend_from_slice(&into_class_item_ranges(prim1)?);
1093 return Ok(());
1094 }
1095 // OK, now we're parsing a range, so bump past the `-` and parse the
1096 // second half of the range.
1097 if !self.bump_and_bump_space() {
1098 return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_DASH));
1099 }
1100 let prim2 = self.parse_class_item()?;
1101 let range = hir::ClassRange {
1102 start: into_class_item_range(prim1)?,
1103 end: into_class_item_range(prim2)?,
1104 };
1105 if range.start > range.end {
1106 return Err(Error::new(ERR_CLASS_INVALID_RANGE));
1107 }
1108 union.push(range);
1109 Ok(())
1110 }
1111
1112 /// Parse a single item in a character class as a primitive, where the
1113 /// primitive either consists of a verbatim literal or a single escape
1114 /// sequence.
1115 ///
1116 /// This assumes the parser is positioned at the beginning of a primitive,
1117 /// and advances the parser to the first position after the primitive if
1118 /// successful.
1119 ///
1120 /// Note that it is the caller's responsibility to report an error if an
1121 /// illegal primitive was parsed.
1122 fn parse_class_item(&self) -> Result<Hir, Error> {
1123 let ch = self.char();
1124 self.bump();
1125 if ch == '\\' {
1126 self.parse_escape()
1127 } else {
1128 Ok(Hir::char(ch))
1129 }
1130 }
1131
1132 /// Attempt to parse a POSIX character class, e.g., `[:alnum:]`.
1133 ///
1134 /// This assumes the parser is positioned at the opening `[`.
1135 ///
1136 /// If no valid POSIX character class could be found, then this does not
1137 /// advance the parser and `None` is returned. Otherwise, the parser is
1138 /// advanced to the first byte following the closing `]` and the
1139 /// corresponding POSIX class is returned.
1140 fn maybe_parse_posix_class(&self) -> Option<hir::Class> {
1141 // POSIX character classes are interesting from a parsing perspective
1142 // because parsing cannot fail with any interesting error. For example,
1143 // in order to use an POSIX character class, it must be enclosed in
1144 // double brackets, e.g., `[[:alnum:]]`. Alternatively, you might think
1145 // of it as "POSIX character classes have the syntax `[:NAME:]` which
1146 // can only appear within character brackets." This means that things
1147 // like `[[:lower:]A]` are legal constructs.
1148 //
1149 // However, if one types an incorrect POSIX character class, e.g.,
1150 // `[[:loower:]]`, then we treat that as if it were normal nested
1151 // character class containing the characters `:elorw`. (Which isn't
1152 // supported and results in an error in regex-lite.) One might argue
1153 // that we should return an error instead since the repeated colons
1154 // give away the intent to write an POSIX class. But what if the user
1155 // typed `[[:lower]]` instead? How can we tell that was intended to be
1156 // a POSXI class and not just a normal nested class?
1157 //
1158 // Reasonable people can probably disagree over this, but for better
1159 // or worse, we implement semantics that never fails at the expense of
1160 // better failure modes.
1161 assert_eq!(self.char(), '[');
1162
1163 // If parsing fails, then we back up the parser to this starting point.
1164 let start_pos = self.pos();
1165 let start_char = self.char.get();
1166 let reset = || {
1167 self.pos.set(start_pos);
1168 self.char.set(start_char);
1169 };
1170
1171 let mut negated = false;
1172 if !self.bump() || self.char() != ':' {
1173 reset();
1174 return None;
1175 }
1176 if !self.bump() {
1177 reset();
1178 return None;
1179 }
1180 if self.char() == '^' {
1181 negated = true;
1182 if !self.bump() {
1183 reset();
1184 return None;
1185 }
1186 }
1187 let name_start = self.pos();
1188 while self.char() != ':' && self.bump() {}
1189 if self.is_done() {
1190 reset();
1191 return None;
1192 }
1193 let name = &self.pattern()[name_start..self.pos()];
1194 if !self.bump_if(":]") {
1195 reset();
1196 return None;
1197 }
1198 if let Ok(ranges) = posix_class(name) {
1199 let mut class = hir::Class::new(ranges);
1200 if negated {
1201 class.negate();
1202 }
1203 return Some(class);
1204 }
1205 reset();
1206 None
1207 }
1208
1209 /// Parse a Perl character class, e.g., `\d` or `\W`. This assumes the
1210 /// parser is currently at a valid character class name and will be
1211 /// advanced to the character immediately following the class.
1212 fn parse_perl_class(&self) -> Hir {
1213 let ch = self.char();
1214 self.bump();
1215 let mut class = hir::Class::new(match ch {
1216 'd' | 'D' => posix_class("digit").unwrap(),
1217 's' | 'S' => posix_class("space").unwrap(),
1218 'w' | 'W' => posix_class("word").unwrap(),
1219 unk => unreachable!("invalid Perl class \\{}", unk),
1220 });
1221 if ch.is_ascii_uppercase() {
1222 class.negate();
1223 }
1224 Hir::class(class)
1225 }
1226
1227 fn hir_dot(&self) -> Hir {
1228 if self.flags().dot_matches_new_line {
1229 Hir::class(hir::Class::new([hir::ClassRange {
1230 start: '\x00',
1231 end: '\u{10FFFF}',
1232 }]))
1233 } else if self.flags().crlf {
1234 Hir::class(hir::Class::new([
1235 hir::ClassRange { start: '\x00', end: '\x09' },
1236 hir::ClassRange { start: '\x0B', end: '\x0C' },
1237 hir::ClassRange { start: '\x0E', end: '\u{10FFFF}' },
1238 ]))
1239 } else {
1240 Hir::class(hir::Class::new([
1241 hir::ClassRange { start: '\x00', end: '\x09' },
1242 hir::ClassRange { start: '\x0B', end: '\u{10FFFF}' },
1243 ]))
1244 }
1245 }
1246
1247 fn hir_anchor_start(&self) -> Hir {
1248 let look = if self.flags().multi_line {
1249 if self.flags().crlf {
1250 hir::Look::StartCRLF
1251 } else {
1252 hir::Look::StartLF
1253 }
1254 } else {
1255 hir::Look::Start
1256 };
1257 Hir::look(look)
1258 }
1259
1260 fn hir_anchor_end(&self) -> Hir {
1261 let look = if self.flags().multi_line {
1262 if self.flags().crlf {
1263 hir::Look::EndCRLF
1264 } else {
1265 hir::Look::EndLF
1266 }
1267 } else {
1268 hir::Look::End
1269 };
1270 Hir::look(look)
1271 }
1272
1273 fn hir_char(&self, ch: char) -> Hir {
1274 if self.flags().case_insensitive {
1275 let this = hir::ClassRange { start: ch, end: ch };
1276 if let Some(folded) = this.ascii_case_fold() {
1277 return Hir::class(hir::Class::new([this, folded]));
1278 }
1279 }
1280 Hir::char(ch)
1281 }
1282}
1283
1284/// This checks the depth of the given `Hir` value, and if it exceeds the given
1285/// limit, then an error is returned.
1286fn check_hir_nesting(hir: &Hir, limit: u32) -> Result<(), Error> {
1287 fn recurse(hir: &Hir, limit: u32, depth: u32) -> Result<(), Error> {
1288 if depth > limit {
1289 return Err(Error::new(ERR_TOO_MUCH_NESTING));
1290 }
1291 let Some(next_depth) = depth.checked_add(1) else {
1292 return Err(Error::new(ERR_TOO_MUCH_NESTING));
1293 };
1294 match *hir.kind() {
1295 HirKind::Empty
1296 | HirKind::Char(_)
1297 | HirKind::Class(_)
1298 | HirKind::Look(_) => Ok(()),
1299 HirKind::Repetition(hir::Repetition { ref sub, .. }) => {
1300 recurse(sub, limit, next_depth)
1301 }
1302 HirKind::Capture(hir::Capture { ref sub, .. }) => {
1303 recurse(sub, limit, next_depth)
1304 }
1305 HirKind::Concat(ref subs) | HirKind::Alternation(ref subs) => {
1306 for sub in subs.iter() {
1307 recurse(sub, limit, next_depth)?;
1308 }
1309 Ok(())
1310 }
1311 }
1312 }
1313 recurse(hir, limit, 0)
1314}
1315
1316/// Converts the given Hir to a literal char if the Hir is just a single
1317/// character. Otherwise this returns an error.
1318///
1319/// This is useful in contexts where you can only accept a single character,
1320/// but where it is convenient to parse something more general. For example,
1321/// parsing a single part of a character class range. It's useful to reuse
1322/// the literal parsing code, but that code can itself return entire classes
1323/// which can't be used as the start/end of a class range.
1324fn into_class_item_range(hir: Hir) -> Result<char, Error> {
1325 match hir.kind {
1326 HirKind::Char(ch: char) => Ok(ch),
1327 _ => Err(Error::new(ERR_CLASS_INVALID_RANGE_ITEM)),
1328 }
1329}
1330
1331fn into_class_item_ranges(
1332 mut hir: Hir,
1333) -> Result<Vec<hir::ClassRange>, Error> {
1334 match core::mem::replace(&mut hir.kind, src:HirKind::Empty) {
1335 HirKind::Char(ch: char) => Ok(vec![hir::ClassRange { start: ch, end: ch }]),
1336 HirKind::Class(hir::Class { ranges: Vec }) => Ok(ranges),
1337 _ => Err(Error::new(ERR_CLASS_INVALID_ITEM)),
1338 }
1339}
1340
1341/// Returns an iterator of character class ranges for the given named POSIX
1342/// character class. If no such character class exists for the name given, then
1343/// an error is returned.
1344fn posix_class(
1345 kind: &str,
1346) -> Result<impl Iterator<Item = hir::ClassRange>, Error> {
1347 let slice: &'static [(u8, u8)] = match kind {
1348 "alnum" => &[(b'0', b'9'), (b'A', b'Z'), (b'a', b'z')],
1349 "alpha" => &[(b'A', b'Z'), (b'a', b'z')],
1350 "ascii" => &[(b'\x00', b'\x7F')],
1351 "blank" => &[(b'\t', b'\t'), (b' ', b' ')],
1352 "cntrl" => &[(b'\x00', b'\x1F'), (b'\x7F', b'\x7F')],
1353 "digit" => &[(b'0', b'9')],
1354 "graph" => &[(b'!', b'~')],
1355 "lower" => &[(b'a', b'z')],
1356 "print" => &[(b' ', b'~')],
1357 "punct" => &[(b'!', b'/'), (b':', b'@'), (b'[', b'`'), (b'{', b'~')],
1358 "space" => &[
1359 (b'\t', b'\t'),
1360 (b'\n', b'\n'),
1361 (b'\x0B', b'\x0B'),
1362 (b'\x0C', b'\x0C'),
1363 (b'\r', b'\r'),
1364 (b' ', b' '),
1365 ],
1366 "upper" => &[(b'A', b'Z')],
1367 "word" => &[(b'0', b'9'), (b'A', b'Z'), (b'_', b'_'), (b'a', b'z')],
1368 "xdigit" => &[(b'0', b'9'), (b'A', b'F'), (b'a', b'f')],
1369 _ => return Err(Error::new(ERR_POSIX_CLASS_UNRECOGNIZED)),
1370 };
1371 Ok(slice.iter().map(|&(start, end)| hir::ClassRange {
1372 start: char::from(start),
1373 end: char::from(end),
1374 }))
1375}
1376
1377/// Returns true if the given character is a hexadecimal digit.
1378fn is_hex(c: char) -> bool {
1379 ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')
1380}
1381
1382/// Returns true if the given character is a valid in a capture group name.
1383///
1384/// If `first` is true, then `c` is treated as the first character in the
1385/// group name (which must be alphabetic or underscore).
1386fn is_capture_char(c: char, first: bool) -> bool {
1387 if first {
1388 c == '_' || c.is_alphabetic()
1389 } else {
1390 c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric()
1391 }
1392}
1393
1394#[cfg(test)]
1395mod tests {
1396 use super::*;
1397
1398 fn p(pattern: &str) -> Hir {
1399 Parser::new(Config::default(), pattern).parse_inner().unwrap()
1400 }
1401
1402 fn perr(pattern: &str) -> String {
1403 Parser::new(Config::default(), pattern)
1404 .parse_inner()
1405 .unwrap_err()
1406 .to_string()
1407 }
1408
1409 fn class<I: IntoIterator<Item = (char, char)>>(it: I) -> Hir {
1410 Hir::class(hir::Class::new(
1411 it.into_iter().map(|(start, end)| hir::ClassRange { start, end }),
1412 ))
1413 }
1414
1415 fn singles<I: IntoIterator<Item = char>>(it: I) -> Hir {
1416 Hir::class(hir::Class::new(
1417 it.into_iter().map(|ch| hir::ClassRange { start: ch, end: ch }),
1418 ))
1419 }
1420
1421 fn posix(name: &str) -> Hir {
1422 Hir::class(hir::Class::new(posix_class(name).unwrap()))
1423 }
1424
1425 fn cap(index: u32, sub: Hir) -> Hir {
1426 Hir::capture(hir::Capture { index, name: None, sub: Box::new(sub) })
1427 }
1428
1429 fn named_cap(index: u32, name: &str, sub: Hir) -> Hir {
1430 Hir::capture(hir::Capture {
1431 index,
1432 name: Some(Box::from(name)),
1433 sub: Box::new(sub),
1434 })
1435 }
1436
1437 #[test]
1438 fn ok_literal() {
1439 assert_eq!(p("a"), Hir::char('a'));
1440 assert_eq!(p("ab"), Hir::concat(vec![Hir::char('a'), Hir::char('b')]));
1441 assert_eq!(p("💩"), Hir::char('💩'));
1442 }
1443
1444 #[test]
1445 fn ok_meta_escapes() {
1446 assert_eq!(p(r"\*"), Hir::char('*'));
1447 assert_eq!(p(r"\+"), Hir::char('+'));
1448 assert_eq!(p(r"\?"), Hir::char('?'));
1449 assert_eq!(p(r"\|"), Hir::char('|'));
1450 assert_eq!(p(r"\("), Hir::char('('));
1451 assert_eq!(p(r"\)"), Hir::char(')'));
1452 assert_eq!(p(r"\^"), Hir::char('^'));
1453 assert_eq!(p(r"\$"), Hir::char('$'));
1454 assert_eq!(p(r"\["), Hir::char('['));
1455 assert_eq!(p(r"\]"), Hir::char(']'));
1456 }
1457
1458 #[test]
1459 fn ok_special_escapes() {
1460 assert_eq!(p(r"\a"), Hir::char('\x07'));
1461 assert_eq!(p(r"\f"), Hir::char('\x0C'));
1462 assert_eq!(p(r"\t"), Hir::char('\t'));
1463 assert_eq!(p(r"\n"), Hir::char('\n'));
1464 assert_eq!(p(r"\r"), Hir::char('\r'));
1465 assert_eq!(p(r"\v"), Hir::char('\x0B'));
1466 assert_eq!(p(r"\A"), Hir::look(hir::Look::Start));
1467 assert_eq!(p(r"\z"), Hir::look(hir::Look::End));
1468 assert_eq!(p(r"\b"), Hir::look(hir::Look::Word));
1469 assert_eq!(p(r"\B"), Hir::look(hir::Look::WordNegate));
1470 }
1471
1472 #[test]
1473 fn ok_hex() {
1474 // fixed length
1475 assert_eq!(p(r"\x41"), Hir::char('A'));
1476 assert_eq!(p(r"\u2603"), Hir::char('☃'));
1477 assert_eq!(p(r"\U0001F4A9"), Hir::char('💩'));
1478 // braces
1479 assert_eq!(p(r"\x{1F4A9}"), Hir::char('💩'));
1480 assert_eq!(p(r"\u{1F4A9}"), Hir::char('💩'));
1481 assert_eq!(p(r"\U{1F4A9}"), Hir::char('💩'));
1482 }
1483
1484 #[test]
1485 fn ok_perl() {
1486 assert_eq!(p(r"\d"), posix("digit"));
1487 assert_eq!(p(r"\s"), posix("space"));
1488 assert_eq!(p(r"\w"), posix("word"));
1489
1490 let negated = |name| {
1491 let mut class = hir::Class::new(posix_class(name).unwrap());
1492 class.negate();
1493 Hir::class(class)
1494 };
1495 assert_eq!(p(r"\D"), negated("digit"));
1496 assert_eq!(p(r"\S"), negated("space"));
1497 assert_eq!(p(r"\W"), negated("word"));
1498 }
1499
1500 #[test]
1501 fn ok_flags_and_primitives() {
1502 assert_eq!(p(r"a"), Hir::char('a'));
1503 assert_eq!(p(r"(?i:a)"), singles(['A', 'a']));
1504
1505 assert_eq!(p(r"^"), Hir::look(hir::Look::Start));
1506 assert_eq!(p(r"(?m:^)"), Hir::look(hir::Look::StartLF));
1507 assert_eq!(p(r"(?mR:^)"), Hir::look(hir::Look::StartCRLF));
1508
1509 assert_eq!(p(r"$"), Hir::look(hir::Look::End));
1510 assert_eq!(p(r"(?m:$)"), Hir::look(hir::Look::EndLF));
1511 assert_eq!(p(r"(?mR:$)"), Hir::look(hir::Look::EndCRLF));
1512
1513 assert_eq!(p(r"."), class([('\x00', '\x09'), ('\x0B', '\u{10FFFF}')]));
1514 assert_eq!(
1515 p(r"(?R:.)"),
1516 class([
1517 ('\x00', '\x09'),
1518 ('\x0B', '\x0C'),
1519 ('\x0E', '\u{10FFFF}'),
1520 ])
1521 );
1522 assert_eq!(p(r"(?s:.)"), class([('\x00', '\u{10FFFF}')]));
1523 assert_eq!(p(r"(?sR:.)"), class([('\x00', '\u{10FFFF}')]));
1524 }
1525
1526 #[test]
1527 fn ok_alternate() {
1528 assert_eq!(
1529 p(r"a|b"),
1530 Hir::alternation(vec![Hir::char('a'), Hir::char('b')])
1531 );
1532 assert_eq!(
1533 p(r"(?:a|b)"),
1534 Hir::alternation(vec![Hir::char('a'), Hir::char('b')])
1535 );
1536
1537 assert_eq!(
1538 p(r"(a|b)"),
1539 cap(1, Hir::alternation(vec![Hir::char('a'), Hir::char('b')]))
1540 );
1541 assert_eq!(
1542 p(r"(?<foo>a|b)"),
1543 named_cap(
1544 1,
1545 "foo",
1546 Hir::alternation(vec![Hir::char('a'), Hir::char('b')])
1547 )
1548 );
1549
1550 assert_eq!(
1551 p(r"a|b|c"),
1552 Hir::alternation(vec![
1553 Hir::char('a'),
1554 Hir::char('b'),
1555 Hir::char('c')
1556 ])
1557 );
1558
1559 assert_eq!(
1560 p(r"ax|by|cz"),
1561 Hir::alternation(vec![
1562 Hir::concat(vec![Hir::char('a'), Hir::char('x')]),
1563 Hir::concat(vec![Hir::char('b'), Hir::char('y')]),
1564 Hir::concat(vec![Hir::char('c'), Hir::char('z')]),
1565 ])
1566 );
1567 assert_eq!(
1568 p(r"(ax|(by|(cz)))"),
1569 cap(
1570 1,
1571 Hir::alternation(vec![
1572 Hir::concat(vec![Hir::char('a'), Hir::char('x')]),
1573 cap(
1574 2,
1575 Hir::alternation(vec![
1576 Hir::concat(vec![Hir::char('b'), Hir::char('y')]),
1577 cap(
1578 3,
1579 Hir::concat(vec![
1580 Hir::char('c'),
1581 Hir::char('z')
1582 ])
1583 ),
1584 ])
1585 ),
1586 ])
1587 )
1588 );
1589
1590 assert_eq!(
1591 p(r"|"),
1592 Hir::alternation(vec![Hir::empty(), Hir::empty()])
1593 );
1594 assert_eq!(
1595 p(r"||"),
1596 Hir::alternation(vec![Hir::empty(), Hir::empty(), Hir::empty()])
1597 );
1598
1599 assert_eq!(
1600 p(r"a|"),
1601 Hir::alternation(vec![Hir::char('a'), Hir::empty()])
1602 );
1603 assert_eq!(
1604 p(r"|a"),
1605 Hir::alternation(vec![Hir::empty(), Hir::char('a')])
1606 );
1607
1608 assert_eq!(
1609 p(r"(|)"),
1610 cap(1, Hir::alternation(vec![Hir::empty(), Hir::empty()]))
1611 );
1612 assert_eq!(
1613 p(r"(a|)"),
1614 cap(1, Hir::alternation(vec![Hir::char('a'), Hir::empty()]))
1615 );
1616 assert_eq!(
1617 p(r"(|a)"),
1618 cap(1, Hir::alternation(vec![Hir::empty(), Hir::char('a')]))
1619 );
1620 }
1621
1622 #[test]
1623 fn ok_flag_group() {
1624 assert_eq!(
1625 p("a(?i:b)"),
1626 Hir::concat(vec![Hir::char('a'), singles(['B', 'b'])])
1627 );
1628 }
1629
1630 #[test]
1631 fn ok_flag_directive() {
1632 assert_eq!(p("(?i)a"), singles(['A', 'a']));
1633 assert_eq!(p("a(?i)"), Hir::char('a'));
1634 assert_eq!(
1635 p("a(?i)b"),
1636 Hir::concat(vec![Hir::char('a'), singles(['B', 'b'])])
1637 );
1638 assert_eq!(
1639 p("a(?i)a(?-i)a"),
1640 Hir::concat(vec![
1641 Hir::char('a'),
1642 singles(['A', 'a']),
1643 Hir::char('a'),
1644 ])
1645 );
1646 assert_eq!(
1647 p("a(?:(?i)a)a"),
1648 Hir::concat(vec![
1649 Hir::char('a'),
1650 singles(['A', 'a']),
1651 Hir::char('a'),
1652 ])
1653 );
1654 assert_eq!(
1655 p("a((?i)a)a"),
1656 Hir::concat(vec![
1657 Hir::char('a'),
1658 cap(1, singles(['A', 'a'])),
1659 Hir::char('a'),
1660 ])
1661 );
1662 }
1663
1664 #[test]
1665 fn ok_uncounted_repetition() {
1666 assert_eq!(
1667 p(r"a?"),
1668 Hir::repetition(hir::Repetition {
1669 min: 0,
1670 max: Some(1),
1671 greedy: true,
1672 sub: Box::new(Hir::char('a')),
1673 }),
1674 );
1675 assert_eq!(
1676 p(r"a*"),
1677 Hir::repetition(hir::Repetition {
1678 min: 0,
1679 max: None,
1680 greedy: true,
1681 sub: Box::new(Hir::char('a')),
1682 }),
1683 );
1684 assert_eq!(
1685 p(r"a+"),
1686 Hir::repetition(hir::Repetition {
1687 min: 1,
1688 max: None,
1689 greedy: true,
1690 sub: Box::new(Hir::char('a')),
1691 }),
1692 );
1693
1694 assert_eq!(
1695 p(r"a??"),
1696 Hir::repetition(hir::Repetition {
1697 min: 0,
1698 max: Some(1),
1699 greedy: false,
1700 sub: Box::new(Hir::char('a')),
1701 }),
1702 );
1703 assert_eq!(
1704 p(r"a*?"),
1705 Hir::repetition(hir::Repetition {
1706 min: 0,
1707 max: None,
1708 greedy: false,
1709 sub: Box::new(Hir::char('a')),
1710 }),
1711 );
1712 assert_eq!(
1713 p(r"a+?"),
1714 Hir::repetition(hir::Repetition {
1715 min: 1,
1716 max: None,
1717 greedy: false,
1718 sub: Box::new(Hir::char('a')),
1719 }),
1720 );
1721
1722 assert_eq!(
1723 p(r"a?b"),
1724 Hir::concat(vec![
1725 Hir::repetition(hir::Repetition {
1726 min: 0,
1727 max: Some(1),
1728 greedy: true,
1729 sub: Box::new(Hir::char('a')),
1730 }),
1731 Hir::char('b'),
1732 ]),
1733 );
1734
1735 assert_eq!(
1736 p(r"ab?"),
1737 Hir::concat(vec![
1738 Hir::char('a'),
1739 Hir::repetition(hir::Repetition {
1740 min: 0,
1741 max: Some(1),
1742 greedy: true,
1743 sub: Box::new(Hir::char('b')),
1744 }),
1745 ]),
1746 );
1747
1748 assert_eq!(
1749 p(r"(?:ab)?"),
1750 Hir::repetition(hir::Repetition {
1751 min: 0,
1752 max: Some(1),
1753 greedy: true,
1754 sub: Box::new(Hir::concat(vec![
1755 Hir::char('a'),
1756 Hir::char('b')
1757 ])),
1758 }),
1759 );
1760
1761 assert_eq!(
1762 p(r"(ab)?"),
1763 Hir::repetition(hir::Repetition {
1764 min: 0,
1765 max: Some(1),
1766 greedy: true,
1767 sub: Box::new(cap(
1768 1,
1769 Hir::concat(vec![Hir::char('a'), Hir::char('b')])
1770 )),
1771 }),
1772 );
1773
1774 assert_eq!(
1775 p(r"|a?"),
1776 Hir::alternation(vec![
1777 Hir::empty(),
1778 Hir::repetition(hir::Repetition {
1779 min: 0,
1780 max: Some(1),
1781 greedy: true,
1782 sub: Box::new(Hir::char('a')),
1783 })
1784 ]),
1785 );
1786 }
1787
1788 #[test]
1789 fn ok_counted_repetition() {
1790 assert_eq!(
1791 p(r"a{5}"),
1792 Hir::repetition(hir::Repetition {
1793 min: 5,
1794 max: Some(5),
1795 greedy: true,
1796 sub: Box::new(Hir::char('a')),
1797 }),
1798 );
1799 assert_eq!(
1800 p(r"a{5}?"),
1801 Hir::repetition(hir::Repetition {
1802 min: 5,
1803 max: Some(5),
1804 greedy: false,
1805 sub: Box::new(Hir::char('a')),
1806 }),
1807 );
1808
1809 assert_eq!(
1810 p(r"a{5,}"),
1811 Hir::repetition(hir::Repetition {
1812 min: 5,
1813 max: None,
1814 greedy: true,
1815 sub: Box::new(Hir::char('a')),
1816 }),
1817 );
1818
1819 assert_eq!(
1820 p(r"a{5,9}"),
1821 Hir::repetition(hir::Repetition {
1822 min: 5,
1823 max: Some(9),
1824 greedy: true,
1825 sub: Box::new(Hir::char('a')),
1826 }),
1827 );
1828
1829 assert_eq!(
1830 p(r"ab{5}c"),
1831 Hir::concat(vec![
1832 Hir::char('a'),
1833 Hir::repetition(hir::Repetition {
1834 min: 5,
1835 max: Some(5),
1836 greedy: true,
1837 sub: Box::new(Hir::char('b')),
1838 }),
1839 Hir::char('c'),
1840 ]),
1841 );
1842
1843 assert_eq!(
1844 p(r"a{ 5 }"),
1845 Hir::repetition(hir::Repetition {
1846 min: 5,
1847 max: Some(5),
1848 greedy: true,
1849 sub: Box::new(Hir::char('a')),
1850 }),
1851 );
1852 assert_eq!(
1853 p(r"a{ 5 , 9 }"),
1854 Hir::repetition(hir::Repetition {
1855 min: 5,
1856 max: Some(9),
1857 greedy: true,
1858 sub: Box::new(Hir::char('a')),
1859 }),
1860 );
1861 }
1862
1863 #[test]
1864 fn ok_group_unnamed() {
1865 assert_eq!(p("(a)"), cap(1, Hir::char('a')));
1866 assert_eq!(
1867 p("(ab)"),
1868 cap(1, Hir::concat(vec![Hir::char('a'), Hir::char('b')]))
1869 );
1870 }
1871
1872 #[test]
1873 fn ok_group_named() {
1874 assert_eq!(p("(?P<foo>a)"), named_cap(1, "foo", Hir::char('a')));
1875 assert_eq!(p("(?<foo>a)"), named_cap(1, "foo", Hir::char('a')));
1876
1877 assert_eq!(
1878 p("(?P<foo>ab)"),
1879 named_cap(
1880 1,
1881 "foo",
1882 Hir::concat(vec![Hir::char('a'), Hir::char('b')])
1883 )
1884 );
1885 assert_eq!(
1886 p("(?<foo>ab)"),
1887 named_cap(
1888 1,
1889 "foo",
1890 Hir::concat(vec![Hir::char('a'), Hir::char('b')])
1891 )
1892 );
1893
1894 assert_eq!(p(r"(?<a>z)"), named_cap(1, "a", Hir::char('z')));
1895 assert_eq!(p(r"(?P<a>z)"), named_cap(1, "a", Hir::char('z')));
1896
1897 assert_eq!(p(r"(?<a_1>z)"), named_cap(1, "a_1", Hir::char('z')));
1898 assert_eq!(p(r"(?P<a_1>z)"), named_cap(1, "a_1", Hir::char('z')));
1899
1900 assert_eq!(p(r"(?<a.1>z)"), named_cap(1, "a.1", Hir::char('z')));
1901 assert_eq!(p(r"(?P<a.1>z)"), named_cap(1, "a.1", Hir::char('z')));
1902
1903 assert_eq!(p(r"(?<a[1]>z)"), named_cap(1, "a[1]", Hir::char('z')));
1904 assert_eq!(p(r"(?P<a[1]>z)"), named_cap(1, "a[1]", Hir::char('z')));
1905
1906 assert_eq!(p(r"(?<a¾>z)"), named_cap(1, "a¾", Hir::char('z')));
1907 assert_eq!(p(r"(?P<a¾>z)"), named_cap(1, "a¾", Hir::char('z')));
1908
1909 assert_eq!(p(r"(?<名字>z)"), named_cap(1, "名字", Hir::char('z')));
1910 assert_eq!(p(r"(?P<名字>z)"), named_cap(1, "名字", Hir::char('z')));
1911 }
1912
1913 #[test]
1914 fn ok_class() {
1915 assert_eq!(p(r"[a]"), singles(['a']));
1916 assert_eq!(p(r"[a\]]"), singles(['a', ']']));
1917 assert_eq!(p(r"[a\-z]"), singles(['a', '-', 'z']));
1918 assert_eq!(p(r"[ab]"), class([('a', 'b')]));
1919 assert_eq!(p(r"[a-]"), singles(['a', '-']));
1920 assert_eq!(p(r"[-a]"), singles(['a', '-']));
1921 assert_eq!(p(r"[--a]"), singles(['a', '-']));
1922 assert_eq!(p(r"[---a]"), singles(['a', '-']));
1923 assert_eq!(p(r"[[:alnum:]]"), posix("alnum"));
1924 assert_eq!(p(r"[\w]"), posix("word"));
1925 assert_eq!(p(r"[a\wz]"), posix("word"));
1926 assert_eq!(p(r"[\s\S]"), class([('\x00', '\u{10FFFF}')]));
1927 assert_eq!(p(r"[^\s\S]"), Hir::fail());
1928 assert_eq!(p(r"[a-cx-z]"), class([('a', 'c'), ('x', 'z')]));
1929 assert_eq!(p(r"[☃-⛄]"), class([('☃', '⛄')]));
1930 assert_eq!(p(r"[]]"), singles([']']));
1931 assert_eq!(p(r"[]a]"), singles([']', 'a']));
1932 assert_eq!(p(r"[]\[]"), singles(['[', ']']));
1933 assert_eq!(p(r"[\[]"), singles(['[']));
1934
1935 assert_eq!(p(r"(?i)[a]"), singles(['A', 'a']));
1936 assert_eq!(p(r"(?i)[A]"), singles(['A', 'a']));
1937 assert_eq!(p(r"(?i)[k]"), singles(['K', 'k']));
1938 assert_eq!(p(r"(?i)[s]"), singles(['S', 's']));
1939 assert_eq!(p(r"(?i)[β]"), singles(['β']));
1940
1941 assert_eq!(p(r"[^^]"), class([('\x00', ']'), ('_', '\u{10FFFF}')]));
1942 assert_eq!(
1943 p(r"[^-a]"),
1944 class([('\x00', ','), ('.', '`'), ('b', '\u{10FFFF}')])
1945 );
1946
1947 assert_eq!(
1948 p(r"[-]a]"),
1949 Hir::concat(vec![singles(['-']), Hir::char('a'), Hir::char(']')])
1950 );
1951 }
1952
1953 #[test]
1954 fn ok_verbatim() {
1955 assert_eq!(
1956 p(r"(?x)a{5,9} ?"),
1957 Hir::repetition(hir::Repetition {
1958 min: 5,
1959 max: Some(9),
1960 greedy: false,
1961 sub: Box::new(Hir::char('a')),
1962 })
1963 );
1964 assert_eq!(p(r"(?x)[ a]"), singles(['a']));
1965 assert_eq!(
1966 p(r"(?x)[ ^ a]"),
1967 class([('\x00', '`'), ('b', '\u{10FFFF}')])
1968 );
1969 assert_eq!(p(r"(?x)[ - a]"), singles(['a', '-']));
1970 assert_eq!(p(r"(?x)[ ] a]"), singles([']', 'a']));
1971
1972 assert_eq!(
1973 p(r"(?x)a b"),
1974 Hir::concat(vec![Hir::char('a'), Hir::char('b')])
1975 );
1976 assert_eq!(
1977 p(r"(?x)a b(?-x)a b"),
1978 Hir::concat(vec![
1979 Hir::char('a'),
1980 Hir::char('b'),
1981 Hir::char('a'),
1982 Hir::char(' '),
1983 Hir::char('b'),
1984 ])
1985 );
1986 assert_eq!(
1987 p(r"a (?x:a )a "),
1988 Hir::concat(vec![
1989 Hir::char('a'),
1990 Hir::char(' '),
1991 Hir::char('a'),
1992 Hir::char('a'),
1993 Hir::char(' '),
1994 ])
1995 );
1996 assert_eq!(
1997 p(r"(?x)( ?P<foo> a )"),
1998 named_cap(1, "foo", Hir::char('a')),
1999 );
2000 assert_eq!(p(r"(?x)( a )"), cap(1, Hir::char('a')));
2001 assert_eq!(p(r"(?x)( ?: a )"), Hir::char('a'));
2002 assert_eq!(p(r"(?x)\x { 53 }"), Hir::char('\x53'));
2003 assert_eq!(p(r"(?x)\ "), Hir::char(' '));
2004 }
2005
2006 #[test]
2007 fn ok_comments() {
2008 let pat = "(?x)
2009# This is comment 1.
2010foo # This is comment 2.
2011 # This is comment 3.
2012bar
2013# This is comment 4.";
2014 assert_eq!(
2015 p(pat),
2016 Hir::concat(vec![
2017 Hir::char('f'),
2018 Hir::char('o'),
2019 Hir::char('o'),
2020 Hir::char('b'),
2021 Hir::char('a'),
2022 Hir::char('r'),
2023 ])
2024 );
2025 }
2026
2027 #[test]
2028 fn err_standard() {
2029 assert_eq!(
2030 ERR_TOO_MUCH_NESTING,
2031 perr("(((((((((((((((((((((((((((((((((((((((((((((((((((a)))))))))))))))))))))))))))))))))))))))))))))))))))"),
2032 );
2033 // This one is tricky, because the only way it can happen is if the
2034 // number of captures overflows u32. Perhaps we should allow setting a
2035 // lower limit?
2036 // assert_eq!(ERR_TOO_MANY_CAPTURES, perr(""));
2037 assert_eq!(ERR_DUPLICATE_CAPTURE_NAME, perr(r"(?P<a>y)(?P<a>z)"));
2038 assert_eq!(ERR_UNCLOSED_GROUP, perr("("));
2039 assert_eq!(ERR_UNCLOSED_GROUP_QUESTION, perr("(?"));
2040 assert_eq!(ERR_UNOPENED_GROUP, perr(")"));
2041 assert_eq!(ERR_LOOK_UNSUPPORTED, perr(r"(?=a)"));
2042 assert_eq!(ERR_LOOK_UNSUPPORTED, perr(r"(?!a)"));
2043 assert_eq!(ERR_LOOK_UNSUPPORTED, perr(r"(?<=a)"));
2044 assert_eq!(ERR_LOOK_UNSUPPORTED, perr(r"(?<!a)"));
2045 assert_eq!(ERR_EMPTY_FLAGS, perr(r"(?)"));
2046 assert_eq!(ERR_MISSING_GROUP_NAME, perr(r"(?P<"));
2047 assert_eq!(ERR_MISSING_GROUP_NAME, perr(r"(?<"));
2048 assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?P<1abc>z)"));
2049 assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<1abc>z)"));
2050 assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<¾>z)"));
2051 assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<¾a>z)"));
2052 assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<☃>z)"));
2053 assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<a☃>z)"));
2054 assert_eq!(ERR_UNCLOSED_GROUP_NAME, perr(r"(?P<foo"));
2055 assert_eq!(ERR_UNCLOSED_GROUP_NAME, perr(r"(?<foo"));
2056 assert_eq!(ERR_EMPTY_GROUP_NAME, perr(r"(?P<>z)"));
2057 assert_eq!(ERR_EMPTY_GROUP_NAME, perr(r"(?<>z)"));
2058 assert_eq!(ERR_FLAG_UNRECOGNIZED, perr(r"(?z:foo)"));
2059 assert_eq!(ERR_FLAG_REPEATED_NEGATION, perr(r"(?s-i-R)"));
2060 assert_eq!(ERR_FLAG_DUPLICATE, perr(r"(?isi)"));
2061 assert_eq!(ERR_FLAG_DUPLICATE, perr(r"(?is-i)"));
2062 assert_eq!(ERR_FLAG_UNEXPECTED_EOF, perr(r"(?is"));
2063 assert_eq!(ERR_FLAG_DANGLING_NEGATION, perr(r"(?is-:foo)"));
2064 assert_eq!(ERR_HEX_BRACE_INVALID_DIGIT, perr(r"\x{Z}"));
2065 assert_eq!(ERR_HEX_BRACE_UNEXPECTED_EOF, perr(r"\x{"));
2066 assert_eq!(ERR_HEX_BRACE_UNEXPECTED_EOF, perr(r"\x{A"));
2067 assert_eq!(ERR_HEX_BRACE_EMPTY, perr(r"\x{}"));
2068 assert_eq!(ERR_HEX_BRACE_INVALID, perr(r"\x{FFFFFFFFFFFFFFFFF}"));
2069 assert_eq!(ERR_HEX_FIXED_UNEXPECTED_EOF, perr(r"\xA"));
2070 assert_eq!(ERR_HEX_FIXED_INVALID_DIGIT, perr(r"\xZ"));
2071 assert_eq!(ERR_HEX_FIXED_INVALID_DIGIT, perr(r"\xZA"));
2072 assert_eq!(ERR_HEX_FIXED_INVALID_DIGIT, perr(r"\xAZ"));
2073 assert_eq!(ERR_HEX_FIXED_INVALID, perr(r"\uD800"));
2074 assert_eq!(ERR_HEX_FIXED_INVALID, perr(r"\UFFFFFFFF"));
2075 assert_eq!(ERR_HEX_UNEXPECTED_EOF, perr(r"\x"));
2076 assert_eq!(ERR_ESCAPE_UNEXPECTED_EOF, perr(r"\"));
2077 assert_eq!(ERR_BACKREF_UNSUPPORTED, perr(r"\0"));
2078 assert_eq!(ERR_BACKREF_UNSUPPORTED, perr(r"\1"));
2079 assert_eq!(ERR_BACKREF_UNSUPPORTED, perr(r"\8"));
2080 assert_eq!(ERR_UNICODE_CLASS_UNSUPPORTED, perr(r"\pL"));
2081 assert_eq!(ERR_UNICODE_CLASS_UNSUPPORTED, perr(r"\p{L}"));
2082 assert_eq!(ERR_ESCAPE_UNRECOGNIZED, perr(r"\i"));
2083 assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"?"));
2084 assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"*"));
2085 assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"+"));
2086 assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"(+)"));
2087 assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"|?"));
2088 assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"(?i)?"));
2089 assert_eq!(ERR_COUNTED_REP_SUB_MISSING, perr(r"{5}"));
2090 assert_eq!(ERR_COUNTED_REP_SUB_MISSING, perr(r"({5})"));
2091 assert_eq!(ERR_COUNTED_REP_SUB_MISSING, perr(r"(?i){5}"));
2092 assert_eq!(ERR_COUNTED_REP_UNCLOSED, perr(r"a{"));
2093 assert_eq!(ERR_COUNTED_REP_MIN_UNCLOSED, perr(r"a{5"));
2094 assert_eq!(ERR_COUNTED_REP_COMMA_UNCLOSED, perr(r"a{5,"));
2095 assert_eq!(ERR_COUNTED_REP_MIN_MAX_UNCLOSED, perr(r"a{5,6"));
2096 assert_eq!(ERR_COUNTED_REP_INVALID, perr(r"a{5,6Z"));
2097 assert_eq!(ERR_COUNTED_REP_INVALID_RANGE, perr(r"a{6,5}"));
2098 assert_eq!(ERR_DECIMAL_NO_DIGITS, perr(r"a{}"));
2099 assert_eq!(ERR_DECIMAL_NO_DIGITS, perr(r"a{]}"));
2100 assert_eq!(ERR_DECIMAL_INVALID, perr(r"a{999999999999999}"));
2101 assert_eq!(ERR_CLASS_UNCLOSED_AFTER_ITEM, perr(r"[a"));
2102 assert_eq!(ERR_CLASS_INVALID_RANGE_ITEM, perr(r"[\w-a]"));
2103 assert_eq!(ERR_CLASS_INVALID_RANGE_ITEM, perr(r"[a-\w]"));
2104 assert_eq!(ERR_CLASS_INVALID_ITEM, perr(r"[\b]"));
2105 assert_eq!(ERR_CLASS_UNCLOSED_AFTER_DASH, perr(r"[a-"));
2106 assert_eq!(ERR_CLASS_UNCLOSED_AFTER_NEGATION, perr(r"[^"));
2107 assert_eq!(ERR_CLASS_UNCLOSED_AFTER_CLOSING, perr(r"[]"));
2108 assert_eq!(ERR_CLASS_INVALID_RANGE, perr(r"[z-a]"));
2109 assert_eq!(ERR_CLASS_UNCLOSED, perr(r"["));
2110 assert_eq!(ERR_CLASS_UNCLOSED, perr(r"[a-z"));
2111 assert_eq!(ERR_CLASS_NEST_UNSUPPORTED, perr(r"[a-z[A-Z]]"));
2112 assert_eq!(ERR_CLASS_NEST_UNSUPPORTED, perr(r"[[:alnum]]"));
2113 assert_eq!(ERR_CLASS_INTERSECTION_UNSUPPORTED, perr(r"[a&&b]"));
2114 assert_eq!(ERR_CLASS_DIFFERENCE_UNSUPPORTED, perr(r"[a--b]"));
2115 assert_eq!(ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED, perr(r"[a~~b]"));
2116 assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED, perr(r"\b{foo"));
2117 assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED, perr(r"\b{foo!}"));
2118 assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED, perr(r"\b{foo}"));
2119 assert_eq!(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF, perr(r"\b{"));
2120 assert_eq!(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF, perr(r"(?x)\b{ "));
2121 }
2122
2123 #[test]
2124 fn err_verbatim() {
2125 // See: https://github.com/rust-lang/regex/issues/792
2126 assert_eq!(ERR_CLASS_UNCLOSED_AFTER_DASH, perr(r"(?x)[-#]"));
2127 assert_eq!(ERR_CLASS_UNCLOSED_AFTER_ITEM, perr(r"(?x)[a "));
2128 assert_eq!(ERR_CLASS_UNCLOSED_AFTER_DASH, perr(r"(?x)[a- "));
2129 assert_eq!(ERR_CLASS_UNCLOSED, perr(r"(?x)[ "));
2130 }
2131
2132 // This tests a bug fix where the nest limit checker wasn't decrementing
2133 // its depth during post-traversal, which causes long regexes to trip
2134 // the default limit too aggressively.
2135 #[test]
2136 fn regression_454_nest_too_big() {
2137 let pattern = r#"
2138 2(?:
2139 [45]\d{3}|
2140 7(?:
2141 1[0-267]|
2142 2[0-289]|
2143 3[0-29]|
2144 4[01]|
2145 5[1-3]|
2146 6[013]|
2147 7[0178]|
2148 91
2149 )|
2150 8(?:
2151 0[125]|
2152 [139][1-6]|
2153 2[0157-9]|
2154 41|
2155 6[1-35]|
2156 7[1-5]|
2157 8[1-8]|
2158 90
2159 )|
2160 9(?:
2161 0[0-2]|
2162 1[0-4]|
2163 2[568]|
2164 3[3-6]|
2165 5[5-7]|
2166 6[0167]|
2167 7[15]|
2168 8[0146-9]
2169 )
2170 )\d{4}
2171 "#;
2172 p(pattern);
2173 }
2174
2175 // This tests that we treat a trailing `-` in a character class as a
2176 // literal `-` even when whitespace mode is enabled and there is whitespace
2177 // after the trailing `-`.
2178 #[test]
2179 fn regression_455_trailing_dash_ignore_whitespace() {
2180 p("(?x)[ / - ]");
2181 p("(?x)[ a - ]");
2182 p("(?x)[
2183 a
2184 - ]
2185 ");
2186 p("(?x)[
2187 a # wat
2188 - ]
2189 ");
2190
2191 perr("(?x)[ / -");
2192 perr("(?x)[ / - ");
2193 perr(
2194 "(?x)[
2195 / -
2196 ",
2197 );
2198 perr(
2199 "(?x)[
2200 / - # wat
2201 ",
2202 );
2203 }
2204
2205 #[test]
2206 fn regression_capture_indices() {
2207 let got = p(r"(a|ab|c|bcd){4,10}(d*)");
2208 assert_eq!(
2209 got,
2210 Hir::concat(vec![
2211 Hir::repetition(hir::Repetition {
2212 min: 4,
2213 max: Some(10),
2214 greedy: true,
2215 sub: Box::new(cap(
2216 1,
2217 Hir::alternation(vec![
2218 Hir::char('a'),
2219 Hir::concat(vec![Hir::char('a'), Hir::char('b')]),
2220 Hir::char('c'),
2221 Hir::concat(vec![
2222 Hir::char('b'),
2223 Hir::char('c'),
2224 Hir::char('d')
2225 ]),
2226 ])
2227 ))
2228 }),
2229 cap(
2230 2,
2231 Hir::repetition(hir::Repetition {
2232 min: 0,
2233 max: None,
2234 greedy: true,
2235 sub: Box::new(Hir::char('d')),
2236 })
2237 ),
2238 ])
2239 );
2240 }
2241}
2242