1/*!
2This module provides a regular expression parser.
3*/
4
5use std::borrow::Borrow;
6use std::cell::{Cell, RefCell};
7use std::mem;
8use std::result;
9
10use crate::ast::{self, Ast, Position, Span};
11use crate::either::Either;
12
13use crate::is_meta_character;
14
15type Result<T> = result::Result<T, ast::Error>;
16
17/// A primitive is an expression with no sub-expressions. This includes
18/// literals, assertions and non-set character classes. This representation
19/// is used as intermediate state in the parser.
20///
21/// This does not include ASCII character classes, since they can only appear
22/// within a set character class.
23#[derive(Clone, Debug, Eq, PartialEq)]
24enum Primitive {
25 Literal(ast::Literal),
26 Assertion(ast::Assertion),
27 Dot(Span),
28 Perl(ast::ClassPerl),
29 Unicode(ast::ClassUnicode),
30}
31
32impl Primitive {
33 /// Return the span of this primitive.
34 fn span(&self) -> &Span {
35 match *self {
36 Primitive::Literal(ref x) => &x.span,
37 Primitive::Assertion(ref x) => &x.span,
38 Primitive::Dot(ref span) => span,
39 Primitive::Perl(ref x) => &x.span,
40 Primitive::Unicode(ref x) => &x.span,
41 }
42 }
43
44 /// Convert this primitive into a proper AST.
45 fn into_ast(self) -> Ast {
46 match self {
47 Primitive::Literal(lit) => Ast::Literal(lit),
48 Primitive::Assertion(assert) => Ast::Assertion(assert),
49 Primitive::Dot(span) => Ast::Dot(span),
50 Primitive::Perl(cls) => Ast::Class(ast::Class::Perl(cls)),
51 Primitive::Unicode(cls) => Ast::Class(ast::Class::Unicode(cls)),
52 }
53 }
54
55 /// Convert this primitive into an item in a character class.
56 ///
57 /// If this primitive is not a legal item (i.e., an assertion or a dot),
58 /// then return an error.
59 fn into_class_set_item<P: Borrow<Parser>>(
60 self,
61 p: &ParserI<'_, P>,
62 ) -> Result<ast::ClassSetItem> {
63 use self::Primitive::*;
64 use crate::ast::ClassSetItem;
65
66 match self {
67 Literal(lit) => Ok(ClassSetItem::Literal(lit)),
68 Perl(cls) => Ok(ClassSetItem::Perl(cls)),
69 Unicode(cls) => Ok(ClassSetItem::Unicode(cls)),
70 x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)),
71 }
72 }
73
74 /// Convert this primitive into a literal in a character class. In
75 /// particular, literals are the only valid items that can appear in
76 /// ranges.
77 ///
78 /// If this primitive is not a legal item (i.e., a class, assertion or a
79 /// dot), then return an error.
80 fn into_class_literal<P: Borrow<Parser>>(
81 self,
82 p: &ParserI<'_, P>,
83 ) -> Result<ast::Literal> {
84 use self::Primitive::*;
85
86 match self {
87 Literal(lit) => Ok(lit),
88 x => Err(p.error(*x.span(), ast::ErrorKind::ClassRangeLiteral)),
89 }
90 }
91}
92
93/// Returns true if the given character is a hexadecimal digit.
94fn is_hex(c: char) -> bool {
95 ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')
96}
97
98/// Returns true if the given character is a valid in a capture group name.
99///
100/// If `first` is true, then `c` is treated as the first character in the
101/// group name (which must be alphabetic or underscore).
102fn is_capture_char(c: char, first: bool) -> bool {
103 c == '_'
104 || (!first
105 && (('0' <= c && c <= '9') || c == '.' || c == '[' || c == ']'))
106 || ('A' <= c && c <= 'Z')
107 || ('a' <= c && c <= 'z')
108}
109
110/// A builder for a regular expression parser.
111///
112/// This builder permits modifying configuration options for the parser.
113#[derive(Clone, Debug)]
114pub struct ParserBuilder {
115 ignore_whitespace: bool,
116 nest_limit: u32,
117 octal: bool,
118}
119
120impl Default for ParserBuilder {
121 fn default() -> ParserBuilder {
122 ParserBuilder::new()
123 }
124}
125
126impl ParserBuilder {
127 /// Create a new parser builder with a default configuration.
128 pub fn new() -> ParserBuilder {
129 ParserBuilder {
130 ignore_whitespace: false,
131 nest_limit: 250,
132 octal: false,
133 }
134 }
135
136 /// Build a parser from this configuration with the given pattern.
137 pub fn build(&self) -> Parser {
138 Parser {
139 pos: Cell::new(Position { offset: 0, line: 1, column: 1 }),
140 capture_index: Cell::new(0),
141 nest_limit: self.nest_limit,
142 octal: self.octal,
143 initial_ignore_whitespace: self.ignore_whitespace,
144 ignore_whitespace: Cell::new(self.ignore_whitespace),
145 comments: RefCell::new(vec![]),
146 stack_group: RefCell::new(vec![]),
147 stack_class: RefCell::new(vec![]),
148 capture_names: RefCell::new(vec![]),
149 scratch: RefCell::new(String::new()),
150 }
151 }
152
153 /// Set the nesting limit for this parser.
154 ///
155 /// The nesting limit controls how deep the abstract syntax tree is allowed
156 /// to be. If the AST exceeds the given limit (e.g., with too many nested
157 /// groups), then an error is returned by the parser.
158 ///
159 /// The purpose of this limit is to act as a heuristic to prevent stack
160 /// overflow for consumers that do structural induction on an `Ast` using
161 /// explicit recursion. While this crate never does this (instead using
162 /// constant stack space and moving the call stack to the heap), other
163 /// crates may.
164 ///
165 /// This limit is not checked until the entire Ast is parsed. Therefore,
166 /// if callers want to put a limit on the amount of heap space used, then
167 /// they should impose a limit on the length, in bytes, of the concrete
168 /// pattern string. In particular, this is viable since this parser
169 /// implementation will limit itself to heap space proportional to the
170 /// length of the pattern string.
171 ///
172 /// Note that a nest limit of `0` will return a nest limit error for most
173 /// patterns but not all. For example, a nest limit of `0` permits `a` but
174 /// not `ab`, since `ab` requires a concatenation, which results in a nest
175 /// depth of `1`. In general, a nest limit is not something that manifests
176 /// in an obvious way in the concrete syntax, therefore, it should not be
177 /// used in a granular way.
178 pub fn nest_limit(&mut self, limit: u32) -> &mut ParserBuilder {
179 self.nest_limit = limit;
180 self
181 }
182
183 /// Whether to support octal syntax or not.
184 ///
185 /// Octal syntax is a little-known way of uttering Unicode codepoints in
186 /// a regular expression. For example, `a`, `\x61`, `\u0061` and
187 /// `\141` are all equivalent regular expressions, where the last example
188 /// shows octal syntax.
189 ///
190 /// While supporting octal syntax isn't in and of itself a problem, it does
191 /// make good error messages harder. That is, in PCRE based regex engines,
192 /// syntax like `\0` invokes a backreference, which is explicitly
193 /// unsupported in Rust's regex engine. However, many users expect it to
194 /// be supported. Therefore, when octal support is disabled, the error
195 /// message will explicitly mention that backreferences aren't supported.
196 ///
197 /// Octal syntax is disabled by default.
198 pub fn octal(&mut self, yes: bool) -> &mut ParserBuilder {
199 self.octal = yes;
200 self
201 }
202
203 /// Enable verbose mode in the regular expression.
204 ///
205 /// When enabled, verbose mode permits insignificant whitespace in many
206 /// places in the regular expression, as well as comments. Comments are
207 /// started using `#` and continue until the end of the line.
208 ///
209 /// By default, this is disabled. It may be selectively enabled in the
210 /// regular expression by using the `x` flag regardless of this setting.
211 pub fn ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder {
212 self.ignore_whitespace = yes;
213 self
214 }
215}
216
217/// A regular expression parser.
218///
219/// This parses a string representation of a regular expression into an
220/// abstract syntax tree. The size of the tree is proportional to the length
221/// of the regular expression pattern.
222///
223/// A `Parser` can be configured in more detail via a
224/// [`ParserBuilder`](struct.ParserBuilder.html).
225#[derive(Clone, Debug)]
226pub struct Parser {
227 /// The current position of the parser.
228 pos: Cell<Position>,
229 /// The current capture index.
230 capture_index: Cell<u32>,
231 /// The maximum number of open parens/brackets allowed. If the parser
232 /// exceeds this number, then an error is returned.
233 nest_limit: u32,
234 /// Whether to support octal syntax or not. When `false`, the parser will
235 /// return an error helpfully pointing out that backreferences are not
236 /// supported.
237 octal: bool,
238 /// The initial setting for `ignore_whitespace` as provided by
239 /// `ParserBuilder`. It is used when resetting the parser's state.
240 initial_ignore_whitespace: bool,
241 /// Whether whitespace should be ignored. When enabled, comments are
242 /// also permitted.
243 ignore_whitespace: Cell<bool>,
244 /// A list of comments, in order of appearance.
245 comments: RefCell<Vec<ast::Comment>>,
246 /// A stack of grouped sub-expressions, including alternations.
247 stack_group: RefCell<Vec<GroupState>>,
248 /// A stack of nested character classes. This is only non-empty when
249 /// parsing a class.
250 stack_class: RefCell<Vec<ClassState>>,
251 /// A sorted sequence of capture names. This is used to detect duplicate
252 /// capture names and report an error if one is detected.
253 capture_names: RefCell<Vec<ast::CaptureName>>,
254 /// A scratch buffer used in various places. Mostly this is used to
255 /// accumulate relevant characters from parts of a pattern.
256 scratch: RefCell<String>,
257}
258
259/// ParserI is the internal parser implementation.
260///
261/// We use this separate type so that we can carry the provided pattern string
262/// along with us. In particular, a `Parser` internal state is not tied to any
263/// one pattern, but `ParserI` is.
264///
265/// This type also lets us use `ParserI<&Parser>` in production code while
266/// retaining the convenience of `ParserI<Parser>` for tests, which sometimes
267/// work against the internal interface of the parser.
268#[derive(Clone, Debug)]
269struct ParserI<'s, P> {
270 /// The parser state/configuration.
271 parser: P,
272 /// The full regular expression provided by the user.
273 pattern: &'s str,
274}
275
276/// GroupState represents a single stack frame while parsing nested groups
277/// and alternations. Each frame records the state up to an opening parenthesis
278/// or a alternating bracket `|`.
279#[derive(Clone, Debug)]
280enum GroupState {
281 /// This state is pushed whenever an opening group is found.
282 Group {
283 /// The concatenation immediately preceding the opening group.
284 concat: ast::Concat,
285 /// The group that has been opened. Its sub-AST is always empty.
286 group: ast::Group,
287 /// Whether this group has the `x` flag enabled or not.
288 ignore_whitespace: bool,
289 },
290 /// This state is pushed whenever a new alternation branch is found. If
291 /// an alternation branch is found and this state is at the top of the
292 /// stack, then this state should be modified to include the new
293 /// alternation.
294 Alternation(ast::Alternation),
295}
296
297/// ClassState represents a single stack frame while parsing character classes.
298/// Each frame records the state up to an intersection, difference, symmetric
299/// difference or nested class.
300///
301/// Note that a parser's character class stack is only non-empty when parsing
302/// a character class. In all other cases, it is empty.
303#[derive(Clone, Debug)]
304enum ClassState {
305 /// This state is pushed whenever an opening bracket is found.
306 Open {
307 /// The union of class items immediately preceding this class.
308 union: ast::ClassSetUnion,
309 /// The class that has been opened. Typically this just corresponds
310 /// to the `[`, but it can also include `[^` since `^` indicates
311 /// negation of the class.
312 set: ast::ClassBracketed,
313 },
314 /// This state is pushed when a operator is seen. When popped, the stored
315 /// set becomes the left hand side of the operator.
316 Op {
317 /// The type of the operation, i.e., &&, -- or ~~.
318 kind: ast::ClassSetBinaryOpKind,
319 /// The left-hand side of the operator.
320 lhs: ast::ClassSet,
321 },
322}
323
324impl Parser {
325 /// Create a new parser with a default configuration.
326 ///
327 /// The parser can be run with either the `parse` or `parse_with_comments`
328 /// methods. The parse methods return an abstract syntax tree.
329 ///
330 /// To set configuration options on the parser, use
331 /// [`ParserBuilder`](struct.ParserBuilder.html).
332 pub fn new() -> Parser {
333 ParserBuilder::new().build()
334 }
335
336 /// Parse the regular expression into an abstract syntax tree.
337 pub fn parse(&mut self, pattern: &str) -> Result<Ast> {
338 ParserI::new(self, pattern).parse()
339 }
340
341 /// Parse the regular expression and return an abstract syntax tree with
342 /// all of the comments found in the pattern.
343 pub fn parse_with_comments(
344 &mut self,
345 pattern: &str,
346 ) -> Result<ast::WithComments> {
347 ParserI::new(self, pattern).parse_with_comments()
348 }
349
350 /// Reset the internal state of a parser.
351 ///
352 /// This is called at the beginning of every parse. This prevents the
353 /// parser from running with inconsistent state (say, if a previous
354 /// invocation returned an error and the parser is reused).
355 fn reset(&self) {
356 // These settings should be in line with the construction
357 // in `ParserBuilder::build`.
358 self.pos.set(Position { offset: 0, line: 1, column: 1 });
359 self.ignore_whitespace.set(self.initial_ignore_whitespace);
360 self.comments.borrow_mut().clear();
361 self.stack_group.borrow_mut().clear();
362 self.stack_class.borrow_mut().clear();
363 }
364}
365
366impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
367 /// Build an internal parser from a parser configuration and a pattern.
368 fn new(parser: P, pattern: &'s str) -> ParserI<'s, P> {
369 ParserI { parser, pattern }
370 }
371
372 /// Return a reference to the parser state.
373 fn parser(&self) -> &Parser {
374 self.parser.borrow()
375 }
376
377 /// Return a reference to the pattern being parsed.
378 fn pattern(&self) -> &str {
379 self.pattern.borrow()
380 }
381
382 /// Create a new error with the given span and error type.
383 fn error(&self, span: Span, kind: ast::ErrorKind) -> ast::Error {
384 ast::Error { kind, pattern: self.pattern().to_string(), span }
385 }
386
387 /// Return the current offset of the parser.
388 ///
389 /// The offset starts at `0` from the beginning of the regular expression
390 /// pattern string.
391 fn offset(&self) -> usize {
392 self.parser().pos.get().offset
393 }
394
395 /// Return the current line number of the parser.
396 ///
397 /// The line number starts at `1`.
398 fn line(&self) -> usize {
399 self.parser().pos.get().line
400 }
401
402 /// Return the current column of the parser.
403 ///
404 /// The column number starts at `1` and is reset whenever a `\n` is seen.
405 fn column(&self) -> usize {
406 self.parser().pos.get().column
407 }
408
409 /// Return the next capturing index. Each subsequent call increments the
410 /// internal index.
411 ///
412 /// The span given should correspond to the location of the opening
413 /// parenthesis.
414 ///
415 /// If the capture limit is exceeded, then an error is returned.
416 fn next_capture_index(&self, span: Span) -> Result<u32> {
417 let current = self.parser().capture_index.get();
418 let i = current.checked_add(1).ok_or_else(|| {
419 self.error(span, ast::ErrorKind::CaptureLimitExceeded)
420 })?;
421 self.parser().capture_index.set(i);
422 Ok(i)
423 }
424
425 /// Adds the given capture name to this parser. If this capture name has
426 /// already been used, then an error is returned.
427 fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> {
428 let mut names = self.parser().capture_names.borrow_mut();
429 match names
430 .binary_search_by_key(&cap.name.as_str(), |c| c.name.as_str())
431 {
432 Err(i) => {
433 names.insert(i, cap.clone());
434 Ok(())
435 }
436 Ok(i) => Err(self.error(
437 cap.span,
438 ast::ErrorKind::GroupNameDuplicate { original: names[i].span },
439 )),
440 }
441 }
442
443 /// Return whether the parser should ignore whitespace or not.
444 fn ignore_whitespace(&self) -> bool {
445 self.parser().ignore_whitespace.get()
446 }
447
448 /// Return the character at the current position of the parser.
449 ///
450 /// This panics if the current position does not point to a valid char.
451 fn char(&self) -> char {
452 self.char_at(self.offset())
453 }
454
455 /// Return the character at the given position.
456 ///
457 /// This panics if the given position does not point to a valid char.
458 fn char_at(&self, i: usize) -> char {
459 self.pattern()[i..]
460 .chars()
461 .next()
462 .unwrap_or_else(|| panic!("expected char at offset {}", i))
463 }
464
465 /// Bump the parser to the next Unicode scalar value.
466 ///
467 /// If the end of the input has been reached, then `false` is returned.
468 fn bump(&self) -> bool {
469 if self.is_eof() {
470 return false;
471 }
472 let Position { mut offset, mut line, mut column } = self.pos();
473 if self.char() == '\n' {
474 line = line.checked_add(1).unwrap();
475 column = 1;
476 } else {
477 column = column.checked_add(1).unwrap();
478 }
479 offset += self.char().len_utf8();
480 self.parser().pos.set(Position { offset, line, column });
481 self.pattern()[self.offset()..].chars().next().is_some()
482 }
483
484 /// If the substring starting at the current position of the parser has
485 /// the given prefix, then bump the parser to the character immediately
486 /// following the prefix and return true. Otherwise, don't bump the parser
487 /// and return false.
488 fn bump_if(&self, prefix: &str) -> bool {
489 if self.pattern()[self.offset()..].starts_with(prefix) {
490 for _ in 0..prefix.chars().count() {
491 self.bump();
492 }
493 true
494 } else {
495 false
496 }
497 }
498
499 /// Returns true if and only if the parser is positioned at a look-around
500 /// prefix. The conditions under which this returns true must always
501 /// correspond to a regular expression that would otherwise be consider
502 /// invalid.
503 ///
504 /// This should only be called immediately after parsing the opening of
505 /// a group or a set of flags.
506 fn is_lookaround_prefix(&self) -> bool {
507 self.bump_if("?=")
508 || self.bump_if("?!")
509 || self.bump_if("?<=")
510 || self.bump_if("?<!")
511 }
512
513 /// Bump the parser, and if the `x` flag is enabled, bump through any
514 /// subsequent spaces. Return true if and only if the parser is not at
515 /// EOF.
516 fn bump_and_bump_space(&self) -> bool {
517 if !self.bump() {
518 return false;
519 }
520 self.bump_space();
521 !self.is_eof()
522 }
523
524 /// If the `x` flag is enabled (i.e., whitespace insensitivity with
525 /// comments), then this will advance the parser through all whitespace
526 /// and comments to the next non-whitespace non-comment byte.
527 ///
528 /// If the `x` flag is disabled, then this is a no-op.
529 ///
530 /// This should be used selectively throughout the parser where
531 /// arbitrary whitespace is permitted when the `x` flag is enabled. For
532 /// example, `{ 5 , 6}` is equivalent to `{5,6}`.
533 fn bump_space(&self) {
534 if !self.ignore_whitespace() {
535 return;
536 }
537 while !self.is_eof() {
538 if self.char().is_whitespace() {
539 self.bump();
540 } else if self.char() == '#' {
541 let start = self.pos();
542 let mut comment_text = String::new();
543 self.bump();
544 while !self.is_eof() {
545 let c = self.char();
546 self.bump();
547 if c == '\n' {
548 break;
549 }
550 comment_text.push(c);
551 }
552 let comment = ast::Comment {
553 span: Span::new(start, self.pos()),
554 comment: comment_text,
555 };
556 self.parser().comments.borrow_mut().push(comment);
557 } else {
558 break;
559 }
560 }
561 }
562
563 /// Peek at the next character in the input without advancing the parser.
564 ///
565 /// If the input has been exhausted, then this returns `None`.
566 fn peek(&self) -> Option<char> {
567 if self.is_eof() {
568 return None;
569 }
570 self.pattern()[self.offset() + self.char().len_utf8()..].chars().next()
571 }
572
573 /// Like peek, but will ignore spaces when the parser is in whitespace
574 /// insensitive mode.
575 fn peek_space(&self) -> Option<char> {
576 if !self.ignore_whitespace() {
577 return self.peek();
578 }
579 if self.is_eof() {
580 return None;
581 }
582 let mut start = self.offset() + self.char().len_utf8();
583 let mut in_comment = false;
584 for (i, c) in self.pattern()[start..].char_indices() {
585 if c.is_whitespace() {
586 continue;
587 } else if !in_comment && c == '#' {
588 in_comment = true;
589 } else if in_comment && c == '\n' {
590 in_comment = false;
591 } else {
592 start += i;
593 break;
594 }
595 }
596 self.pattern()[start..].chars().next()
597 }
598
599 /// Returns true if the next call to `bump` would return false.
600 fn is_eof(&self) -> bool {
601 self.offset() == self.pattern().len()
602 }
603
604 /// Return the current position of the parser, which includes the offset,
605 /// line and column.
606 fn pos(&self) -> Position {
607 self.parser().pos.get()
608 }
609
610 /// Create a span at the current position of the parser. Both the start
611 /// and end of the span are set.
612 fn span(&self) -> Span {
613 Span::splat(self.pos())
614 }
615
616 /// Create a span that covers the current character.
617 fn span_char(&self) -> Span {
618 let mut next = Position {
619 offset: self.offset().checked_add(self.char().len_utf8()).unwrap(),
620 line: self.line(),
621 column: self.column().checked_add(1).unwrap(),
622 };
623 if self.char() == '\n' {
624 next.line += 1;
625 next.column = 1;
626 }
627 Span::new(self.pos(), next)
628 }
629
630 /// Parse and push a single alternation on to the parser's internal stack.
631 /// If the top of the stack already has an alternation, then add to that
632 /// instead of pushing a new one.
633 ///
634 /// The concatenation given corresponds to a single alternation branch.
635 /// The concatenation returned starts the next branch and is empty.
636 ///
637 /// This assumes the parser is currently positioned at `|` and will advance
638 /// the parser to the character following `|`.
639 #[inline(never)]
640 fn push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
641 assert_eq!(self.char(), '|');
642 concat.span.end = self.pos();
643 self.push_or_add_alternation(concat);
644 self.bump();
645 Ok(ast::Concat { span: self.span(), asts: vec![] })
646 }
647
648 /// Pushes or adds the given branch of an alternation to the parser's
649 /// internal stack of state.
650 fn push_or_add_alternation(&self, concat: ast::Concat) {
651 use self::GroupState::*;
652
653 let mut stack = self.parser().stack_group.borrow_mut();
654 if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() {
655 alts.asts.push(concat.into_ast());
656 return;
657 }
658 stack.push(Alternation(ast::Alternation {
659 span: Span::new(concat.span.start, self.pos()),
660 asts: vec![concat.into_ast()],
661 }));
662 }
663
664 /// Parse and push a group AST (and its parent concatenation) on to the
665 /// parser's internal stack. Return a fresh concatenation corresponding
666 /// to the group's sub-AST.
667 ///
668 /// If a set of flags was found (with no group), then the concatenation
669 /// is returned with that set of flags added.
670 ///
671 /// This assumes that the parser is currently positioned on the opening
672 /// parenthesis. It advances the parser to the character at the start
673 /// of the sub-expression (or adjoining expression).
674 ///
675 /// If there was a problem parsing the start of the group, then an error
676 /// is returned.
677 #[inline(never)]
678 fn push_group(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
679 assert_eq!(self.char(), '(');
680 match self.parse_group()? {
681 Either::Left(set) => {
682 let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace);
683 if let Some(v) = ignore {
684 self.parser().ignore_whitespace.set(v);
685 }
686
687 concat.asts.push(Ast::Flags(set));
688 Ok(concat)
689 }
690 Either::Right(group) => {
691 let old_ignore_whitespace = self.ignore_whitespace();
692 let new_ignore_whitespace = group
693 .flags()
694 .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
695 .unwrap_or(old_ignore_whitespace);
696 self.parser().stack_group.borrow_mut().push(
697 GroupState::Group {
698 concat,
699 group,
700 ignore_whitespace: old_ignore_whitespace,
701 },
702 );
703 self.parser().ignore_whitespace.set(new_ignore_whitespace);
704 Ok(ast::Concat { span: self.span(), asts: vec![] })
705 }
706 }
707 }
708
709 /// Pop a group AST from the parser's internal stack and set the group's
710 /// AST to the given concatenation. Return the concatenation containing
711 /// the group.
712 ///
713 /// This assumes that the parser is currently positioned on the closing
714 /// parenthesis and advances the parser to the character following the `)`.
715 ///
716 /// If no such group could be popped, then an unopened group error is
717 /// returned.
718 #[inline(never)]
719 fn pop_group(&self, mut group_concat: ast::Concat) -> Result<ast::Concat> {
720 use self::GroupState::*;
721
722 assert_eq!(self.char(), ')');
723 let mut stack = self.parser().stack_group.borrow_mut();
724 let (mut prior_concat, mut group, ignore_whitespace, alt) = match stack
725 .pop()
726 {
727 Some(Group { concat, group, ignore_whitespace }) => {
728 (concat, group, ignore_whitespace, None)
729 }
730 Some(Alternation(alt)) => match stack.pop() {
731 Some(Group { concat, group, ignore_whitespace }) => {
732 (concat, group, ignore_whitespace, Some(alt))
733 }
734 None | Some(Alternation(_)) => {
735 return Err(self.error(
736 self.span_char(),
737 ast::ErrorKind::GroupUnopened,
738 ));
739 }
740 },
741 None => {
742 return Err(self
743 .error(self.span_char(), ast::ErrorKind::GroupUnopened));
744 }
745 };
746 self.parser().ignore_whitespace.set(ignore_whitespace);
747 group_concat.span.end = self.pos();
748 self.bump();
749 group.span.end = self.pos();
750 match alt {
751 Some(mut alt) => {
752 alt.span.end = group_concat.span.end;
753 alt.asts.push(group_concat.into_ast());
754 group.ast = Box::new(alt.into_ast());
755 }
756 None => {
757 group.ast = Box::new(group_concat.into_ast());
758 }
759 }
760 prior_concat.asts.push(Ast::Group(group));
761 Ok(prior_concat)
762 }
763
764 /// Pop the last state from the parser's internal stack, if it exists, and
765 /// add the given concatenation to it. There either must be no state or a
766 /// single alternation item on the stack. Any other scenario produces an
767 /// error.
768 ///
769 /// This assumes that the parser has advanced to the end.
770 #[inline(never)]
771 fn pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast> {
772 concat.span.end = self.pos();
773 let mut stack = self.parser().stack_group.borrow_mut();
774 let ast = match stack.pop() {
775 None => Ok(concat.into_ast()),
776 Some(GroupState::Alternation(mut alt)) => {
777 alt.span.end = self.pos();
778 alt.asts.push(concat.into_ast());
779 Ok(Ast::Alternation(alt))
780 }
781 Some(GroupState::Group { group, .. }) => {
782 return Err(
783 self.error(group.span, ast::ErrorKind::GroupUnclosed)
784 );
785 }
786 };
787 // If we try to pop again, there should be nothing.
788 match stack.pop() {
789 None => ast,
790 Some(GroupState::Alternation(_)) => {
791 // This unreachable is unfortunate. This case can't happen
792 // because the only way we can be here is if there were two
793 // `GroupState::Alternation`s adjacent in the parser's stack,
794 // which we guarantee to never happen because we never push a
795 // `GroupState::Alternation` if one is already at the top of
796 // the stack.
797 unreachable!()
798 }
799 Some(GroupState::Group { group, .. }) => {
800 Err(self.error(group.span, ast::ErrorKind::GroupUnclosed))
801 }
802 }
803 }
804
805 /// Parse the opening of a character class and push the current class
806 /// parsing context onto the parser's stack. This assumes that the parser
807 /// is positioned at an opening `[`. The given union should correspond to
808 /// the union of set items built up before seeing the `[`.
809 ///
810 /// If there was a problem parsing the opening of the class, then an error
811 /// is returned. Otherwise, a new union of set items for the class is
812 /// returned (which may be populated with either a `]` or a `-`).
813 #[inline(never)]
814 fn push_class_open(
815 &self,
816 parent_union: ast::ClassSetUnion,
817 ) -> Result<ast::ClassSetUnion> {
818 assert_eq!(self.char(), '[');
819
820 let (nested_set, nested_union) = self.parse_set_class_open()?;
821 self.parser()
822 .stack_class
823 .borrow_mut()
824 .push(ClassState::Open { union: parent_union, set: nested_set });
825 Ok(nested_union)
826 }
827
828 /// Parse the end of a character class set and pop the character class
829 /// parser stack. The union given corresponds to the last union built
830 /// before seeing the closing `]`. The union returned corresponds to the
831 /// parent character class set with the nested class added to it.
832 ///
833 /// This assumes that the parser is positioned at a `]` and will advance
834 /// the parser to the byte immediately following the `]`.
835 ///
836 /// If the stack is empty after popping, then this returns the final
837 /// "top-level" character class AST (where a "top-level" character class
838 /// is one that is not nested inside any other character class).
839 ///
840 /// If there is no corresponding opening bracket on the parser's stack,
841 /// then an error is returned.
842 #[inline(never)]
843 fn pop_class(
844 &self,
845 nested_union: ast::ClassSetUnion,
846 ) -> Result<Either<ast::ClassSetUnion, ast::Class>> {
847 assert_eq!(self.char(), ']');
848
849 let item = ast::ClassSet::Item(nested_union.into_item());
850 let prevset = self.pop_class_op(item);
851 let mut stack = self.parser().stack_class.borrow_mut();
852 match stack.pop() {
853 None => {
854 // We can never observe an empty stack:
855 //
856 // 1) We are guaranteed to start with a non-empty stack since
857 // the character class parser is only initiated when it sees
858 // a `[`.
859 // 2) If we ever observe an empty stack while popping after
860 // seeing a `]`, then we signal the character class parser
861 // to terminate.
862 panic!("unexpected empty character class stack")
863 }
864 Some(ClassState::Op { .. }) => {
865 // This panic is unfortunate, but this case is impossible
866 // since we already popped the Op state if one exists above.
867 // Namely, every push to the class parser stack is guarded by
868 // whether an existing Op is already on the top of the stack.
869 // If it is, the existing Op is modified. That is, the stack
870 // can never have consecutive Op states.
871 panic!("unexpected ClassState::Op")
872 }
873 Some(ClassState::Open { mut union, mut set }) => {
874 self.bump();
875 set.span.end = self.pos();
876 set.kind = prevset;
877 if stack.is_empty() {
878 Ok(Either::Right(ast::Class::Bracketed(set)))
879 } else {
880 union.push(ast::ClassSetItem::Bracketed(Box::new(set)));
881 Ok(Either::Left(union))
882 }
883 }
884 }
885 }
886
887 /// Return an "unclosed class" error whose span points to the most
888 /// recently opened class.
889 ///
890 /// This should only be called while parsing a character class.
891 #[inline(never)]
892 fn unclosed_class_error(&self) -> ast::Error {
893 for state in self.parser().stack_class.borrow().iter().rev() {
894 if let ClassState::Open { ref set, .. } = *state {
895 return self.error(set.span, ast::ErrorKind::ClassUnclosed);
896 }
897 }
898 // We are guaranteed to have a non-empty stack with at least
899 // one open bracket, so we should never get here.
900 panic!("no open character class found")
901 }
902
903 /// Push the current set of class items on to the class parser's stack as
904 /// the left hand side of the given operator.
905 ///
906 /// A fresh set union is returned, which should be used to build the right
907 /// hand side of this operator.
908 #[inline(never)]
909 fn push_class_op(
910 &self,
911 next_kind: ast::ClassSetBinaryOpKind,
912 next_union: ast::ClassSetUnion,
913 ) -> ast::ClassSetUnion {
914 let item = ast::ClassSet::Item(next_union.into_item());
915 let new_lhs = self.pop_class_op(item);
916 self.parser()
917 .stack_class
918 .borrow_mut()
919 .push(ClassState::Op { kind: next_kind, lhs: new_lhs });
920 ast::ClassSetUnion { span: self.span(), items: vec![] }
921 }
922
923 /// Pop a character class set from the character class parser stack. If the
924 /// top of the stack is just an item (not an operation), then return the
925 /// given set unchanged. If the top of the stack is an operation, then the
926 /// given set will be used as the rhs of the operation on the top of the
927 /// stack. In that case, the binary operation is returned as a set.
928 #[inline(never)]
929 fn pop_class_op(&self, rhs: ast::ClassSet) -> ast::ClassSet {
930 let mut stack = self.parser().stack_class.borrow_mut();
931 let (kind, lhs) = match stack.pop() {
932 Some(ClassState::Op { kind, lhs }) => (kind, lhs),
933 Some(state @ ClassState::Open { .. }) => {
934 stack.push(state);
935 return rhs;
936 }
937 None => unreachable!(),
938 };
939 let span = Span::new(lhs.span().start, rhs.span().end);
940 ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp {
941 span,
942 kind,
943 lhs: Box::new(lhs),
944 rhs: Box::new(rhs),
945 })
946 }
947}
948
949impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
950 /// Parse the regular expression into an abstract syntax tree.
951 fn parse(&self) -> Result<Ast> {
952 self.parse_with_comments().map(|astc| astc.ast)
953 }
954
955 /// Parse the regular expression and return an abstract syntax tree with
956 /// all of the comments found in the pattern.
957 fn parse_with_comments(&self) -> Result<ast::WithComments> {
958 assert_eq!(self.offset(), 0, "parser can only be used once");
959 self.parser().reset();
960 let mut concat = ast::Concat { span: self.span(), asts: vec![] };
961 loop {
962 self.bump_space();
963 if self.is_eof() {
964 break;
965 }
966 match self.char() {
967 '(' => concat = self.push_group(concat)?,
968 ')' => concat = self.pop_group(concat)?,
969 '|' => concat = self.push_alternate(concat)?,
970 '[' => {
971 let class = self.parse_set_class()?;
972 concat.asts.push(Ast::Class(class));
973 }
974 '?' => {
975 concat = self.parse_uncounted_repetition(
976 concat,
977 ast::RepetitionKind::ZeroOrOne,
978 )?;
979 }
980 '*' => {
981 concat = self.parse_uncounted_repetition(
982 concat,
983 ast::RepetitionKind::ZeroOrMore,
984 )?;
985 }
986 '+' => {
987 concat = self.parse_uncounted_repetition(
988 concat,
989 ast::RepetitionKind::OneOrMore,
990 )?;
991 }
992 '{' => {
993 concat = self.parse_counted_repetition(concat)?;
994 }
995 _ => concat.asts.push(self.parse_primitive()?.into_ast()),
996 }
997 }
998 let ast = self.pop_group_end(concat)?;
999 NestLimiter::new(self).check(&ast)?;
1000 Ok(ast::WithComments {
1001 ast,
1002 comments: mem::replace(
1003 &mut *self.parser().comments.borrow_mut(),
1004 vec![],
1005 ),
1006 })
1007 }
1008
1009 /// Parses an uncounted repetition operation. An uncounted repetition
1010 /// operator includes ?, * and +, but does not include the {m,n} syntax.
1011 /// The given `kind` should correspond to the operator observed by the
1012 /// caller.
1013 ///
1014 /// This assumes that the parser is currently positioned at the repetition
1015 /// operator and advances the parser to the first character after the
1016 /// operator. (Note that the operator may include a single additional `?`,
1017 /// which makes the operator ungreedy.)
1018 ///
1019 /// The caller should include the concatenation that is being built. The
1020 /// concatenation returned includes the repetition operator applied to the
1021 /// last expression in the given concatenation.
1022 #[inline(never)]
1023 fn parse_uncounted_repetition(
1024 &self,
1025 mut concat: ast::Concat,
1026 kind: ast::RepetitionKind,
1027 ) -> Result<ast::Concat> {
1028 assert!(
1029 self.char() == '?' || self.char() == '*' || self.char() == '+'
1030 );
1031 let op_start = self.pos();
1032 let ast = match concat.asts.pop() {
1033 Some(ast) => ast,
1034 None => {
1035 return Err(
1036 self.error(self.span(), ast::ErrorKind::RepetitionMissing)
1037 )
1038 }
1039 };
1040 match ast {
1041 Ast::Empty(_) | Ast::Flags(_) => {
1042 return Err(
1043 self.error(self.span(), ast::ErrorKind::RepetitionMissing)
1044 )
1045 }
1046 _ => {}
1047 }
1048 let mut greedy = true;
1049 if self.bump() && self.char() == '?' {
1050 greedy = false;
1051 self.bump();
1052 }
1053 concat.asts.push(Ast::Repetition(ast::Repetition {
1054 span: ast.span().with_end(self.pos()),
1055 op: ast::RepetitionOp {
1056 span: Span::new(op_start, self.pos()),
1057 kind,
1058 },
1059 greedy,
1060 ast: Box::new(ast),
1061 }));
1062 Ok(concat)
1063 }
1064
1065 /// Parses a counted repetition operation. A counted repetition operator
1066 /// corresponds to the {m,n} syntax, and does not include the ?, * or +
1067 /// operators.
1068 ///
1069 /// This assumes that the parser is currently positioned at the opening `{`
1070 /// and advances the parser to the first character after the operator.
1071 /// (Note that the operator may include a single additional `?`, which
1072 /// makes the operator ungreedy.)
1073 ///
1074 /// The caller should include the concatenation that is being built. The
1075 /// concatenation returned includes the repetition operator applied to the
1076 /// last expression in the given concatenation.
1077 #[inline(never)]
1078 fn parse_counted_repetition(
1079 &self,
1080 mut concat: ast::Concat,
1081 ) -> Result<ast::Concat> {
1082 assert!(self.char() == '{');
1083 let start = self.pos();
1084 let ast = match concat.asts.pop() {
1085 Some(ast) => ast,
1086 None => {
1087 return Err(
1088 self.error(self.span(), ast::ErrorKind::RepetitionMissing)
1089 )
1090 }
1091 };
1092 match ast {
1093 Ast::Empty(_) | Ast::Flags(_) => {
1094 return Err(
1095 self.error(self.span(), ast::ErrorKind::RepetitionMissing)
1096 )
1097 }
1098 _ => {}
1099 }
1100 if !self.bump_and_bump_space() {
1101 return Err(self.error(
1102 Span::new(start, self.pos()),
1103 ast::ErrorKind::RepetitionCountUnclosed,
1104 ));
1105 }
1106 let count_start = specialize_err(
1107 self.parse_decimal(),
1108 ast::ErrorKind::DecimalEmpty,
1109 ast::ErrorKind::RepetitionCountDecimalEmpty,
1110 )?;
1111 let mut range = ast::RepetitionRange::Exactly(count_start);
1112 if self.is_eof() {
1113 return Err(self.error(
1114 Span::new(start, self.pos()),
1115 ast::ErrorKind::RepetitionCountUnclosed,
1116 ));
1117 }
1118 if self.char() == ',' {
1119 if !self.bump_and_bump_space() {
1120 return Err(self.error(
1121 Span::new(start, self.pos()),
1122 ast::ErrorKind::RepetitionCountUnclosed,
1123 ));
1124 }
1125 if self.char() != '}' {
1126 let count_end = specialize_err(
1127 self.parse_decimal(),
1128 ast::ErrorKind::DecimalEmpty,
1129 ast::ErrorKind::RepetitionCountDecimalEmpty,
1130 )?;
1131 range = ast::RepetitionRange::Bounded(count_start, count_end);
1132 } else {
1133 range = ast::RepetitionRange::AtLeast(count_start);
1134 }
1135 }
1136 if self.is_eof() || self.char() != '}' {
1137 return Err(self.error(
1138 Span::new(start, self.pos()),
1139 ast::ErrorKind::RepetitionCountUnclosed,
1140 ));
1141 }
1142
1143 let mut greedy = true;
1144 if self.bump_and_bump_space() && self.char() == '?' {
1145 greedy = false;
1146 self.bump();
1147 }
1148
1149 let op_span = Span::new(start, self.pos());
1150 if !range.is_valid() {
1151 return Err(
1152 self.error(op_span, ast::ErrorKind::RepetitionCountInvalid)
1153 );
1154 }
1155 concat.asts.push(Ast::Repetition(ast::Repetition {
1156 span: ast.span().with_end(self.pos()),
1157 op: ast::RepetitionOp {
1158 span: op_span,
1159 kind: ast::RepetitionKind::Range(range),
1160 },
1161 greedy,
1162 ast: Box::new(ast),
1163 }));
1164 Ok(concat)
1165 }
1166
1167 /// Parse a group (which contains a sub-expression) or a set of flags.
1168 ///
1169 /// If a group was found, then it is returned with an empty AST. If a set
1170 /// of flags is found, then that set is returned.
1171 ///
1172 /// The parser should be positioned at the opening parenthesis.
1173 ///
1174 /// This advances the parser to the character before the start of the
1175 /// sub-expression (in the case of a group) or to the closing parenthesis
1176 /// immediately following the set of flags.
1177 ///
1178 /// # Errors
1179 ///
1180 /// If flags are given and incorrectly specified, then a corresponding
1181 /// error is returned.
1182 ///
1183 /// If a capture name is given and it is incorrectly specified, then a
1184 /// corresponding error is returned.
1185 #[inline(never)]
1186 fn parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>> {
1187 assert_eq!(self.char(), '(');
1188 let open_span = self.span_char();
1189 self.bump();
1190 self.bump_space();
1191 if self.is_lookaround_prefix() {
1192 return Err(self.error(
1193 Span::new(open_span.start, self.span().end),
1194 ast::ErrorKind::UnsupportedLookAround,
1195 ));
1196 }
1197 let inner_span = self.span();
1198 if self.bump_if("?P<") {
1199 let capture_index = self.next_capture_index(open_span)?;
1200 let cap = self.parse_capture_name(capture_index)?;
1201 Ok(Either::Right(ast::Group {
1202 span: open_span,
1203 kind: ast::GroupKind::CaptureName(cap),
1204 ast: Box::new(Ast::Empty(self.span())),
1205 }))
1206 } else if self.bump_if("?") {
1207 if self.is_eof() {
1208 return Err(
1209 self.error(open_span, ast::ErrorKind::GroupUnclosed)
1210 );
1211 }
1212 let flags = self.parse_flags()?;
1213 let char_end = self.char();
1214 self.bump();
1215 if char_end == ')' {
1216 // We don't allow empty flags, e.g., `(?)`. We instead
1217 // interpret it as a repetition operator missing its argument.
1218 if flags.items.is_empty() {
1219 return Err(self.error(
1220 inner_span,
1221 ast::ErrorKind::RepetitionMissing,
1222 ));
1223 }
1224 Ok(Either::Left(ast::SetFlags {
1225 span: Span { end: self.pos(), ..open_span },
1226 flags,
1227 }))
1228 } else {
1229 assert_eq!(char_end, ':');
1230 Ok(Either::Right(ast::Group {
1231 span: open_span,
1232 kind: ast::GroupKind::NonCapturing(flags),
1233 ast: Box::new(Ast::Empty(self.span())),
1234 }))
1235 }
1236 } else {
1237 let capture_index = self.next_capture_index(open_span)?;
1238 Ok(Either::Right(ast::Group {
1239 span: open_span,
1240 kind: ast::GroupKind::CaptureIndex(capture_index),
1241 ast: Box::new(Ast::Empty(self.span())),
1242 }))
1243 }
1244 }
1245
1246 /// Parses a capture group name. Assumes that the parser is positioned at
1247 /// the first character in the name following the opening `<` (and may
1248 /// possibly be EOF). This advances the parser to the first character
1249 /// following the closing `>`.
1250 ///
1251 /// The caller must provide the capture index of the group for this name.
1252 #[inline(never)]
1253 fn parse_capture_name(
1254 &self,
1255 capture_index: u32,
1256 ) -> Result<ast::CaptureName> {
1257 if self.is_eof() {
1258 return Err(self
1259 .error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
1260 }
1261 let start = self.pos();
1262 loop {
1263 if self.char() == '>' {
1264 break;
1265 }
1266 if !is_capture_char(self.char(), self.pos() == start) {
1267 return Err(self.error(
1268 self.span_char(),
1269 ast::ErrorKind::GroupNameInvalid,
1270 ));
1271 }
1272 if !self.bump() {
1273 break;
1274 }
1275 }
1276 let end = self.pos();
1277 if self.is_eof() {
1278 return Err(self
1279 .error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
1280 }
1281 assert_eq!(self.char(), '>');
1282 self.bump();
1283 let name = &self.pattern()[start.offset..end.offset];
1284 if name.is_empty() {
1285 return Err(self.error(
1286 Span::new(start, start),
1287 ast::ErrorKind::GroupNameEmpty,
1288 ));
1289 }
1290 let capname = ast::CaptureName {
1291 span: Span::new(start, end),
1292 name: name.to_string(),
1293 index: capture_index,
1294 };
1295 self.add_capture_name(&capname)?;
1296 Ok(capname)
1297 }
1298
1299 /// Parse a sequence of flags starting at the current character.
1300 ///
1301 /// This advances the parser to the character immediately following the
1302 /// flags, which is guaranteed to be either `:` or `)`.
1303 ///
1304 /// # Errors
1305 ///
1306 /// If any flags are duplicated, then an error is returned.
1307 ///
1308 /// If the negation operator is used more than once, then an error is
1309 /// returned.
1310 ///
1311 /// If no flags could be found or if the negation operation is not followed
1312 /// by any flags, then an error is returned.
1313 #[inline(never)]
1314 fn parse_flags(&self) -> Result<ast::Flags> {
1315 let mut flags = ast::Flags { span: self.span(), items: vec![] };
1316 let mut last_was_negation = None;
1317 while self.char() != ':' && self.char() != ')' {
1318 if self.char() == '-' {
1319 last_was_negation = Some(self.span_char());
1320 let item = ast::FlagsItem {
1321 span: self.span_char(),
1322 kind: ast::FlagsItemKind::Negation,
1323 };
1324 if let Some(i) = flags.add_item(item) {
1325 return Err(self.error(
1326 self.span_char(),
1327 ast::ErrorKind::FlagRepeatedNegation {
1328 original: flags.items[i].span,
1329 },
1330 ));
1331 }
1332 } else {
1333 last_was_negation = None;
1334 let item = ast::FlagsItem {
1335 span: self.span_char(),
1336 kind: ast::FlagsItemKind::Flag(self.parse_flag()?),
1337 };
1338 if let Some(i) = flags.add_item(item) {
1339 return Err(self.error(
1340 self.span_char(),
1341 ast::ErrorKind::FlagDuplicate {
1342 original: flags.items[i].span,
1343 },
1344 ));
1345 }
1346 }
1347 if !self.bump() {
1348 return Err(
1349 self.error(self.span(), ast::ErrorKind::FlagUnexpectedEof)
1350 );
1351 }
1352 }
1353 if let Some(span) = last_was_negation {
1354 return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation));
1355 }
1356 flags.span.end = self.pos();
1357 Ok(flags)
1358 }
1359
1360 /// Parse the current character as a flag. Do not advance the parser.
1361 ///
1362 /// # Errors
1363 ///
1364 /// If the flag is not recognized, then an error is returned.
1365 #[inline(never)]
1366 fn parse_flag(&self) -> Result<ast::Flag> {
1367 match self.char() {
1368 'i' => Ok(ast::Flag::CaseInsensitive),
1369 'm' => Ok(ast::Flag::MultiLine),
1370 's' => Ok(ast::Flag::DotMatchesNewLine),
1371 'U' => Ok(ast::Flag::SwapGreed),
1372 'u' => Ok(ast::Flag::Unicode),
1373 'x' => Ok(ast::Flag::IgnoreWhitespace),
1374 _ => {
1375 Err(self
1376 .error(self.span_char(), ast::ErrorKind::FlagUnrecognized))
1377 }
1378 }
1379 }
1380
1381 /// Parse a primitive AST. e.g., A literal, non-set character class or
1382 /// assertion.
1383 ///
1384 /// This assumes that the parser expects a primitive at the current
1385 /// location. i.e., All other non-primitive cases have been handled.
1386 /// For example, if the parser's position is at `|`, then `|` will be
1387 /// treated as a literal (e.g., inside a character class).
1388 ///
1389 /// This advances the parser to the first character immediately following
1390 /// the primitive.
1391 fn parse_primitive(&self) -> Result<Primitive> {
1392 match self.char() {
1393 '\\' => self.parse_escape(),
1394 '.' => {
1395 let ast = Primitive::Dot(self.span_char());
1396 self.bump();
1397 Ok(ast)
1398 }
1399 '^' => {
1400 let ast = Primitive::Assertion(ast::Assertion {
1401 span: self.span_char(),
1402 kind: ast::AssertionKind::StartLine,
1403 });
1404 self.bump();
1405 Ok(ast)
1406 }
1407 '$' => {
1408 let ast = Primitive::Assertion(ast::Assertion {
1409 span: self.span_char(),
1410 kind: ast::AssertionKind::EndLine,
1411 });
1412 self.bump();
1413 Ok(ast)
1414 }
1415 c => {
1416 let ast = Primitive::Literal(ast::Literal {
1417 span: self.span_char(),
1418 kind: ast::LiteralKind::Verbatim,
1419 c,
1420 });
1421 self.bump();
1422 Ok(ast)
1423 }
1424 }
1425 }
1426
1427 /// Parse an escape sequence as a primitive AST.
1428 ///
1429 /// This assumes the parser is positioned at the start of the escape
1430 /// sequence, i.e., `\`. It advances the parser to the first position
1431 /// immediately following the escape sequence.
1432 #[inline(never)]
1433 fn parse_escape(&self) -> Result<Primitive> {
1434 assert_eq!(self.char(), '\\');
1435 let start = self.pos();
1436 if !self.bump() {
1437 return Err(self.error(
1438 Span::new(start, self.pos()),
1439 ast::ErrorKind::EscapeUnexpectedEof,
1440 ));
1441 }
1442 let c = self.char();
1443 // Put some of the more complicated routines into helpers.
1444 match c {
1445 '0'..='7' => {
1446 if !self.parser().octal {
1447 return Err(self.error(
1448 Span::new(start, self.span_char().end),
1449 ast::ErrorKind::UnsupportedBackreference,
1450 ));
1451 }
1452 let mut lit = self.parse_octal();
1453 lit.span.start = start;
1454 return Ok(Primitive::Literal(lit));
1455 }
1456 '8'..='9' if !self.parser().octal => {
1457 return Err(self.error(
1458 Span::new(start, self.span_char().end),
1459 ast::ErrorKind::UnsupportedBackreference,
1460 ));
1461 }
1462 'x' | 'u' | 'U' => {
1463 let mut lit = self.parse_hex()?;
1464 lit.span.start = start;
1465 return Ok(Primitive::Literal(lit));
1466 }
1467 'p' | 'P' => {
1468 let mut cls = self.parse_unicode_class()?;
1469 cls.span.start = start;
1470 return Ok(Primitive::Unicode(cls));
1471 }
1472 'd' | 's' | 'w' | 'D' | 'S' | 'W' => {
1473 let mut cls = self.parse_perl_class();
1474 cls.span.start = start;
1475 return Ok(Primitive::Perl(cls));
1476 }
1477 _ => {}
1478 }
1479
1480 // Handle all of the one letter sequences inline.
1481 self.bump();
1482 let span = Span::new(start, self.pos());
1483 if is_meta_character(c) {
1484 return Ok(Primitive::Literal(ast::Literal {
1485 span,
1486 kind: ast::LiteralKind::Punctuation,
1487 c,
1488 }));
1489 }
1490 let special = |kind, c| {
1491 Ok(Primitive::Literal(ast::Literal {
1492 span,
1493 kind: ast::LiteralKind::Special(kind),
1494 c,
1495 }))
1496 };
1497 match c {
1498 'a' => special(ast::SpecialLiteralKind::Bell, '\x07'),
1499 'f' => special(ast::SpecialLiteralKind::FormFeed, '\x0C'),
1500 't' => special(ast::SpecialLiteralKind::Tab, '\t'),
1501 'n' => special(ast::SpecialLiteralKind::LineFeed, '\n'),
1502 'r' => special(ast::SpecialLiteralKind::CarriageReturn, '\r'),
1503 'v' => special(ast::SpecialLiteralKind::VerticalTab, '\x0B'),
1504 ' ' if self.ignore_whitespace() => {
1505 special(ast::SpecialLiteralKind::Space, ' ')
1506 }
1507 'A' => Ok(Primitive::Assertion(ast::Assertion {
1508 span,
1509 kind: ast::AssertionKind::StartText,
1510 })),
1511 'z' => Ok(Primitive::Assertion(ast::Assertion {
1512 span,
1513 kind: ast::AssertionKind::EndText,
1514 })),
1515 'b' => Ok(Primitive::Assertion(ast::Assertion {
1516 span,
1517 kind: ast::AssertionKind::WordBoundary,
1518 })),
1519 'B' => Ok(Primitive::Assertion(ast::Assertion {
1520 span,
1521 kind: ast::AssertionKind::NotWordBoundary,
1522 })),
1523 _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)),
1524 }
1525 }
1526
1527 /// Parse an octal representation of a Unicode codepoint up to 3 digits
1528 /// long. This expects the parser to be positioned at the first octal
1529 /// digit and advances the parser to the first character immediately
1530 /// following the octal number. This also assumes that parsing octal
1531 /// escapes is enabled.
1532 ///
1533 /// Assuming the preconditions are met, this routine can never fail.
1534 #[inline(never)]
1535 fn parse_octal(&self) -> ast::Literal {
1536 use std::char;
1537 use std::u32;
1538
1539 assert!(self.parser().octal);
1540 assert!('0' <= self.char() && self.char() <= '7');
1541 let start = self.pos();
1542 // Parse up to two more digits.
1543 while self.bump()
1544 && '0' <= self.char()
1545 && self.char() <= '7'
1546 && self.pos().offset - start.offset <= 2
1547 {}
1548 let end = self.pos();
1549 let octal = &self.pattern()[start.offset..end.offset];
1550 // Parsing the octal should never fail since the above guarantees a
1551 // valid number.
1552 let codepoint =
1553 u32::from_str_radix(octal, 8).expect("valid octal number");
1554 // The max value for 3 digit octal is 0777 = 511 and [0, 511] has no
1555 // invalid Unicode scalar values.
1556 let c = char::from_u32(codepoint).expect("Unicode scalar value");
1557 ast::Literal {
1558 span: Span::new(start, end),
1559 kind: ast::LiteralKind::Octal,
1560 c,
1561 }
1562 }
1563
1564 /// Parse a hex representation of a Unicode codepoint. This handles both
1565 /// hex notations, i.e., `\xFF` and `\x{FFFF}`. This expects the parser to
1566 /// be positioned at the `x`, `u` or `U` prefix. The parser is advanced to
1567 /// the first character immediately following the hexadecimal literal.
1568 #[inline(never)]
1569 fn parse_hex(&self) -> Result<ast::Literal> {
1570 assert!(
1571 self.char() == 'x' || self.char() == 'u' || self.char() == 'U'
1572 );
1573
1574 let hex_kind = match self.char() {
1575 'x' => ast::HexLiteralKind::X,
1576 'u' => ast::HexLiteralKind::UnicodeShort,
1577 _ => ast::HexLiteralKind::UnicodeLong,
1578 };
1579 if !self.bump_and_bump_space() {
1580 return Err(
1581 self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof)
1582 );
1583 }
1584 if self.char() == '{' {
1585 self.parse_hex_brace(hex_kind)
1586 } else {
1587 self.parse_hex_digits(hex_kind)
1588 }
1589 }
1590
1591 /// Parse an N-digit hex representation of a Unicode codepoint. This
1592 /// expects the parser to be positioned at the first digit and will advance
1593 /// the parser to the first character immediately following the escape
1594 /// sequence.
1595 ///
1596 /// The number of digits given must be 2 (for `\xNN`), 4 (for `\uNNNN`)
1597 /// or 8 (for `\UNNNNNNNN`).
1598 #[inline(never)]
1599 fn parse_hex_digits(
1600 &self,
1601 kind: ast::HexLiteralKind,
1602 ) -> Result<ast::Literal> {
1603 use std::char;
1604 use std::u32;
1605
1606 let mut scratch = self.parser().scratch.borrow_mut();
1607 scratch.clear();
1608
1609 let start = self.pos();
1610 for i in 0..kind.digits() {
1611 if i > 0 && !self.bump_and_bump_space() {
1612 return Err(self
1613 .error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
1614 }
1615 if !is_hex(self.char()) {
1616 return Err(self.error(
1617 self.span_char(),
1618 ast::ErrorKind::EscapeHexInvalidDigit,
1619 ));
1620 }
1621 scratch.push(self.char());
1622 }
1623 // The final bump just moves the parser past the literal, which may
1624 // be EOF.
1625 self.bump_and_bump_space();
1626 let end = self.pos();
1627 let hex = scratch.as_str();
1628 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
1629 None => Err(self.error(
1630 Span::new(start, end),
1631 ast::ErrorKind::EscapeHexInvalid,
1632 )),
1633 Some(c) => Ok(ast::Literal {
1634 span: Span::new(start, end),
1635 kind: ast::LiteralKind::HexFixed(kind),
1636 c,
1637 }),
1638 }
1639 }
1640
1641 /// Parse a hex representation of any Unicode scalar value. This expects
1642 /// the parser to be positioned at the opening brace `{` and will advance
1643 /// the parser to the first character following the closing brace `}`.
1644 #[inline(never)]
1645 fn parse_hex_brace(
1646 &self,
1647 kind: ast::HexLiteralKind,
1648 ) -> Result<ast::Literal> {
1649 use std::char;
1650 use std::u32;
1651
1652 let mut scratch = self.parser().scratch.borrow_mut();
1653 scratch.clear();
1654
1655 let brace_pos = self.pos();
1656 let start = self.span_char().end;
1657 while self.bump_and_bump_space() && self.char() != '}' {
1658 if !is_hex(self.char()) {
1659 return Err(self.error(
1660 self.span_char(),
1661 ast::ErrorKind::EscapeHexInvalidDigit,
1662 ));
1663 }
1664 scratch.push(self.char());
1665 }
1666 if self.is_eof() {
1667 return Err(self.error(
1668 Span::new(brace_pos, self.pos()),
1669 ast::ErrorKind::EscapeUnexpectedEof,
1670 ));
1671 }
1672 let end = self.pos();
1673 let hex = scratch.as_str();
1674 assert_eq!(self.char(), '}');
1675 self.bump_and_bump_space();
1676
1677 if hex.is_empty() {
1678 return Err(self.error(
1679 Span::new(brace_pos, self.pos()),
1680 ast::ErrorKind::EscapeHexEmpty,
1681 ));
1682 }
1683 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
1684 None => Err(self.error(
1685 Span::new(start, end),
1686 ast::ErrorKind::EscapeHexInvalid,
1687 )),
1688 Some(c) => Ok(ast::Literal {
1689 span: Span::new(start, self.pos()),
1690 kind: ast::LiteralKind::HexBrace(kind),
1691 c,
1692 }),
1693 }
1694 }
1695
1696 /// Parse a decimal number into a u32 while trimming leading and trailing
1697 /// whitespace.
1698 ///
1699 /// This expects the parser to be positioned at the first position where
1700 /// a decimal digit could occur. This will advance the parser to the byte
1701 /// immediately following the last contiguous decimal digit.
1702 ///
1703 /// If no decimal digit could be found or if there was a problem parsing
1704 /// the complete set of digits into a u32, then an error is returned.
1705 fn parse_decimal(&self) -> Result<u32> {
1706 let mut scratch = self.parser().scratch.borrow_mut();
1707 scratch.clear();
1708
1709 while !self.is_eof() && self.char().is_whitespace() {
1710 self.bump();
1711 }
1712 let start = self.pos();
1713 while !self.is_eof() && '0' <= self.char() && self.char() <= '9' {
1714 scratch.push(self.char());
1715 self.bump_and_bump_space();
1716 }
1717 let span = Span::new(start, self.pos());
1718 while !self.is_eof() && self.char().is_whitespace() {
1719 self.bump_and_bump_space();
1720 }
1721 let digits = scratch.as_str();
1722 if digits.is_empty() {
1723 return Err(self.error(span, ast::ErrorKind::DecimalEmpty));
1724 }
1725 match u32::from_str_radix(digits, 10).ok() {
1726 Some(n) => Ok(n),
1727 None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)),
1728 }
1729 }
1730
1731 /// Parse a standard character class consisting primarily of characters or
1732 /// character ranges, but can also contain nested character classes of
1733 /// any type (sans `.`).
1734 ///
1735 /// This assumes the parser is positioned at the opening `[`. If parsing
1736 /// is successful, then the parser is advanced to the position immediately
1737 /// following the closing `]`.
1738 #[inline(never)]
1739 fn parse_set_class(&self) -> Result<ast::Class> {
1740 assert_eq!(self.char(), '[');
1741
1742 let mut union =
1743 ast::ClassSetUnion { span: self.span(), items: vec![] };
1744 loop {
1745 self.bump_space();
1746 if self.is_eof() {
1747 return Err(self.unclosed_class_error());
1748 }
1749 match self.char() {
1750 '[' => {
1751 // If we've already parsed the opening bracket, then
1752 // attempt to treat this as the beginning of an ASCII
1753 // class. If ASCII class parsing fails, then the parser
1754 // backs up to `[`.
1755 if !self.parser().stack_class.borrow().is_empty() {
1756 if let Some(cls) = self.maybe_parse_ascii_class() {
1757 union.push(ast::ClassSetItem::Ascii(cls));
1758 continue;
1759 }
1760 }
1761 union = self.push_class_open(union)?;
1762 }
1763 ']' => match self.pop_class(union)? {
1764 Either::Left(nested_union) => {
1765 union = nested_union;
1766 }
1767 Either::Right(class) => return Ok(class),
1768 },
1769 '&' if self.peek() == Some('&') => {
1770 assert!(self.bump_if("&&"));
1771 union = self.push_class_op(
1772 ast::ClassSetBinaryOpKind::Intersection,
1773 union,
1774 );
1775 }
1776 '-' if self.peek() == Some('-') => {
1777 assert!(self.bump_if("--"));
1778 union = self.push_class_op(
1779 ast::ClassSetBinaryOpKind::Difference,
1780 union,
1781 );
1782 }
1783 '~' if self.peek() == Some('~') => {
1784 assert!(self.bump_if("~~"));
1785 union = self.push_class_op(
1786 ast::ClassSetBinaryOpKind::SymmetricDifference,
1787 union,
1788 );
1789 }
1790 _ => {
1791 union.push(self.parse_set_class_range()?);
1792 }
1793 }
1794 }
1795 }
1796
1797 /// Parse a single primitive item in a character class set. The item to
1798 /// be parsed can either be one of a simple literal character, a range
1799 /// between two simple literal characters or a "primitive" character
1800 /// class like \w or \p{Greek}.
1801 ///
1802 /// If an invalid escape is found, or if a character class is found where
1803 /// a simple literal is expected (e.g., in a range), then an error is
1804 /// returned.
1805 #[inline(never)]
1806 fn parse_set_class_range(&self) -> Result<ast::ClassSetItem> {
1807 let prim1 = self.parse_set_class_item()?;
1808 self.bump_space();
1809 if self.is_eof() {
1810 return Err(self.unclosed_class_error());
1811 }
1812 // If the next char isn't a `-`, then we don't have a range.
1813 // There are two exceptions. If the char after a `-` is a `]`, then
1814 // `-` is interpreted as a literal `-`. Alternatively, if the char
1815 // after a `-` is a `-`, then `--` corresponds to a "difference"
1816 // operation.
1817 if self.char() != '-'
1818 || self.peek_space() == Some(']')
1819 || self.peek_space() == Some('-')
1820 {
1821 return prim1.into_class_set_item(self);
1822 }
1823 // OK, now we're parsing a range, so bump past the `-` and parse the
1824 // second half of the range.
1825 if !self.bump_and_bump_space() {
1826 return Err(self.unclosed_class_error());
1827 }
1828 let prim2 = self.parse_set_class_item()?;
1829 let range = ast::ClassSetRange {
1830 span: Span::new(prim1.span().start, prim2.span().end),
1831 start: prim1.into_class_literal(self)?,
1832 end: prim2.into_class_literal(self)?,
1833 };
1834 if !range.is_valid() {
1835 return Err(
1836 self.error(range.span, ast::ErrorKind::ClassRangeInvalid)
1837 );
1838 }
1839 Ok(ast::ClassSetItem::Range(range))
1840 }
1841
1842 /// Parse a single item in a character class as a primitive, where the
1843 /// primitive either consists of a verbatim literal or a single escape
1844 /// sequence.
1845 ///
1846 /// This assumes the parser is positioned at the beginning of a primitive,
1847 /// and advances the parser to the first position after the primitive if
1848 /// successful.
1849 ///
1850 /// Note that it is the caller's responsibility to report an error if an
1851 /// illegal primitive was parsed.
1852 #[inline(never)]
1853 fn parse_set_class_item(&self) -> Result<Primitive> {
1854 if self.char() == '\\' {
1855 self.parse_escape()
1856 } else {
1857 let x = Primitive::Literal(ast::Literal {
1858 span: self.span_char(),
1859 kind: ast::LiteralKind::Verbatim,
1860 c: self.char(),
1861 });
1862 self.bump();
1863 Ok(x)
1864 }
1865 }
1866
1867 /// Parses the opening of a character class set. This includes the opening
1868 /// bracket along with `^` if present to indicate negation. This also
1869 /// starts parsing the opening set of unioned items if applicable, since
1870 /// there are special rules applied to certain characters in the opening
1871 /// of a character class. For example, `[^]]` is the class of all
1872 /// characters not equal to `]`. (`]` would need to be escaped in any other
1873 /// position.) Similarly for `-`.
1874 ///
1875 /// In all cases, the op inside the returned `ast::ClassBracketed` is an
1876 /// empty union. This empty union should be replaced with the actual item
1877 /// when it is popped from the parser's stack.
1878 ///
1879 /// This assumes the parser is positioned at the opening `[` and advances
1880 /// the parser to the first non-special byte of the character class.
1881 ///
1882 /// An error is returned if EOF is found.
1883 #[inline(never)]
1884 fn parse_set_class_open(
1885 &self,
1886 ) -> Result<(ast::ClassBracketed, ast::ClassSetUnion)> {
1887 assert_eq!(self.char(), '[');
1888 let start = self.pos();
1889 if !self.bump_and_bump_space() {
1890 return Err(self.error(
1891 Span::new(start, self.pos()),
1892 ast::ErrorKind::ClassUnclosed,
1893 ));
1894 }
1895
1896 let negated = if self.char() != '^' {
1897 false
1898 } else {
1899 if !self.bump_and_bump_space() {
1900 return Err(self.error(
1901 Span::new(start, self.pos()),
1902 ast::ErrorKind::ClassUnclosed,
1903 ));
1904 }
1905 true
1906 };
1907 // Accept any number of `-` as literal `-`.
1908 let mut union =
1909 ast::ClassSetUnion { span: self.span(), items: vec![] };
1910 while self.char() == '-' {
1911 union.push(ast::ClassSetItem::Literal(ast::Literal {
1912 span: self.span_char(),
1913 kind: ast::LiteralKind::Verbatim,
1914 c: '-',
1915 }));
1916 if !self.bump_and_bump_space() {
1917 return Err(self.error(
1918 Span::new(start, start),
1919 ast::ErrorKind::ClassUnclosed,
1920 ));
1921 }
1922 }
1923 // If `]` is the *first* char in a set, then interpret it as a literal
1924 // `]`. That is, an empty class is impossible to write.
1925 if union.items.is_empty() && self.char() == ']' {
1926 union.push(ast::ClassSetItem::Literal(ast::Literal {
1927 span: self.span_char(),
1928 kind: ast::LiteralKind::Verbatim,
1929 c: ']',
1930 }));
1931 if !self.bump_and_bump_space() {
1932 return Err(self.error(
1933 Span::new(start, self.pos()),
1934 ast::ErrorKind::ClassUnclosed,
1935 ));
1936 }
1937 }
1938 let set = ast::ClassBracketed {
1939 span: Span::new(start, self.pos()),
1940 negated,
1941 kind: ast::ClassSet::union(ast::ClassSetUnion {
1942 span: Span::new(union.span.start, union.span.start),
1943 items: vec![],
1944 }),
1945 };
1946 Ok((set, union))
1947 }
1948
1949 /// Attempt to parse an ASCII character class, e.g., `[:alnum:]`.
1950 ///
1951 /// This assumes the parser is positioned at the opening `[`.
1952 ///
1953 /// If no valid ASCII character class could be found, then this does not
1954 /// advance the parser and `None` is returned. Otherwise, the parser is
1955 /// advanced to the first byte following the closing `]` and the
1956 /// corresponding ASCII class is returned.
1957 #[inline(never)]
1958 fn maybe_parse_ascii_class(&self) -> Option<ast::ClassAscii> {
1959 // ASCII character classes are interesting from a parsing perspective
1960 // because parsing cannot fail with any interesting error. For example,
1961 // in order to use an ASCII character class, it must be enclosed in
1962 // double brackets, e.g., `[[:alnum:]]`. Alternatively, you might think
1963 // of it as "ASCII character characters have the syntax `[:NAME:]`
1964 // which can only appear within character brackets." This means that
1965 // things like `[[:lower:]A]` are legal constructs.
1966 //
1967 // However, if one types an incorrect ASCII character class, e.g.,
1968 // `[[:loower:]]`, then we treat that as a normal nested character
1969 // class containing the characters `:elorw`. One might argue that we
1970 // should return an error instead since the repeated colons give away
1971 // the intent to write an ASCII class. But what if the user typed
1972 // `[[:lower]]` instead? How can we tell that was intended to be an
1973 // ASCII class and not just a normal nested class?
1974 //
1975 // Reasonable people can probably disagree over this, but for better
1976 // or worse, we implement semantics that never fails at the expense
1977 // of better failure modes.
1978 assert_eq!(self.char(), '[');
1979 // If parsing fails, then we back up the parser to this starting point.
1980 let start = self.pos();
1981 let mut negated = false;
1982 if !self.bump() || self.char() != ':' {
1983 self.parser().pos.set(start);
1984 return None;
1985 }
1986 if !self.bump() {
1987 self.parser().pos.set(start);
1988 return None;
1989 }
1990 if self.char() == '^' {
1991 negated = true;
1992 if !self.bump() {
1993 self.parser().pos.set(start);
1994 return None;
1995 }
1996 }
1997 let name_start = self.offset();
1998 while self.char() != ':' && self.bump() {}
1999 if self.is_eof() {
2000 self.parser().pos.set(start);
2001 return None;
2002 }
2003 let name = &self.pattern()[name_start..self.offset()];
2004 if !self.bump_if(":]") {
2005 self.parser().pos.set(start);
2006 return None;
2007 }
2008 let kind = match ast::ClassAsciiKind::from_name(name) {
2009 Some(kind) => kind,
2010 None => {
2011 self.parser().pos.set(start);
2012 return None;
2013 }
2014 };
2015 Some(ast::ClassAscii {
2016 span: Span::new(start, self.pos()),
2017 kind,
2018 negated,
2019 })
2020 }
2021
2022 /// Parse a Unicode class in either the single character notation, `\pN`
2023 /// or the multi-character bracketed notation, `\p{Greek}`. This assumes
2024 /// the parser is positioned at the `p` (or `P` for negation) and will
2025 /// advance the parser to the character immediately following the class.
2026 ///
2027 /// Note that this does not check whether the class name is valid or not.
2028 #[inline(never)]
2029 fn parse_unicode_class(&self) -> Result<ast::ClassUnicode> {
2030 assert!(self.char() == 'p' || self.char() == 'P');
2031
2032 let mut scratch = self.parser().scratch.borrow_mut();
2033 scratch.clear();
2034
2035 let negated = self.char() == 'P';
2036 if !self.bump_and_bump_space() {
2037 return Err(
2038 self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof)
2039 );
2040 }
2041 let (start, kind) = if self.char() == '{' {
2042 let start = self.span_char().end;
2043 while self.bump_and_bump_space() && self.char() != '}' {
2044 scratch.push(self.char());
2045 }
2046 if self.is_eof() {
2047 return Err(self
2048 .error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2049 }
2050 assert_eq!(self.char(), '}');
2051 self.bump();
2052
2053 let name = scratch.as_str();
2054 if let Some(i) = name.find("!=") {
2055 (
2056 start,
2057 ast::ClassUnicodeKind::NamedValue {
2058 op: ast::ClassUnicodeOpKind::NotEqual,
2059 name: name[..i].to_string(),
2060 value: name[i + 2..].to_string(),
2061 },
2062 )
2063 } else if let Some(i) = name.find(':') {
2064 (
2065 start,
2066 ast::ClassUnicodeKind::NamedValue {
2067 op: ast::ClassUnicodeOpKind::Colon,
2068 name: name[..i].to_string(),
2069 value: name[i + 1..].to_string(),
2070 },
2071 )
2072 } else if let Some(i) = name.find('=') {
2073 (
2074 start,
2075 ast::ClassUnicodeKind::NamedValue {
2076 op: ast::ClassUnicodeOpKind::Equal,
2077 name: name[..i].to_string(),
2078 value: name[i + 1..].to_string(),
2079 },
2080 )
2081 } else {
2082 (start, ast::ClassUnicodeKind::Named(name.to_string()))
2083 }
2084 } else {
2085 let start = self.pos();
2086 let c = self.char();
2087 if c == '\\' {
2088 return Err(self.error(
2089 self.span_char(),
2090 ast::ErrorKind::UnicodeClassInvalid,
2091 ));
2092 }
2093 self.bump_and_bump_space();
2094 let kind = ast::ClassUnicodeKind::OneLetter(c);
2095 (start, kind)
2096 };
2097 Ok(ast::ClassUnicode {
2098 span: Span::new(start, self.pos()),
2099 negated,
2100 kind,
2101 })
2102 }
2103
2104 /// Parse a Perl character class, e.g., `\d` or `\W`. This assumes the
2105 /// parser is currently at a valid character class name and will be
2106 /// advanced to the character immediately following the class.
2107 #[inline(never)]
2108 fn parse_perl_class(&self) -> ast::ClassPerl {
2109 let c = self.char();
2110 let span = self.span_char();
2111 self.bump();
2112 let (negated, kind) = match c {
2113 'd' => (false, ast::ClassPerlKind::Digit),
2114 'D' => (true, ast::ClassPerlKind::Digit),
2115 's' => (false, ast::ClassPerlKind::Space),
2116 'S' => (true, ast::ClassPerlKind::Space),
2117 'w' => (false, ast::ClassPerlKind::Word),
2118 'W' => (true, ast::ClassPerlKind::Word),
2119 c => panic!("expected valid Perl class but got '{}'", c),
2120 };
2121 ast::ClassPerl { span, kind, negated }
2122 }
2123}
2124
2125/// A type that traverses a fully parsed Ast and checks whether its depth
2126/// exceeds the specified nesting limit. If it does, then an error is returned.
2127#[derive(Debug)]
2128struct NestLimiter<'p, 's, P> {
2129 /// The parser that is checking the nest limit.
2130 p: &'p ParserI<'s, P>,
2131 /// The current depth while walking an Ast.
2132 depth: u32,
2133}
2134
2135impl<'p, 's, P: Borrow<Parser>> NestLimiter<'p, 's, P> {
2136 fn new(p: &'p ParserI<'s, P>) -> NestLimiter<'p, 's, P> {
2137 NestLimiter { p, depth: 0 }
2138 }
2139
2140 #[inline(never)]
2141 fn check(self, ast: &Ast) -> Result<()> {
2142 ast::visit(ast, self)
2143 }
2144
2145 fn increment_depth(&mut self, span: &Span) -> Result<()> {
2146 let new = self.depth.checked_add(1).ok_or_else(|| {
2147 self.p.error(
2148 span.clone(),
2149 ast::ErrorKind::NestLimitExceeded(::std::u32::MAX),
2150 )
2151 })?;
2152 let limit = self.p.parser().nest_limit;
2153 if new > limit {
2154 return Err(self.p.error(
2155 span.clone(),
2156 ast::ErrorKind::NestLimitExceeded(limit),
2157 ));
2158 }
2159 self.depth = new;
2160 Ok(())
2161 }
2162
2163 fn decrement_depth(&mut self) {
2164 // Assuming the correctness of the visitor, this should never drop
2165 // below 0.
2166 self.depth = self.depth.checked_sub(1).unwrap();
2167 }
2168}
2169
2170impl<'p, 's, P: Borrow<Parser>> ast::Visitor for NestLimiter<'p, 's, P> {
2171 type Output = ();
2172 type Err = ast::Error;
2173
2174 fn finish(self) -> Result<()> {
2175 Ok(())
2176 }
2177
2178 fn visit_pre(&mut self, ast: &Ast) -> Result<()> {
2179 let span = match *ast {
2180 Ast::Empty(_)
2181 | Ast::Flags(_)
2182 | Ast::Literal(_)
2183 | Ast::Dot(_)
2184 | Ast::Assertion(_)
2185 | Ast::Class(ast::Class::Unicode(_))
2186 | Ast::Class(ast::Class::Perl(_)) => {
2187 // These are all base cases, so we don't increment depth.
2188 return Ok(());
2189 }
2190 Ast::Class(ast::Class::Bracketed(ref x)) => &x.span,
2191 Ast::Repetition(ref x) => &x.span,
2192 Ast::Group(ref x) => &x.span,
2193 Ast::Alternation(ref x) => &x.span,
2194 Ast::Concat(ref x) => &x.span,
2195 };
2196 self.increment_depth(span)
2197 }
2198
2199 fn visit_post(&mut self, ast: &Ast) -> Result<()> {
2200 match *ast {
2201 Ast::Empty(_)
2202 | Ast::Flags(_)
2203 | Ast::Literal(_)
2204 | Ast::Dot(_)
2205 | Ast::Assertion(_)
2206 | Ast::Class(ast::Class::Unicode(_))
2207 | Ast::Class(ast::Class::Perl(_)) => {
2208 // These are all base cases, so we don't decrement depth.
2209 Ok(())
2210 }
2211 Ast::Class(ast::Class::Bracketed(_))
2212 | Ast::Repetition(_)
2213 | Ast::Group(_)
2214 | Ast::Alternation(_)
2215 | Ast::Concat(_) => {
2216 self.decrement_depth();
2217 Ok(())
2218 }
2219 }
2220 }
2221
2222 fn visit_class_set_item_pre(
2223 &mut self,
2224 ast: &ast::ClassSetItem,
2225 ) -> Result<()> {
2226 let span = match *ast {
2227 ast::ClassSetItem::Empty(_)
2228 | ast::ClassSetItem::Literal(_)
2229 | ast::ClassSetItem::Range(_)
2230 | ast::ClassSetItem::Ascii(_)
2231 | ast::ClassSetItem::Unicode(_)
2232 | ast::ClassSetItem::Perl(_) => {
2233 // These are all base cases, so we don't increment depth.
2234 return Ok(());
2235 }
2236 ast::ClassSetItem::Bracketed(ref x) => &x.span,
2237 ast::ClassSetItem::Union(ref x) => &x.span,
2238 };
2239 self.increment_depth(span)
2240 }
2241
2242 fn visit_class_set_item_post(
2243 &mut self,
2244 ast: &ast::ClassSetItem,
2245 ) -> Result<()> {
2246 match *ast {
2247 ast::ClassSetItem::Empty(_)
2248 | ast::ClassSetItem::Literal(_)
2249 | ast::ClassSetItem::Range(_)
2250 | ast::ClassSetItem::Ascii(_)
2251 | ast::ClassSetItem::Unicode(_)
2252 | ast::ClassSetItem::Perl(_) => {
2253 // These are all base cases, so we don't decrement depth.
2254 Ok(())
2255 }
2256 ast::ClassSetItem::Bracketed(_) | ast::ClassSetItem::Union(_) => {
2257 self.decrement_depth();
2258 Ok(())
2259 }
2260 }
2261 }
2262
2263 fn visit_class_set_binary_op_pre(
2264 &mut self,
2265 ast: &ast::ClassSetBinaryOp,
2266 ) -> Result<()> {
2267 self.increment_depth(&ast.span)
2268 }
2269
2270 fn visit_class_set_binary_op_post(
2271 &mut self,
2272 _ast: &ast::ClassSetBinaryOp,
2273 ) -> Result<()> {
2274 self.decrement_depth();
2275 Ok(())
2276 }
2277}
2278
2279/// When the result is an error, transforms the ast::ErrorKind from the source
2280/// Result into another one. This function is used to return clearer error
2281/// messages when possible.
2282fn specialize_err<T>(
2283 result: Result<T>,
2284 from: ast::ErrorKind,
2285 to: ast::ErrorKind,
2286) -> Result<T> {
2287 if let Err(e) = result {
2288 if e.kind == from {
2289 Err(ast::Error { kind: to, pattern: e.pattern, span: e.span })
2290 } else {
2291 Err(e)
2292 }
2293 } else {
2294 result
2295 }
2296}
2297
2298#[cfg(test)]
2299mod tests {
2300 use std::ops::Range;
2301
2302 use super::{Parser, ParserBuilder, ParserI, Primitive};
2303 use crate::ast::{self, Ast, Position, Span};
2304
2305 // Our own assert_eq, which has slightly better formatting (but honestly
2306 // still kind of crappy).
2307 macro_rules! assert_eq {
2308 ($left:expr, $right:expr) => {{
2309 match (&$left, &$right) {
2310 (left_val, right_val) => {
2311 if !(*left_val == *right_val) {
2312 panic!(
2313 "assertion failed: `(left == right)`\n\n\
2314 left: `{:?}`\nright: `{:?}`\n\n",
2315 left_val, right_val
2316 )
2317 }
2318 }
2319 }
2320 }};
2321 }
2322
2323 // We create these errors to compare with real ast::Errors in the tests.
2324 // We define equality between TestError and ast::Error to disregard the
2325 // pattern string in ast::Error, which is annoying to provide in tests.
2326 #[derive(Clone, Debug)]
2327 struct TestError {
2328 span: Span,
2329 kind: ast::ErrorKind,
2330 }
2331
2332 impl PartialEq<ast::Error> for TestError {
2333 fn eq(&self, other: &ast::Error) -> bool {
2334 self.span == other.span && self.kind == other.kind
2335 }
2336 }
2337
2338 impl PartialEq<TestError> for ast::Error {
2339 fn eq(&self, other: &TestError) -> bool {
2340 self.span == other.span && self.kind == other.kind
2341 }
2342 }
2343
2344 fn s(str: &str) -> String {
2345 str.to_string()
2346 }
2347
2348 fn parser(pattern: &str) -> ParserI<'_, Parser> {
2349 ParserI::new(Parser::new(), pattern)
2350 }
2351
2352 fn parser_octal(pattern: &str) -> ParserI<'_, Parser> {
2353 let parser = ParserBuilder::new().octal(true).build();
2354 ParserI::new(parser, pattern)
2355 }
2356
2357 fn parser_nest_limit(
2358 pattern: &str,
2359 nest_limit: u32,
2360 ) -> ParserI<'_, Parser> {
2361 let p = ParserBuilder::new().nest_limit(nest_limit).build();
2362 ParserI::new(p, pattern)
2363 }
2364
2365 fn parser_ignore_whitespace(pattern: &str) -> ParserI<'_, Parser> {
2366 let p = ParserBuilder::new().ignore_whitespace(true).build();
2367 ParserI::new(p, pattern)
2368 }
2369
2370 /// Short alias for creating a new span.
2371 fn nspan(start: Position, end: Position) -> Span {
2372 Span::new(start, end)
2373 }
2374
2375 /// Short alias for creating a new position.
2376 fn npos(offset: usize, line: usize, column: usize) -> Position {
2377 Position::new(offset, line, column)
2378 }
2379
2380 /// Create a new span from the given offset range. This assumes a single
2381 /// line and sets the columns based on the offsets. i.e., This only works
2382 /// out of the box for ASCII, which is fine for most tests.
2383 fn span(range: Range<usize>) -> Span {
2384 let start = Position::new(range.start, 1, range.start + 1);
2385 let end = Position::new(range.end, 1, range.end + 1);
2386 Span::new(start, end)
2387 }
2388
2389 /// Create a new span for the corresponding byte range in the given string.
2390 fn span_range(subject: &str, range: Range<usize>) -> Span {
2391 let start = Position {
2392 offset: range.start,
2393 line: 1 + subject[..range.start].matches('\n').count(),
2394 column: 1 + subject[..range.start]
2395 .chars()
2396 .rev()
2397 .position(|c| c == '\n')
2398 .unwrap_or(subject[..range.start].chars().count()),
2399 };
2400 let end = Position {
2401 offset: range.end,
2402 line: 1 + subject[..range.end].matches('\n').count(),
2403 column: 1 + subject[..range.end]
2404 .chars()
2405 .rev()
2406 .position(|c| c == '\n')
2407 .unwrap_or(subject[..range.end].chars().count()),
2408 };
2409 Span::new(start, end)
2410 }
2411
2412 /// Create a verbatim literal starting at the given position.
2413 fn lit(c: char, start: usize) -> Ast {
2414 lit_with(c, span(start..start + c.len_utf8()))
2415 }
2416
2417 /// Create a punctuation literal starting at the given position.
2418 fn punct_lit(c: char, span: Span) -> Ast {
2419 Ast::Literal(ast::Literal {
2420 span,
2421 kind: ast::LiteralKind::Punctuation,
2422 c,
2423 })
2424 }
2425
2426 /// Create a verbatim literal with the given span.
2427 fn lit_with(c: char, span: Span) -> Ast {
2428 Ast::Literal(ast::Literal {
2429 span,
2430 kind: ast::LiteralKind::Verbatim,
2431 c,
2432 })
2433 }
2434
2435 /// Create a concatenation with the given range.
2436 fn concat(range: Range<usize>, asts: Vec<Ast>) -> Ast {
2437 concat_with(span(range), asts)
2438 }
2439
2440 /// Create a concatenation with the given span.
2441 fn concat_with(span: Span, asts: Vec<Ast>) -> Ast {
2442 Ast::Concat(ast::Concat { span, asts })
2443 }
2444
2445 /// Create an alternation with the given span.
2446 fn alt(range: Range<usize>, asts: Vec<Ast>) -> Ast {
2447 Ast::Alternation(ast::Alternation { span: span(range), asts })
2448 }
2449
2450 /// Create a capturing group with the given span.
2451 fn group(range: Range<usize>, index: u32, ast: Ast) -> Ast {
2452 Ast::Group(ast::Group {
2453 span: span(range),
2454 kind: ast::GroupKind::CaptureIndex(index),
2455 ast: Box::new(ast),
2456 })
2457 }
2458
2459 /// Create an ast::SetFlags.
2460 ///
2461 /// The given pattern should be the full pattern string. The range given
2462 /// should correspond to the byte offsets where the flag set occurs.
2463 ///
2464 /// If negated is true, then the set is interpreted as beginning with a
2465 /// negation.
2466 fn flag_set(
2467 pat: &str,
2468 range: Range<usize>,
2469 flag: ast::Flag,
2470 negated: bool,
2471 ) -> Ast {
2472 let mut items = vec![ast::FlagsItem {
2473 span: span_range(pat, (range.end - 2)..(range.end - 1)),
2474 kind: ast::FlagsItemKind::Flag(flag),
2475 }];
2476 if negated {
2477 items.insert(
2478 0,
2479 ast::FlagsItem {
2480 span: span_range(pat, (range.start + 2)..(range.end - 2)),
2481 kind: ast::FlagsItemKind::Negation,
2482 },
2483 );
2484 }
2485 Ast::Flags(ast::SetFlags {
2486 span: span_range(pat, range.clone()),
2487 flags: ast::Flags {
2488 span: span_range(pat, (range.start + 2)..(range.end - 1)),
2489 items,
2490 },
2491 })
2492 }
2493
2494 #[test]
2495 fn parse_nest_limit() {
2496 // A nest limit of 0 still allows some types of regexes.
2497 assert_eq!(
2498 parser_nest_limit("", 0).parse(),
2499 Ok(Ast::Empty(span(0..0)))
2500 );
2501 assert_eq!(parser_nest_limit("a", 0).parse(), Ok(lit('a', 0)));
2502
2503 // Test repetition operations, which require one level of nesting.
2504 assert_eq!(
2505 parser_nest_limit("a+", 0).parse().unwrap_err(),
2506 TestError {
2507 span: span(0..2),
2508 kind: ast::ErrorKind::NestLimitExceeded(0),
2509 }
2510 );
2511 assert_eq!(
2512 parser_nest_limit("a+", 1).parse(),
2513 Ok(Ast::Repetition(ast::Repetition {
2514 span: span(0..2),
2515 op: ast::RepetitionOp {
2516 span: span(1..2),
2517 kind: ast::RepetitionKind::OneOrMore,
2518 },
2519 greedy: true,
2520 ast: Box::new(lit('a', 0)),
2521 }))
2522 );
2523 assert_eq!(
2524 parser_nest_limit("(a)+", 1).parse().unwrap_err(),
2525 TestError {
2526 span: span(0..3),
2527 kind: ast::ErrorKind::NestLimitExceeded(1),
2528 }
2529 );
2530 assert_eq!(
2531 parser_nest_limit("a+*", 1).parse().unwrap_err(),
2532 TestError {
2533 span: span(0..2),
2534 kind: ast::ErrorKind::NestLimitExceeded(1),
2535 }
2536 );
2537 assert_eq!(
2538 parser_nest_limit("a+*", 2).parse(),
2539 Ok(Ast::Repetition(ast::Repetition {
2540 span: span(0..3),
2541 op: ast::RepetitionOp {
2542 span: span(2..3),
2543 kind: ast::RepetitionKind::ZeroOrMore,
2544 },
2545 greedy: true,
2546 ast: Box::new(Ast::Repetition(ast::Repetition {
2547 span: span(0..2),
2548 op: ast::RepetitionOp {
2549 span: span(1..2),
2550 kind: ast::RepetitionKind::OneOrMore,
2551 },
2552 greedy: true,
2553 ast: Box::new(lit('a', 0)),
2554 })),
2555 }))
2556 );
2557
2558 // Test concatenations. A concatenation requires one level of nesting.
2559 assert_eq!(
2560 parser_nest_limit("ab", 0).parse().unwrap_err(),
2561 TestError {
2562 span: span(0..2),
2563 kind: ast::ErrorKind::NestLimitExceeded(0),
2564 }
2565 );
2566 assert_eq!(
2567 parser_nest_limit("ab", 1).parse(),
2568 Ok(concat(0..2, vec![lit('a', 0), lit('b', 1)]))
2569 );
2570 assert_eq!(
2571 parser_nest_limit("abc", 1).parse(),
2572 Ok(concat(0..3, vec![lit('a', 0), lit('b', 1), lit('c', 2)]))
2573 );
2574
2575 // Test alternations. An alternation requires one level of nesting.
2576 assert_eq!(
2577 parser_nest_limit("a|b", 0).parse().unwrap_err(),
2578 TestError {
2579 span: span(0..3),
2580 kind: ast::ErrorKind::NestLimitExceeded(0),
2581 }
2582 );
2583 assert_eq!(
2584 parser_nest_limit("a|b", 1).parse(),
2585 Ok(alt(0..3, vec![lit('a', 0), lit('b', 2)]))
2586 );
2587 assert_eq!(
2588 parser_nest_limit("a|b|c", 1).parse(),
2589 Ok(alt(0..5, vec![lit('a', 0), lit('b', 2), lit('c', 4)]))
2590 );
2591
2592 // Test character classes. Classes form their own mini-recursive
2593 // syntax!
2594 assert_eq!(
2595 parser_nest_limit("[a]", 0).parse().unwrap_err(),
2596 TestError {
2597 span: span(0..3),
2598 kind: ast::ErrorKind::NestLimitExceeded(0),
2599 }
2600 );
2601 assert_eq!(
2602 parser_nest_limit("[a]", 1).parse(),
2603 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
2604 span: span(0..3),
2605 negated: false,
2606 kind: ast::ClassSet::Item(ast::ClassSetItem::Literal(
2607 ast::Literal {
2608 span: span(1..2),
2609 kind: ast::LiteralKind::Verbatim,
2610 c: 'a',
2611 }
2612 )),
2613 })))
2614 );
2615 assert_eq!(
2616 parser_nest_limit("[ab]", 1).parse().unwrap_err(),
2617 TestError {
2618 span: span(1..3),
2619 kind: ast::ErrorKind::NestLimitExceeded(1),
2620 }
2621 );
2622 assert_eq!(
2623 parser_nest_limit("[ab[cd]]", 2).parse().unwrap_err(),
2624 TestError {
2625 span: span(3..7),
2626 kind: ast::ErrorKind::NestLimitExceeded(2),
2627 }
2628 );
2629 assert_eq!(
2630 parser_nest_limit("[ab[cd]]", 3).parse().unwrap_err(),
2631 TestError {
2632 span: span(4..6),
2633 kind: ast::ErrorKind::NestLimitExceeded(3),
2634 }
2635 );
2636 assert_eq!(
2637 parser_nest_limit("[a--b]", 1).parse().unwrap_err(),
2638 TestError {
2639 span: span(1..5),
2640 kind: ast::ErrorKind::NestLimitExceeded(1),
2641 }
2642 );
2643 assert_eq!(
2644 parser_nest_limit("[a--bc]", 2).parse().unwrap_err(),
2645 TestError {
2646 span: span(4..6),
2647 kind: ast::ErrorKind::NestLimitExceeded(2),
2648 }
2649 );
2650 }
2651
2652 #[test]
2653 fn parse_comments() {
2654 let pat = "(?x)
2655# This is comment 1.
2656foo # This is comment 2.
2657 # This is comment 3.
2658bar
2659# This is comment 4.";
2660 let astc = parser(pat).parse_with_comments().unwrap();
2661 assert_eq!(
2662 astc.ast,
2663 concat_with(
2664 span_range(pat, 0..pat.len()),
2665 vec![
2666 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2667 lit_with('f', span_range(pat, 26..27)),
2668 lit_with('o', span_range(pat, 27..28)),
2669 lit_with('o', span_range(pat, 28..29)),
2670 lit_with('b', span_range(pat, 74..75)),
2671 lit_with('a', span_range(pat, 75..76)),
2672 lit_with('r', span_range(pat, 76..77)),
2673 ]
2674 )
2675 );
2676 assert_eq!(
2677 astc.comments,
2678 vec![
2679 ast::Comment {
2680 span: span_range(pat, 5..26),
2681 comment: s(" This is comment 1."),
2682 },
2683 ast::Comment {
2684 span: span_range(pat, 30..51),
2685 comment: s(" This is comment 2."),
2686 },
2687 ast::Comment {
2688 span: span_range(pat, 53..74),
2689 comment: s(" This is comment 3."),
2690 },
2691 ast::Comment {
2692 span: span_range(pat, 78..98),
2693 comment: s(" This is comment 4."),
2694 },
2695 ]
2696 );
2697 }
2698
2699 #[test]
2700 fn parse_holistic() {
2701 assert_eq!(parser("]").parse(), Ok(lit(']', 0)));
2702 assert_eq!(
2703 parser(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#\&\-\~").parse(),
2704 Ok(concat(
2705 0..36,
2706 vec![
2707 punct_lit('\\', span(0..2)),
2708 punct_lit('.', span(2..4)),
2709 punct_lit('+', span(4..6)),
2710 punct_lit('*', span(6..8)),
2711 punct_lit('?', span(8..10)),
2712 punct_lit('(', span(10..12)),
2713 punct_lit(')', span(12..14)),
2714 punct_lit('|', span(14..16)),
2715 punct_lit('[', span(16..18)),
2716 punct_lit(']', span(18..20)),
2717 punct_lit('{', span(20..22)),
2718 punct_lit('}', span(22..24)),
2719 punct_lit('^', span(24..26)),
2720 punct_lit('$', span(26..28)),
2721 punct_lit('#', span(28..30)),
2722 punct_lit('&', span(30..32)),
2723 punct_lit('-', span(32..34)),
2724 punct_lit('~', span(34..36)),
2725 ]
2726 ))
2727 );
2728 }
2729
2730 #[test]
2731 fn parse_ignore_whitespace() {
2732 // Test that basic whitespace insensitivity works.
2733 let pat = "(?x)a b";
2734 assert_eq!(
2735 parser(pat).parse(),
2736 Ok(concat_with(
2737 nspan(npos(0, 1, 1), npos(7, 1, 8)),
2738 vec![
2739 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2740 lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))),
2741 lit_with('b', nspan(npos(6, 1, 7), npos(7, 1, 8))),
2742 ]
2743 ))
2744 );
2745
2746 // Test that we can toggle whitespace insensitivity.
2747 let pat = "(?x)a b(?-x)a b";
2748 assert_eq!(
2749 parser(pat).parse(),
2750 Ok(concat_with(
2751 nspan(npos(0, 1, 1), npos(15, 1, 16)),
2752 vec![
2753 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2754 lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))),
2755 lit_with('b', nspan(npos(6, 1, 7), npos(7, 1, 8))),
2756 flag_set(pat, 7..12, ast::Flag::IgnoreWhitespace, true),
2757 lit_with('a', nspan(npos(12, 1, 13), npos(13, 1, 14))),
2758 lit_with(' ', nspan(npos(13, 1, 14), npos(14, 1, 15))),
2759 lit_with('b', nspan(npos(14, 1, 15), npos(15, 1, 16))),
2760 ]
2761 ))
2762 );
2763
2764 // Test that nesting whitespace insensitive flags works.
2765 let pat = "a (?x:a )a ";
2766 assert_eq!(
2767 parser(pat).parse(),
2768 Ok(concat_with(
2769 span_range(pat, 0..11),
2770 vec![
2771 lit_with('a', span_range(pat, 0..1)),
2772 lit_with(' ', span_range(pat, 1..2)),
2773 Ast::Group(ast::Group {
2774 span: span_range(pat, 2..9),
2775 kind: ast::GroupKind::NonCapturing(ast::Flags {
2776 span: span_range(pat, 4..5),
2777 items: vec![ast::FlagsItem {
2778 span: span_range(pat, 4..5),
2779 kind: ast::FlagsItemKind::Flag(
2780 ast::Flag::IgnoreWhitespace
2781 ),
2782 },],
2783 }),
2784 ast: Box::new(lit_with('a', span_range(pat, 6..7))),
2785 }),
2786 lit_with('a', span_range(pat, 9..10)),
2787 lit_with(' ', span_range(pat, 10..11)),
2788 ]
2789 ))
2790 );
2791
2792 // Test that whitespace after an opening paren is insignificant.
2793 let pat = "(?x)( ?P<foo> a )";
2794 assert_eq!(
2795 parser(pat).parse(),
2796 Ok(concat_with(
2797 span_range(pat, 0..pat.len()),
2798 vec![
2799 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2800 Ast::Group(ast::Group {
2801 span: span_range(pat, 4..pat.len()),
2802 kind: ast::GroupKind::CaptureName(ast::CaptureName {
2803 span: span_range(pat, 9..12),
2804 name: s("foo"),
2805 index: 1,
2806 }),
2807 ast: Box::new(lit_with('a', span_range(pat, 14..15))),
2808 }),
2809 ]
2810 ))
2811 );
2812 let pat = "(?x)( a )";
2813 assert_eq!(
2814 parser(pat).parse(),
2815 Ok(concat_with(
2816 span_range(pat, 0..pat.len()),
2817 vec![
2818 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2819 Ast::Group(ast::Group {
2820 span: span_range(pat, 4..pat.len()),
2821 kind: ast::GroupKind::CaptureIndex(1),
2822 ast: Box::new(lit_with('a', span_range(pat, 7..8))),
2823 }),
2824 ]
2825 ))
2826 );
2827 let pat = "(?x)( ?: a )";
2828 assert_eq!(
2829 parser(pat).parse(),
2830 Ok(concat_with(
2831 span_range(pat, 0..pat.len()),
2832 vec![
2833 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2834 Ast::Group(ast::Group {
2835 span: span_range(pat, 4..pat.len()),
2836 kind: ast::GroupKind::NonCapturing(ast::Flags {
2837 span: span_range(pat, 8..8),
2838 items: vec![],
2839 }),
2840 ast: Box::new(lit_with('a', span_range(pat, 11..12))),
2841 }),
2842 ]
2843 ))
2844 );
2845 let pat = r"(?x)\x { 53 }";
2846 assert_eq!(
2847 parser(pat).parse(),
2848 Ok(concat_with(
2849 span_range(pat, 0..pat.len()),
2850 vec![
2851 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2852 Ast::Literal(ast::Literal {
2853 span: span(4..13),
2854 kind: ast::LiteralKind::HexBrace(
2855 ast::HexLiteralKind::X
2856 ),
2857 c: 'S',
2858 }),
2859 ]
2860 ))
2861 );
2862
2863 // Test that whitespace after an escape is OK.
2864 let pat = r"(?x)\ ";
2865 assert_eq!(
2866 parser(pat).parse(),
2867 Ok(concat_with(
2868 span_range(pat, 0..pat.len()),
2869 vec![
2870 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2871 Ast::Literal(ast::Literal {
2872 span: span_range(pat, 4..6),
2873 kind: ast::LiteralKind::Special(
2874 ast::SpecialLiteralKind::Space
2875 ),
2876 c: ' ',
2877 }),
2878 ]
2879 ))
2880 );
2881 // ... but only when `x` mode is enabled.
2882 let pat = r"\ ";
2883 assert_eq!(
2884 parser(pat).parse().unwrap_err(),
2885 TestError {
2886 span: span_range(pat, 0..2),
2887 kind: ast::ErrorKind::EscapeUnrecognized,
2888 }
2889 );
2890 }
2891
2892 #[test]
2893 fn parse_newlines() {
2894 let pat = ".\n.";
2895 assert_eq!(
2896 parser(pat).parse(),
2897 Ok(concat_with(
2898 span_range(pat, 0..3),
2899 vec![
2900 Ast::Dot(span_range(pat, 0..1)),
2901 lit_with('\n', span_range(pat, 1..2)),
2902 Ast::Dot(span_range(pat, 2..3)),
2903 ]
2904 ))
2905 );
2906
2907 let pat = "foobar\nbaz\nquux\n";
2908 assert_eq!(
2909 parser(pat).parse(),
2910 Ok(concat_with(
2911 span_range(pat, 0..pat.len()),
2912 vec![
2913 lit_with('f', nspan(npos(0, 1, 1), npos(1, 1, 2))),
2914 lit_with('o', nspan(npos(1, 1, 2), npos(2, 1, 3))),
2915 lit_with('o', nspan(npos(2, 1, 3), npos(3, 1, 4))),
2916 lit_with('b', nspan(npos(3, 1, 4), npos(4, 1, 5))),
2917 lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))),
2918 lit_with('r', nspan(npos(5, 1, 6), npos(6, 1, 7))),
2919 lit_with('\n', nspan(npos(6, 1, 7), npos(7, 2, 1))),
2920 lit_with('b', nspan(npos(7, 2, 1), npos(8, 2, 2))),
2921 lit_with('a', nspan(npos(8, 2, 2), npos(9, 2, 3))),
2922 lit_with('z', nspan(npos(9, 2, 3), npos(10, 2, 4))),
2923 lit_with('\n', nspan(npos(10, 2, 4), npos(11, 3, 1))),
2924 lit_with('q', nspan(npos(11, 3, 1), npos(12, 3, 2))),
2925 lit_with('u', nspan(npos(12, 3, 2), npos(13, 3, 3))),
2926 lit_with('u', nspan(npos(13, 3, 3), npos(14, 3, 4))),
2927 lit_with('x', nspan(npos(14, 3, 4), npos(15, 3, 5))),
2928 lit_with('\n', nspan(npos(15, 3, 5), npos(16, 4, 1))),
2929 ]
2930 ))
2931 );
2932 }
2933
2934 #[test]
2935 fn parse_uncounted_repetition() {
2936 assert_eq!(
2937 parser(r"a*").parse(),
2938 Ok(Ast::Repetition(ast::Repetition {
2939 span: span(0..2),
2940 op: ast::RepetitionOp {
2941 span: span(1..2),
2942 kind: ast::RepetitionKind::ZeroOrMore,
2943 },
2944 greedy: true,
2945 ast: Box::new(lit('a', 0)),
2946 }))
2947 );
2948 assert_eq!(
2949 parser(r"a+").parse(),
2950 Ok(Ast::Repetition(ast::Repetition {
2951 span: span(0..2),
2952 op: ast::RepetitionOp {
2953 span: span(1..2),
2954 kind: ast::RepetitionKind::OneOrMore,
2955 },
2956 greedy: true,
2957 ast: Box::new(lit('a', 0)),
2958 }))
2959 );
2960
2961 assert_eq!(
2962 parser(r"a?").parse(),
2963 Ok(Ast::Repetition(ast::Repetition {
2964 span: span(0..2),
2965 op: ast::RepetitionOp {
2966 span: span(1..2),
2967 kind: ast::RepetitionKind::ZeroOrOne,
2968 },
2969 greedy: true,
2970 ast: Box::new(lit('a', 0)),
2971 }))
2972 );
2973 assert_eq!(
2974 parser(r"a??").parse(),
2975 Ok(Ast::Repetition(ast::Repetition {
2976 span: span(0..3),
2977 op: ast::RepetitionOp {
2978 span: span(1..3),
2979 kind: ast::RepetitionKind::ZeroOrOne,
2980 },
2981 greedy: false,
2982 ast: Box::new(lit('a', 0)),
2983 }))
2984 );
2985 assert_eq!(
2986 parser(r"a?").parse(),
2987 Ok(Ast::Repetition(ast::Repetition {
2988 span: span(0..2),
2989 op: ast::RepetitionOp {
2990 span: span(1..2),
2991 kind: ast::RepetitionKind::ZeroOrOne,
2992 },
2993 greedy: true,
2994 ast: Box::new(lit('a', 0)),
2995 }))
2996 );
2997 assert_eq!(
2998 parser(r"a?b").parse(),
2999 Ok(concat(
3000 0..3,
3001 vec![
3002 Ast::Repetition(ast::Repetition {
3003 span: span(0..2),
3004 op: ast::RepetitionOp {
3005 span: span(1..2),
3006 kind: ast::RepetitionKind::ZeroOrOne,
3007 },
3008 greedy: true,
3009 ast: Box::new(lit('a', 0)),
3010 }),
3011 lit('b', 2),
3012 ]
3013 ))
3014 );
3015 assert_eq!(
3016 parser(r"a??b").parse(),
3017 Ok(concat(
3018 0..4,
3019 vec![
3020 Ast::Repetition(ast::Repetition {
3021 span: span(0..3),
3022 op: ast::RepetitionOp {
3023 span: span(1..3),
3024 kind: ast::RepetitionKind::ZeroOrOne,
3025 },
3026 greedy: false,
3027 ast: Box::new(lit('a', 0)),
3028 }),
3029 lit('b', 3),
3030 ]
3031 ))
3032 );
3033 assert_eq!(
3034 parser(r"ab?").parse(),
3035 Ok(concat(
3036 0..3,
3037 vec![
3038 lit('a', 0),
3039 Ast::Repetition(ast::Repetition {
3040 span: span(1..3),
3041 op: ast::RepetitionOp {
3042 span: span(2..3),
3043 kind: ast::RepetitionKind::ZeroOrOne,
3044 },
3045 greedy: true,
3046 ast: Box::new(lit('b', 1)),
3047 }),
3048 ]
3049 ))
3050 );
3051 assert_eq!(
3052 parser(r"(ab)?").parse(),
3053 Ok(Ast::Repetition(ast::Repetition {
3054 span: span(0..5),
3055 op: ast::RepetitionOp {
3056 span: span(4..5),
3057 kind: ast::RepetitionKind::ZeroOrOne,
3058 },
3059 greedy: true,
3060 ast: Box::new(group(
3061 0..4,
3062 1,
3063 concat(1..3, vec![lit('a', 1), lit('b', 2),])
3064 )),
3065 }))
3066 );
3067 assert_eq!(
3068 parser(r"|a?").parse(),
3069 Ok(alt(
3070 0..3,
3071 vec![
3072 Ast::Empty(span(0..0)),
3073 Ast::Repetition(ast::Repetition {
3074 span: span(1..3),
3075 op: ast::RepetitionOp {
3076 span: span(2..3),
3077 kind: ast::RepetitionKind::ZeroOrOne,
3078 },
3079 greedy: true,
3080 ast: Box::new(lit('a', 1)),
3081 }),
3082 ]
3083 ))
3084 );
3085
3086 assert_eq!(
3087 parser(r"*").parse().unwrap_err(),
3088 TestError {
3089 span: span(0..0),
3090 kind: ast::ErrorKind::RepetitionMissing,
3091 }
3092 );
3093 assert_eq!(
3094 parser(r"(?i)*").parse().unwrap_err(),
3095 TestError {
3096 span: span(4..4),
3097 kind: ast::ErrorKind::RepetitionMissing,
3098 }
3099 );
3100 assert_eq!(
3101 parser(r"(*)").parse().unwrap_err(),
3102 TestError {
3103 span: span(1..1),
3104 kind: ast::ErrorKind::RepetitionMissing,
3105 }
3106 );
3107 assert_eq!(
3108 parser(r"(?:?)").parse().unwrap_err(),
3109 TestError {
3110 span: span(3..3),
3111 kind: ast::ErrorKind::RepetitionMissing,
3112 }
3113 );
3114 assert_eq!(
3115 parser(r"+").parse().unwrap_err(),
3116 TestError {
3117 span: span(0..0),
3118 kind: ast::ErrorKind::RepetitionMissing,
3119 }
3120 );
3121 assert_eq!(
3122 parser(r"?").parse().unwrap_err(),
3123 TestError {
3124 span: span(0..0),
3125 kind: ast::ErrorKind::RepetitionMissing,
3126 }
3127 );
3128 assert_eq!(
3129 parser(r"(?)").parse().unwrap_err(),
3130 TestError {
3131 span: span(1..1),
3132 kind: ast::ErrorKind::RepetitionMissing,
3133 }
3134 );
3135 assert_eq!(
3136 parser(r"|*").parse().unwrap_err(),
3137 TestError {
3138 span: span(1..1),
3139 kind: ast::ErrorKind::RepetitionMissing,
3140 }
3141 );
3142 assert_eq!(
3143 parser(r"|+").parse().unwrap_err(),
3144 TestError {
3145 span: span(1..1),
3146 kind: ast::ErrorKind::RepetitionMissing,
3147 }
3148 );
3149 assert_eq!(
3150 parser(r"|?").parse().unwrap_err(),
3151 TestError {
3152 span: span(1..1),
3153 kind: ast::ErrorKind::RepetitionMissing,
3154 }
3155 );
3156 }
3157
3158 #[test]
3159 fn parse_counted_repetition() {
3160 assert_eq!(
3161 parser(r"a{5}").parse(),
3162 Ok(Ast::Repetition(ast::Repetition {
3163 span: span(0..4),
3164 op: ast::RepetitionOp {
3165 span: span(1..4),
3166 kind: ast::RepetitionKind::Range(
3167 ast::RepetitionRange::Exactly(5)
3168 ),
3169 },
3170 greedy: true,
3171 ast: Box::new(lit('a', 0)),
3172 }))
3173 );
3174 assert_eq!(
3175 parser(r"a{5,}").parse(),
3176 Ok(Ast::Repetition(ast::Repetition {
3177 span: span(0..5),
3178 op: ast::RepetitionOp {
3179 span: span(1..5),
3180 kind: ast::RepetitionKind::Range(
3181 ast::RepetitionRange::AtLeast(5)
3182 ),
3183 },
3184 greedy: true,
3185 ast: Box::new(lit('a', 0)),
3186 }))
3187 );
3188 assert_eq!(
3189 parser(r"a{5,9}").parse(),
3190 Ok(Ast::Repetition(ast::Repetition {
3191 span: span(0..6),
3192 op: ast::RepetitionOp {
3193 span: span(1..6),
3194 kind: ast::RepetitionKind::Range(
3195 ast::RepetitionRange::Bounded(5, 9)
3196 ),
3197 },
3198 greedy: true,
3199 ast: Box::new(lit('a', 0)),
3200 }))
3201 );
3202 assert_eq!(
3203 parser(r"a{5}?").parse(),
3204 Ok(Ast::Repetition(ast::Repetition {
3205 span: span(0..5),
3206 op: ast::RepetitionOp {
3207 span: span(1..5),
3208 kind: ast::RepetitionKind::Range(
3209 ast::RepetitionRange::Exactly(5)
3210 ),
3211 },
3212 greedy: false,
3213 ast: Box::new(lit('a', 0)),
3214 }))
3215 );
3216 assert_eq!(
3217 parser(r"ab{5}").parse(),
3218 Ok(concat(
3219 0..5,
3220 vec![
3221 lit('a', 0),
3222 Ast::Repetition(ast::Repetition {
3223 span: span(1..5),
3224 op: ast::RepetitionOp {
3225 span: span(2..5),
3226 kind: ast::RepetitionKind::Range(
3227 ast::RepetitionRange::Exactly(5)
3228 ),
3229 },
3230 greedy: true,
3231 ast: Box::new(lit('b', 1)),
3232 }),
3233 ]
3234 ))
3235 );
3236 assert_eq!(
3237 parser(r"ab{5}c").parse(),
3238 Ok(concat(
3239 0..6,
3240 vec![
3241 lit('a', 0),
3242 Ast::Repetition(ast::Repetition {
3243 span: span(1..5),
3244 op: ast::RepetitionOp {
3245 span: span(2..5),
3246 kind: ast::RepetitionKind::Range(
3247 ast::RepetitionRange::Exactly(5)
3248 ),
3249 },
3250 greedy: true,
3251 ast: Box::new(lit('b', 1)),
3252 }),
3253 lit('c', 5),
3254 ]
3255 ))
3256 );
3257
3258 assert_eq!(
3259 parser(r"a{ 5 }").parse(),
3260 Ok(Ast::Repetition(ast::Repetition {
3261 span: span(0..6),
3262 op: ast::RepetitionOp {
3263 span: span(1..6),
3264 kind: ast::RepetitionKind::Range(
3265 ast::RepetitionRange::Exactly(5)
3266 ),
3267 },
3268 greedy: true,
3269 ast: Box::new(lit('a', 0)),
3270 }))
3271 );
3272 assert_eq!(
3273 parser(r"a{ 5 , 9 }").parse(),
3274 Ok(Ast::Repetition(ast::Repetition {
3275 span: span(0..10),
3276 op: ast::RepetitionOp {
3277 span: span(1..10),
3278 kind: ast::RepetitionKind::Range(
3279 ast::RepetitionRange::Bounded(5, 9)
3280 ),
3281 },
3282 greedy: true,
3283 ast: Box::new(lit('a', 0)),
3284 }))
3285 );
3286 assert_eq!(
3287 parser_ignore_whitespace(r"a{5,9} ?").parse(),
3288 Ok(Ast::Repetition(ast::Repetition {
3289 span: span(0..8),
3290 op: ast::RepetitionOp {
3291 span: span(1..8),
3292 kind: ast::RepetitionKind::Range(
3293 ast::RepetitionRange::Bounded(5, 9)
3294 ),
3295 },
3296 greedy: false,
3297 ast: Box::new(lit('a', 0)),
3298 }))
3299 );
3300
3301 assert_eq!(
3302 parser(r"(?i){0}").parse().unwrap_err(),
3303 TestError {
3304 span: span(4..4),
3305 kind: ast::ErrorKind::RepetitionMissing,
3306 }
3307 );
3308 assert_eq!(
3309 parser(r"(?m){1,1}").parse().unwrap_err(),
3310 TestError {
3311 span: span(4..4),
3312 kind: ast::ErrorKind::RepetitionMissing,
3313 }
3314 );
3315 assert_eq!(
3316 parser(r"a{]}").parse().unwrap_err(),
3317 TestError {
3318 span: span(2..2),
3319 kind: ast::ErrorKind::RepetitionCountDecimalEmpty,
3320 }
3321 );
3322 assert_eq!(
3323 parser(r"a{1,]}").parse().unwrap_err(),
3324 TestError {
3325 span: span(4..4),
3326 kind: ast::ErrorKind::RepetitionCountDecimalEmpty,
3327 }
3328 );
3329 assert_eq!(
3330 parser(r"a{").parse().unwrap_err(),
3331 TestError {
3332 span: span(1..2),
3333 kind: ast::ErrorKind::RepetitionCountUnclosed,
3334 }
3335 );
3336 assert_eq!(
3337 parser(r"a{}").parse().unwrap_err(),
3338 TestError {
3339 span: span(2..2),
3340 kind: ast::ErrorKind::RepetitionCountDecimalEmpty,
3341 }
3342 );
3343 assert_eq!(
3344 parser(r"a{a").parse().unwrap_err(),
3345 TestError {
3346 span: span(2..2),
3347 kind: ast::ErrorKind::RepetitionCountDecimalEmpty,
3348 }
3349 );
3350 assert_eq!(
3351 parser(r"a{9999999999}").parse().unwrap_err(),
3352 TestError {
3353 span: span(2..12),
3354 kind: ast::ErrorKind::DecimalInvalid,
3355 }
3356 );
3357 assert_eq!(
3358 parser(r"a{9").parse().unwrap_err(),
3359 TestError {
3360 span: span(1..3),
3361 kind: ast::ErrorKind::RepetitionCountUnclosed,
3362 }
3363 );
3364 assert_eq!(
3365 parser(r"a{9,a").parse().unwrap_err(),
3366 TestError {
3367 span: span(4..4),
3368 kind: ast::ErrorKind::RepetitionCountDecimalEmpty,
3369 }
3370 );
3371 assert_eq!(
3372 parser(r"a{9,9999999999}").parse().unwrap_err(),
3373 TestError {
3374 span: span(4..14),
3375 kind: ast::ErrorKind::DecimalInvalid,
3376 }
3377 );
3378 assert_eq!(
3379 parser(r"a{9,").parse().unwrap_err(),
3380 TestError {
3381 span: span(1..4),
3382 kind: ast::ErrorKind::RepetitionCountUnclosed,
3383 }
3384 );
3385 assert_eq!(
3386 parser(r"a{9,11").parse().unwrap_err(),
3387 TestError {
3388 span: span(1..6),
3389 kind: ast::ErrorKind::RepetitionCountUnclosed,
3390 }
3391 );
3392 assert_eq!(
3393 parser(r"a{2,1}").parse().unwrap_err(),
3394 TestError {
3395 span: span(1..6),
3396 kind: ast::ErrorKind::RepetitionCountInvalid,
3397 }
3398 );
3399 assert_eq!(
3400 parser(r"{5}").parse().unwrap_err(),
3401 TestError {
3402 span: span(0..0),
3403 kind: ast::ErrorKind::RepetitionMissing,
3404 }
3405 );
3406 assert_eq!(
3407 parser(r"|{5}").parse().unwrap_err(),
3408 TestError {
3409 span: span(1..1),
3410 kind: ast::ErrorKind::RepetitionMissing,
3411 }
3412 );
3413 }
3414
3415 #[test]
3416 fn parse_alternate() {
3417 assert_eq!(
3418 parser(r"a|b").parse(),
3419 Ok(Ast::Alternation(ast::Alternation {
3420 span: span(0..3),
3421 asts: vec![lit('a', 0), lit('b', 2)],
3422 }))
3423 );
3424 assert_eq!(
3425 parser(r"(a|b)").parse(),
3426 Ok(group(
3427 0..5,
3428 1,
3429 Ast::Alternation(ast::Alternation {
3430 span: span(1..4),
3431 asts: vec![lit('a', 1), lit('b', 3)],
3432 })
3433 ))
3434 );
3435
3436 assert_eq!(
3437 parser(r"a|b|c").parse(),
3438 Ok(Ast::Alternation(ast::Alternation {
3439 span: span(0..5),
3440 asts: vec![lit('a', 0), lit('b', 2), lit('c', 4)],
3441 }))
3442 );
3443 assert_eq!(
3444 parser(r"ax|by|cz").parse(),
3445 Ok(Ast::Alternation(ast::Alternation {
3446 span: span(0..8),
3447 asts: vec![
3448 concat(0..2, vec![lit('a', 0), lit('x', 1)]),
3449 concat(3..5, vec![lit('b', 3), lit('y', 4)]),
3450 concat(6..8, vec![lit('c', 6), lit('z', 7)]),
3451 ],
3452 }))
3453 );
3454 assert_eq!(
3455 parser(r"(ax|by|cz)").parse(),
3456 Ok(group(
3457 0..10,
3458 1,
3459 Ast::Alternation(ast::Alternation {
3460 span: span(1..9),
3461 asts: vec![
3462 concat(1..3, vec![lit('a', 1), lit('x', 2)]),
3463 concat(4..6, vec![lit('b', 4), lit('y', 5)]),
3464 concat(7..9, vec![lit('c', 7), lit('z', 8)]),
3465 ],
3466 })
3467 ))
3468 );
3469 assert_eq!(
3470 parser(r"(ax|(by|(cz)))").parse(),
3471 Ok(group(
3472 0..14,
3473 1,
3474 alt(
3475 1..13,
3476 vec![
3477 concat(1..3, vec![lit('a', 1), lit('x', 2)]),
3478 group(
3479 4..13,
3480 2,
3481 alt(
3482 5..12,
3483 vec![
3484 concat(
3485 5..7,
3486 vec![lit('b', 5), lit('y', 6)]
3487 ),
3488 group(
3489 8..12,
3490 3,
3491 concat(
3492 9..11,
3493 vec![lit('c', 9), lit('z', 10),]
3494 )
3495 ),
3496 ]
3497 )
3498 ),
3499 ]
3500 )
3501 ))
3502 );
3503
3504 assert_eq!(
3505 parser(r"|").parse(),
3506 Ok(alt(
3507 0..1,
3508 vec![Ast::Empty(span(0..0)), Ast::Empty(span(1..1)),]
3509 ))
3510 );
3511 assert_eq!(
3512 parser(r"||").parse(),
3513 Ok(alt(
3514 0..2,
3515 vec![
3516 Ast::Empty(span(0..0)),
3517 Ast::Empty(span(1..1)),
3518 Ast::Empty(span(2..2)),
3519 ]
3520 ))
3521 );
3522 assert_eq!(
3523 parser(r"a|").parse(),
3524 Ok(alt(0..2, vec![lit('a', 0), Ast::Empty(span(2..2)),]))
3525 );
3526 assert_eq!(
3527 parser(r"|a").parse(),
3528 Ok(alt(0..2, vec![Ast::Empty(span(0..0)), lit('a', 1),]))
3529 );
3530
3531 assert_eq!(
3532 parser(r"(|)").parse(),
3533 Ok(group(
3534 0..3,
3535 1,
3536 alt(
3537 1..2,
3538 vec![Ast::Empty(span(1..1)), Ast::Empty(span(2..2)),]
3539 )
3540 ))
3541 );
3542 assert_eq!(
3543 parser(r"(a|)").parse(),
3544 Ok(group(
3545 0..4,
3546 1,
3547 alt(1..3, vec![lit('a', 1), Ast::Empty(span(3..3)),])
3548 ))
3549 );
3550 assert_eq!(
3551 parser(r"(|a)").parse(),
3552 Ok(group(
3553 0..4,
3554 1,
3555 alt(1..3, vec![Ast::Empty(span(1..1)), lit('a', 2),])
3556 ))
3557 );
3558
3559 assert_eq!(
3560 parser(r"a|b)").parse().unwrap_err(),
3561 TestError {
3562 span: span(3..4),
3563 kind: ast::ErrorKind::GroupUnopened,
3564 }
3565 );
3566 assert_eq!(
3567 parser(r"(a|b").parse().unwrap_err(),
3568 TestError {
3569 span: span(0..1),
3570 kind: ast::ErrorKind::GroupUnclosed,
3571 }
3572 );
3573 }
3574
3575 #[test]
3576 fn parse_unsupported_lookaround() {
3577 assert_eq!(
3578 parser(r"(?=a)").parse().unwrap_err(),
3579 TestError {
3580 span: span(0..3),
3581 kind: ast::ErrorKind::UnsupportedLookAround,
3582 }
3583 );
3584 assert_eq!(
3585 parser(r"(?!a)").parse().unwrap_err(),
3586 TestError {
3587 span: span(0..3),
3588 kind: ast::ErrorKind::UnsupportedLookAround,
3589 }
3590 );
3591 assert_eq!(
3592 parser(r"(?<=a)").parse().unwrap_err(),
3593 TestError {
3594 span: span(0..4),
3595 kind: ast::ErrorKind::UnsupportedLookAround,
3596 }
3597 );
3598 assert_eq!(
3599 parser(r"(?<!a)").parse().unwrap_err(),
3600 TestError {
3601 span: span(0..4),
3602 kind: ast::ErrorKind::UnsupportedLookAround,
3603 }
3604 );
3605 }
3606
3607 #[test]
3608 fn parse_group() {
3609 assert_eq!(
3610 parser("(?i)").parse(),
3611 Ok(Ast::Flags(ast::SetFlags {
3612 span: span(0..4),
3613 flags: ast::Flags {
3614 span: span(2..3),
3615 items: vec![ast::FlagsItem {
3616 span: span(2..3),
3617 kind: ast::FlagsItemKind::Flag(
3618 ast::Flag::CaseInsensitive
3619 ),
3620 }],
3621 },
3622 }))
3623 );
3624 assert_eq!(
3625 parser("(?iU)").parse(),
3626 Ok(Ast::Flags(ast::SetFlags {
3627 span: span(0..5),
3628 flags: ast::Flags {
3629 span: span(2..4),
3630 items: vec![
3631 ast::FlagsItem {
3632 span: span(2..3),
3633 kind: ast::FlagsItemKind::Flag(
3634 ast::Flag::CaseInsensitive
3635 ),
3636 },
3637 ast::FlagsItem {
3638 span: span(3..4),
3639 kind: ast::FlagsItemKind::Flag(
3640 ast::Flag::SwapGreed
3641 ),
3642 },
3643 ],
3644 },
3645 }))
3646 );
3647 assert_eq!(
3648 parser("(?i-U)").parse(),
3649 Ok(Ast::Flags(ast::SetFlags {
3650 span: span(0..6),
3651 flags: ast::Flags {
3652 span: span(2..5),
3653 items: vec![
3654 ast::FlagsItem {
3655 span: span(2..3),
3656 kind: ast::FlagsItemKind::Flag(
3657 ast::Flag::CaseInsensitive
3658 ),
3659 },
3660 ast::FlagsItem {
3661 span: span(3..4),
3662 kind: ast::FlagsItemKind::Negation,
3663 },
3664 ast::FlagsItem {
3665 span: span(4..5),
3666 kind: ast::FlagsItemKind::Flag(
3667 ast::Flag::SwapGreed
3668 ),
3669 },
3670 ],
3671 },
3672 }))
3673 );
3674
3675 assert_eq!(
3676 parser("()").parse(),
3677 Ok(Ast::Group(ast::Group {
3678 span: span(0..2),
3679 kind: ast::GroupKind::CaptureIndex(1),
3680 ast: Box::new(Ast::Empty(span(1..1))),
3681 }))
3682 );
3683 assert_eq!(
3684 parser("(a)").parse(),
3685 Ok(Ast::Group(ast::Group {
3686 span: span(0..3),
3687 kind: ast::GroupKind::CaptureIndex(1),
3688 ast: Box::new(lit('a', 1)),
3689 }))
3690 );
3691 assert_eq!(
3692 parser("(())").parse(),
3693 Ok(Ast::Group(ast::Group {
3694 span: span(0..4),
3695 kind: ast::GroupKind::CaptureIndex(1),
3696 ast: Box::new(Ast::Group(ast::Group {
3697 span: span(1..3),
3698 kind: ast::GroupKind::CaptureIndex(2),
3699 ast: Box::new(Ast::Empty(span(2..2))),
3700 })),
3701 }))
3702 );
3703
3704 assert_eq!(
3705 parser("(?:a)").parse(),
3706 Ok(Ast::Group(ast::Group {
3707 span: span(0..5),
3708 kind: ast::GroupKind::NonCapturing(ast::Flags {
3709 span: span(2..2),
3710 items: vec![],
3711 }),
3712 ast: Box::new(lit('a', 3)),
3713 }))
3714 );
3715
3716 assert_eq!(
3717 parser("(?i:a)").parse(),
3718 Ok(Ast::Group(ast::Group {
3719 span: span(0..6),
3720 kind: ast::GroupKind::NonCapturing(ast::Flags {
3721 span: span(2..3),
3722 items: vec![ast::FlagsItem {
3723 span: span(2..3),
3724 kind: ast::FlagsItemKind::Flag(
3725 ast::Flag::CaseInsensitive
3726 ),
3727 },],
3728 }),
3729 ast: Box::new(lit('a', 4)),
3730 }))
3731 );
3732 assert_eq!(
3733 parser("(?i-U:a)").parse(),
3734 Ok(Ast::Group(ast::Group {
3735 span: span(0..8),
3736 kind: ast::GroupKind::NonCapturing(ast::Flags {
3737 span: span(2..5),
3738 items: vec![
3739 ast::FlagsItem {
3740 span: span(2..3),
3741 kind: ast::FlagsItemKind::Flag(
3742 ast::Flag::CaseInsensitive
3743 ),
3744 },
3745 ast::FlagsItem {
3746 span: span(3..4),
3747 kind: ast::FlagsItemKind::Negation,
3748 },
3749 ast::FlagsItem {
3750 span: span(4..5),
3751 kind: ast::FlagsItemKind::Flag(
3752 ast::Flag::SwapGreed
3753 ),
3754 },
3755 ],
3756 }),
3757 ast: Box::new(lit('a', 6)),
3758 }))
3759 );
3760
3761 assert_eq!(
3762 parser("(").parse().unwrap_err(),
3763 TestError {
3764 span: span(0..1),
3765 kind: ast::ErrorKind::GroupUnclosed,
3766 }
3767 );
3768 assert_eq!(
3769 parser("(?").parse().unwrap_err(),
3770 TestError {
3771 span: span(0..1),
3772 kind: ast::ErrorKind::GroupUnclosed,
3773 }
3774 );
3775 assert_eq!(
3776 parser("(?P").parse().unwrap_err(),
3777 TestError {
3778 span: span(2..3),
3779 kind: ast::ErrorKind::FlagUnrecognized,
3780 }
3781 );
3782 assert_eq!(
3783 parser("(?P<").parse().unwrap_err(),
3784 TestError {
3785 span: span(4..4),
3786 kind: ast::ErrorKind::GroupNameUnexpectedEof,
3787 }
3788 );
3789 assert_eq!(
3790 parser("(a").parse().unwrap_err(),
3791 TestError {
3792 span: span(0..1),
3793 kind: ast::ErrorKind::GroupUnclosed,
3794 }
3795 );
3796 assert_eq!(
3797 parser("(()").parse().unwrap_err(),
3798 TestError {
3799 span: span(0..1),
3800 kind: ast::ErrorKind::GroupUnclosed,
3801 }
3802 );
3803 assert_eq!(
3804 parser(")").parse().unwrap_err(),
3805 TestError {
3806 span: span(0..1),
3807 kind: ast::ErrorKind::GroupUnopened,
3808 }
3809 );
3810 assert_eq!(
3811 parser("a)").parse().unwrap_err(),
3812 TestError {
3813 span: span(1..2),
3814 kind: ast::ErrorKind::GroupUnopened,
3815 }
3816 );
3817 }
3818
3819 #[test]
3820 fn parse_capture_name() {
3821 assert_eq!(
3822 parser("(?P<a>z)").parse(),
3823 Ok(Ast::Group(ast::Group {
3824 span: span(0..8),
3825 kind: ast::GroupKind::CaptureName(ast::CaptureName {
3826 span: span(4..5),
3827 name: s("a"),
3828 index: 1,
3829 }),
3830 ast: Box::new(lit('z', 6)),
3831 }))
3832 );
3833 assert_eq!(
3834 parser("(?P<abc>z)").parse(),
3835 Ok(Ast::Group(ast::Group {
3836 span: span(0..10),
3837 kind: ast::GroupKind::CaptureName(ast::CaptureName {
3838 span: span(4..7),
3839 name: s("abc"),
3840 index: 1,
3841 }),
3842 ast: Box::new(lit('z', 8)),
3843 }))
3844 );
3845
3846 assert_eq!(
3847 parser("(?P<a_1>z)").parse(),
3848 Ok(Ast::Group(ast::Group {
3849 span: span(0..10),
3850 kind: ast::GroupKind::CaptureName(ast::CaptureName {
3851 span: span(4..7),
3852 name: s("a_1"),
3853 index: 1,
3854 }),
3855 ast: Box::new(lit('z', 8)),
3856 }))
3857 );
3858
3859 assert_eq!(
3860 parser("(?P<a.1>z)").parse(),
3861 Ok(Ast::Group(ast::Group {
3862 span: span(0..10),
3863 kind: ast::GroupKind::CaptureName(ast::CaptureName {
3864 span: span(4..7),
3865 name: s("a.1"),
3866 index: 1,
3867 }),
3868 ast: Box::new(lit('z', 8)),
3869 }))
3870 );
3871
3872 assert_eq!(
3873 parser("(?P<a[1]>z)").parse(),
3874 Ok(Ast::Group(ast::Group {
3875 span: span(0..11),
3876 kind: ast::GroupKind::CaptureName(ast::CaptureName {
3877 span: span(4..8),
3878 name: s("a[1]"),
3879 index: 1,
3880 }),
3881 ast: Box::new(lit('z', 9)),
3882 }))
3883 );
3884
3885 assert_eq!(
3886 parser("(?P<").parse().unwrap_err(),
3887 TestError {
3888 span: span(4..4),
3889 kind: ast::ErrorKind::GroupNameUnexpectedEof,
3890 }
3891 );
3892 assert_eq!(
3893 parser("(?P<>z)").parse().unwrap_err(),
3894 TestError {
3895 span: span(4..4),
3896 kind: ast::ErrorKind::GroupNameEmpty,
3897 }
3898 );
3899 assert_eq!(
3900 parser("(?P<a").parse().unwrap_err(),
3901 TestError {
3902 span: span(5..5),
3903 kind: ast::ErrorKind::GroupNameUnexpectedEof,
3904 }
3905 );
3906 assert_eq!(
3907 parser("(?P<ab").parse().unwrap_err(),
3908 TestError {
3909 span: span(6..6),
3910 kind: ast::ErrorKind::GroupNameUnexpectedEof,
3911 }
3912 );
3913 assert_eq!(
3914 parser("(?P<0a").parse().unwrap_err(),
3915 TestError {
3916 span: span(4..5),
3917 kind: ast::ErrorKind::GroupNameInvalid,
3918 }
3919 );
3920 assert_eq!(
3921 parser("(?P<~").parse().unwrap_err(),
3922 TestError {
3923 span: span(4..5),
3924 kind: ast::ErrorKind::GroupNameInvalid,
3925 }
3926 );
3927 assert_eq!(
3928 parser("(?P<abc~").parse().unwrap_err(),
3929 TestError {
3930 span: span(7..8),
3931 kind: ast::ErrorKind::GroupNameInvalid,
3932 }
3933 );
3934 assert_eq!(
3935 parser("(?P<a>y)(?P<a>z)").parse().unwrap_err(),
3936 TestError {
3937 span: span(12..13),
3938 kind: ast::ErrorKind::GroupNameDuplicate {
3939 original: span(4..5),
3940 },
3941 }
3942 );
3943 }
3944
3945 #[test]
3946 fn parse_flags() {
3947 assert_eq!(
3948 parser("i:").parse_flags(),
3949 Ok(ast::Flags {
3950 span: span(0..1),
3951 items: vec![ast::FlagsItem {
3952 span: span(0..1),
3953 kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive),
3954 }],
3955 })
3956 );
3957 assert_eq!(
3958 parser("i)").parse_flags(),
3959 Ok(ast::Flags {
3960 span: span(0..1),
3961 items: vec![ast::FlagsItem {
3962 span: span(0..1),
3963 kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive),
3964 }],
3965 })
3966 );
3967
3968 assert_eq!(
3969 parser("isU:").parse_flags(),
3970 Ok(ast::Flags {
3971 span: span(0..3),
3972 items: vec![
3973 ast::FlagsItem {
3974 span: span(0..1),
3975 kind: ast::FlagsItemKind::Flag(
3976 ast::Flag::CaseInsensitive
3977 ),
3978 },
3979 ast::FlagsItem {
3980 span: span(1..2),
3981 kind: ast::FlagsItemKind::Flag(
3982 ast::Flag::DotMatchesNewLine
3983 ),
3984 },
3985 ast::FlagsItem {
3986 span: span(2..3),
3987 kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed),
3988 },
3989 ],
3990 })
3991 );
3992
3993 assert_eq!(
3994 parser("-isU:").parse_flags(),
3995 Ok(ast::Flags {
3996 span: span(0..4),
3997 items: vec![
3998 ast::FlagsItem {
3999 span: span(0..1),
4000 kind: ast::FlagsItemKind::Negation,
4001 },
4002 ast::FlagsItem {
4003 span: span(1..2),
4004 kind: ast::FlagsItemKind::Flag(
4005 ast::Flag::CaseInsensitive
4006 ),
4007 },
4008 ast::FlagsItem {
4009 span: span(2..3),
4010 kind: ast::FlagsItemKind::Flag(
4011 ast::Flag::DotMatchesNewLine
4012 ),
4013 },
4014 ast::FlagsItem {
4015 span: span(3..4),
4016 kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed),
4017 },
4018 ],
4019 })
4020 );
4021 assert_eq!(
4022 parser("i-sU:").parse_flags(),
4023 Ok(ast::Flags {
4024 span: span(0..4),
4025 items: vec![
4026 ast::FlagsItem {
4027 span: span(0..1),
4028 kind: ast::FlagsItemKind::Flag(
4029 ast::Flag::CaseInsensitive
4030 ),
4031 },
4032 ast::FlagsItem {
4033 span: span(1..2),
4034 kind: ast::FlagsItemKind::Negation,
4035 },
4036 ast::FlagsItem {
4037 span: span(2..3),
4038 kind: ast::FlagsItemKind::Flag(
4039 ast::Flag::DotMatchesNewLine
4040 ),
4041 },
4042 ast::FlagsItem {
4043 span: span(3..4),
4044 kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed),
4045 },
4046 ],
4047 })
4048 );
4049
4050 assert_eq!(
4051 parser("isU").parse_flags().unwrap_err(),
4052 TestError {
4053 span: span(3..3),
4054 kind: ast::ErrorKind::FlagUnexpectedEof,
4055 }
4056 );
4057 assert_eq!(
4058 parser("isUa:").parse_flags().unwrap_err(),
4059 TestError {
4060 span: span(3..4),
4061 kind: ast::ErrorKind::FlagUnrecognized,
4062 }
4063 );
4064 assert_eq!(
4065 parser("isUi:").parse_flags().unwrap_err(),
4066 TestError {
4067 span: span(3..4),
4068 kind: ast::ErrorKind::FlagDuplicate { original: span(0..1) },
4069 }
4070 );
4071 assert_eq!(
4072 parser("i-sU-i:").parse_flags().unwrap_err(),
4073 TestError {
4074 span: span(4..5),
4075 kind: ast::ErrorKind::FlagRepeatedNegation {
4076 original: span(1..2),
4077 },
4078 }
4079 );
4080 assert_eq!(
4081 parser("-)").parse_flags().unwrap_err(),
4082 TestError {
4083 span: span(0..1),
4084 kind: ast::ErrorKind::FlagDanglingNegation,
4085 }
4086 );
4087 assert_eq!(
4088 parser("i-)").parse_flags().unwrap_err(),
4089 TestError {
4090 span: span(1..2),
4091 kind: ast::ErrorKind::FlagDanglingNegation,
4092 }
4093 );
4094 assert_eq!(
4095 parser("iU-)").parse_flags().unwrap_err(),
4096 TestError {
4097 span: span(2..3),
4098 kind: ast::ErrorKind::FlagDanglingNegation,
4099 }
4100 );
4101 }
4102
4103 #[test]
4104 fn parse_flag() {
4105 assert_eq!(parser("i").parse_flag(), Ok(ast::Flag::CaseInsensitive));
4106 assert_eq!(parser("m").parse_flag(), Ok(ast::Flag::MultiLine));
4107 assert_eq!(parser("s").parse_flag(), Ok(ast::Flag::DotMatchesNewLine));
4108 assert_eq!(parser("U").parse_flag(), Ok(ast::Flag::SwapGreed));
4109 assert_eq!(parser("u").parse_flag(), Ok(ast::Flag::Unicode));
4110 assert_eq!(parser("x").parse_flag(), Ok(ast::Flag::IgnoreWhitespace));
4111
4112 assert_eq!(
4113 parser("a").parse_flag().unwrap_err(),
4114 TestError {
4115 span: span(0..1),
4116 kind: ast::ErrorKind::FlagUnrecognized,
4117 }
4118 );
4119 assert_eq!(
4120 parser("☃").parse_flag().unwrap_err(),
4121 TestError {
4122 span: span_range("☃", 0..3),
4123 kind: ast::ErrorKind::FlagUnrecognized,
4124 }
4125 );
4126 }
4127
4128 #[test]
4129 fn parse_primitive_non_escape() {
4130 assert_eq!(
4131 parser(r".").parse_primitive(),
4132 Ok(Primitive::Dot(span(0..1)))
4133 );
4134 assert_eq!(
4135 parser(r"^").parse_primitive(),
4136 Ok(Primitive::Assertion(ast::Assertion {
4137 span: span(0..1),
4138 kind: ast::AssertionKind::StartLine,
4139 }))
4140 );
4141 assert_eq!(
4142 parser(r"$").parse_primitive(),
4143 Ok(Primitive::Assertion(ast::Assertion {
4144 span: span(0..1),
4145 kind: ast::AssertionKind::EndLine,
4146 }))
4147 );
4148
4149 assert_eq!(
4150 parser(r"a").parse_primitive(),
4151 Ok(Primitive::Literal(ast::Literal {
4152 span: span(0..1),
4153 kind: ast::LiteralKind::Verbatim,
4154 c: 'a',
4155 }))
4156 );
4157 assert_eq!(
4158 parser(r"|").parse_primitive(),
4159 Ok(Primitive::Literal(ast::Literal {
4160 span: span(0..1),
4161 kind: ast::LiteralKind::Verbatim,
4162 c: '|',
4163 }))
4164 );
4165 assert_eq!(
4166 parser(r"☃").parse_primitive(),
4167 Ok(Primitive::Literal(ast::Literal {
4168 span: span_range("☃", 0..3),
4169 kind: ast::LiteralKind::Verbatim,
4170 c: '☃',
4171 }))
4172 );
4173 }
4174
4175 #[test]
4176 fn parse_escape() {
4177 assert_eq!(
4178 parser(r"\|").parse_primitive(),
4179 Ok(Primitive::Literal(ast::Literal {
4180 span: span(0..2),
4181 kind: ast::LiteralKind::Punctuation,
4182 c: '|',
4183 }))
4184 );
4185 let specials = &[
4186 (r"\a", '\x07', ast::SpecialLiteralKind::Bell),
4187 (r"\f", '\x0C', ast::SpecialLiteralKind::FormFeed),
4188 (r"\t", '\t', ast::SpecialLiteralKind::Tab),
4189 (r"\n", '\n', ast::SpecialLiteralKind::LineFeed),
4190 (r"\r", '\r', ast::SpecialLiteralKind::CarriageReturn),
4191 (r"\v", '\x0B', ast::SpecialLiteralKind::VerticalTab),
4192 ];
4193 for &(pat, c, ref kind) in specials {
4194 assert_eq!(
4195 parser(pat).parse_primitive(),
4196 Ok(Primitive::Literal(ast::Literal {
4197 span: span(0..2),
4198 kind: ast::LiteralKind::Special(kind.clone()),
4199 c,
4200 }))
4201 );
4202 }
4203 assert_eq!(
4204 parser(r"\A").parse_primitive(),
4205 Ok(Primitive::Assertion(ast::Assertion {
4206 span: span(0..2),
4207 kind: ast::AssertionKind::StartText,
4208 }))
4209 );
4210 assert_eq!(
4211 parser(r"\z").parse_primitive(),
4212 Ok(Primitive::Assertion(ast::Assertion {
4213 span: span(0..2),
4214 kind: ast::AssertionKind::EndText,
4215 }))
4216 );
4217 assert_eq!(
4218 parser(r"\b").parse_primitive(),
4219 Ok(Primitive::Assertion(ast::Assertion {
4220 span: span(0..2),
4221 kind: ast::AssertionKind::WordBoundary,
4222 }))
4223 );
4224 assert_eq!(
4225 parser(r"\B").parse_primitive(),
4226 Ok(Primitive::Assertion(ast::Assertion {
4227 span: span(0..2),
4228 kind: ast::AssertionKind::NotWordBoundary,
4229 }))
4230 );
4231
4232 assert_eq!(
4233 parser(r"\").parse_escape().unwrap_err(),
4234 TestError {
4235 span: span(0..1),
4236 kind: ast::ErrorKind::EscapeUnexpectedEof,
4237 }
4238 );
4239 assert_eq!(
4240 parser(r"\y").parse_escape().unwrap_err(),
4241 TestError {
4242 span: span(0..2),
4243 kind: ast::ErrorKind::EscapeUnrecognized,
4244 }
4245 );
4246 }
4247
4248 #[test]
4249 fn parse_unsupported_backreference() {
4250 assert_eq!(
4251 parser(r"\0").parse_escape().unwrap_err(),
4252 TestError {
4253 span: span(0..2),
4254 kind: ast::ErrorKind::UnsupportedBackreference,
4255 }
4256 );
4257 assert_eq!(
4258 parser(r"\9").parse_escape().unwrap_err(),
4259 TestError {
4260 span: span(0..2),
4261 kind: ast::ErrorKind::UnsupportedBackreference,
4262 }
4263 );
4264 }
4265
4266 #[test]
4267 fn parse_octal() {
4268 for i in 0..511 {
4269 let pat = format!(r"\{:o}", i);
4270 assert_eq!(
4271 parser_octal(&pat).parse_escape(),
4272 Ok(Primitive::Literal(ast::Literal {
4273 span: span(0..pat.len()),
4274 kind: ast::LiteralKind::Octal,
4275 c: ::std::char::from_u32(i).unwrap(),
4276 }))
4277 );
4278 }
4279 assert_eq!(
4280 parser_octal(r"\778").parse_escape(),
4281 Ok(Primitive::Literal(ast::Literal {
4282 span: span(0..3),
4283 kind: ast::LiteralKind::Octal,
4284 c: '?',
4285 }))
4286 );
4287 assert_eq!(
4288 parser_octal(r"\7777").parse_escape(),
4289 Ok(Primitive::Literal(ast::Literal {
4290 span: span(0..4),
4291 kind: ast::LiteralKind::Octal,
4292 c: '\u{01FF}',
4293 }))
4294 );
4295 assert_eq!(
4296 parser_octal(r"\778").parse(),
4297 Ok(Ast::Concat(ast::Concat {
4298 span: span(0..4),
4299 asts: vec![
4300 Ast::Literal(ast::Literal {
4301 span: span(0..3),
4302 kind: ast::LiteralKind::Octal,
4303 c: '?',
4304 }),
4305 Ast::Literal(ast::Literal {
4306 span: span(3..4),
4307 kind: ast::LiteralKind::Verbatim,
4308 c: '8',
4309 }),
4310 ],
4311 }))
4312 );
4313 assert_eq!(
4314 parser_octal(r"\7777").parse(),
4315 Ok(Ast::Concat(ast::Concat {
4316 span: span(0..5),
4317 asts: vec![
4318 Ast::Literal(ast::Literal {
4319 span: span(0..4),
4320 kind: ast::LiteralKind::Octal,
4321 c: '\u{01FF}',
4322 }),
4323 Ast::Literal(ast::Literal {
4324 span: span(4..5),
4325 kind: ast::LiteralKind::Verbatim,
4326 c: '7',
4327 }),
4328 ],
4329 }))
4330 );
4331
4332 assert_eq!(
4333 parser_octal(r"\8").parse_escape().unwrap_err(),
4334 TestError {
4335 span: span(0..2),
4336 kind: ast::ErrorKind::EscapeUnrecognized,
4337 }
4338 );
4339 }
4340
4341 #[test]
4342 fn parse_hex_two() {
4343 for i in 0..256 {
4344 let pat = format!(r"\x{:02x}", i);
4345 assert_eq!(
4346 parser(&pat).parse_escape(),
4347 Ok(Primitive::Literal(ast::Literal {
4348 span: span(0..pat.len()),
4349 kind: ast::LiteralKind::HexFixed(ast::HexLiteralKind::X),
4350 c: ::std::char::from_u32(i).unwrap(),
4351 }))
4352 );
4353 }
4354
4355 assert_eq!(
4356 parser(r"\xF").parse_escape().unwrap_err(),
4357 TestError {
4358 span: span(3..3),
4359 kind: ast::ErrorKind::EscapeUnexpectedEof,
4360 }
4361 );
4362 assert_eq!(
4363 parser(r"\xG").parse_escape().unwrap_err(),
4364 TestError {
4365 span: span(2..3),
4366 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4367 }
4368 );
4369 assert_eq!(
4370 parser(r"\xFG").parse_escape().unwrap_err(),
4371 TestError {
4372 span: span(3..4),
4373 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4374 }
4375 );
4376 }
4377
4378 #[test]
4379 fn parse_hex_four() {
4380 for i in 0..65536 {
4381 let c = match ::std::char::from_u32(i) {
4382 None => continue,
4383 Some(c) => c,
4384 };
4385 let pat = format!(r"\u{:04x}", i);
4386 assert_eq!(
4387 parser(&pat).parse_escape(),
4388 Ok(Primitive::Literal(ast::Literal {
4389 span: span(0..pat.len()),
4390 kind: ast::LiteralKind::HexFixed(
4391 ast::HexLiteralKind::UnicodeShort
4392 ),
4393 c,
4394 }))
4395 );
4396 }
4397
4398 assert_eq!(
4399 parser(r"\uF").parse_escape().unwrap_err(),
4400 TestError {
4401 span: span(3..3),
4402 kind: ast::ErrorKind::EscapeUnexpectedEof,
4403 }
4404 );
4405 assert_eq!(
4406 parser(r"\uG").parse_escape().unwrap_err(),
4407 TestError {
4408 span: span(2..3),
4409 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4410 }
4411 );
4412 assert_eq!(
4413 parser(r"\uFG").parse_escape().unwrap_err(),
4414 TestError {
4415 span: span(3..4),
4416 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4417 }
4418 );
4419 assert_eq!(
4420 parser(r"\uFFG").parse_escape().unwrap_err(),
4421 TestError {
4422 span: span(4..5),
4423 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4424 }
4425 );
4426 assert_eq!(
4427 parser(r"\uFFFG").parse_escape().unwrap_err(),
4428 TestError {
4429 span: span(5..6),
4430 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4431 }
4432 );
4433 assert_eq!(
4434 parser(r"\uD800").parse_escape().unwrap_err(),
4435 TestError {
4436 span: span(2..6),
4437 kind: ast::ErrorKind::EscapeHexInvalid,
4438 }
4439 );
4440 }
4441
4442 #[test]
4443 fn parse_hex_eight() {
4444 for i in 0..65536 {
4445 let c = match ::std::char::from_u32(i) {
4446 None => continue,
4447 Some(c) => c,
4448 };
4449 let pat = format!(r"\U{:08x}", i);
4450 assert_eq!(
4451 parser(&pat).parse_escape(),
4452 Ok(Primitive::Literal(ast::Literal {
4453 span: span(0..pat.len()),
4454 kind: ast::LiteralKind::HexFixed(
4455 ast::HexLiteralKind::UnicodeLong
4456 ),
4457 c,
4458 }))
4459 );
4460 }
4461
4462 assert_eq!(
4463 parser(r"\UF").parse_escape().unwrap_err(),
4464 TestError {
4465 span: span(3..3),
4466 kind: ast::ErrorKind::EscapeUnexpectedEof,
4467 }
4468 );
4469 assert_eq!(
4470 parser(r"\UG").parse_escape().unwrap_err(),
4471 TestError {
4472 span: span(2..3),
4473 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4474 }
4475 );
4476 assert_eq!(
4477 parser(r"\UFG").parse_escape().unwrap_err(),
4478 TestError {
4479 span: span(3..4),
4480 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4481 }
4482 );
4483 assert_eq!(
4484 parser(r"\UFFG").parse_escape().unwrap_err(),
4485 TestError {
4486 span: span(4..5),
4487 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4488 }
4489 );
4490 assert_eq!(
4491 parser(r"\UFFFG").parse_escape().unwrap_err(),
4492 TestError {
4493 span: span(5..6),
4494 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4495 }
4496 );
4497 assert_eq!(
4498 parser(r"\UFFFFG").parse_escape().unwrap_err(),
4499 TestError {
4500 span: span(6..7),
4501 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4502 }
4503 );
4504 assert_eq!(
4505 parser(r"\UFFFFFG").parse_escape().unwrap_err(),
4506 TestError {
4507 span: span(7..8),
4508 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4509 }
4510 );
4511 assert_eq!(
4512 parser(r"\UFFFFFFG").parse_escape().unwrap_err(),
4513 TestError {
4514 span: span(8..9),
4515 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4516 }
4517 );
4518 assert_eq!(
4519 parser(r"\UFFFFFFFG").parse_escape().unwrap_err(),
4520 TestError {
4521 span: span(9..10),
4522 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4523 }
4524 );
4525 }
4526
4527 #[test]
4528 fn parse_hex_brace() {
4529 assert_eq!(
4530 parser(r"\u{26c4}").parse_escape(),
4531 Ok(Primitive::Literal(ast::Literal {
4532 span: span(0..8),
4533 kind: ast::LiteralKind::HexBrace(
4534 ast::HexLiteralKind::UnicodeShort
4535 ),
4536 c: '⛄',
4537 }))
4538 );
4539 assert_eq!(
4540 parser(r"\U{26c4}").parse_escape(),
4541 Ok(Primitive::Literal(ast::Literal {
4542 span: span(0..8),
4543 kind: ast::LiteralKind::HexBrace(
4544 ast::HexLiteralKind::UnicodeLong
4545 ),
4546 c: '⛄',
4547 }))
4548 );
4549 assert_eq!(
4550 parser(r"\x{26c4}").parse_escape(),
4551 Ok(Primitive::Literal(ast::Literal {
4552 span: span(0..8),
4553 kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X),
4554 c: '⛄',
4555 }))
4556 );
4557 assert_eq!(
4558 parser(r"\x{26C4}").parse_escape(),
4559 Ok(Primitive::Literal(ast::Literal {
4560 span: span(0..8),
4561 kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X),
4562 c: '⛄',
4563 }))
4564 );
4565 assert_eq!(
4566 parser(r"\x{10fFfF}").parse_escape(),
4567 Ok(Primitive::Literal(ast::Literal {
4568 span: span(0..10),
4569 kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X),
4570 c: '\u{10FFFF}',
4571 }))
4572 );
4573
4574 assert_eq!(
4575 parser(r"\x").parse_escape().unwrap_err(),
4576 TestError {
4577 span: span(2..2),
4578 kind: ast::ErrorKind::EscapeUnexpectedEof,
4579 }
4580 );
4581 assert_eq!(
4582 parser(r"\x{").parse_escape().unwrap_err(),
4583 TestError {
4584 span: span(2..3),
4585 kind: ast::ErrorKind::EscapeUnexpectedEof,
4586 }
4587 );
4588 assert_eq!(
4589 parser(r"\x{FF").parse_escape().unwrap_err(),
4590 TestError {
4591 span: span(2..5),
4592 kind: ast::ErrorKind::EscapeUnexpectedEof,
4593 }
4594 );
4595 assert_eq!(
4596 parser(r"\x{}").parse_escape().unwrap_err(),
4597 TestError {
4598 span: span(2..4),
4599 kind: ast::ErrorKind::EscapeHexEmpty,
4600 }
4601 );
4602 assert_eq!(
4603 parser(r"\x{FGF}").parse_escape().unwrap_err(),
4604 TestError {
4605 span: span(4..5),
4606 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4607 }
4608 );
4609 assert_eq!(
4610 parser(r"\x{FFFFFF}").parse_escape().unwrap_err(),
4611 TestError {
4612 span: span(3..9),
4613 kind: ast::ErrorKind::EscapeHexInvalid,
4614 }
4615 );
4616 assert_eq!(
4617 parser(r"\x{D800}").parse_escape().unwrap_err(),
4618 TestError {
4619 span: span(3..7),
4620 kind: ast::ErrorKind::EscapeHexInvalid,
4621 }
4622 );
4623 assert_eq!(
4624 parser(r"\x{FFFFFFFFF}").parse_escape().unwrap_err(),
4625 TestError {
4626 span: span(3..12),
4627 kind: ast::ErrorKind::EscapeHexInvalid,
4628 }
4629 );
4630 }
4631
4632 #[test]
4633 fn parse_decimal() {
4634 assert_eq!(parser("123").parse_decimal(), Ok(123));
4635 assert_eq!(parser("0").parse_decimal(), Ok(0));
4636 assert_eq!(parser("01").parse_decimal(), Ok(1));
4637
4638 assert_eq!(
4639 parser("-1").parse_decimal().unwrap_err(),
4640 TestError { span: span(0..0), kind: ast::ErrorKind::DecimalEmpty }
4641 );
4642 assert_eq!(
4643 parser("").parse_decimal().unwrap_err(),
4644 TestError { span: span(0..0), kind: ast::ErrorKind::DecimalEmpty }
4645 );
4646 assert_eq!(
4647 parser("9999999999").parse_decimal().unwrap_err(),
4648 TestError {
4649 span: span(0..10),
4650 kind: ast::ErrorKind::DecimalInvalid,
4651 }
4652 );
4653 }
4654
4655 #[test]
4656 fn parse_set_class() {
4657 fn union(span: Span, items: Vec<ast::ClassSetItem>) -> ast::ClassSet {
4658 ast::ClassSet::union(ast::ClassSetUnion { span, items })
4659 }
4660
4661 fn intersection(
4662 span: Span,
4663 lhs: ast::ClassSet,
4664 rhs: ast::ClassSet,
4665 ) -> ast::ClassSet {
4666 ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp {
4667 span,
4668 kind: ast::ClassSetBinaryOpKind::Intersection,
4669 lhs: Box::new(lhs),
4670 rhs: Box::new(rhs),
4671 })
4672 }
4673
4674 fn difference(
4675 span: Span,
4676 lhs: ast::ClassSet,
4677 rhs: ast::ClassSet,
4678 ) -> ast::ClassSet {
4679 ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp {
4680 span,
4681 kind: ast::ClassSetBinaryOpKind::Difference,
4682 lhs: Box::new(lhs),
4683 rhs: Box::new(rhs),
4684 })
4685 }
4686
4687 fn symdifference(
4688 span: Span,
4689 lhs: ast::ClassSet,
4690 rhs: ast::ClassSet,
4691 ) -> ast::ClassSet {
4692 ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp {
4693 span,
4694 kind: ast::ClassSetBinaryOpKind::SymmetricDifference,
4695 lhs: Box::new(lhs),
4696 rhs: Box::new(rhs),
4697 })
4698 }
4699
4700 fn itemset(item: ast::ClassSetItem) -> ast::ClassSet {
4701 ast::ClassSet::Item(item)
4702 }
4703
4704 fn item_ascii(cls: ast::ClassAscii) -> ast::ClassSetItem {
4705 ast::ClassSetItem::Ascii(cls)
4706 }
4707
4708 fn item_unicode(cls: ast::ClassUnicode) -> ast::ClassSetItem {
4709 ast::ClassSetItem::Unicode(cls)
4710 }
4711
4712 fn item_perl(cls: ast::ClassPerl) -> ast::ClassSetItem {
4713 ast::ClassSetItem::Perl(cls)
4714 }
4715
4716 fn item_bracket(cls: ast::ClassBracketed) -> ast::ClassSetItem {
4717 ast::ClassSetItem::Bracketed(Box::new(cls))
4718 }
4719
4720 fn lit(span: Span, c: char) -> ast::ClassSetItem {
4721 ast::ClassSetItem::Literal(ast::Literal {
4722 span,
4723 kind: ast::LiteralKind::Verbatim,
4724 c,
4725 })
4726 }
4727
4728 fn empty(span: Span) -> ast::ClassSetItem {
4729 ast::ClassSetItem::Empty(span)
4730 }
4731
4732 fn range(span: Span, start: char, end: char) -> ast::ClassSetItem {
4733 let pos1 = Position {
4734 offset: span.start.offset + start.len_utf8(),
4735 column: span.start.column + 1,
4736 ..span.start
4737 };
4738 let pos2 = Position {
4739 offset: span.end.offset - end.len_utf8(),
4740 column: span.end.column - 1,
4741 ..span.end
4742 };
4743 ast::ClassSetItem::Range(ast::ClassSetRange {
4744 span,
4745 start: ast::Literal {
4746 span: Span { end: pos1, ..span },
4747 kind: ast::LiteralKind::Verbatim,
4748 c: start,
4749 },
4750 end: ast::Literal {
4751 span: Span { start: pos2, ..span },
4752 kind: ast::LiteralKind::Verbatim,
4753 c: end,
4754 },
4755 })
4756 }
4757
4758 fn alnum(span: Span, negated: bool) -> ast::ClassAscii {
4759 ast::ClassAscii { span, kind: ast::ClassAsciiKind::Alnum, negated }
4760 }
4761
4762 fn lower(span: Span, negated: bool) -> ast::ClassAscii {
4763 ast::ClassAscii { span, kind: ast::ClassAsciiKind::Lower, negated }
4764 }
4765
4766 assert_eq!(
4767 parser("[[:alnum:]]").parse(),
4768 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4769 span: span(0..11),
4770 negated: false,
4771 kind: itemset(item_ascii(alnum(span(1..10), false))),
4772 })))
4773 );
4774 assert_eq!(
4775 parser("[[[:alnum:]]]").parse(),
4776 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4777 span: span(0..13),
4778 negated: false,
4779 kind: itemset(item_bracket(ast::ClassBracketed {
4780 span: span(1..12),
4781 negated: false,
4782 kind: itemset(item_ascii(alnum(span(2..11), false))),
4783 })),
4784 })))
4785 );
4786 assert_eq!(
4787 parser("[[:alnum:]&&[:lower:]]").parse(),
4788 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4789 span: span(0..22),
4790 negated: false,
4791 kind: intersection(
4792 span(1..21),
4793 itemset(item_ascii(alnum(span(1..10), false))),
4794 itemset(item_ascii(lower(span(12..21), false))),
4795 ),
4796 })))
4797 );
4798 assert_eq!(
4799 parser("[[:alnum:]--[:lower:]]").parse(),
4800 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4801 span: span(0..22),
4802 negated: false,
4803 kind: difference(
4804 span(1..21),
4805 itemset(item_ascii(alnum(span(1..10), false))),
4806 itemset(item_ascii(lower(span(12..21), false))),
4807 ),
4808 })))
4809 );
4810 assert_eq!(
4811 parser("[[:alnum:]~~[:lower:]]").parse(),
4812 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4813 span: span(0..22),
4814 negated: false,
4815 kind: symdifference(
4816 span(1..21),
4817 itemset(item_ascii(alnum(span(1..10), false))),
4818 itemset(item_ascii(lower(span(12..21), false))),
4819 ),
4820 })))
4821 );
4822
4823 assert_eq!(
4824 parser("[a]").parse(),
4825 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4826 span: span(0..3),
4827 negated: false,
4828 kind: itemset(lit(span(1..2), 'a')),
4829 })))
4830 );
4831 assert_eq!(
4832 parser(r"[a\]]").parse(),
4833 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4834 span: span(0..5),
4835 negated: false,
4836 kind: union(
4837 span(1..4),
4838 vec![
4839 lit(span(1..2), 'a'),
4840 ast::ClassSetItem::Literal(ast::Literal {
4841 span: span(2..4),
4842 kind: ast::LiteralKind::Punctuation,
4843 c: ']',
4844 }),
4845 ]
4846 ),
4847 })))
4848 );
4849 assert_eq!(
4850 parser(r"[a\-z]").parse(),
4851 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4852 span: span(0..6),
4853 negated: false,
4854 kind: union(
4855 span(1..5),
4856 vec![
4857 lit(span(1..2), 'a'),
4858 ast::ClassSetItem::Literal(ast::Literal {
4859 span: span(2..4),
4860 kind: ast::LiteralKind::Punctuation,
4861 c: '-',
4862 }),
4863 lit(span(4..5), 'z'),
4864 ]
4865 ),
4866 })))
4867 );
4868 assert_eq!(
4869 parser("[ab]").parse(),
4870 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4871 span: span(0..4),
4872 negated: false,
4873 kind: union(
4874 span(1..3),
4875 vec![lit(span(1..2), 'a'), lit(span(2..3), 'b'),]
4876 ),
4877 })))
4878 );
4879 assert_eq!(
4880 parser("[a-]").parse(),
4881 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4882 span: span(0..4),
4883 negated: false,
4884 kind: union(
4885 span(1..3),
4886 vec![lit(span(1..2), 'a'), lit(span(2..3), '-'),]
4887 ),
4888 })))
4889 );
4890 assert_eq!(
4891 parser("[-a]").parse(),
4892 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4893 span: span(0..4),
4894 negated: false,
4895 kind: union(
4896 span(1..3),
4897 vec![lit(span(1..2), '-'), lit(span(2..3), 'a'),]
4898 ),
4899 })))
4900 );
4901 assert_eq!(
4902 parser(r"[\pL]").parse(),
4903 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4904 span: span(0..5),
4905 negated: false,
4906 kind: itemset(item_unicode(ast::ClassUnicode {
4907 span: span(1..4),
4908 negated: false,
4909 kind: ast::ClassUnicodeKind::OneLetter('L'),
4910 })),
4911 })))
4912 );
4913 assert_eq!(
4914 parser(r"[\w]").parse(),
4915 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4916 span: span(0..4),
4917 negated: false,
4918 kind: itemset(item_perl(ast::ClassPerl {
4919 span: span(1..3),
4920 kind: ast::ClassPerlKind::Word,
4921 negated: false,
4922 })),
4923 })))
4924 );
4925 assert_eq!(
4926 parser(r"[a\wz]").parse(),
4927 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4928 span: span(0..6),
4929 negated: false,
4930 kind: union(
4931 span(1..5),
4932 vec![
4933 lit(span(1..2), 'a'),
4934 item_perl(ast::ClassPerl {
4935 span: span(2..4),
4936 kind: ast::ClassPerlKind::Word,
4937 negated: false,
4938 }),
4939 lit(span(4..5), 'z'),
4940 ]
4941 ),
4942 })))
4943 );
4944
4945 assert_eq!(
4946 parser("[a-z]").parse(),
4947 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4948 span: span(0..5),
4949 negated: false,
4950 kind: itemset(range(span(1..4), 'a', 'z')),
4951 })))
4952 );
4953 assert_eq!(
4954 parser("[a-cx-z]").parse(),
4955 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4956 span: span(0..8),
4957 negated: false,
4958 kind: union(
4959 span(1..7),
4960 vec![
4961 range(span(1..4), 'a', 'c'),
4962 range(span(4..7), 'x', 'z'),
4963 ]
4964 ),
4965 })))
4966 );
4967 assert_eq!(
4968 parser(r"[\w&&a-cx-z]").parse(),
4969 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4970 span: span(0..12),
4971 negated: false,
4972 kind: intersection(
4973 span(1..11),
4974 itemset(item_perl(ast::ClassPerl {
4975 span: span(1..3),
4976 kind: ast::ClassPerlKind::Word,
4977 negated: false,
4978 })),
4979 union(
4980 span(5..11),
4981 vec![
4982 range(span(5..8), 'a', 'c'),
4983 range(span(8..11), 'x', 'z'),
4984 ]
4985 ),
4986 ),
4987 })))
4988 );
4989 assert_eq!(
4990 parser(r"[a-cx-z&&\w]").parse(),
4991 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4992 span: span(0..12),
4993 negated: false,
4994 kind: intersection(
4995 span(1..11),
4996 union(
4997 span(1..7),
4998 vec![
4999 range(span(1..4), 'a', 'c'),
5000 range(span(4..7), 'x', 'z'),
5001 ]
5002 ),
5003 itemset(item_perl(ast::ClassPerl {
5004 span: span(9..11),
5005 kind: ast::ClassPerlKind::Word,
5006 negated: false,
5007 })),
5008 ),
5009 })))
5010 );
5011 assert_eq!(
5012 parser(r"[a--b--c]").parse(),
5013 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5014 span: span(0..9),
5015 negated: false,
5016 kind: difference(
5017 span(1..8),
5018 difference(
5019 span(1..5),
5020 itemset(lit(span(1..2), 'a')),
5021 itemset(lit(span(4..5), 'b')),
5022 ),
5023 itemset(lit(span(7..8), 'c')),
5024 ),
5025 })))
5026 );
5027 assert_eq!(
5028 parser(r"[a~~b~~c]").parse(),
5029 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5030 span: span(0..9),
5031 negated: false,
5032 kind: symdifference(
5033 span(1..8),
5034 symdifference(
5035 span(1..5),
5036 itemset(lit(span(1..2), 'a')),
5037 itemset(lit(span(4..5), 'b')),
5038 ),
5039 itemset(lit(span(7..8), 'c')),
5040 ),
5041 })))
5042 );
5043 assert_eq!(
5044 parser(r"[\^&&^]").parse(),
5045 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5046 span: span(0..7),
5047 negated: false,
5048 kind: intersection(
5049 span(1..6),
5050 itemset(ast::ClassSetItem::Literal(ast::Literal {
5051 span: span(1..3),
5052 kind: ast::LiteralKind::Punctuation,
5053 c: '^',
5054 })),
5055 itemset(lit(span(5..6), '^')),
5056 ),
5057 })))
5058 );
5059 assert_eq!(
5060 parser(r"[\&&&&]").parse(),
5061 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5062 span: span(0..7),
5063 negated: false,
5064 kind: intersection(
5065 span(1..6),
5066 itemset(ast::ClassSetItem::Literal(ast::Literal {
5067 span: span(1..3),
5068 kind: ast::LiteralKind::Punctuation,
5069 c: '&',
5070 })),
5071 itemset(lit(span(5..6), '&')),
5072 ),
5073 })))
5074 );
5075 assert_eq!(
5076 parser(r"[&&&&]").parse(),
5077 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5078 span: span(0..6),
5079 negated: false,
5080 kind: intersection(
5081 span(1..5),
5082 intersection(
5083 span(1..3),
5084 itemset(empty(span(1..1))),
5085 itemset(empty(span(3..3))),
5086 ),
5087 itemset(empty(span(5..5))),
5088 ),
5089 })))
5090 );
5091
5092 let pat = "[☃-⛄]";
5093 assert_eq!(
5094 parser(pat).parse(),
5095 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5096 span: span_range(pat, 0..9),
5097 negated: false,
5098 kind: itemset(ast::ClassSetItem::Range(ast::ClassSetRange {
5099 span: span_range(pat, 1..8),
5100 start: ast::Literal {
5101 span: span_range(pat, 1..4),
5102 kind: ast::LiteralKind::Verbatim,
5103 c: '☃',
5104 },
5105 end: ast::Literal {
5106 span: span_range(pat, 5..8),
5107 kind: ast::LiteralKind::Verbatim,
5108 c: '⛄',
5109 },
5110 })),
5111 })))
5112 );
5113
5114 assert_eq!(
5115 parser(r"[]]").parse(),
5116 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5117 span: span(0..3),
5118 negated: false,
5119 kind: itemset(lit(span(1..2), ']')),
5120 })))
5121 );
5122 assert_eq!(
5123 parser(r"[]\[]").parse(),
5124 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5125 span: span(0..5),
5126 negated: false,
5127 kind: union(
5128 span(1..4),
5129 vec![
5130 lit(span(1..2), ']'),
5131 ast::ClassSetItem::Literal(ast::Literal {
5132 span: span(2..4),
5133 kind: ast::LiteralKind::Punctuation,
5134 c: '[',
5135 }),
5136 ]
5137 ),
5138 })))
5139 );
5140 assert_eq!(
5141 parser(r"[\[]]").parse(),
5142 Ok(concat(
5143 0..5,
5144 vec![
5145 Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5146 span: span(0..4),
5147 negated: false,
5148 kind: itemset(ast::ClassSetItem::Literal(
5149 ast::Literal {
5150 span: span(1..3),
5151 kind: ast::LiteralKind::Punctuation,
5152 c: '[',
5153 }
5154 )),
5155 })),
5156 Ast::Literal(ast::Literal {
5157 span: span(4..5),
5158 kind: ast::LiteralKind::Verbatim,
5159 c: ']',
5160 }),
5161 ]
5162 ))
5163 );
5164
5165 assert_eq!(
5166 parser("[").parse().unwrap_err(),
5167 TestError {
5168 span: span(0..1),
5169 kind: ast::ErrorKind::ClassUnclosed,
5170 }
5171 );
5172 assert_eq!(
5173 parser("[[").parse().unwrap_err(),
5174 TestError {
5175 span: span(1..2),
5176 kind: ast::ErrorKind::ClassUnclosed,
5177 }
5178 );
5179 assert_eq!(
5180 parser("[[-]").parse().unwrap_err(),
5181 TestError {
5182 span: span(0..1),
5183 kind: ast::ErrorKind::ClassUnclosed,
5184 }
5185 );
5186 assert_eq!(
5187 parser("[[[:alnum:]").parse().unwrap_err(),
5188 TestError {
5189 span: span(1..2),
5190 kind: ast::ErrorKind::ClassUnclosed,
5191 }
5192 );
5193 assert_eq!(
5194 parser(r"[\b]").parse().unwrap_err(),
5195 TestError {
5196 span: span(1..3),
5197 kind: ast::ErrorKind::ClassEscapeInvalid,
5198 }
5199 );
5200 assert_eq!(
5201 parser(r"[\w-a]").parse().unwrap_err(),
5202 TestError {
5203 span: span(1..3),
5204 kind: ast::ErrorKind::ClassRangeLiteral,
5205 }
5206 );
5207 assert_eq!(
5208 parser(r"[a-\w]").parse().unwrap_err(),
5209 TestError {
5210 span: span(3..5),
5211 kind: ast::ErrorKind::ClassRangeLiteral,
5212 }
5213 );
5214 assert_eq!(
5215 parser(r"[z-a]").parse().unwrap_err(),
5216 TestError {
5217 span: span(1..4),
5218 kind: ast::ErrorKind::ClassRangeInvalid,
5219 }
5220 );
5221
5222 assert_eq!(
5223 parser_ignore_whitespace("[a ").parse().unwrap_err(),
5224 TestError {
5225 span: span(0..1),
5226 kind: ast::ErrorKind::ClassUnclosed,
5227 }
5228 );
5229 assert_eq!(
5230 parser_ignore_whitespace("[a- ").parse().unwrap_err(),
5231 TestError {
5232 span: span(0..1),
5233 kind: ast::ErrorKind::ClassUnclosed,
5234 }
5235 );
5236 }
5237
5238 #[test]
5239 fn parse_set_class_open() {
5240 assert_eq!(parser("[a]").parse_set_class_open(), {
5241 let set = ast::ClassBracketed {
5242 span: span(0..1),
5243 negated: false,
5244 kind: ast::ClassSet::union(ast::ClassSetUnion {
5245 span: span(1..1),
5246 items: vec![],
5247 }),
5248 };
5249 let union = ast::ClassSetUnion { span: span(1..1), items: vec![] };
5250 Ok((set, union))
5251 });
5252 assert_eq!(
5253 parser_ignore_whitespace("[ a]").parse_set_class_open(),
5254 {
5255 let set = ast::ClassBracketed {
5256 span: span(0..4),
5257 negated: false,
5258 kind: ast::ClassSet::union(ast::ClassSetUnion {
5259 span: span(4..4),
5260 items: vec![],
5261 }),
5262 };
5263 let union =
5264 ast::ClassSetUnion { span: span(4..4), items: vec![] };
5265 Ok((set, union))
5266 }
5267 );
5268 assert_eq!(parser("[^a]").parse_set_class_open(), {
5269 let set = ast::ClassBracketed {
5270 span: span(0..2),
5271 negated: true,
5272 kind: ast::ClassSet::union(ast::ClassSetUnion {
5273 span: span(2..2),
5274 items: vec![],
5275 }),
5276 };
5277 let union = ast::ClassSetUnion { span: span(2..2), items: vec![] };
5278 Ok((set, union))
5279 });
5280 assert_eq!(
5281 parser_ignore_whitespace("[ ^ a]").parse_set_class_open(),
5282 {
5283 let set = ast::ClassBracketed {
5284 span: span(0..4),
5285 negated: true,
5286 kind: ast::ClassSet::union(ast::ClassSetUnion {
5287 span: span(4..4),
5288 items: vec![],
5289 }),
5290 };
5291 let union =
5292 ast::ClassSetUnion { span: span(4..4), items: vec![] };
5293 Ok((set, union))
5294 }
5295 );
5296 assert_eq!(parser("[-a]").parse_set_class_open(), {
5297 let set = ast::ClassBracketed {
5298 span: span(0..2),
5299 negated: false,
5300 kind: ast::ClassSet::union(ast::ClassSetUnion {
5301 span: span(1..1),
5302 items: vec![],
5303 }),
5304 };
5305 let union = ast::ClassSetUnion {
5306 span: span(1..2),
5307 items: vec![ast::ClassSetItem::Literal(ast::Literal {
5308 span: span(1..2),
5309 kind: ast::LiteralKind::Verbatim,
5310 c: '-',
5311 })],
5312 };
5313 Ok((set, union))
5314 });
5315 assert_eq!(
5316 parser_ignore_whitespace("[ - a]").parse_set_class_open(),
5317 {
5318 let set = ast::ClassBracketed {
5319 span: span(0..4),
5320 negated: false,
5321 kind: ast::ClassSet::union(ast::ClassSetUnion {
5322 span: span(2..2),
5323 items: vec![],
5324 }),
5325 };
5326 let union = ast::ClassSetUnion {
5327 span: span(2..3),
5328 items: vec![ast::ClassSetItem::Literal(ast::Literal {
5329 span: span(2..3),
5330 kind: ast::LiteralKind::Verbatim,
5331 c: '-',
5332 })],
5333 };
5334 Ok((set, union))
5335 }
5336 );
5337 assert_eq!(parser("[^-a]").parse_set_class_open(), {
5338 let set = ast::ClassBracketed {
5339 span: span(0..3),
5340 negated: true,
5341 kind: ast::ClassSet::union(ast::ClassSetUnion {
5342 span: span(2..2),
5343 items: vec![],
5344 }),
5345 };
5346 let union = ast::ClassSetUnion {
5347 span: span(2..3),
5348 items: vec![ast::ClassSetItem::Literal(ast::Literal {
5349 span: span(2..3),
5350 kind: ast::LiteralKind::Verbatim,
5351 c: '-',
5352 })],
5353 };
5354 Ok((set, union))
5355 });
5356 assert_eq!(parser("[--a]").parse_set_class_open(), {
5357 let set = ast::ClassBracketed {
5358 span: span(0..3),
5359 negated: false,
5360 kind: ast::ClassSet::union(ast::ClassSetUnion {
5361 span: span(1..1),
5362 items: vec![],
5363 }),
5364 };
5365 let union = ast::ClassSetUnion {
5366 span: span(1..3),
5367 items: vec![
5368 ast::ClassSetItem::Literal(ast::Literal {
5369 span: span(1..2),
5370 kind: ast::LiteralKind::Verbatim,
5371 c: '-',
5372 }),
5373 ast::ClassSetItem::Literal(ast::Literal {
5374 span: span(2..3),
5375 kind: ast::LiteralKind::Verbatim,
5376 c: '-',
5377 }),
5378 ],
5379 };
5380 Ok((set, union))
5381 });
5382 assert_eq!(parser("[]a]").parse_set_class_open(), {
5383 let set = ast::ClassBracketed {
5384 span: span(0..2),
5385 negated: false,
5386 kind: ast::ClassSet::union(ast::ClassSetUnion {
5387 span: span(1..1),
5388 items: vec![],
5389 }),
5390 };
5391 let union = ast::ClassSetUnion {
5392 span: span(1..2),
5393 items: vec![ast::ClassSetItem::Literal(ast::Literal {
5394 span: span(1..2),
5395 kind: ast::LiteralKind::Verbatim,
5396 c: ']',
5397 })],
5398 };
5399 Ok((set, union))
5400 });
5401 assert_eq!(
5402 parser_ignore_whitespace("[ ] a]").parse_set_class_open(),
5403 {
5404 let set = ast::ClassBracketed {
5405 span: span(0..4),
5406 negated: false,
5407 kind: ast::ClassSet::union(ast::ClassSetUnion {
5408 span: span(2..2),
5409 items: vec![],
5410 }),
5411 };
5412 let union = ast::ClassSetUnion {
5413 span: span(2..3),
5414 items: vec![ast::ClassSetItem::Literal(ast::Literal {
5415 span: span(2..3),
5416 kind: ast::LiteralKind::Verbatim,
5417 c: ']',
5418 })],
5419 };
5420 Ok((set, union))
5421 }
5422 );
5423 assert_eq!(parser("[^]a]").parse_set_class_open(), {
5424 let set = ast::ClassBracketed {
5425 span: span(0..3),
5426 negated: true,
5427 kind: ast::ClassSet::union(ast::ClassSetUnion {
5428 span: span(2..2),
5429 items: vec![],
5430 }),
5431 };
5432 let union = ast::ClassSetUnion {
5433 span: span(2..3),
5434 items: vec![ast::ClassSetItem::Literal(ast::Literal {
5435 span: span(2..3),
5436 kind: ast::LiteralKind::Verbatim,
5437 c: ']',
5438 })],
5439 };
5440 Ok((set, union))
5441 });
5442 assert_eq!(parser("[-]a]").parse_set_class_open(), {
5443 let set = ast::ClassBracketed {
5444 span: span(0..2),
5445 negated: false,
5446 kind: ast::ClassSet::union(ast::ClassSetUnion {
5447 span: span(1..1),
5448 items: vec![],
5449 }),
5450 };
5451 let union = ast::ClassSetUnion {
5452 span: span(1..2),
5453 items: vec![ast::ClassSetItem::Literal(ast::Literal {
5454 span: span(1..2),
5455 kind: ast::LiteralKind::Verbatim,
5456 c: '-',
5457 })],
5458 };
5459 Ok((set, union))
5460 });
5461
5462 assert_eq!(
5463 parser("[").parse_set_class_open().unwrap_err(),
5464 TestError {
5465 span: span(0..1),
5466 kind: ast::ErrorKind::ClassUnclosed,
5467 }
5468 );
5469 assert_eq!(
5470 parser_ignore_whitespace("[ ")
5471 .parse_set_class_open()
5472 .unwrap_err(),
5473 TestError {
5474 span: span(0..5),
5475 kind: ast::ErrorKind::ClassUnclosed,
5476 }
5477 );
5478 assert_eq!(
5479 parser("[^").parse_set_class_open().unwrap_err(),
5480 TestError {
5481 span: span(0..2),
5482 kind: ast::ErrorKind::ClassUnclosed,
5483 }
5484 );
5485 assert_eq!(
5486 parser("[]").parse_set_class_open().unwrap_err(),
5487 TestError {
5488 span: span(0..2),
5489 kind: ast::ErrorKind::ClassUnclosed,
5490 }
5491 );
5492 assert_eq!(
5493 parser("[-").parse_set_class_open().unwrap_err(),
5494 TestError {
5495 span: span(0..0),
5496 kind: ast::ErrorKind::ClassUnclosed,
5497 }
5498 );
5499 assert_eq!(
5500 parser("[--").parse_set_class_open().unwrap_err(),
5501 TestError {
5502 span: span(0..0),
5503 kind: ast::ErrorKind::ClassUnclosed,
5504 }
5505 );
5506
5507 // See: https://github.com/rust-lang/regex/issues/792
5508 assert_eq!(
5509 parser("(?x)[-#]").parse_with_comments().unwrap_err(),
5510 TestError {
5511 span: span(4..4),
5512 kind: ast::ErrorKind::ClassUnclosed,
5513 }
5514 );
5515 }
5516
5517 #[test]
5518 fn maybe_parse_ascii_class() {
5519 assert_eq!(
5520 parser(r"[:alnum:]").maybe_parse_ascii_class(),
5521 Some(ast::ClassAscii {
5522 span: span(0..9),
5523 kind: ast::ClassAsciiKind::Alnum,
5524 negated: false,
5525 })
5526 );
5527 assert_eq!(
5528 parser(r"[:alnum:]A").maybe_parse_ascii_class(),
5529 Some(ast::ClassAscii {
5530 span: span(0..9),
5531 kind: ast::ClassAsciiKind::Alnum,
5532 negated: false,
5533 })
5534 );
5535 assert_eq!(
5536 parser(r"[:^alnum:]").maybe_parse_ascii_class(),
5537 Some(ast::ClassAscii {
5538 span: span(0..10),
5539 kind: ast::ClassAsciiKind::Alnum,
5540 negated: true,
5541 })
5542 );
5543
5544 let p = parser(r"[:");
5545 assert_eq!(p.maybe_parse_ascii_class(), None);
5546 assert_eq!(p.offset(), 0);
5547
5548 let p = parser(r"[:^");
5549 assert_eq!(p.maybe_parse_ascii_class(), None);
5550 assert_eq!(p.offset(), 0);
5551
5552 let p = parser(r"[^:alnum:]");
5553 assert_eq!(p.maybe_parse_ascii_class(), None);
5554 assert_eq!(p.offset(), 0);
5555
5556 let p = parser(r"[:alnnum:]");
5557 assert_eq!(p.maybe_parse_ascii_class(), None);
5558 assert_eq!(p.offset(), 0);
5559
5560 let p = parser(r"[:alnum]");
5561 assert_eq!(p.maybe_parse_ascii_class(), None);
5562 assert_eq!(p.offset(), 0);
5563
5564 let p = parser(r"[:alnum:");
5565 assert_eq!(p.maybe_parse_ascii_class(), None);
5566 assert_eq!(p.offset(), 0);
5567 }
5568
5569 #[test]
5570 fn parse_unicode_class() {
5571 assert_eq!(
5572 parser(r"\pN").parse_escape(),
5573 Ok(Primitive::Unicode(ast::ClassUnicode {
5574 span: span(0..3),
5575 negated: false,
5576 kind: ast::ClassUnicodeKind::OneLetter('N'),
5577 }))
5578 );
5579 assert_eq!(
5580 parser(r"\PN").parse_escape(),
5581 Ok(Primitive::Unicode(ast::ClassUnicode {
5582 span: span(0..3),
5583 negated: true,
5584 kind: ast::ClassUnicodeKind::OneLetter('N'),
5585 }))
5586 );
5587 assert_eq!(
5588 parser(r"\p{N}").parse_escape(),
5589 Ok(Primitive::Unicode(ast::ClassUnicode {
5590 span: span(0..5),
5591 negated: false,
5592 kind: ast::ClassUnicodeKind::Named(s("N")),
5593 }))
5594 );
5595 assert_eq!(
5596 parser(r"\P{N}").parse_escape(),
5597 Ok(Primitive::Unicode(ast::ClassUnicode {
5598 span: span(0..5),
5599 negated: true,
5600 kind: ast::ClassUnicodeKind::Named(s("N")),
5601 }))
5602 );
5603 assert_eq!(
5604 parser(r"\p{Greek}").parse_escape(),
5605 Ok(Primitive::Unicode(ast::ClassUnicode {
5606 span: span(0..9),
5607 negated: false,
5608 kind: ast::ClassUnicodeKind::Named(s("Greek")),
5609 }))
5610 );
5611
5612 assert_eq!(
5613 parser(r"\p{scx:Katakana}").parse_escape(),
5614 Ok(Primitive::Unicode(ast::ClassUnicode {
5615 span: span(0..16),
5616 negated: false,
5617 kind: ast::ClassUnicodeKind::NamedValue {
5618 op: ast::ClassUnicodeOpKind::Colon,
5619 name: s("scx"),
5620 value: s("Katakana"),
5621 },
5622 }))
5623 );
5624 assert_eq!(
5625 parser(r"\p{scx=Katakana}").parse_escape(),
5626 Ok(Primitive::Unicode(ast::ClassUnicode {
5627 span: span(0..16),
5628 negated: false,
5629 kind: ast::ClassUnicodeKind::NamedValue {
5630 op: ast::ClassUnicodeOpKind::Equal,
5631 name: s("scx"),
5632 value: s("Katakana"),
5633 },
5634 }))
5635 );
5636 assert_eq!(
5637 parser(r"\p{scx!=Katakana}").parse_escape(),
5638 Ok(Primitive::Unicode(ast::ClassUnicode {
5639 span: span(0..17),
5640 negated: false,
5641 kind: ast::ClassUnicodeKind::NamedValue {
5642 op: ast::ClassUnicodeOpKind::NotEqual,
5643 name: s("scx"),
5644 value: s("Katakana"),
5645 },
5646 }))
5647 );
5648
5649 assert_eq!(
5650 parser(r"\p{:}").parse_escape(),
5651 Ok(Primitive::Unicode(ast::ClassUnicode {
5652 span: span(0..5),
5653 negated: false,
5654 kind: ast::ClassUnicodeKind::NamedValue {
5655 op: ast::ClassUnicodeOpKind::Colon,
5656 name: s(""),
5657 value: s(""),
5658 },
5659 }))
5660 );
5661 assert_eq!(
5662 parser(r"\p{=}").parse_escape(),
5663 Ok(Primitive::Unicode(ast::ClassUnicode {
5664 span: span(0..5),
5665 negated: false,
5666 kind: ast::ClassUnicodeKind::NamedValue {
5667 op: ast::ClassUnicodeOpKind::Equal,
5668 name: s(""),
5669 value: s(""),
5670 },
5671 }))
5672 );
5673 assert_eq!(
5674 parser(r"\p{!=}").parse_escape(),
5675 Ok(Primitive::Unicode(ast::ClassUnicode {
5676 span: span(0..6),
5677 negated: false,
5678 kind: ast::ClassUnicodeKind::NamedValue {
5679 op: ast::ClassUnicodeOpKind::NotEqual,
5680 name: s(""),
5681 value: s(""),
5682 },
5683 }))
5684 );
5685
5686 assert_eq!(
5687 parser(r"\p").parse_escape().unwrap_err(),
5688 TestError {
5689 span: span(2..2),
5690 kind: ast::ErrorKind::EscapeUnexpectedEof,
5691 }
5692 );
5693 assert_eq!(
5694 parser(r"\p{").parse_escape().unwrap_err(),
5695 TestError {
5696 span: span(3..3),
5697 kind: ast::ErrorKind::EscapeUnexpectedEof,
5698 }
5699 );
5700 assert_eq!(
5701 parser(r"\p{N").parse_escape().unwrap_err(),
5702 TestError {
5703 span: span(4..4),
5704 kind: ast::ErrorKind::EscapeUnexpectedEof,
5705 }
5706 );
5707 assert_eq!(
5708 parser(r"\p{Greek").parse_escape().unwrap_err(),
5709 TestError {
5710 span: span(8..8),
5711 kind: ast::ErrorKind::EscapeUnexpectedEof,
5712 }
5713 );
5714
5715 assert_eq!(
5716 parser(r"\pNz").parse(),
5717 Ok(Ast::Concat(ast::Concat {
5718 span: span(0..4),
5719 asts: vec![
5720 Ast::Class(ast::Class::Unicode(ast::ClassUnicode {
5721 span: span(0..3),
5722 negated: false,
5723 kind: ast::ClassUnicodeKind::OneLetter('N'),
5724 })),
5725 Ast::Literal(ast::Literal {
5726 span: span(3..4),
5727 kind: ast::LiteralKind::Verbatim,
5728 c: 'z',
5729 }),
5730 ],
5731 }))
5732 );
5733 assert_eq!(
5734 parser(r"\p{Greek}z").parse(),
5735 Ok(Ast::Concat(ast::Concat {
5736 span: span(0..10),
5737 asts: vec![
5738 Ast::Class(ast::Class::Unicode(ast::ClassUnicode {
5739 span: span(0..9),
5740 negated: false,
5741 kind: ast::ClassUnicodeKind::Named(s("Greek")),
5742 })),
5743 Ast::Literal(ast::Literal {
5744 span: span(9..10),
5745 kind: ast::LiteralKind::Verbatim,
5746 c: 'z',
5747 }),
5748 ],
5749 }))
5750 );
5751 assert_eq!(
5752 parser(r"\p\{").parse().unwrap_err(),
5753 TestError {
5754 span: span(2..3),
5755 kind: ast::ErrorKind::UnicodeClassInvalid,
5756 }
5757 );
5758 assert_eq!(
5759 parser(r"\P\{").parse().unwrap_err(),
5760 TestError {
5761 span: span(2..3),
5762 kind: ast::ErrorKind::UnicodeClassInvalid,
5763 }
5764 );
5765 }
5766
5767 #[test]
5768 fn parse_perl_class() {
5769 assert_eq!(
5770 parser(r"\d").parse_escape(),
5771 Ok(Primitive::Perl(ast::ClassPerl {
5772 span: span(0..2),
5773 kind: ast::ClassPerlKind::Digit,
5774 negated: false,
5775 }))
5776 );
5777 assert_eq!(
5778 parser(r"\D").parse_escape(),
5779 Ok(Primitive::Perl(ast::ClassPerl {
5780 span: span(0..2),
5781 kind: ast::ClassPerlKind::Digit,
5782 negated: true,
5783 }))
5784 );
5785 assert_eq!(
5786 parser(r"\s").parse_escape(),
5787 Ok(Primitive::Perl(ast::ClassPerl {
5788 span: span(0..2),
5789 kind: ast::ClassPerlKind::Space,
5790 negated: false,
5791 }))
5792 );
5793 assert_eq!(
5794 parser(r"\S").parse_escape(),
5795 Ok(Primitive::Perl(ast::ClassPerl {
5796 span: span(0..2),
5797 kind: ast::ClassPerlKind::Space,
5798 negated: true,
5799 }))
5800 );
5801 assert_eq!(
5802 parser(r"\w").parse_escape(),
5803 Ok(Primitive::Perl(ast::ClassPerl {
5804 span: span(0..2),
5805 kind: ast::ClassPerlKind::Word,
5806 negated: false,
5807 }))
5808 );
5809 assert_eq!(
5810 parser(r"\W").parse_escape(),
5811 Ok(Primitive::Perl(ast::ClassPerl {
5812 span: span(0..2),
5813 kind: ast::ClassPerlKind::Word,
5814 negated: true,
5815 }))
5816 );
5817
5818 assert_eq!(
5819 parser(r"\d").parse(),
5820 Ok(Ast::Class(ast::Class::Perl(ast::ClassPerl {
5821 span: span(0..2),
5822 kind: ast::ClassPerlKind::Digit,
5823 negated: false,
5824 })))
5825 );
5826 assert_eq!(
5827 parser(r"\dz").parse(),
5828 Ok(Ast::Concat(ast::Concat {
5829 span: span(0..3),
5830 asts: vec![
5831 Ast::Class(ast::Class::Perl(ast::ClassPerl {
5832 span: span(0..2),
5833 kind: ast::ClassPerlKind::Digit,
5834 negated: false,
5835 })),
5836 Ast::Literal(ast::Literal {
5837 span: span(2..3),
5838 kind: ast::LiteralKind::Verbatim,
5839 c: 'z',
5840 }),
5841 ],
5842 }))
5843 );
5844 }
5845
5846 // This tests a bug fix where the nest limit checker wasn't decrementing
5847 // its depth during post-traversal, which causes long regexes to trip
5848 // the default limit too aggressively.
5849 #[test]
5850 fn regression_454_nest_too_big() {
5851 let pattern = r#"
5852 2(?:
5853 [45]\d{3}|
5854 7(?:
5855 1[0-267]|
5856 2[0-289]|
5857 3[0-29]|
5858 4[01]|
5859 5[1-3]|
5860 6[013]|
5861 7[0178]|
5862 91
5863 )|
5864 8(?:
5865 0[125]|
5866 [139][1-6]|
5867 2[0157-9]|
5868 41|
5869 6[1-35]|
5870 7[1-5]|
5871 8[1-8]|
5872 90
5873 )|
5874 9(?:
5875 0[0-2]|
5876 1[0-4]|
5877 2[568]|
5878 3[3-6]|
5879 5[5-7]|
5880 6[0167]|
5881 7[15]|
5882 8[0146-9]
5883 )
5884 )\d{4}
5885 "#;
5886 assert!(parser_nest_limit(pattern, 50).parse().is_ok());
5887 }
5888
5889 // This tests that we treat a trailing `-` in a character class as a
5890 // literal `-` even when whitespace mode is enabled and there is whitespace
5891 // after the trailing `-`.
5892 #[test]
5893 fn regression_455_trailing_dash_ignore_whitespace() {
5894 assert!(parser("(?x)[ / - ]").parse().is_ok());
5895 assert!(parser("(?x)[ a - ]").parse().is_ok());
5896 assert!(parser(
5897 "(?x)[
5898 a
5899 - ]
5900 "
5901 )
5902 .parse()
5903 .is_ok());
5904 assert!(parser(
5905 "(?x)[
5906 a # wat
5907 - ]
5908 "
5909 )
5910 .parse()
5911 .is_ok());
5912
5913 assert!(parser("(?x)[ / -").parse().is_err());
5914 assert!(parser("(?x)[ / - ").parse().is_err());
5915 assert!(parser(
5916 "(?x)[
5917 / -
5918 "
5919 )
5920 .parse()
5921 .is_err());
5922 assert!(parser(
5923 "(?x)[
5924 / - # wat
5925 "
5926 )
5927 .parse()
5928 .is_err());
5929 }
5930}
5931