1/*!
2This module provides a regular expression parser.
3*/
4
5use core::{
6 borrow::Borrow,
7 cell::{Cell, RefCell},
8 mem,
9};
10
11use alloc::{
12 boxed::Box,
13 string::{String, ToString},
14 vec,
15 vec::Vec,
16};
17
18use crate::{
19 ast::{self, Ast, Position, Span},
20 either::Either,
21 is_escapeable_character, is_meta_character,
22};
23
24type Result<T> = core::result::Result<T, ast::Error>;
25
26/// A primitive is an expression with no sub-expressions. This includes
27/// literals, assertions and non-set character classes. This representation
28/// is used as intermediate state in the parser.
29///
30/// This does not include ASCII character classes, since they can only appear
31/// within a set character class.
32#[derive(Clone, Debug, Eq, PartialEq)]
33enum Primitive {
34 Literal(ast::Literal),
35 Assertion(ast::Assertion),
36 Dot(Span),
37 Perl(ast::ClassPerl),
38 Unicode(ast::ClassUnicode),
39}
40
41impl Primitive {
42 /// Return the span of this primitive.
43 fn span(&self) -> &Span {
44 match *self {
45 Primitive::Literal(ref x) => &x.span,
46 Primitive::Assertion(ref x) => &x.span,
47 Primitive::Dot(ref span) => span,
48 Primitive::Perl(ref x) => &x.span,
49 Primitive::Unicode(ref x) => &x.span,
50 }
51 }
52
53 /// Convert this primitive into a proper AST.
54 fn into_ast(self) -> Ast {
55 match self {
56 Primitive::Literal(lit) => Ast::Literal(lit),
57 Primitive::Assertion(assert) => Ast::Assertion(assert),
58 Primitive::Dot(span) => Ast::Dot(span),
59 Primitive::Perl(cls) => Ast::Class(ast::Class::Perl(cls)),
60 Primitive::Unicode(cls) => Ast::Class(ast::Class::Unicode(cls)),
61 }
62 }
63
64 /// Convert this primitive into an item in a character class.
65 ///
66 /// If this primitive is not a legal item (i.e., an assertion or a dot),
67 /// then return an error.
68 fn into_class_set_item<P: Borrow<Parser>>(
69 self,
70 p: &ParserI<'_, P>,
71 ) -> Result<ast::ClassSetItem> {
72 use self::Primitive::*;
73 use crate::ast::ClassSetItem;
74
75 match self {
76 Literal(lit) => Ok(ClassSetItem::Literal(lit)),
77 Perl(cls) => Ok(ClassSetItem::Perl(cls)),
78 Unicode(cls) => Ok(ClassSetItem::Unicode(cls)),
79 x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)),
80 }
81 }
82
83 /// Convert this primitive into a literal in a character class. In
84 /// particular, literals are the only valid items that can appear in
85 /// ranges.
86 ///
87 /// If this primitive is not a legal item (i.e., a class, assertion or a
88 /// dot), then return an error.
89 fn into_class_literal<P: Borrow<Parser>>(
90 self,
91 p: &ParserI<'_, P>,
92 ) -> Result<ast::Literal> {
93 use self::Primitive::*;
94
95 match self {
96 Literal(lit) => Ok(lit),
97 x => Err(p.error(*x.span(), ast::ErrorKind::ClassRangeLiteral)),
98 }
99 }
100}
101
102/// Returns true if the given character is a hexadecimal digit.
103fn is_hex(c: char) -> bool {
104 ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')
105}
106
107/// Returns true if the given character is a valid in a capture group name.
108///
109/// If `first` is true, then `c` is treated as the first character in the
110/// group name (which must be alphabetic or underscore).
111fn is_capture_char(c: char, first: bool) -> bool {
112 if first {
113 c == '_' || c.is_alphabetic()
114 } else {
115 c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric()
116 }
117}
118
119/// A builder for a regular expression parser.
120///
121/// This builder permits modifying configuration options for the parser.
122#[derive(Clone, Debug)]
123pub struct ParserBuilder {
124 ignore_whitespace: bool,
125 nest_limit: u32,
126 octal: bool,
127}
128
129impl Default for ParserBuilder {
130 fn default() -> ParserBuilder {
131 ParserBuilder::new()
132 }
133}
134
135impl ParserBuilder {
136 /// Create a new parser builder with a default configuration.
137 pub fn new() -> ParserBuilder {
138 ParserBuilder {
139 ignore_whitespace: false,
140 nest_limit: 250,
141 octal: false,
142 }
143 }
144
145 /// Build a parser from this configuration with the given pattern.
146 pub fn build(&self) -> Parser {
147 Parser {
148 pos: Cell::new(Position { offset: 0, line: 1, column: 1 }),
149 capture_index: Cell::new(0),
150 nest_limit: self.nest_limit,
151 octal: self.octal,
152 initial_ignore_whitespace: self.ignore_whitespace,
153 ignore_whitespace: Cell::new(self.ignore_whitespace),
154 comments: RefCell::new(vec![]),
155 stack_group: RefCell::new(vec![]),
156 stack_class: RefCell::new(vec![]),
157 capture_names: RefCell::new(vec![]),
158 scratch: RefCell::new(String::new()),
159 }
160 }
161
162 /// Set the nesting limit for this parser.
163 ///
164 /// The nesting limit controls how deep the abstract syntax tree is allowed
165 /// to be. If the AST exceeds the given limit (e.g., with too many nested
166 /// groups), then an error is returned by the parser.
167 ///
168 /// The purpose of this limit is to act as a heuristic to prevent stack
169 /// overflow for consumers that do structural induction on an `Ast` using
170 /// explicit recursion. While this crate never does this (instead using
171 /// constant stack space and moving the call stack to the heap), other
172 /// crates may.
173 ///
174 /// This limit is not checked until the entire AST is parsed. Therefore,
175 /// if callers want to put a limit on the amount of heap space used, then
176 /// they should impose a limit on the length, in bytes, of the concrete
177 /// pattern string. In particular, this is viable since this parser
178 /// implementation will limit itself to heap space proportional to the
179 /// length of the pattern string.
180 ///
181 /// Note that a nest limit of `0` will return a nest limit error for most
182 /// patterns but not all. For example, a nest limit of `0` permits `a` but
183 /// not `ab`, since `ab` requires a concatenation, which results in a nest
184 /// depth of `1`. In general, a nest limit is not something that manifests
185 /// in an obvious way in the concrete syntax, therefore, it should not be
186 /// used in a granular way.
187 pub fn nest_limit(&mut self, limit: u32) -> &mut ParserBuilder {
188 self.nest_limit = limit;
189 self
190 }
191
192 /// Whether to support octal syntax or not.
193 ///
194 /// Octal syntax is a little-known way of uttering Unicode codepoints in
195 /// a regular expression. For example, `a`, `\x61`, `\u0061` and
196 /// `\141` are all equivalent regular expressions, where the last example
197 /// shows octal syntax.
198 ///
199 /// While supporting octal syntax isn't in and of itself a problem, it does
200 /// make good error messages harder. That is, in PCRE based regex engines,
201 /// syntax like `\0` invokes a backreference, which is explicitly
202 /// unsupported in Rust's regex engine. However, many users expect it to
203 /// be supported. Therefore, when octal support is disabled, the error
204 /// message will explicitly mention that backreferences aren't supported.
205 ///
206 /// Octal syntax is disabled by default.
207 pub fn octal(&mut self, yes: bool) -> &mut ParserBuilder {
208 self.octal = yes;
209 self
210 }
211
212 /// Enable verbose mode in the regular expression.
213 ///
214 /// When enabled, verbose mode permits insignificant whitespace in many
215 /// places in the regular expression, as well as comments. Comments are
216 /// started using `#` and continue until the end of the line.
217 ///
218 /// By default, this is disabled. It may be selectively enabled in the
219 /// regular expression by using the `x` flag regardless of this setting.
220 pub fn ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder {
221 self.ignore_whitespace = yes;
222 self
223 }
224}
225
226/// A regular expression parser.
227///
228/// This parses a string representation of a regular expression into an
229/// abstract syntax tree. The size of the tree is proportional to the length
230/// of the regular expression pattern.
231///
232/// A `Parser` can be configured in more detail via a [`ParserBuilder`].
233#[derive(Clone, Debug)]
234pub struct Parser {
235 /// The current position of the parser.
236 pos: Cell<Position>,
237 /// The current capture index.
238 capture_index: Cell<u32>,
239 /// The maximum number of open parens/brackets allowed. If the parser
240 /// exceeds this number, then an error is returned.
241 nest_limit: u32,
242 /// Whether to support octal syntax or not. When `false`, the parser will
243 /// return an error helpfully pointing out that backreferences are not
244 /// supported.
245 octal: bool,
246 /// The initial setting for `ignore_whitespace` as provided by
247 /// `ParserBuilder`. It is used when resetting the parser's state.
248 initial_ignore_whitespace: bool,
249 /// Whether whitespace should be ignored. When enabled, comments are
250 /// also permitted.
251 ignore_whitespace: Cell<bool>,
252 /// A list of comments, in order of appearance.
253 comments: RefCell<Vec<ast::Comment>>,
254 /// A stack of grouped sub-expressions, including alternations.
255 stack_group: RefCell<Vec<GroupState>>,
256 /// A stack of nested character classes. This is only non-empty when
257 /// parsing a class.
258 stack_class: RefCell<Vec<ClassState>>,
259 /// A sorted sequence of capture names. This is used to detect duplicate
260 /// capture names and report an error if one is detected.
261 capture_names: RefCell<Vec<ast::CaptureName>>,
262 /// A scratch buffer used in various places. Mostly this is used to
263 /// accumulate relevant characters from parts of a pattern.
264 scratch: RefCell<String>,
265}
266
267/// ParserI is the internal parser implementation.
268///
269/// We use this separate type so that we can carry the provided pattern string
270/// along with us. In particular, a `Parser` internal state is not tied to any
271/// one pattern, but `ParserI` is.
272///
273/// This type also lets us use `ParserI<&Parser>` in production code while
274/// retaining the convenience of `ParserI<Parser>` for tests, which sometimes
275/// work against the internal interface of the parser.
276#[derive(Clone, Debug)]
277struct ParserI<'s, P> {
278 /// The parser state/configuration.
279 parser: P,
280 /// The full regular expression provided by the user.
281 pattern: &'s str,
282}
283
284/// GroupState represents a single stack frame while parsing nested groups
285/// and alternations. Each frame records the state up to an opening parenthesis
286/// or a alternating bracket `|`.
287#[derive(Clone, Debug)]
288enum GroupState {
289 /// This state is pushed whenever an opening group is found.
290 Group {
291 /// The concatenation immediately preceding the opening group.
292 concat: ast::Concat,
293 /// The group that has been opened. Its sub-AST is always empty.
294 group: ast::Group,
295 /// Whether this group has the `x` flag enabled or not.
296 ignore_whitespace: bool,
297 },
298 /// This state is pushed whenever a new alternation branch is found. If
299 /// an alternation branch is found and this state is at the top of the
300 /// stack, then this state should be modified to include the new
301 /// alternation.
302 Alternation(ast::Alternation),
303}
304
305/// ClassState represents a single stack frame while parsing character classes.
306/// Each frame records the state up to an intersection, difference, symmetric
307/// difference or nested class.
308///
309/// Note that a parser's character class stack is only non-empty when parsing
310/// a character class. In all other cases, it is empty.
311#[derive(Clone, Debug)]
312enum ClassState {
313 /// This state is pushed whenever an opening bracket is found.
314 Open {
315 /// The union of class items immediately preceding this class.
316 union: ast::ClassSetUnion,
317 /// The class that has been opened. Typically this just corresponds
318 /// to the `[`, but it can also include `[^` since `^` indicates
319 /// negation of the class.
320 set: ast::ClassBracketed,
321 },
322 /// This state is pushed when a operator is seen. When popped, the stored
323 /// set becomes the left hand side of the operator.
324 Op {
325 /// The type of the operation, i.e., &&, -- or ~~.
326 kind: ast::ClassSetBinaryOpKind,
327 /// The left-hand side of the operator.
328 lhs: ast::ClassSet,
329 },
330}
331
332impl Parser {
333 /// Create a new parser with a default configuration.
334 ///
335 /// The parser can be run with either the `parse` or `parse_with_comments`
336 /// methods. The parse methods return an abstract syntax tree.
337 ///
338 /// To set configuration options on the parser, use [`ParserBuilder`].
339 pub fn new() -> Parser {
340 ParserBuilder::new().build()
341 }
342
343 /// Parse the regular expression into an abstract syntax tree.
344 pub fn parse(&mut self, pattern: &str) -> Result<Ast> {
345 ParserI::new(self, pattern).parse()
346 }
347
348 /// Parse the regular expression and return an abstract syntax tree with
349 /// all of the comments found in the pattern.
350 pub fn parse_with_comments(
351 &mut self,
352 pattern: &str,
353 ) -> Result<ast::WithComments> {
354 ParserI::new(self, pattern).parse_with_comments()
355 }
356
357 /// Reset the internal state of a parser.
358 ///
359 /// This is called at the beginning of every parse. This prevents the
360 /// parser from running with inconsistent state (say, if a previous
361 /// invocation returned an error and the parser is reused).
362 fn reset(&self) {
363 // These settings should be in line with the construction
364 // in `ParserBuilder::build`.
365 self.pos.set(Position { offset: 0, line: 1, column: 1 });
366 self.ignore_whitespace.set(self.initial_ignore_whitespace);
367 self.comments.borrow_mut().clear();
368 self.stack_group.borrow_mut().clear();
369 self.stack_class.borrow_mut().clear();
370 }
371}
372
373impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
374 /// Build an internal parser from a parser configuration and a pattern.
375 fn new(parser: P, pattern: &'s str) -> ParserI<'s, P> {
376 ParserI { parser, pattern }
377 }
378
379 /// Return a reference to the parser state.
380 fn parser(&self) -> &Parser {
381 self.parser.borrow()
382 }
383
384 /// Return a reference to the pattern being parsed.
385 fn pattern(&self) -> &str {
386 self.pattern.borrow()
387 }
388
389 /// Create a new error with the given span and error type.
390 fn error(&self, span: Span, kind: ast::ErrorKind) -> ast::Error {
391 ast::Error { kind, pattern: self.pattern().to_string(), span }
392 }
393
394 /// Return the current offset of the parser.
395 ///
396 /// The offset starts at `0` from the beginning of the regular expression
397 /// pattern string.
398 fn offset(&self) -> usize {
399 self.parser().pos.get().offset
400 }
401
402 /// Return the current line number of the parser.
403 ///
404 /// The line number starts at `1`.
405 fn line(&self) -> usize {
406 self.parser().pos.get().line
407 }
408
409 /// Return the current column of the parser.
410 ///
411 /// The column number starts at `1` and is reset whenever a `\n` is seen.
412 fn column(&self) -> usize {
413 self.parser().pos.get().column
414 }
415
416 /// Return the next capturing index. Each subsequent call increments the
417 /// internal index.
418 ///
419 /// The span given should correspond to the location of the opening
420 /// parenthesis.
421 ///
422 /// If the capture limit is exceeded, then an error is returned.
423 fn next_capture_index(&self, span: Span) -> Result<u32> {
424 let current = self.parser().capture_index.get();
425 let i = current.checked_add(1).ok_or_else(|| {
426 self.error(span, ast::ErrorKind::CaptureLimitExceeded)
427 })?;
428 self.parser().capture_index.set(i);
429 Ok(i)
430 }
431
432 /// Adds the given capture name to this parser. If this capture name has
433 /// already been used, then an error is returned.
434 fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> {
435 let mut names = self.parser().capture_names.borrow_mut();
436 match names
437 .binary_search_by_key(&cap.name.as_str(), |c| c.name.as_str())
438 {
439 Err(i) => {
440 names.insert(i, cap.clone());
441 Ok(())
442 }
443 Ok(i) => Err(self.error(
444 cap.span,
445 ast::ErrorKind::GroupNameDuplicate { original: names[i].span },
446 )),
447 }
448 }
449
450 /// Return whether the parser should ignore whitespace or not.
451 fn ignore_whitespace(&self) -> bool {
452 self.parser().ignore_whitespace.get()
453 }
454
455 /// Return the character at the current position of the parser.
456 ///
457 /// This panics if the current position does not point to a valid char.
458 fn char(&self) -> char {
459 self.char_at(self.offset())
460 }
461
462 /// Return the character at the given position.
463 ///
464 /// This panics if the given position does not point to a valid char.
465 fn char_at(&self, i: usize) -> char {
466 self.pattern()[i..]
467 .chars()
468 .next()
469 .unwrap_or_else(|| panic!("expected char at offset {}", i))
470 }
471
472 /// Bump the parser to the next Unicode scalar value.
473 ///
474 /// If the end of the input has been reached, then `false` is returned.
475 fn bump(&self) -> bool {
476 if self.is_eof() {
477 return false;
478 }
479 let Position { mut offset, mut line, mut column } = self.pos();
480 if self.char() == '\n' {
481 line = line.checked_add(1).unwrap();
482 column = 1;
483 } else {
484 column = column.checked_add(1).unwrap();
485 }
486 offset += self.char().len_utf8();
487 self.parser().pos.set(Position { offset, line, column });
488 self.pattern()[self.offset()..].chars().next().is_some()
489 }
490
491 /// If the substring starting at the current position of the parser has
492 /// the given prefix, then bump the parser to the character immediately
493 /// following the prefix and return true. Otherwise, don't bump the parser
494 /// and return false.
495 fn bump_if(&self, prefix: &str) -> bool {
496 if self.pattern()[self.offset()..].starts_with(prefix) {
497 for _ in 0..prefix.chars().count() {
498 self.bump();
499 }
500 true
501 } else {
502 false
503 }
504 }
505
506 /// Returns true if and only if the parser is positioned at a look-around
507 /// prefix. The conditions under which this returns true must always
508 /// correspond to a regular expression that would otherwise be consider
509 /// invalid.
510 ///
511 /// This should only be called immediately after parsing the opening of
512 /// a group or a set of flags.
513 fn is_lookaround_prefix(&self) -> bool {
514 self.bump_if("?=")
515 || self.bump_if("?!")
516 || self.bump_if("?<=")
517 || self.bump_if("?<!")
518 }
519
520 /// Bump the parser, and if the `x` flag is enabled, bump through any
521 /// subsequent spaces. Return true if and only if the parser is not at
522 /// EOF.
523 fn bump_and_bump_space(&self) -> bool {
524 if !self.bump() {
525 return false;
526 }
527 self.bump_space();
528 !self.is_eof()
529 }
530
531 /// If the `x` flag is enabled (i.e., whitespace insensitivity with
532 /// comments), then this will advance the parser through all whitespace
533 /// and comments to the next non-whitespace non-comment byte.
534 ///
535 /// If the `x` flag is disabled, then this is a no-op.
536 ///
537 /// This should be used selectively throughout the parser where
538 /// arbitrary whitespace is permitted when the `x` flag is enabled. For
539 /// example, `{ 5 , 6}` is equivalent to `{5,6}`.
540 fn bump_space(&self) {
541 if !self.ignore_whitespace() {
542 return;
543 }
544 while !self.is_eof() {
545 if self.char().is_whitespace() {
546 self.bump();
547 } else if self.char() == '#' {
548 let start = self.pos();
549 let mut comment_text = String::new();
550 self.bump();
551 while !self.is_eof() {
552 let c = self.char();
553 self.bump();
554 if c == '\n' {
555 break;
556 }
557 comment_text.push(c);
558 }
559 let comment = ast::Comment {
560 span: Span::new(start, self.pos()),
561 comment: comment_text,
562 };
563 self.parser().comments.borrow_mut().push(comment);
564 } else {
565 break;
566 }
567 }
568 }
569
570 /// Peek at the next character in the input without advancing the parser.
571 ///
572 /// If the input has been exhausted, then this returns `None`.
573 fn peek(&self) -> Option<char> {
574 if self.is_eof() {
575 return None;
576 }
577 self.pattern()[self.offset() + self.char().len_utf8()..].chars().next()
578 }
579
580 /// Like peek, but will ignore spaces when the parser is in whitespace
581 /// insensitive mode.
582 fn peek_space(&self) -> Option<char> {
583 if !self.ignore_whitespace() {
584 return self.peek();
585 }
586 if self.is_eof() {
587 return None;
588 }
589 let mut start = self.offset() + self.char().len_utf8();
590 let mut in_comment = false;
591 for (i, c) in self.pattern()[start..].char_indices() {
592 if c.is_whitespace() {
593 continue;
594 } else if !in_comment && c == '#' {
595 in_comment = true;
596 } else if in_comment && c == '\n' {
597 in_comment = false;
598 } else {
599 start += i;
600 break;
601 }
602 }
603 self.pattern()[start..].chars().next()
604 }
605
606 /// Returns true if the next call to `bump` would return false.
607 fn is_eof(&self) -> bool {
608 self.offset() == self.pattern().len()
609 }
610
611 /// Return the current position of the parser, which includes the offset,
612 /// line and column.
613 fn pos(&self) -> Position {
614 self.parser().pos.get()
615 }
616
617 /// Create a span at the current position of the parser. Both the start
618 /// and end of the span are set.
619 fn span(&self) -> Span {
620 Span::splat(self.pos())
621 }
622
623 /// Create a span that covers the current character.
624 fn span_char(&self) -> Span {
625 let mut next = Position {
626 offset: self.offset().checked_add(self.char().len_utf8()).unwrap(),
627 line: self.line(),
628 column: self.column().checked_add(1).unwrap(),
629 };
630 if self.char() == '\n' {
631 next.line += 1;
632 next.column = 1;
633 }
634 Span::new(self.pos(), next)
635 }
636
637 /// Parse and push a single alternation on to the parser's internal stack.
638 /// If the top of the stack already has an alternation, then add to that
639 /// instead of pushing a new one.
640 ///
641 /// The concatenation given corresponds to a single alternation branch.
642 /// The concatenation returned starts the next branch and is empty.
643 ///
644 /// This assumes the parser is currently positioned at `|` and will advance
645 /// the parser to the character following `|`.
646 #[inline(never)]
647 fn push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
648 assert_eq!(self.char(), '|');
649 concat.span.end = self.pos();
650 self.push_or_add_alternation(concat);
651 self.bump();
652 Ok(ast::Concat { span: self.span(), asts: vec![] })
653 }
654
655 /// Pushes or adds the given branch of an alternation to the parser's
656 /// internal stack of state.
657 fn push_or_add_alternation(&self, concat: ast::Concat) {
658 use self::GroupState::*;
659
660 let mut stack = self.parser().stack_group.borrow_mut();
661 if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() {
662 alts.asts.push(concat.into_ast());
663 return;
664 }
665 stack.push(Alternation(ast::Alternation {
666 span: Span::new(concat.span.start, self.pos()),
667 asts: vec![concat.into_ast()],
668 }));
669 }
670
671 /// Parse and push a group AST (and its parent concatenation) on to the
672 /// parser's internal stack. Return a fresh concatenation corresponding
673 /// to the group's sub-AST.
674 ///
675 /// If a set of flags was found (with no group), then the concatenation
676 /// is returned with that set of flags added.
677 ///
678 /// This assumes that the parser is currently positioned on the opening
679 /// parenthesis. It advances the parser to the character at the start
680 /// of the sub-expression (or adjoining expression).
681 ///
682 /// If there was a problem parsing the start of the group, then an error
683 /// is returned.
684 #[inline(never)]
685 fn push_group(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
686 assert_eq!(self.char(), '(');
687 match self.parse_group()? {
688 Either::Left(set) => {
689 let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace);
690 if let Some(v) = ignore {
691 self.parser().ignore_whitespace.set(v);
692 }
693
694 concat.asts.push(Ast::Flags(set));
695 Ok(concat)
696 }
697 Either::Right(group) => {
698 let old_ignore_whitespace = self.ignore_whitespace();
699 let new_ignore_whitespace = group
700 .flags()
701 .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
702 .unwrap_or(old_ignore_whitespace);
703 self.parser().stack_group.borrow_mut().push(
704 GroupState::Group {
705 concat,
706 group,
707 ignore_whitespace: old_ignore_whitespace,
708 },
709 );
710 self.parser().ignore_whitespace.set(new_ignore_whitespace);
711 Ok(ast::Concat { span: self.span(), asts: vec![] })
712 }
713 }
714 }
715
716 /// Pop a group AST from the parser's internal stack and set the group's
717 /// AST to the given concatenation. Return the concatenation containing
718 /// the group.
719 ///
720 /// This assumes that the parser is currently positioned on the closing
721 /// parenthesis and advances the parser to the character following the `)`.
722 ///
723 /// If no such group could be popped, then an unopened group error is
724 /// returned.
725 #[inline(never)]
726 fn pop_group(&self, mut group_concat: ast::Concat) -> Result<ast::Concat> {
727 use self::GroupState::*;
728
729 assert_eq!(self.char(), ')');
730 let mut stack = self.parser().stack_group.borrow_mut();
731 let (mut prior_concat, mut group, ignore_whitespace, alt) = match stack
732 .pop()
733 {
734 Some(Group { concat, group, ignore_whitespace }) => {
735 (concat, group, ignore_whitespace, None)
736 }
737 Some(Alternation(alt)) => match stack.pop() {
738 Some(Group { concat, group, ignore_whitespace }) => {
739 (concat, group, ignore_whitespace, Some(alt))
740 }
741 None | Some(Alternation(_)) => {
742 return Err(self.error(
743 self.span_char(),
744 ast::ErrorKind::GroupUnopened,
745 ));
746 }
747 },
748 None => {
749 return Err(self
750 .error(self.span_char(), ast::ErrorKind::GroupUnopened));
751 }
752 };
753 self.parser().ignore_whitespace.set(ignore_whitespace);
754 group_concat.span.end = self.pos();
755 self.bump();
756 group.span.end = self.pos();
757 match alt {
758 Some(mut alt) => {
759 alt.span.end = group_concat.span.end;
760 alt.asts.push(group_concat.into_ast());
761 group.ast = Box::new(alt.into_ast());
762 }
763 None => {
764 group.ast = Box::new(group_concat.into_ast());
765 }
766 }
767 prior_concat.asts.push(Ast::Group(group));
768 Ok(prior_concat)
769 }
770
771 /// Pop the last state from the parser's internal stack, if it exists, and
772 /// add the given concatenation to it. There either must be no state or a
773 /// single alternation item on the stack. Any other scenario produces an
774 /// error.
775 ///
776 /// This assumes that the parser has advanced to the end.
777 #[inline(never)]
778 fn pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast> {
779 concat.span.end = self.pos();
780 let mut stack = self.parser().stack_group.borrow_mut();
781 let ast = match stack.pop() {
782 None => Ok(concat.into_ast()),
783 Some(GroupState::Alternation(mut alt)) => {
784 alt.span.end = self.pos();
785 alt.asts.push(concat.into_ast());
786 Ok(Ast::Alternation(alt))
787 }
788 Some(GroupState::Group { group, .. }) => {
789 return Err(
790 self.error(group.span, ast::ErrorKind::GroupUnclosed)
791 );
792 }
793 };
794 // If we try to pop again, there should be nothing.
795 match stack.pop() {
796 None => ast,
797 Some(GroupState::Alternation(_)) => {
798 // This unreachable is unfortunate. This case can't happen
799 // because the only way we can be here is if there were two
800 // `GroupState::Alternation`s adjacent in the parser's stack,
801 // which we guarantee to never happen because we never push a
802 // `GroupState::Alternation` if one is already at the top of
803 // the stack.
804 unreachable!()
805 }
806 Some(GroupState::Group { group, .. }) => {
807 Err(self.error(group.span, ast::ErrorKind::GroupUnclosed))
808 }
809 }
810 }
811
812 /// Parse the opening of a character class and push the current class
813 /// parsing context onto the parser's stack. This assumes that the parser
814 /// is positioned at an opening `[`. The given union should correspond to
815 /// the union of set items built up before seeing the `[`.
816 ///
817 /// If there was a problem parsing the opening of the class, then an error
818 /// is returned. Otherwise, a new union of set items for the class is
819 /// returned (which may be populated with either a `]` or a `-`).
820 #[inline(never)]
821 fn push_class_open(
822 &self,
823 parent_union: ast::ClassSetUnion,
824 ) -> Result<ast::ClassSetUnion> {
825 assert_eq!(self.char(), '[');
826
827 let (nested_set, nested_union) = self.parse_set_class_open()?;
828 self.parser()
829 .stack_class
830 .borrow_mut()
831 .push(ClassState::Open { union: parent_union, set: nested_set });
832 Ok(nested_union)
833 }
834
835 /// Parse the end of a character class set and pop the character class
836 /// parser stack. The union given corresponds to the last union built
837 /// before seeing the closing `]`. The union returned corresponds to the
838 /// parent character class set with the nested class added to it.
839 ///
840 /// This assumes that the parser is positioned at a `]` and will advance
841 /// the parser to the byte immediately following the `]`.
842 ///
843 /// If the stack is empty after popping, then this returns the final
844 /// "top-level" character class AST (where a "top-level" character class
845 /// is one that is not nested inside any other character class).
846 ///
847 /// If there is no corresponding opening bracket on the parser's stack,
848 /// then an error is returned.
849 #[inline(never)]
850 fn pop_class(
851 &self,
852 nested_union: ast::ClassSetUnion,
853 ) -> Result<Either<ast::ClassSetUnion, ast::Class>> {
854 assert_eq!(self.char(), ']');
855
856 let item = ast::ClassSet::Item(nested_union.into_item());
857 let prevset = self.pop_class_op(item);
858 let mut stack = self.parser().stack_class.borrow_mut();
859 match stack.pop() {
860 None => {
861 // We can never observe an empty stack:
862 //
863 // 1) We are guaranteed to start with a non-empty stack since
864 // the character class parser is only initiated when it sees
865 // a `[`.
866 // 2) If we ever observe an empty stack while popping after
867 // seeing a `]`, then we signal the character class parser
868 // to terminate.
869 panic!("unexpected empty character class stack")
870 }
871 Some(ClassState::Op { .. }) => {
872 // This panic is unfortunate, but this case is impossible
873 // since we already popped the Op state if one exists above.
874 // Namely, every push to the class parser stack is guarded by
875 // whether an existing Op is already on the top of the stack.
876 // If it is, the existing Op is modified. That is, the stack
877 // can never have consecutive Op states.
878 panic!("unexpected ClassState::Op")
879 }
880 Some(ClassState::Open { mut union, mut set }) => {
881 self.bump();
882 set.span.end = self.pos();
883 set.kind = prevset;
884 if stack.is_empty() {
885 Ok(Either::Right(ast::Class::Bracketed(set)))
886 } else {
887 union.push(ast::ClassSetItem::Bracketed(Box::new(set)));
888 Ok(Either::Left(union))
889 }
890 }
891 }
892 }
893
894 /// Return an "unclosed class" error whose span points to the most
895 /// recently opened class.
896 ///
897 /// This should only be called while parsing a character class.
898 #[inline(never)]
899 fn unclosed_class_error(&self) -> ast::Error {
900 for state in self.parser().stack_class.borrow().iter().rev() {
901 if let ClassState::Open { ref set, .. } = *state {
902 return self.error(set.span, ast::ErrorKind::ClassUnclosed);
903 }
904 }
905 // We are guaranteed to have a non-empty stack with at least
906 // one open bracket, so we should never get here.
907 panic!("no open character class found")
908 }
909
910 /// Push the current set of class items on to the class parser's stack as
911 /// the left hand side of the given operator.
912 ///
913 /// A fresh set union is returned, which should be used to build the right
914 /// hand side of this operator.
915 #[inline(never)]
916 fn push_class_op(
917 &self,
918 next_kind: ast::ClassSetBinaryOpKind,
919 next_union: ast::ClassSetUnion,
920 ) -> ast::ClassSetUnion {
921 let item = ast::ClassSet::Item(next_union.into_item());
922 let new_lhs = self.pop_class_op(item);
923 self.parser()
924 .stack_class
925 .borrow_mut()
926 .push(ClassState::Op { kind: next_kind, lhs: new_lhs });
927 ast::ClassSetUnion { span: self.span(), items: vec![] }
928 }
929
930 /// Pop a character class set from the character class parser stack. If the
931 /// top of the stack is just an item (not an operation), then return the
932 /// given set unchanged. If the top of the stack is an operation, then the
933 /// given set will be used as the rhs of the operation on the top of the
934 /// stack. In that case, the binary operation is returned as a set.
935 #[inline(never)]
936 fn pop_class_op(&self, rhs: ast::ClassSet) -> ast::ClassSet {
937 let mut stack = self.parser().stack_class.borrow_mut();
938 let (kind, lhs) = match stack.pop() {
939 Some(ClassState::Op { kind, lhs }) => (kind, lhs),
940 Some(state @ ClassState::Open { .. }) => {
941 stack.push(state);
942 return rhs;
943 }
944 None => unreachable!(),
945 };
946 let span = Span::new(lhs.span().start, rhs.span().end);
947 ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp {
948 span,
949 kind,
950 lhs: Box::new(lhs),
951 rhs: Box::new(rhs),
952 })
953 }
954}
955
956impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
957 /// Parse the regular expression into an abstract syntax tree.
958 fn parse(&self) -> Result<Ast> {
959 self.parse_with_comments().map(|astc| astc.ast)
960 }
961
962 /// Parse the regular expression and return an abstract syntax tree with
963 /// all of the comments found in the pattern.
964 fn parse_with_comments(&self) -> Result<ast::WithComments> {
965 assert_eq!(self.offset(), 0, "parser can only be used once");
966 self.parser().reset();
967 let mut concat = ast::Concat { span: self.span(), asts: vec![] };
968 loop {
969 self.bump_space();
970 if self.is_eof() {
971 break;
972 }
973 match self.char() {
974 '(' => concat = self.push_group(concat)?,
975 ')' => concat = self.pop_group(concat)?,
976 '|' => concat = self.push_alternate(concat)?,
977 '[' => {
978 let class = self.parse_set_class()?;
979 concat.asts.push(Ast::Class(class));
980 }
981 '?' => {
982 concat = self.parse_uncounted_repetition(
983 concat,
984 ast::RepetitionKind::ZeroOrOne,
985 )?;
986 }
987 '*' => {
988 concat = self.parse_uncounted_repetition(
989 concat,
990 ast::RepetitionKind::ZeroOrMore,
991 )?;
992 }
993 '+' => {
994 concat = self.parse_uncounted_repetition(
995 concat,
996 ast::RepetitionKind::OneOrMore,
997 )?;
998 }
999 '{' => {
1000 concat = self.parse_counted_repetition(concat)?;
1001 }
1002 _ => concat.asts.push(self.parse_primitive()?.into_ast()),
1003 }
1004 }
1005 let ast = self.pop_group_end(concat)?;
1006 NestLimiter::new(self).check(&ast)?;
1007 Ok(ast::WithComments {
1008 ast,
1009 comments: mem::replace(
1010 &mut *self.parser().comments.borrow_mut(),
1011 vec![],
1012 ),
1013 })
1014 }
1015
1016 /// Parses an uncounted repetition operation. An uncounted repetition
1017 /// operator includes ?, * and +, but does not include the {m,n} syntax.
1018 /// The given `kind` should correspond to the operator observed by the
1019 /// caller.
1020 ///
1021 /// This assumes that the parser is currently positioned at the repetition
1022 /// operator and advances the parser to the first character after the
1023 /// operator. (Note that the operator may include a single additional `?`,
1024 /// which makes the operator ungreedy.)
1025 ///
1026 /// The caller should include the concatenation that is being built. The
1027 /// concatenation returned includes the repetition operator applied to the
1028 /// last expression in the given concatenation.
1029 #[inline(never)]
1030 fn parse_uncounted_repetition(
1031 &self,
1032 mut concat: ast::Concat,
1033 kind: ast::RepetitionKind,
1034 ) -> Result<ast::Concat> {
1035 assert!(
1036 self.char() == '?' || self.char() == '*' || self.char() == '+'
1037 );
1038 let op_start = self.pos();
1039 let ast = match concat.asts.pop() {
1040 Some(ast) => ast,
1041 None => {
1042 return Err(
1043 self.error(self.span(), ast::ErrorKind::RepetitionMissing)
1044 )
1045 }
1046 };
1047 match ast {
1048 Ast::Empty(_) | Ast::Flags(_) => {
1049 return Err(
1050 self.error(self.span(), ast::ErrorKind::RepetitionMissing)
1051 )
1052 }
1053 _ => {}
1054 }
1055 let mut greedy = true;
1056 if self.bump() && self.char() == '?' {
1057 greedy = false;
1058 self.bump();
1059 }
1060 concat.asts.push(Ast::Repetition(ast::Repetition {
1061 span: ast.span().with_end(self.pos()),
1062 op: ast::RepetitionOp {
1063 span: Span::new(op_start, self.pos()),
1064 kind,
1065 },
1066 greedy,
1067 ast: Box::new(ast),
1068 }));
1069 Ok(concat)
1070 }
1071
1072 /// Parses a counted repetition operation. A counted repetition operator
1073 /// corresponds to the {m,n} syntax, and does not include the ?, * or +
1074 /// operators.
1075 ///
1076 /// This assumes that the parser is currently positioned at the opening `{`
1077 /// and advances the parser to the first character after the operator.
1078 /// (Note that the operator may include a single additional `?`, which
1079 /// makes the operator ungreedy.)
1080 ///
1081 /// The caller should include the concatenation that is being built. The
1082 /// concatenation returned includes the repetition operator applied to the
1083 /// last expression in the given concatenation.
1084 #[inline(never)]
1085 fn parse_counted_repetition(
1086 &self,
1087 mut concat: ast::Concat,
1088 ) -> Result<ast::Concat> {
1089 assert!(self.char() == '{');
1090 let start = self.pos();
1091 let ast = match concat.asts.pop() {
1092 Some(ast) => ast,
1093 None => {
1094 return Err(
1095 self.error(self.span(), ast::ErrorKind::RepetitionMissing)
1096 )
1097 }
1098 };
1099 match ast {
1100 Ast::Empty(_) | Ast::Flags(_) => {
1101 return Err(
1102 self.error(self.span(), ast::ErrorKind::RepetitionMissing)
1103 )
1104 }
1105 _ => {}
1106 }
1107 if !self.bump_and_bump_space() {
1108 return Err(self.error(
1109 Span::new(start, self.pos()),
1110 ast::ErrorKind::RepetitionCountUnclosed,
1111 ));
1112 }
1113 let count_start = specialize_err(
1114 self.parse_decimal(),
1115 ast::ErrorKind::DecimalEmpty,
1116 ast::ErrorKind::RepetitionCountDecimalEmpty,
1117 )?;
1118 let mut range = ast::RepetitionRange::Exactly(count_start);
1119 if self.is_eof() {
1120 return Err(self.error(
1121 Span::new(start, self.pos()),
1122 ast::ErrorKind::RepetitionCountUnclosed,
1123 ));
1124 }
1125 if self.char() == ',' {
1126 if !self.bump_and_bump_space() {
1127 return Err(self.error(
1128 Span::new(start, self.pos()),
1129 ast::ErrorKind::RepetitionCountUnclosed,
1130 ));
1131 }
1132 if self.char() != '}' {
1133 let count_end = specialize_err(
1134 self.parse_decimal(),
1135 ast::ErrorKind::DecimalEmpty,
1136 ast::ErrorKind::RepetitionCountDecimalEmpty,
1137 )?;
1138 range = ast::RepetitionRange::Bounded(count_start, count_end);
1139 } else {
1140 range = ast::RepetitionRange::AtLeast(count_start);
1141 }
1142 }
1143 if self.is_eof() || self.char() != '}' {
1144 return Err(self.error(
1145 Span::new(start, self.pos()),
1146 ast::ErrorKind::RepetitionCountUnclosed,
1147 ));
1148 }
1149
1150 let mut greedy = true;
1151 if self.bump_and_bump_space() && self.char() == '?' {
1152 greedy = false;
1153 self.bump();
1154 }
1155
1156 let op_span = Span::new(start, self.pos());
1157 if !range.is_valid() {
1158 return Err(
1159 self.error(op_span, ast::ErrorKind::RepetitionCountInvalid)
1160 );
1161 }
1162 concat.asts.push(Ast::Repetition(ast::Repetition {
1163 span: ast.span().with_end(self.pos()),
1164 op: ast::RepetitionOp {
1165 span: op_span,
1166 kind: ast::RepetitionKind::Range(range),
1167 },
1168 greedy,
1169 ast: Box::new(ast),
1170 }));
1171 Ok(concat)
1172 }
1173
1174 /// Parse a group (which contains a sub-expression) or a set of flags.
1175 ///
1176 /// If a group was found, then it is returned with an empty AST. If a set
1177 /// of flags is found, then that set is returned.
1178 ///
1179 /// The parser should be positioned at the opening parenthesis.
1180 ///
1181 /// This advances the parser to the character before the start of the
1182 /// sub-expression (in the case of a group) or to the closing parenthesis
1183 /// immediately following the set of flags.
1184 ///
1185 /// # Errors
1186 ///
1187 /// If flags are given and incorrectly specified, then a corresponding
1188 /// error is returned.
1189 ///
1190 /// If a capture name is given and it is incorrectly specified, then a
1191 /// corresponding error is returned.
1192 #[inline(never)]
1193 fn parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>> {
1194 assert_eq!(self.char(), '(');
1195 let open_span = self.span_char();
1196 self.bump();
1197 self.bump_space();
1198 if self.is_lookaround_prefix() {
1199 return Err(self.error(
1200 Span::new(open_span.start, self.span().end),
1201 ast::ErrorKind::UnsupportedLookAround,
1202 ));
1203 }
1204 let inner_span = self.span();
1205 let mut starts_with_p = true;
1206 if self.bump_if("?P<") || {
1207 starts_with_p = false;
1208 self.bump_if("?<")
1209 } {
1210 let capture_index = self.next_capture_index(open_span)?;
1211 let name = self.parse_capture_name(capture_index)?;
1212 Ok(Either::Right(ast::Group {
1213 span: open_span,
1214 kind: ast::GroupKind::CaptureName { starts_with_p, name },
1215 ast: Box::new(Ast::Empty(self.span())),
1216 }))
1217 } else if self.bump_if("?") {
1218 if self.is_eof() {
1219 return Err(
1220 self.error(open_span, ast::ErrorKind::GroupUnclosed)
1221 );
1222 }
1223 let flags = self.parse_flags()?;
1224 let char_end = self.char();
1225 self.bump();
1226 if char_end == ')' {
1227 // We don't allow empty flags, e.g., `(?)`. We instead
1228 // interpret it as a repetition operator missing its argument.
1229 if flags.items.is_empty() {
1230 return Err(self.error(
1231 inner_span,
1232 ast::ErrorKind::RepetitionMissing,
1233 ));
1234 }
1235 Ok(Either::Left(ast::SetFlags {
1236 span: Span { end: self.pos(), ..open_span },
1237 flags,
1238 }))
1239 } else {
1240 assert_eq!(char_end, ':');
1241 Ok(Either::Right(ast::Group {
1242 span: open_span,
1243 kind: ast::GroupKind::NonCapturing(flags),
1244 ast: Box::new(Ast::Empty(self.span())),
1245 }))
1246 }
1247 } else {
1248 let capture_index = self.next_capture_index(open_span)?;
1249 Ok(Either::Right(ast::Group {
1250 span: open_span,
1251 kind: ast::GroupKind::CaptureIndex(capture_index),
1252 ast: Box::new(Ast::Empty(self.span())),
1253 }))
1254 }
1255 }
1256
1257 /// Parses a capture group name. Assumes that the parser is positioned at
1258 /// the first character in the name following the opening `<` (and may
1259 /// possibly be EOF). This advances the parser to the first character
1260 /// following the closing `>`.
1261 ///
1262 /// The caller must provide the capture index of the group for this name.
1263 #[inline(never)]
1264 fn parse_capture_name(
1265 &self,
1266 capture_index: u32,
1267 ) -> Result<ast::CaptureName> {
1268 if self.is_eof() {
1269 return Err(self
1270 .error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
1271 }
1272 let start = self.pos();
1273 loop {
1274 if self.char() == '>' {
1275 break;
1276 }
1277 if !is_capture_char(self.char(), self.pos() == start) {
1278 return Err(self.error(
1279 self.span_char(),
1280 ast::ErrorKind::GroupNameInvalid,
1281 ));
1282 }
1283 if !self.bump() {
1284 break;
1285 }
1286 }
1287 let end = self.pos();
1288 if self.is_eof() {
1289 return Err(self
1290 .error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
1291 }
1292 assert_eq!(self.char(), '>');
1293 self.bump();
1294 let name = &self.pattern()[start.offset..end.offset];
1295 if name.is_empty() {
1296 return Err(self.error(
1297 Span::new(start, start),
1298 ast::ErrorKind::GroupNameEmpty,
1299 ));
1300 }
1301 let capname = ast::CaptureName {
1302 span: Span::new(start, end),
1303 name: name.to_string(),
1304 index: capture_index,
1305 };
1306 self.add_capture_name(&capname)?;
1307 Ok(capname)
1308 }
1309
1310 /// Parse a sequence of flags starting at the current character.
1311 ///
1312 /// This advances the parser to the character immediately following the
1313 /// flags, which is guaranteed to be either `:` or `)`.
1314 ///
1315 /// # Errors
1316 ///
1317 /// If any flags are duplicated, then an error is returned.
1318 ///
1319 /// If the negation operator is used more than once, then an error is
1320 /// returned.
1321 ///
1322 /// If no flags could be found or if the negation operation is not followed
1323 /// by any flags, then an error is returned.
1324 #[inline(never)]
1325 fn parse_flags(&self) -> Result<ast::Flags> {
1326 let mut flags = ast::Flags { span: self.span(), items: vec![] };
1327 let mut last_was_negation = None;
1328 while self.char() != ':' && self.char() != ')' {
1329 if self.char() == '-' {
1330 last_was_negation = Some(self.span_char());
1331 let item = ast::FlagsItem {
1332 span: self.span_char(),
1333 kind: ast::FlagsItemKind::Negation,
1334 };
1335 if let Some(i) = flags.add_item(item) {
1336 return Err(self.error(
1337 self.span_char(),
1338 ast::ErrorKind::FlagRepeatedNegation {
1339 original: flags.items[i].span,
1340 },
1341 ));
1342 }
1343 } else {
1344 last_was_negation = None;
1345 let item = ast::FlagsItem {
1346 span: self.span_char(),
1347 kind: ast::FlagsItemKind::Flag(self.parse_flag()?),
1348 };
1349 if let Some(i) = flags.add_item(item) {
1350 return Err(self.error(
1351 self.span_char(),
1352 ast::ErrorKind::FlagDuplicate {
1353 original: flags.items[i].span,
1354 },
1355 ));
1356 }
1357 }
1358 if !self.bump() {
1359 return Err(
1360 self.error(self.span(), ast::ErrorKind::FlagUnexpectedEof)
1361 );
1362 }
1363 }
1364 if let Some(span) = last_was_negation {
1365 return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation));
1366 }
1367 flags.span.end = self.pos();
1368 Ok(flags)
1369 }
1370
1371 /// Parse the current character as a flag. Do not advance the parser.
1372 ///
1373 /// # Errors
1374 ///
1375 /// If the flag is not recognized, then an error is returned.
1376 #[inline(never)]
1377 fn parse_flag(&self) -> Result<ast::Flag> {
1378 match self.char() {
1379 'i' => Ok(ast::Flag::CaseInsensitive),
1380 'm' => Ok(ast::Flag::MultiLine),
1381 's' => Ok(ast::Flag::DotMatchesNewLine),
1382 'U' => Ok(ast::Flag::SwapGreed),
1383 'u' => Ok(ast::Flag::Unicode),
1384 'R' => Ok(ast::Flag::CRLF),
1385 'x' => Ok(ast::Flag::IgnoreWhitespace),
1386 _ => {
1387 Err(self
1388 .error(self.span_char(), ast::ErrorKind::FlagUnrecognized))
1389 }
1390 }
1391 }
1392
1393 /// Parse a primitive AST. e.g., A literal, non-set character class or
1394 /// assertion.
1395 ///
1396 /// This assumes that the parser expects a primitive at the current
1397 /// location. i.e., All other non-primitive cases have been handled.
1398 /// For example, if the parser's position is at `|`, then `|` will be
1399 /// treated as a literal (e.g., inside a character class).
1400 ///
1401 /// This advances the parser to the first character immediately following
1402 /// the primitive.
1403 fn parse_primitive(&self) -> Result<Primitive> {
1404 match self.char() {
1405 '\\' => self.parse_escape(),
1406 '.' => {
1407 let ast = Primitive::Dot(self.span_char());
1408 self.bump();
1409 Ok(ast)
1410 }
1411 '^' => {
1412 let ast = Primitive::Assertion(ast::Assertion {
1413 span: self.span_char(),
1414 kind: ast::AssertionKind::StartLine,
1415 });
1416 self.bump();
1417 Ok(ast)
1418 }
1419 '$' => {
1420 let ast = Primitive::Assertion(ast::Assertion {
1421 span: self.span_char(),
1422 kind: ast::AssertionKind::EndLine,
1423 });
1424 self.bump();
1425 Ok(ast)
1426 }
1427 c => {
1428 let ast = Primitive::Literal(ast::Literal {
1429 span: self.span_char(),
1430 kind: ast::LiteralKind::Verbatim,
1431 c,
1432 });
1433 self.bump();
1434 Ok(ast)
1435 }
1436 }
1437 }
1438
1439 /// Parse an escape sequence as a primitive AST.
1440 ///
1441 /// This assumes the parser is positioned at the start of the escape
1442 /// sequence, i.e., `\`. It advances the parser to the first position
1443 /// immediately following the escape sequence.
1444 #[inline(never)]
1445 fn parse_escape(&self) -> Result<Primitive> {
1446 assert_eq!(self.char(), '\\');
1447 let start = self.pos();
1448 if !self.bump() {
1449 return Err(self.error(
1450 Span::new(start, self.pos()),
1451 ast::ErrorKind::EscapeUnexpectedEof,
1452 ));
1453 }
1454 let c = self.char();
1455 // Put some of the more complicated routines into helpers.
1456 match c {
1457 '0'..='7' => {
1458 if !self.parser().octal {
1459 return Err(self.error(
1460 Span::new(start, self.span_char().end),
1461 ast::ErrorKind::UnsupportedBackreference,
1462 ));
1463 }
1464 let mut lit = self.parse_octal();
1465 lit.span.start = start;
1466 return Ok(Primitive::Literal(lit));
1467 }
1468 '8'..='9' if !self.parser().octal => {
1469 return Err(self.error(
1470 Span::new(start, self.span_char().end),
1471 ast::ErrorKind::UnsupportedBackreference,
1472 ));
1473 }
1474 'x' | 'u' | 'U' => {
1475 let mut lit = self.parse_hex()?;
1476 lit.span.start = start;
1477 return Ok(Primitive::Literal(lit));
1478 }
1479 'p' | 'P' => {
1480 let mut cls = self.parse_unicode_class()?;
1481 cls.span.start = start;
1482 return Ok(Primitive::Unicode(cls));
1483 }
1484 'd' | 's' | 'w' | 'D' | 'S' | 'W' => {
1485 let mut cls = self.parse_perl_class();
1486 cls.span.start = start;
1487 return Ok(Primitive::Perl(cls));
1488 }
1489 _ => {}
1490 }
1491
1492 // Handle all of the one letter sequences inline.
1493 self.bump();
1494 let span = Span::new(start, self.pos());
1495 if is_meta_character(c) {
1496 return Ok(Primitive::Literal(ast::Literal {
1497 span,
1498 kind: ast::LiteralKind::Meta,
1499 c,
1500 }));
1501 }
1502 if is_escapeable_character(c) {
1503 return Ok(Primitive::Literal(ast::Literal {
1504 span,
1505 kind: ast::LiteralKind::Superfluous,
1506 c,
1507 }));
1508 }
1509 let special = |kind, c| {
1510 Ok(Primitive::Literal(ast::Literal {
1511 span,
1512 kind: ast::LiteralKind::Special(kind),
1513 c,
1514 }))
1515 };
1516 match c {
1517 'a' => special(ast::SpecialLiteralKind::Bell, '\x07'),
1518 'f' => special(ast::SpecialLiteralKind::FormFeed, '\x0C'),
1519 't' => special(ast::SpecialLiteralKind::Tab, '\t'),
1520 'n' => special(ast::SpecialLiteralKind::LineFeed, '\n'),
1521 'r' => special(ast::SpecialLiteralKind::CarriageReturn, '\r'),
1522 'v' => special(ast::SpecialLiteralKind::VerticalTab, '\x0B'),
1523 'A' => Ok(Primitive::Assertion(ast::Assertion {
1524 span,
1525 kind: ast::AssertionKind::StartText,
1526 })),
1527 'z' => Ok(Primitive::Assertion(ast::Assertion {
1528 span,
1529 kind: ast::AssertionKind::EndText,
1530 })),
1531 'b' => Ok(Primitive::Assertion(ast::Assertion {
1532 span,
1533 kind: ast::AssertionKind::WordBoundary,
1534 })),
1535 'B' => Ok(Primitive::Assertion(ast::Assertion {
1536 span,
1537 kind: ast::AssertionKind::NotWordBoundary,
1538 })),
1539 _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)),
1540 }
1541 }
1542
1543 /// Parse an octal representation of a Unicode codepoint up to 3 digits
1544 /// long. This expects the parser to be positioned at the first octal
1545 /// digit and advances the parser to the first character immediately
1546 /// following the octal number. This also assumes that parsing octal
1547 /// escapes is enabled.
1548 ///
1549 /// Assuming the preconditions are met, this routine can never fail.
1550 #[inline(never)]
1551 fn parse_octal(&self) -> ast::Literal {
1552 assert!(self.parser().octal);
1553 assert!('0' <= self.char() && self.char() <= '7');
1554 let start = self.pos();
1555 // Parse up to two more digits.
1556 while self.bump()
1557 && '0' <= self.char()
1558 && self.char() <= '7'
1559 && self.pos().offset - start.offset <= 2
1560 {}
1561 let end = self.pos();
1562 let octal = &self.pattern()[start.offset..end.offset];
1563 // Parsing the octal should never fail since the above guarantees a
1564 // valid number.
1565 let codepoint =
1566 u32::from_str_radix(octal, 8).expect("valid octal number");
1567 // The max value for 3 digit octal is 0777 = 511 and [0, 511] has no
1568 // invalid Unicode scalar values.
1569 let c = char::from_u32(codepoint).expect("Unicode scalar value");
1570 ast::Literal {
1571 span: Span::new(start, end),
1572 kind: ast::LiteralKind::Octal,
1573 c,
1574 }
1575 }
1576
1577 /// Parse a hex representation of a Unicode codepoint. This handles both
1578 /// hex notations, i.e., `\xFF` and `\x{FFFF}`. This expects the parser to
1579 /// be positioned at the `x`, `u` or `U` prefix. The parser is advanced to
1580 /// the first character immediately following the hexadecimal literal.
1581 #[inline(never)]
1582 fn parse_hex(&self) -> Result<ast::Literal> {
1583 assert!(
1584 self.char() == 'x' || self.char() == 'u' || self.char() == 'U'
1585 );
1586
1587 let hex_kind = match self.char() {
1588 'x' => ast::HexLiteralKind::X,
1589 'u' => ast::HexLiteralKind::UnicodeShort,
1590 _ => ast::HexLiteralKind::UnicodeLong,
1591 };
1592 if !self.bump_and_bump_space() {
1593 return Err(
1594 self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof)
1595 );
1596 }
1597 if self.char() == '{' {
1598 self.parse_hex_brace(hex_kind)
1599 } else {
1600 self.parse_hex_digits(hex_kind)
1601 }
1602 }
1603
1604 /// Parse an N-digit hex representation of a Unicode codepoint. This
1605 /// expects the parser to be positioned at the first digit and will advance
1606 /// the parser to the first character immediately following the escape
1607 /// sequence.
1608 ///
1609 /// The number of digits given must be 2 (for `\xNN`), 4 (for `\uNNNN`)
1610 /// or 8 (for `\UNNNNNNNN`).
1611 #[inline(never)]
1612 fn parse_hex_digits(
1613 &self,
1614 kind: ast::HexLiteralKind,
1615 ) -> Result<ast::Literal> {
1616 let mut scratch = self.parser().scratch.borrow_mut();
1617 scratch.clear();
1618
1619 let start = self.pos();
1620 for i in 0..kind.digits() {
1621 if i > 0 && !self.bump_and_bump_space() {
1622 return Err(self
1623 .error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
1624 }
1625 if !is_hex(self.char()) {
1626 return Err(self.error(
1627 self.span_char(),
1628 ast::ErrorKind::EscapeHexInvalidDigit,
1629 ));
1630 }
1631 scratch.push(self.char());
1632 }
1633 // The final bump just moves the parser past the literal, which may
1634 // be EOF.
1635 self.bump_and_bump_space();
1636 let end = self.pos();
1637 let hex = scratch.as_str();
1638 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
1639 None => Err(self.error(
1640 Span::new(start, end),
1641 ast::ErrorKind::EscapeHexInvalid,
1642 )),
1643 Some(c) => Ok(ast::Literal {
1644 span: Span::new(start, end),
1645 kind: ast::LiteralKind::HexFixed(kind),
1646 c,
1647 }),
1648 }
1649 }
1650
1651 /// Parse a hex representation of any Unicode scalar value. This expects
1652 /// the parser to be positioned at the opening brace `{` and will advance
1653 /// the parser to the first character following the closing brace `}`.
1654 #[inline(never)]
1655 fn parse_hex_brace(
1656 &self,
1657 kind: ast::HexLiteralKind,
1658 ) -> Result<ast::Literal> {
1659 let mut scratch = self.parser().scratch.borrow_mut();
1660 scratch.clear();
1661
1662 let brace_pos = self.pos();
1663 let start = self.span_char().end;
1664 while self.bump_and_bump_space() && self.char() != '}' {
1665 if !is_hex(self.char()) {
1666 return Err(self.error(
1667 self.span_char(),
1668 ast::ErrorKind::EscapeHexInvalidDigit,
1669 ));
1670 }
1671 scratch.push(self.char());
1672 }
1673 if self.is_eof() {
1674 return Err(self.error(
1675 Span::new(brace_pos, self.pos()),
1676 ast::ErrorKind::EscapeUnexpectedEof,
1677 ));
1678 }
1679 let end = self.pos();
1680 let hex = scratch.as_str();
1681 assert_eq!(self.char(), '}');
1682 self.bump_and_bump_space();
1683
1684 if hex.is_empty() {
1685 return Err(self.error(
1686 Span::new(brace_pos, self.pos()),
1687 ast::ErrorKind::EscapeHexEmpty,
1688 ));
1689 }
1690 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
1691 None => Err(self.error(
1692 Span::new(start, end),
1693 ast::ErrorKind::EscapeHexInvalid,
1694 )),
1695 Some(c) => Ok(ast::Literal {
1696 span: Span::new(start, self.pos()),
1697 kind: ast::LiteralKind::HexBrace(kind),
1698 c,
1699 }),
1700 }
1701 }
1702
1703 /// Parse a decimal number into a u32 while trimming leading and trailing
1704 /// whitespace.
1705 ///
1706 /// This expects the parser to be positioned at the first position where
1707 /// a decimal digit could occur. This will advance the parser to the byte
1708 /// immediately following the last contiguous decimal digit.
1709 ///
1710 /// If no decimal digit could be found or if there was a problem parsing
1711 /// the complete set of digits into a u32, then an error is returned.
1712 fn parse_decimal(&self) -> Result<u32> {
1713 let mut scratch = self.parser().scratch.borrow_mut();
1714 scratch.clear();
1715
1716 while !self.is_eof() && self.char().is_whitespace() {
1717 self.bump();
1718 }
1719 let start = self.pos();
1720 while !self.is_eof() && '0' <= self.char() && self.char() <= '9' {
1721 scratch.push(self.char());
1722 self.bump_and_bump_space();
1723 }
1724 let span = Span::new(start, self.pos());
1725 while !self.is_eof() && self.char().is_whitespace() {
1726 self.bump_and_bump_space();
1727 }
1728 let digits = scratch.as_str();
1729 if digits.is_empty() {
1730 return Err(self.error(span, ast::ErrorKind::DecimalEmpty));
1731 }
1732 match u32::from_str_radix(digits, 10).ok() {
1733 Some(n) => Ok(n),
1734 None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)),
1735 }
1736 }
1737
1738 /// Parse a standard character class consisting primarily of characters or
1739 /// character ranges, but can also contain nested character classes of
1740 /// any type (sans `.`).
1741 ///
1742 /// This assumes the parser is positioned at the opening `[`. If parsing
1743 /// is successful, then the parser is advanced to the position immediately
1744 /// following the closing `]`.
1745 #[inline(never)]
1746 fn parse_set_class(&self) -> Result<ast::Class> {
1747 assert_eq!(self.char(), '[');
1748
1749 let mut union =
1750 ast::ClassSetUnion { span: self.span(), items: vec![] };
1751 loop {
1752 self.bump_space();
1753 if self.is_eof() {
1754 return Err(self.unclosed_class_error());
1755 }
1756 match self.char() {
1757 '[' => {
1758 // If we've already parsed the opening bracket, then
1759 // attempt to treat this as the beginning of an ASCII
1760 // class. If ASCII class parsing fails, then the parser
1761 // backs up to `[`.
1762 if !self.parser().stack_class.borrow().is_empty() {
1763 if let Some(cls) = self.maybe_parse_ascii_class() {
1764 union.push(ast::ClassSetItem::Ascii(cls));
1765 continue;
1766 }
1767 }
1768 union = self.push_class_open(union)?;
1769 }
1770 ']' => match self.pop_class(union)? {
1771 Either::Left(nested_union) => {
1772 union = nested_union;
1773 }
1774 Either::Right(class) => return Ok(class),
1775 },
1776 '&' if self.peek() == Some('&') => {
1777 assert!(self.bump_if("&&"));
1778 union = self.push_class_op(
1779 ast::ClassSetBinaryOpKind::Intersection,
1780 union,
1781 );
1782 }
1783 '-' if self.peek() == Some('-') => {
1784 assert!(self.bump_if("--"));
1785 union = self.push_class_op(
1786 ast::ClassSetBinaryOpKind::Difference,
1787 union,
1788 );
1789 }
1790 '~' if self.peek() == Some('~') => {
1791 assert!(self.bump_if("~~"));
1792 union = self.push_class_op(
1793 ast::ClassSetBinaryOpKind::SymmetricDifference,
1794 union,
1795 );
1796 }
1797 _ => {
1798 union.push(self.parse_set_class_range()?);
1799 }
1800 }
1801 }
1802 }
1803
1804 /// Parse a single primitive item in a character class set. The item to
1805 /// be parsed can either be one of a simple literal character, a range
1806 /// between two simple literal characters or a "primitive" character
1807 /// class like \w or \p{Greek}.
1808 ///
1809 /// If an invalid escape is found, or if a character class is found where
1810 /// a simple literal is expected (e.g., in a range), then an error is
1811 /// returned.
1812 #[inline(never)]
1813 fn parse_set_class_range(&self) -> Result<ast::ClassSetItem> {
1814 let prim1 = self.parse_set_class_item()?;
1815 self.bump_space();
1816 if self.is_eof() {
1817 return Err(self.unclosed_class_error());
1818 }
1819 // If the next char isn't a `-`, then we don't have a range.
1820 // There are two exceptions. If the char after a `-` is a `]`, then
1821 // `-` is interpreted as a literal `-`. Alternatively, if the char
1822 // after a `-` is a `-`, then `--` corresponds to a "difference"
1823 // operation.
1824 if self.char() != '-'
1825 || self.peek_space() == Some(']')
1826 || self.peek_space() == Some('-')
1827 {
1828 return prim1.into_class_set_item(self);
1829 }
1830 // OK, now we're parsing a range, so bump past the `-` and parse the
1831 // second half of the range.
1832 if !self.bump_and_bump_space() {
1833 return Err(self.unclosed_class_error());
1834 }
1835 let prim2 = self.parse_set_class_item()?;
1836 let range = ast::ClassSetRange {
1837 span: Span::new(prim1.span().start, prim2.span().end),
1838 start: prim1.into_class_literal(self)?,
1839 end: prim2.into_class_literal(self)?,
1840 };
1841 if !range.is_valid() {
1842 return Err(
1843 self.error(range.span, ast::ErrorKind::ClassRangeInvalid)
1844 );
1845 }
1846 Ok(ast::ClassSetItem::Range(range))
1847 }
1848
1849 /// Parse a single item in a character class as a primitive, where the
1850 /// primitive either consists of a verbatim literal or a single escape
1851 /// sequence.
1852 ///
1853 /// This assumes the parser is positioned at the beginning of a primitive,
1854 /// and advances the parser to the first position after the primitive if
1855 /// successful.
1856 ///
1857 /// Note that it is the caller's responsibility to report an error if an
1858 /// illegal primitive was parsed.
1859 #[inline(never)]
1860 fn parse_set_class_item(&self) -> Result<Primitive> {
1861 if self.char() == '\\' {
1862 self.parse_escape()
1863 } else {
1864 let x = Primitive::Literal(ast::Literal {
1865 span: self.span_char(),
1866 kind: ast::LiteralKind::Verbatim,
1867 c: self.char(),
1868 });
1869 self.bump();
1870 Ok(x)
1871 }
1872 }
1873
1874 /// Parses the opening of a character class set. This includes the opening
1875 /// bracket along with `^` if present to indicate negation. This also
1876 /// starts parsing the opening set of unioned items if applicable, since
1877 /// there are special rules applied to certain characters in the opening
1878 /// of a character class. For example, `[^]]` is the class of all
1879 /// characters not equal to `]`. (`]` would need to be escaped in any other
1880 /// position.) Similarly for `-`.
1881 ///
1882 /// In all cases, the op inside the returned `ast::ClassBracketed` is an
1883 /// empty union. This empty union should be replaced with the actual item
1884 /// when it is popped from the parser's stack.
1885 ///
1886 /// This assumes the parser is positioned at the opening `[` and advances
1887 /// the parser to the first non-special byte of the character class.
1888 ///
1889 /// An error is returned if EOF is found.
1890 #[inline(never)]
1891 fn parse_set_class_open(
1892 &self,
1893 ) -> Result<(ast::ClassBracketed, ast::ClassSetUnion)> {
1894 assert_eq!(self.char(), '[');
1895 let start = self.pos();
1896 if !self.bump_and_bump_space() {
1897 return Err(self.error(
1898 Span::new(start, self.pos()),
1899 ast::ErrorKind::ClassUnclosed,
1900 ));
1901 }
1902
1903 let negated = if self.char() != '^' {
1904 false
1905 } else {
1906 if !self.bump_and_bump_space() {
1907 return Err(self.error(
1908 Span::new(start, self.pos()),
1909 ast::ErrorKind::ClassUnclosed,
1910 ));
1911 }
1912 true
1913 };
1914 // Accept any number of `-` as literal `-`.
1915 let mut union =
1916 ast::ClassSetUnion { span: self.span(), items: vec![] };
1917 while self.char() == '-' {
1918 union.push(ast::ClassSetItem::Literal(ast::Literal {
1919 span: self.span_char(),
1920 kind: ast::LiteralKind::Verbatim,
1921 c: '-',
1922 }));
1923 if !self.bump_and_bump_space() {
1924 return Err(self.error(
1925 Span::new(start, start),
1926 ast::ErrorKind::ClassUnclosed,
1927 ));
1928 }
1929 }
1930 // If `]` is the *first* char in a set, then interpret it as a literal
1931 // `]`. That is, an empty class is impossible to write.
1932 if union.items.is_empty() && self.char() == ']' {
1933 union.push(ast::ClassSetItem::Literal(ast::Literal {
1934 span: self.span_char(),
1935 kind: ast::LiteralKind::Verbatim,
1936 c: ']',
1937 }));
1938 if !self.bump_and_bump_space() {
1939 return Err(self.error(
1940 Span::new(start, self.pos()),
1941 ast::ErrorKind::ClassUnclosed,
1942 ));
1943 }
1944 }
1945 let set = ast::ClassBracketed {
1946 span: Span::new(start, self.pos()),
1947 negated,
1948 kind: ast::ClassSet::union(ast::ClassSetUnion {
1949 span: Span::new(union.span.start, union.span.start),
1950 items: vec![],
1951 }),
1952 };
1953 Ok((set, union))
1954 }
1955
1956 /// Attempt to parse an ASCII character class, e.g., `[:alnum:]`.
1957 ///
1958 /// This assumes the parser is positioned at the opening `[`.
1959 ///
1960 /// If no valid ASCII character class could be found, then this does not
1961 /// advance the parser and `None` is returned. Otherwise, the parser is
1962 /// advanced to the first byte following the closing `]` and the
1963 /// corresponding ASCII class is returned.
1964 #[inline(never)]
1965 fn maybe_parse_ascii_class(&self) -> Option<ast::ClassAscii> {
1966 // ASCII character classes are interesting from a parsing perspective
1967 // because parsing cannot fail with any interesting error. For example,
1968 // in order to use an ASCII character class, it must be enclosed in
1969 // double brackets, e.g., `[[:alnum:]]`. Alternatively, you might think
1970 // of it as "ASCII character characters have the syntax `[:NAME:]`
1971 // which can only appear within character brackets." This means that
1972 // things like `[[:lower:]A]` are legal constructs.
1973 //
1974 // However, if one types an incorrect ASCII character class, e.g.,
1975 // `[[:loower:]]`, then we treat that as a normal nested character
1976 // class containing the characters `:elorw`. One might argue that we
1977 // should return an error instead since the repeated colons give away
1978 // the intent to write an ASCII class. But what if the user typed
1979 // `[[:lower]]` instead? How can we tell that was intended to be an
1980 // ASCII class and not just a normal nested class?
1981 //
1982 // Reasonable people can probably disagree over this, but for better
1983 // or worse, we implement semantics that never fails at the expense
1984 // of better failure modes.
1985 assert_eq!(self.char(), '[');
1986 // If parsing fails, then we back up the parser to this starting point.
1987 let start = self.pos();
1988 let mut negated = false;
1989 if !self.bump() || self.char() != ':' {
1990 self.parser().pos.set(start);
1991 return None;
1992 }
1993 if !self.bump() {
1994 self.parser().pos.set(start);
1995 return None;
1996 }
1997 if self.char() == '^' {
1998 negated = true;
1999 if !self.bump() {
2000 self.parser().pos.set(start);
2001 return None;
2002 }
2003 }
2004 let name_start = self.offset();
2005 while self.char() != ':' && self.bump() {}
2006 if self.is_eof() {
2007 self.parser().pos.set(start);
2008 return None;
2009 }
2010 let name = &self.pattern()[name_start..self.offset()];
2011 if !self.bump_if(":]") {
2012 self.parser().pos.set(start);
2013 return None;
2014 }
2015 let kind = match ast::ClassAsciiKind::from_name(name) {
2016 Some(kind) => kind,
2017 None => {
2018 self.parser().pos.set(start);
2019 return None;
2020 }
2021 };
2022 Some(ast::ClassAscii {
2023 span: Span::new(start, self.pos()),
2024 kind,
2025 negated,
2026 })
2027 }
2028
2029 /// Parse a Unicode class in either the single character notation, `\pN`
2030 /// or the multi-character bracketed notation, `\p{Greek}`. This assumes
2031 /// the parser is positioned at the `p` (or `P` for negation) and will
2032 /// advance the parser to the character immediately following the class.
2033 ///
2034 /// Note that this does not check whether the class name is valid or not.
2035 #[inline(never)]
2036 fn parse_unicode_class(&self) -> Result<ast::ClassUnicode> {
2037 assert!(self.char() == 'p' || self.char() == 'P');
2038
2039 let mut scratch = self.parser().scratch.borrow_mut();
2040 scratch.clear();
2041
2042 let negated = self.char() == 'P';
2043 if !self.bump_and_bump_space() {
2044 return Err(
2045 self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof)
2046 );
2047 }
2048 let (start, kind) = if self.char() == '{' {
2049 let start = self.span_char().end;
2050 while self.bump_and_bump_space() && self.char() != '}' {
2051 scratch.push(self.char());
2052 }
2053 if self.is_eof() {
2054 return Err(self
2055 .error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2056 }
2057 assert_eq!(self.char(), '}');
2058 self.bump();
2059
2060 let name = scratch.as_str();
2061 if let Some(i) = name.find("!=") {
2062 (
2063 start,
2064 ast::ClassUnicodeKind::NamedValue {
2065 op: ast::ClassUnicodeOpKind::NotEqual,
2066 name: name[..i].to_string(),
2067 value: name[i + 2..].to_string(),
2068 },
2069 )
2070 } else if let Some(i) = name.find(':') {
2071 (
2072 start,
2073 ast::ClassUnicodeKind::NamedValue {
2074 op: ast::ClassUnicodeOpKind::Colon,
2075 name: name[..i].to_string(),
2076 value: name[i + 1..].to_string(),
2077 },
2078 )
2079 } else if let Some(i) = name.find('=') {
2080 (
2081 start,
2082 ast::ClassUnicodeKind::NamedValue {
2083 op: ast::ClassUnicodeOpKind::Equal,
2084 name: name[..i].to_string(),
2085 value: name[i + 1..].to_string(),
2086 },
2087 )
2088 } else {
2089 (start, ast::ClassUnicodeKind::Named(name.to_string()))
2090 }
2091 } else {
2092 let start = self.pos();
2093 let c = self.char();
2094 if c == '\\' {
2095 return Err(self.error(
2096 self.span_char(),
2097 ast::ErrorKind::UnicodeClassInvalid,
2098 ));
2099 }
2100 self.bump_and_bump_space();
2101 let kind = ast::ClassUnicodeKind::OneLetter(c);
2102 (start, kind)
2103 };
2104 Ok(ast::ClassUnicode {
2105 span: Span::new(start, self.pos()),
2106 negated,
2107 kind,
2108 })
2109 }
2110
2111 /// Parse a Perl character class, e.g., `\d` or `\W`. This assumes the
2112 /// parser is currently at a valid character class name and will be
2113 /// advanced to the character immediately following the class.
2114 #[inline(never)]
2115 fn parse_perl_class(&self) -> ast::ClassPerl {
2116 let c = self.char();
2117 let span = self.span_char();
2118 self.bump();
2119 let (negated, kind) = match c {
2120 'd' => (false, ast::ClassPerlKind::Digit),
2121 'D' => (true, ast::ClassPerlKind::Digit),
2122 's' => (false, ast::ClassPerlKind::Space),
2123 'S' => (true, ast::ClassPerlKind::Space),
2124 'w' => (false, ast::ClassPerlKind::Word),
2125 'W' => (true, ast::ClassPerlKind::Word),
2126 c => panic!("expected valid Perl class but got '{}'", c),
2127 };
2128 ast::ClassPerl { span, kind, negated }
2129 }
2130}
2131
2132/// A type that traverses a fully parsed Ast and checks whether its depth
2133/// exceeds the specified nesting limit. If it does, then an error is returned.
2134#[derive(Debug)]
2135struct NestLimiter<'p, 's, P> {
2136 /// The parser that is checking the nest limit.
2137 p: &'p ParserI<'s, P>,
2138 /// The current depth while walking an Ast.
2139 depth: u32,
2140}
2141
2142impl<'p, 's, P: Borrow<Parser>> NestLimiter<'p, 's, P> {
2143 fn new(p: &'p ParserI<'s, P>) -> NestLimiter<'p, 's, P> {
2144 NestLimiter { p, depth: 0 }
2145 }
2146
2147 #[inline(never)]
2148 fn check(self, ast: &Ast) -> Result<()> {
2149 ast::visit(ast, self)
2150 }
2151
2152 fn increment_depth(&mut self, span: &Span) -> Result<()> {
2153 let new = self.depth.checked_add(1).ok_or_else(|| {
2154 self.p.error(
2155 span.clone(),
2156 ast::ErrorKind::NestLimitExceeded(u32::MAX),
2157 )
2158 })?;
2159 let limit = self.p.parser().nest_limit;
2160 if new > limit {
2161 return Err(self.p.error(
2162 span.clone(),
2163 ast::ErrorKind::NestLimitExceeded(limit),
2164 ));
2165 }
2166 self.depth = new;
2167 Ok(())
2168 }
2169
2170 fn decrement_depth(&mut self) {
2171 // Assuming the correctness of the visitor, this should never drop
2172 // below 0.
2173 self.depth = self.depth.checked_sub(1).unwrap();
2174 }
2175}
2176
2177impl<'p, 's, P: Borrow<Parser>> ast::Visitor for NestLimiter<'p, 's, P> {
2178 type Output = ();
2179 type Err = ast::Error;
2180
2181 fn finish(self) -> Result<()> {
2182 Ok(())
2183 }
2184
2185 fn visit_pre(&mut self, ast: &Ast) -> Result<()> {
2186 let span = match *ast {
2187 Ast::Empty(_)
2188 | Ast::Flags(_)
2189 | Ast::Literal(_)
2190 | Ast::Dot(_)
2191 | Ast::Assertion(_)
2192 | Ast::Class(ast::Class::Unicode(_))
2193 | Ast::Class(ast::Class::Perl(_)) => {
2194 // These are all base cases, so we don't increment depth.
2195 return Ok(());
2196 }
2197 Ast::Class(ast::Class::Bracketed(ref x)) => &x.span,
2198 Ast::Repetition(ref x) => &x.span,
2199 Ast::Group(ref x) => &x.span,
2200 Ast::Alternation(ref x) => &x.span,
2201 Ast::Concat(ref x) => &x.span,
2202 };
2203 self.increment_depth(span)
2204 }
2205
2206 fn visit_post(&mut self, ast: &Ast) -> Result<()> {
2207 match *ast {
2208 Ast::Empty(_)
2209 | Ast::Flags(_)
2210 | Ast::Literal(_)
2211 | Ast::Dot(_)
2212 | Ast::Assertion(_)
2213 | Ast::Class(ast::Class::Unicode(_))
2214 | Ast::Class(ast::Class::Perl(_)) => {
2215 // These are all base cases, so we don't decrement depth.
2216 Ok(())
2217 }
2218 Ast::Class(ast::Class::Bracketed(_))
2219 | Ast::Repetition(_)
2220 | Ast::Group(_)
2221 | Ast::Alternation(_)
2222 | Ast::Concat(_) => {
2223 self.decrement_depth();
2224 Ok(())
2225 }
2226 }
2227 }
2228
2229 fn visit_class_set_item_pre(
2230 &mut self,
2231 ast: &ast::ClassSetItem,
2232 ) -> Result<()> {
2233 let span = match *ast {
2234 ast::ClassSetItem::Empty(_)
2235 | ast::ClassSetItem::Literal(_)
2236 | ast::ClassSetItem::Range(_)
2237 | ast::ClassSetItem::Ascii(_)
2238 | ast::ClassSetItem::Unicode(_)
2239 | ast::ClassSetItem::Perl(_) => {
2240 // These are all base cases, so we don't increment depth.
2241 return Ok(());
2242 }
2243 ast::ClassSetItem::Bracketed(ref x) => &x.span,
2244 ast::ClassSetItem::Union(ref x) => &x.span,
2245 };
2246 self.increment_depth(span)
2247 }
2248
2249 fn visit_class_set_item_post(
2250 &mut self,
2251 ast: &ast::ClassSetItem,
2252 ) -> Result<()> {
2253 match *ast {
2254 ast::ClassSetItem::Empty(_)
2255 | ast::ClassSetItem::Literal(_)
2256 | ast::ClassSetItem::Range(_)
2257 | ast::ClassSetItem::Ascii(_)
2258 | ast::ClassSetItem::Unicode(_)
2259 | ast::ClassSetItem::Perl(_) => {
2260 // These are all base cases, so we don't decrement depth.
2261 Ok(())
2262 }
2263 ast::ClassSetItem::Bracketed(_) | ast::ClassSetItem::Union(_) => {
2264 self.decrement_depth();
2265 Ok(())
2266 }
2267 }
2268 }
2269
2270 fn visit_class_set_binary_op_pre(
2271 &mut self,
2272 ast: &ast::ClassSetBinaryOp,
2273 ) -> Result<()> {
2274 self.increment_depth(&ast.span)
2275 }
2276
2277 fn visit_class_set_binary_op_post(
2278 &mut self,
2279 _ast: &ast::ClassSetBinaryOp,
2280 ) -> Result<()> {
2281 self.decrement_depth();
2282 Ok(())
2283 }
2284}
2285
2286/// When the result is an error, transforms the ast::ErrorKind from the source
2287/// Result into another one. This function is used to return clearer error
2288/// messages when possible.
2289fn specialize_err<T>(
2290 result: Result<T>,
2291 from: ast::ErrorKind,
2292 to: ast::ErrorKind,
2293) -> Result<T> {
2294 if let Err(e: Error) = result {
2295 if e.kind == from {
2296 Err(ast::Error { kind: to, pattern: e.pattern, span: e.span })
2297 } else {
2298 Err(e)
2299 }
2300 } else {
2301 result
2302 }
2303}
2304
2305#[cfg(test)]
2306mod tests {
2307 use core::ops::Range;
2308
2309 use alloc::format;
2310
2311 use crate::ast::{self, Ast, Position, Span};
2312
2313 use super::*;
2314
2315 // Our own assert_eq, which has slightly better formatting (but honestly
2316 // still kind of crappy).
2317 macro_rules! assert_eq {
2318 ($left:expr, $right:expr) => {{
2319 match (&$left, &$right) {
2320 (left_val, right_val) => {
2321 if !(*left_val == *right_val) {
2322 panic!(
2323 "assertion failed: `(left == right)`\n\n\
2324 left: `{:?}`\nright: `{:?}`\n\n",
2325 left_val, right_val
2326 )
2327 }
2328 }
2329 }
2330 }};
2331 }
2332
2333 // We create these errors to compare with real ast::Errors in the tests.
2334 // We define equality between TestError and ast::Error to disregard the
2335 // pattern string in ast::Error, which is annoying to provide in tests.
2336 #[derive(Clone, Debug)]
2337 struct TestError {
2338 span: Span,
2339 kind: ast::ErrorKind,
2340 }
2341
2342 impl PartialEq<ast::Error> for TestError {
2343 fn eq(&self, other: &ast::Error) -> bool {
2344 self.span == other.span && self.kind == other.kind
2345 }
2346 }
2347
2348 impl PartialEq<TestError> for ast::Error {
2349 fn eq(&self, other: &TestError) -> bool {
2350 self.span == other.span && self.kind == other.kind
2351 }
2352 }
2353
2354 fn s(str: &str) -> String {
2355 str.to_string()
2356 }
2357
2358 fn parser(pattern: &str) -> ParserI<'_, Parser> {
2359 ParserI::new(Parser::new(), pattern)
2360 }
2361
2362 fn parser_octal(pattern: &str) -> ParserI<'_, Parser> {
2363 let parser = ParserBuilder::new().octal(true).build();
2364 ParserI::new(parser, pattern)
2365 }
2366
2367 fn parser_nest_limit(
2368 pattern: &str,
2369 nest_limit: u32,
2370 ) -> ParserI<'_, Parser> {
2371 let p = ParserBuilder::new().nest_limit(nest_limit).build();
2372 ParserI::new(p, pattern)
2373 }
2374
2375 fn parser_ignore_whitespace(pattern: &str) -> ParserI<'_, Parser> {
2376 let p = ParserBuilder::new().ignore_whitespace(true).build();
2377 ParserI::new(p, pattern)
2378 }
2379
2380 /// Short alias for creating a new span.
2381 fn nspan(start: Position, end: Position) -> Span {
2382 Span::new(start, end)
2383 }
2384
2385 /// Short alias for creating a new position.
2386 fn npos(offset: usize, line: usize, column: usize) -> Position {
2387 Position::new(offset, line, column)
2388 }
2389
2390 /// Create a new span from the given offset range. This assumes a single
2391 /// line and sets the columns based on the offsets. i.e., This only works
2392 /// out of the box for ASCII, which is fine for most tests.
2393 fn span(range: Range<usize>) -> Span {
2394 let start = Position::new(range.start, 1, range.start + 1);
2395 let end = Position::new(range.end, 1, range.end + 1);
2396 Span::new(start, end)
2397 }
2398
2399 /// Create a new span for the corresponding byte range in the given string.
2400 fn span_range(subject: &str, range: Range<usize>) -> Span {
2401 let start = Position {
2402 offset: range.start,
2403 line: 1 + subject[..range.start].matches('\n').count(),
2404 column: 1 + subject[..range.start]
2405 .chars()
2406 .rev()
2407 .position(|c| c == '\n')
2408 .unwrap_or(subject[..range.start].chars().count()),
2409 };
2410 let end = Position {
2411 offset: range.end,
2412 line: 1 + subject[..range.end].matches('\n').count(),
2413 column: 1 + subject[..range.end]
2414 .chars()
2415 .rev()
2416 .position(|c| c == '\n')
2417 .unwrap_or(subject[..range.end].chars().count()),
2418 };
2419 Span::new(start, end)
2420 }
2421
2422 /// Create a verbatim literal starting at the given position.
2423 fn lit(c: char, start: usize) -> Ast {
2424 lit_with(c, span(start..start + c.len_utf8()))
2425 }
2426
2427 /// Create a meta literal starting at the given position.
2428 fn meta_lit(c: char, span: Span) -> Ast {
2429 Ast::Literal(ast::Literal { span, kind: ast::LiteralKind::Meta, c })
2430 }
2431
2432 /// Create a verbatim literal with the given span.
2433 fn lit_with(c: char, span: Span) -> Ast {
2434 Ast::Literal(ast::Literal {
2435 span,
2436 kind: ast::LiteralKind::Verbatim,
2437 c,
2438 })
2439 }
2440
2441 /// Create a concatenation with the given range.
2442 fn concat(range: Range<usize>, asts: Vec<Ast>) -> Ast {
2443 concat_with(span(range), asts)
2444 }
2445
2446 /// Create a concatenation with the given span.
2447 fn concat_with(span: Span, asts: Vec<Ast>) -> Ast {
2448 Ast::Concat(ast::Concat { span, asts })
2449 }
2450
2451 /// Create an alternation with the given span.
2452 fn alt(range: Range<usize>, asts: Vec<Ast>) -> Ast {
2453 Ast::Alternation(ast::Alternation { span: span(range), asts })
2454 }
2455
2456 /// Create a capturing group with the given span.
2457 fn group(range: Range<usize>, index: u32, ast: Ast) -> Ast {
2458 Ast::Group(ast::Group {
2459 span: span(range),
2460 kind: ast::GroupKind::CaptureIndex(index),
2461 ast: Box::new(ast),
2462 })
2463 }
2464
2465 /// Create an ast::SetFlags.
2466 ///
2467 /// The given pattern should be the full pattern string. The range given
2468 /// should correspond to the byte offsets where the flag set occurs.
2469 ///
2470 /// If negated is true, then the set is interpreted as beginning with a
2471 /// negation.
2472 fn flag_set(
2473 pat: &str,
2474 range: Range<usize>,
2475 flag: ast::Flag,
2476 negated: bool,
2477 ) -> Ast {
2478 let mut items = vec![ast::FlagsItem {
2479 span: span_range(pat, (range.end - 2)..(range.end - 1)),
2480 kind: ast::FlagsItemKind::Flag(flag),
2481 }];
2482 if negated {
2483 items.insert(
2484 0,
2485 ast::FlagsItem {
2486 span: span_range(pat, (range.start + 2)..(range.end - 2)),
2487 kind: ast::FlagsItemKind::Negation,
2488 },
2489 );
2490 }
2491 Ast::Flags(ast::SetFlags {
2492 span: span_range(pat, range.clone()),
2493 flags: ast::Flags {
2494 span: span_range(pat, (range.start + 2)..(range.end - 1)),
2495 items,
2496 },
2497 })
2498 }
2499
2500 #[test]
2501 fn parse_nest_limit() {
2502 // A nest limit of 0 still allows some types of regexes.
2503 assert_eq!(
2504 parser_nest_limit("", 0).parse(),
2505 Ok(Ast::Empty(span(0..0)))
2506 );
2507 assert_eq!(parser_nest_limit("a", 0).parse(), Ok(lit('a', 0)));
2508
2509 // Test repetition operations, which require one level of nesting.
2510 assert_eq!(
2511 parser_nest_limit("a+", 0).parse().unwrap_err(),
2512 TestError {
2513 span: span(0..2),
2514 kind: ast::ErrorKind::NestLimitExceeded(0),
2515 }
2516 );
2517 assert_eq!(
2518 parser_nest_limit("a+", 1).parse(),
2519 Ok(Ast::Repetition(ast::Repetition {
2520 span: span(0..2),
2521 op: ast::RepetitionOp {
2522 span: span(1..2),
2523 kind: ast::RepetitionKind::OneOrMore,
2524 },
2525 greedy: true,
2526 ast: Box::new(lit('a', 0)),
2527 }))
2528 );
2529 assert_eq!(
2530 parser_nest_limit("(a)+", 1).parse().unwrap_err(),
2531 TestError {
2532 span: span(0..3),
2533 kind: ast::ErrorKind::NestLimitExceeded(1),
2534 }
2535 );
2536 assert_eq!(
2537 parser_nest_limit("a+*", 1).parse().unwrap_err(),
2538 TestError {
2539 span: span(0..2),
2540 kind: ast::ErrorKind::NestLimitExceeded(1),
2541 }
2542 );
2543 assert_eq!(
2544 parser_nest_limit("a+*", 2).parse(),
2545 Ok(Ast::Repetition(ast::Repetition {
2546 span: span(0..3),
2547 op: ast::RepetitionOp {
2548 span: span(2..3),
2549 kind: ast::RepetitionKind::ZeroOrMore,
2550 },
2551 greedy: true,
2552 ast: Box::new(Ast::Repetition(ast::Repetition {
2553 span: span(0..2),
2554 op: ast::RepetitionOp {
2555 span: span(1..2),
2556 kind: ast::RepetitionKind::OneOrMore,
2557 },
2558 greedy: true,
2559 ast: Box::new(lit('a', 0)),
2560 })),
2561 }))
2562 );
2563
2564 // Test concatenations. A concatenation requires one level of nesting.
2565 assert_eq!(
2566 parser_nest_limit("ab", 0).parse().unwrap_err(),
2567 TestError {
2568 span: span(0..2),
2569 kind: ast::ErrorKind::NestLimitExceeded(0),
2570 }
2571 );
2572 assert_eq!(
2573 parser_nest_limit("ab", 1).parse(),
2574 Ok(concat(0..2, vec![lit('a', 0), lit('b', 1)]))
2575 );
2576 assert_eq!(
2577 parser_nest_limit("abc", 1).parse(),
2578 Ok(concat(0..3, vec![lit('a', 0), lit('b', 1), lit('c', 2)]))
2579 );
2580
2581 // Test alternations. An alternation requires one level of nesting.
2582 assert_eq!(
2583 parser_nest_limit("a|b", 0).parse().unwrap_err(),
2584 TestError {
2585 span: span(0..3),
2586 kind: ast::ErrorKind::NestLimitExceeded(0),
2587 }
2588 );
2589 assert_eq!(
2590 parser_nest_limit("a|b", 1).parse(),
2591 Ok(alt(0..3, vec![lit('a', 0), lit('b', 2)]))
2592 );
2593 assert_eq!(
2594 parser_nest_limit("a|b|c", 1).parse(),
2595 Ok(alt(0..5, vec![lit('a', 0), lit('b', 2), lit('c', 4)]))
2596 );
2597
2598 // Test character classes. Classes form their own mini-recursive
2599 // syntax!
2600 assert_eq!(
2601 parser_nest_limit("[a]", 0).parse().unwrap_err(),
2602 TestError {
2603 span: span(0..3),
2604 kind: ast::ErrorKind::NestLimitExceeded(0),
2605 }
2606 );
2607 assert_eq!(
2608 parser_nest_limit("[a]", 1).parse(),
2609 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
2610 span: span(0..3),
2611 negated: false,
2612 kind: ast::ClassSet::Item(ast::ClassSetItem::Literal(
2613 ast::Literal {
2614 span: span(1..2),
2615 kind: ast::LiteralKind::Verbatim,
2616 c: 'a',
2617 }
2618 )),
2619 })))
2620 );
2621 assert_eq!(
2622 parser_nest_limit("[ab]", 1).parse().unwrap_err(),
2623 TestError {
2624 span: span(1..3),
2625 kind: ast::ErrorKind::NestLimitExceeded(1),
2626 }
2627 );
2628 assert_eq!(
2629 parser_nest_limit("[ab[cd]]", 2).parse().unwrap_err(),
2630 TestError {
2631 span: span(3..7),
2632 kind: ast::ErrorKind::NestLimitExceeded(2),
2633 }
2634 );
2635 assert_eq!(
2636 parser_nest_limit("[ab[cd]]", 3).parse().unwrap_err(),
2637 TestError {
2638 span: span(4..6),
2639 kind: ast::ErrorKind::NestLimitExceeded(3),
2640 }
2641 );
2642 assert_eq!(
2643 parser_nest_limit("[a--b]", 1).parse().unwrap_err(),
2644 TestError {
2645 span: span(1..5),
2646 kind: ast::ErrorKind::NestLimitExceeded(1),
2647 }
2648 );
2649 assert_eq!(
2650 parser_nest_limit("[a--bc]", 2).parse().unwrap_err(),
2651 TestError {
2652 span: span(4..6),
2653 kind: ast::ErrorKind::NestLimitExceeded(2),
2654 }
2655 );
2656 }
2657
2658 #[test]
2659 fn parse_comments() {
2660 let pat = "(?x)
2661# This is comment 1.
2662foo # This is comment 2.
2663 # This is comment 3.
2664bar
2665# This is comment 4.";
2666 let astc = parser(pat).parse_with_comments().unwrap();
2667 assert_eq!(
2668 astc.ast,
2669 concat_with(
2670 span_range(pat, 0..pat.len()),
2671 vec![
2672 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2673 lit_with('f', span_range(pat, 26..27)),
2674 lit_with('o', span_range(pat, 27..28)),
2675 lit_with('o', span_range(pat, 28..29)),
2676 lit_with('b', span_range(pat, 74..75)),
2677 lit_with('a', span_range(pat, 75..76)),
2678 lit_with('r', span_range(pat, 76..77)),
2679 ]
2680 )
2681 );
2682 assert_eq!(
2683 astc.comments,
2684 vec![
2685 ast::Comment {
2686 span: span_range(pat, 5..26),
2687 comment: s(" This is comment 1."),
2688 },
2689 ast::Comment {
2690 span: span_range(pat, 30..51),
2691 comment: s(" This is comment 2."),
2692 },
2693 ast::Comment {
2694 span: span_range(pat, 53..74),
2695 comment: s(" This is comment 3."),
2696 },
2697 ast::Comment {
2698 span: span_range(pat, 78..98),
2699 comment: s(" This is comment 4."),
2700 },
2701 ]
2702 );
2703 }
2704
2705 #[test]
2706 fn parse_holistic() {
2707 assert_eq!(parser("]").parse(), Ok(lit(']', 0)));
2708 assert_eq!(
2709 parser(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#\&\-\~").parse(),
2710 Ok(concat(
2711 0..36,
2712 vec![
2713 meta_lit('\\', span(0..2)),
2714 meta_lit('.', span(2..4)),
2715 meta_lit('+', span(4..6)),
2716 meta_lit('*', span(6..8)),
2717 meta_lit('?', span(8..10)),
2718 meta_lit('(', span(10..12)),
2719 meta_lit(')', span(12..14)),
2720 meta_lit('|', span(14..16)),
2721 meta_lit('[', span(16..18)),
2722 meta_lit(']', span(18..20)),
2723 meta_lit('{', span(20..22)),
2724 meta_lit('}', span(22..24)),
2725 meta_lit('^', span(24..26)),
2726 meta_lit('$', span(26..28)),
2727 meta_lit('#', span(28..30)),
2728 meta_lit('&', span(30..32)),
2729 meta_lit('-', span(32..34)),
2730 meta_lit('~', span(34..36)),
2731 ]
2732 ))
2733 );
2734 }
2735
2736 #[test]
2737 fn parse_ignore_whitespace() {
2738 // Test that basic whitespace insensitivity works.
2739 let pat = "(?x)a b";
2740 assert_eq!(
2741 parser(pat).parse(),
2742 Ok(concat_with(
2743 nspan(npos(0, 1, 1), npos(7, 1, 8)),
2744 vec![
2745 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2746 lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))),
2747 lit_with('b', nspan(npos(6, 1, 7), npos(7, 1, 8))),
2748 ]
2749 ))
2750 );
2751
2752 // Test that we can toggle whitespace insensitivity.
2753 let pat = "(?x)a b(?-x)a b";
2754 assert_eq!(
2755 parser(pat).parse(),
2756 Ok(concat_with(
2757 nspan(npos(0, 1, 1), npos(15, 1, 16)),
2758 vec![
2759 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2760 lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))),
2761 lit_with('b', nspan(npos(6, 1, 7), npos(7, 1, 8))),
2762 flag_set(pat, 7..12, ast::Flag::IgnoreWhitespace, true),
2763 lit_with('a', nspan(npos(12, 1, 13), npos(13, 1, 14))),
2764 lit_with(' ', nspan(npos(13, 1, 14), npos(14, 1, 15))),
2765 lit_with('b', nspan(npos(14, 1, 15), npos(15, 1, 16))),
2766 ]
2767 ))
2768 );
2769
2770 // Test that nesting whitespace insensitive flags works.
2771 let pat = "a (?x:a )a ";
2772 assert_eq!(
2773 parser(pat).parse(),
2774 Ok(concat_with(
2775 span_range(pat, 0..11),
2776 vec![
2777 lit_with('a', span_range(pat, 0..1)),
2778 lit_with(' ', span_range(pat, 1..2)),
2779 Ast::Group(ast::Group {
2780 span: span_range(pat, 2..9),
2781 kind: ast::GroupKind::NonCapturing(ast::Flags {
2782 span: span_range(pat, 4..5),
2783 items: vec![ast::FlagsItem {
2784 span: span_range(pat, 4..5),
2785 kind: ast::FlagsItemKind::Flag(
2786 ast::Flag::IgnoreWhitespace
2787 ),
2788 },],
2789 }),
2790 ast: Box::new(lit_with('a', span_range(pat, 6..7))),
2791 }),
2792 lit_with('a', span_range(pat, 9..10)),
2793 lit_with(' ', span_range(pat, 10..11)),
2794 ]
2795 ))
2796 );
2797
2798 // Test that whitespace after an opening paren is insignificant.
2799 let pat = "(?x)( ?P<foo> a )";
2800 assert_eq!(
2801 parser(pat).parse(),
2802 Ok(concat_with(
2803 span_range(pat, 0..pat.len()),
2804 vec![
2805 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2806 Ast::Group(ast::Group {
2807 span: span_range(pat, 4..pat.len()),
2808 kind: ast::GroupKind::CaptureName {
2809 starts_with_p: true,
2810 name: ast::CaptureName {
2811 span: span_range(pat, 9..12),
2812 name: s("foo"),
2813 index: 1,
2814 }
2815 },
2816 ast: Box::new(lit_with('a', span_range(pat, 14..15))),
2817 }),
2818 ]
2819 ))
2820 );
2821 let pat = "(?x)( a )";
2822 assert_eq!(
2823 parser(pat).parse(),
2824 Ok(concat_with(
2825 span_range(pat, 0..pat.len()),
2826 vec![
2827 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2828 Ast::Group(ast::Group {
2829 span: span_range(pat, 4..pat.len()),
2830 kind: ast::GroupKind::CaptureIndex(1),
2831 ast: Box::new(lit_with('a', span_range(pat, 7..8))),
2832 }),
2833 ]
2834 ))
2835 );
2836 let pat = "(?x)( ?: a )";
2837 assert_eq!(
2838 parser(pat).parse(),
2839 Ok(concat_with(
2840 span_range(pat, 0..pat.len()),
2841 vec![
2842 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2843 Ast::Group(ast::Group {
2844 span: span_range(pat, 4..pat.len()),
2845 kind: ast::GroupKind::NonCapturing(ast::Flags {
2846 span: span_range(pat, 8..8),
2847 items: vec![],
2848 }),
2849 ast: Box::new(lit_with('a', span_range(pat, 11..12))),
2850 }),
2851 ]
2852 ))
2853 );
2854 let pat = r"(?x)\x { 53 }";
2855 assert_eq!(
2856 parser(pat).parse(),
2857 Ok(concat_with(
2858 span_range(pat, 0..pat.len()),
2859 vec![
2860 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2861 Ast::Literal(ast::Literal {
2862 span: span(4..13),
2863 kind: ast::LiteralKind::HexBrace(
2864 ast::HexLiteralKind::X
2865 ),
2866 c: 'S',
2867 }),
2868 ]
2869 ))
2870 );
2871
2872 // Test that whitespace after an escape is OK.
2873 let pat = r"(?x)\ ";
2874 assert_eq!(
2875 parser(pat).parse(),
2876 Ok(concat_with(
2877 span_range(pat, 0..pat.len()),
2878 vec![
2879 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2880 Ast::Literal(ast::Literal {
2881 span: span_range(pat, 4..6),
2882 kind: ast::LiteralKind::Superfluous,
2883 c: ' ',
2884 }),
2885 ]
2886 ))
2887 );
2888 }
2889
2890 #[test]
2891 fn parse_newlines() {
2892 let pat = ".\n.";
2893 assert_eq!(
2894 parser(pat).parse(),
2895 Ok(concat_with(
2896 span_range(pat, 0..3),
2897 vec![
2898 Ast::Dot(span_range(pat, 0..1)),
2899 lit_with('\n', span_range(pat, 1..2)),
2900 Ast::Dot(span_range(pat, 2..3)),
2901 ]
2902 ))
2903 );
2904
2905 let pat = "foobar\nbaz\nquux\n";
2906 assert_eq!(
2907 parser(pat).parse(),
2908 Ok(concat_with(
2909 span_range(pat, 0..pat.len()),
2910 vec![
2911 lit_with('f', nspan(npos(0, 1, 1), npos(1, 1, 2))),
2912 lit_with('o', nspan(npos(1, 1, 2), npos(2, 1, 3))),
2913 lit_with('o', nspan(npos(2, 1, 3), npos(3, 1, 4))),
2914 lit_with('b', nspan(npos(3, 1, 4), npos(4, 1, 5))),
2915 lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))),
2916 lit_with('r', nspan(npos(5, 1, 6), npos(6, 1, 7))),
2917 lit_with('\n', nspan(npos(6, 1, 7), npos(7, 2, 1))),
2918 lit_with('b', nspan(npos(7, 2, 1), npos(8, 2, 2))),
2919 lit_with('a', nspan(npos(8, 2, 2), npos(9, 2, 3))),
2920 lit_with('z', nspan(npos(9, 2, 3), npos(10, 2, 4))),
2921 lit_with('\n', nspan(npos(10, 2, 4), npos(11, 3, 1))),
2922 lit_with('q', nspan(npos(11, 3, 1), npos(12, 3, 2))),
2923 lit_with('u', nspan(npos(12, 3, 2), npos(13, 3, 3))),
2924 lit_with('u', nspan(npos(13, 3, 3), npos(14, 3, 4))),
2925 lit_with('x', nspan(npos(14, 3, 4), npos(15, 3, 5))),
2926 lit_with('\n', nspan(npos(15, 3, 5), npos(16, 4, 1))),
2927 ]
2928 ))
2929 );
2930 }
2931
2932 #[test]
2933 fn parse_uncounted_repetition() {
2934 assert_eq!(
2935 parser(r"a*").parse(),
2936 Ok(Ast::Repetition(ast::Repetition {
2937 span: span(0..2),
2938 op: ast::RepetitionOp {
2939 span: span(1..2),
2940 kind: ast::RepetitionKind::ZeroOrMore,
2941 },
2942 greedy: true,
2943 ast: Box::new(lit('a', 0)),
2944 }))
2945 );
2946 assert_eq!(
2947 parser(r"a+").parse(),
2948 Ok(Ast::Repetition(ast::Repetition {
2949 span: span(0..2),
2950 op: ast::RepetitionOp {
2951 span: span(1..2),
2952 kind: ast::RepetitionKind::OneOrMore,
2953 },
2954 greedy: true,
2955 ast: Box::new(lit('a', 0)),
2956 }))
2957 );
2958
2959 assert_eq!(
2960 parser(r"a?").parse(),
2961 Ok(Ast::Repetition(ast::Repetition {
2962 span: span(0..2),
2963 op: ast::RepetitionOp {
2964 span: span(1..2),
2965 kind: ast::RepetitionKind::ZeroOrOne,
2966 },
2967 greedy: true,
2968 ast: Box::new(lit('a', 0)),
2969 }))
2970 );
2971 assert_eq!(
2972 parser(r"a??").parse(),
2973 Ok(Ast::Repetition(ast::Repetition {
2974 span: span(0..3),
2975 op: ast::RepetitionOp {
2976 span: span(1..3),
2977 kind: ast::RepetitionKind::ZeroOrOne,
2978 },
2979 greedy: false,
2980 ast: Box::new(lit('a', 0)),
2981 }))
2982 );
2983 assert_eq!(
2984 parser(r"a?").parse(),
2985 Ok(Ast::Repetition(ast::Repetition {
2986 span: span(0..2),
2987 op: ast::RepetitionOp {
2988 span: span(1..2),
2989 kind: ast::RepetitionKind::ZeroOrOne,
2990 },
2991 greedy: true,
2992 ast: Box::new(lit('a', 0)),
2993 }))
2994 );
2995 assert_eq!(
2996 parser(r"a?b").parse(),
2997 Ok(concat(
2998 0..3,
2999 vec![
3000 Ast::Repetition(ast::Repetition {
3001 span: span(0..2),
3002 op: ast::RepetitionOp {
3003 span: span(1..2),
3004 kind: ast::RepetitionKind::ZeroOrOne,
3005 },
3006 greedy: true,
3007 ast: Box::new(lit('a', 0)),
3008 }),
3009 lit('b', 2),
3010 ]
3011 ))
3012 );
3013 assert_eq!(
3014 parser(r"a??b").parse(),
3015 Ok(concat(
3016 0..4,
3017 vec![
3018 Ast::Repetition(ast::Repetition {
3019 span: span(0..3),
3020 op: ast::RepetitionOp {
3021 span: span(1..3),
3022 kind: ast::RepetitionKind::ZeroOrOne,
3023 },
3024 greedy: false,
3025 ast: Box::new(lit('a', 0)),
3026 }),
3027 lit('b', 3),
3028 ]
3029 ))
3030 );
3031 assert_eq!(
3032 parser(r"ab?").parse(),
3033 Ok(concat(
3034 0..3,
3035 vec![
3036 lit('a', 0),
3037 Ast::Repetition(ast::Repetition {
3038 span: span(1..3),
3039 op: ast::RepetitionOp {
3040 span: span(2..3),
3041 kind: ast::RepetitionKind::ZeroOrOne,
3042 },
3043 greedy: true,
3044 ast: Box::new(lit('b', 1)),
3045 }),
3046 ]
3047 ))
3048 );
3049 assert_eq!(
3050 parser(r"(ab)?").parse(),
3051 Ok(Ast::Repetition(ast::Repetition {
3052 span: span(0..5),
3053 op: ast::RepetitionOp {
3054 span: span(4..5),
3055 kind: ast::RepetitionKind::ZeroOrOne,
3056 },
3057 greedy: true,
3058 ast: Box::new(group(
3059 0..4,
3060 1,
3061 concat(1..3, vec![lit('a', 1), lit('b', 2),])
3062 )),
3063 }))
3064 );
3065 assert_eq!(
3066 parser(r"|a?").parse(),
3067 Ok(alt(
3068 0..3,
3069 vec![
3070 Ast::Empty(span(0..0)),
3071 Ast::Repetition(ast::Repetition {
3072 span: span(1..3),
3073 op: ast::RepetitionOp {
3074 span: span(2..3),
3075 kind: ast::RepetitionKind::ZeroOrOne,
3076 },
3077 greedy: true,
3078 ast: Box::new(lit('a', 1)),
3079 }),
3080 ]
3081 ))
3082 );
3083
3084 assert_eq!(
3085 parser(r"*").parse().unwrap_err(),
3086 TestError {
3087 span: span(0..0),
3088 kind: ast::ErrorKind::RepetitionMissing,
3089 }
3090 );
3091 assert_eq!(
3092 parser(r"(?i)*").parse().unwrap_err(),
3093 TestError {
3094 span: span(4..4),
3095 kind: ast::ErrorKind::RepetitionMissing,
3096 }
3097 );
3098 assert_eq!(
3099 parser(r"(*)").parse().unwrap_err(),
3100 TestError {
3101 span: span(1..1),
3102 kind: ast::ErrorKind::RepetitionMissing,
3103 }
3104 );
3105 assert_eq!(
3106 parser(r"(?:?)").parse().unwrap_err(),
3107 TestError {
3108 span: span(3..3),
3109 kind: ast::ErrorKind::RepetitionMissing,
3110 }
3111 );
3112 assert_eq!(
3113 parser(r"+").parse().unwrap_err(),
3114 TestError {
3115 span: span(0..0),
3116 kind: ast::ErrorKind::RepetitionMissing,
3117 }
3118 );
3119 assert_eq!(
3120 parser(r"?").parse().unwrap_err(),
3121 TestError {
3122 span: span(0..0),
3123 kind: ast::ErrorKind::RepetitionMissing,
3124 }
3125 );
3126 assert_eq!(
3127 parser(r"(?)").parse().unwrap_err(),
3128 TestError {
3129 span: span(1..1),
3130 kind: ast::ErrorKind::RepetitionMissing,
3131 }
3132 );
3133 assert_eq!(
3134 parser(r"|*").parse().unwrap_err(),
3135 TestError {
3136 span: span(1..1),
3137 kind: ast::ErrorKind::RepetitionMissing,
3138 }
3139 );
3140 assert_eq!(
3141 parser(r"|+").parse().unwrap_err(),
3142 TestError {
3143 span: span(1..1),
3144 kind: ast::ErrorKind::RepetitionMissing,
3145 }
3146 );
3147 assert_eq!(
3148 parser(r"|?").parse().unwrap_err(),
3149 TestError {
3150 span: span(1..1),
3151 kind: ast::ErrorKind::RepetitionMissing,
3152 }
3153 );
3154 }
3155
3156 #[test]
3157 fn parse_counted_repetition() {
3158 assert_eq!(
3159 parser(r"a{5}").parse(),
3160 Ok(Ast::Repetition(ast::Repetition {
3161 span: span(0..4),
3162 op: ast::RepetitionOp {
3163 span: span(1..4),
3164 kind: ast::RepetitionKind::Range(
3165 ast::RepetitionRange::Exactly(5)
3166 ),
3167 },
3168 greedy: true,
3169 ast: Box::new(lit('a', 0)),
3170 }))
3171 );
3172 assert_eq!(
3173 parser(r"a{5,}").parse(),
3174 Ok(Ast::Repetition(ast::Repetition {
3175 span: span(0..5),
3176 op: ast::RepetitionOp {
3177 span: span(1..5),
3178 kind: ast::RepetitionKind::Range(
3179 ast::RepetitionRange::AtLeast(5)
3180 ),
3181 },
3182 greedy: true,
3183 ast: Box::new(lit('a', 0)),
3184 }))
3185 );
3186 assert_eq!(
3187 parser(r"a{5,9}").parse(),
3188 Ok(Ast::Repetition(ast::Repetition {
3189 span: span(0..6),
3190 op: ast::RepetitionOp {
3191 span: span(1..6),
3192 kind: ast::RepetitionKind::Range(
3193 ast::RepetitionRange::Bounded(5, 9)
3194 ),
3195 },
3196 greedy: true,
3197 ast: Box::new(lit('a', 0)),
3198 }))
3199 );
3200 assert_eq!(
3201 parser(r"a{5}?").parse(),
3202 Ok(Ast::Repetition(ast::Repetition {
3203 span: span(0..5),
3204 op: ast::RepetitionOp {
3205 span: span(1..5),
3206 kind: ast::RepetitionKind::Range(
3207 ast::RepetitionRange::Exactly(5)
3208 ),
3209 },
3210 greedy: false,
3211 ast: Box::new(lit('a', 0)),
3212 }))
3213 );
3214 assert_eq!(
3215 parser(r"ab{5}").parse(),
3216 Ok(concat(
3217 0..5,
3218 vec![
3219 lit('a', 0),
3220 Ast::Repetition(ast::Repetition {
3221 span: span(1..5),
3222 op: ast::RepetitionOp {
3223 span: span(2..5),
3224 kind: ast::RepetitionKind::Range(
3225 ast::RepetitionRange::Exactly(5)
3226 ),
3227 },
3228 greedy: true,
3229 ast: Box::new(lit('b', 1)),
3230 }),
3231 ]
3232 ))
3233 );
3234 assert_eq!(
3235 parser(r"ab{5}c").parse(),
3236 Ok(concat(
3237 0..6,
3238 vec![
3239 lit('a', 0),
3240 Ast::Repetition(ast::Repetition {
3241 span: span(1..5),
3242 op: ast::RepetitionOp {
3243 span: span(2..5),
3244 kind: ast::RepetitionKind::Range(
3245 ast::RepetitionRange::Exactly(5)
3246 ),
3247 },
3248 greedy: true,
3249 ast: Box::new(lit('b', 1)),
3250 }),
3251 lit('c', 5),
3252 ]
3253 ))
3254 );
3255
3256 assert_eq!(
3257 parser(r"a{ 5 }").parse(),
3258 Ok(Ast::Repetition(ast::Repetition {
3259 span: span(0..6),
3260 op: ast::RepetitionOp {
3261 span: span(1..6),
3262 kind: ast::RepetitionKind::Range(
3263 ast::RepetitionRange::Exactly(5)
3264 ),
3265 },
3266 greedy: true,
3267 ast: Box::new(lit('a', 0)),
3268 }))
3269 );
3270 assert_eq!(
3271 parser(r"a{ 5 , 9 }").parse(),
3272 Ok(Ast::Repetition(ast::Repetition {
3273 span: span(0..10),
3274 op: ast::RepetitionOp {
3275 span: span(1..10),
3276 kind: ast::RepetitionKind::Range(
3277 ast::RepetitionRange::Bounded(5, 9)
3278 ),
3279 },
3280 greedy: true,
3281 ast: Box::new(lit('a', 0)),
3282 }))
3283 );
3284 assert_eq!(
3285 parser_ignore_whitespace(r"a{5,9} ?").parse(),
3286 Ok(Ast::Repetition(ast::Repetition {
3287 span: span(0..8),
3288 op: ast::RepetitionOp {
3289 span: span(1..8),
3290 kind: ast::RepetitionKind::Range(
3291 ast::RepetitionRange::Bounded(5, 9)
3292 ),
3293 },
3294 greedy: false,
3295 ast: Box::new(lit('a', 0)),
3296 }))
3297 );
3298
3299 assert_eq!(
3300 parser(r"(?i){0}").parse().unwrap_err(),
3301 TestError {
3302 span: span(4..4),
3303 kind: ast::ErrorKind::RepetitionMissing,
3304 }
3305 );
3306 assert_eq!(
3307 parser(r"(?m){1,1}").parse().unwrap_err(),
3308 TestError {
3309 span: span(4..4),
3310 kind: ast::ErrorKind::RepetitionMissing,
3311 }
3312 );
3313 assert_eq!(
3314 parser(r"a{]}").parse().unwrap_err(),
3315 TestError {
3316 span: span(2..2),
3317 kind: ast::ErrorKind::RepetitionCountDecimalEmpty,
3318 }
3319 );
3320 assert_eq!(
3321 parser(r"a{1,]}").parse().unwrap_err(),
3322 TestError {
3323 span: span(4..4),
3324 kind: ast::ErrorKind::RepetitionCountDecimalEmpty,
3325 }
3326 );
3327 assert_eq!(
3328 parser(r"a{").parse().unwrap_err(),
3329 TestError {
3330 span: span(1..2),
3331 kind: ast::ErrorKind::RepetitionCountUnclosed,
3332 }
3333 );
3334 assert_eq!(
3335 parser(r"a{}").parse().unwrap_err(),
3336 TestError {
3337 span: span(2..2),
3338 kind: ast::ErrorKind::RepetitionCountDecimalEmpty,
3339 }
3340 );
3341 assert_eq!(
3342 parser(r"a{a").parse().unwrap_err(),
3343 TestError {
3344 span: span(2..2),
3345 kind: ast::ErrorKind::RepetitionCountDecimalEmpty,
3346 }
3347 );
3348 assert_eq!(
3349 parser(r"a{9999999999}").parse().unwrap_err(),
3350 TestError {
3351 span: span(2..12),
3352 kind: ast::ErrorKind::DecimalInvalid,
3353 }
3354 );
3355 assert_eq!(
3356 parser(r"a{9").parse().unwrap_err(),
3357 TestError {
3358 span: span(1..3),
3359 kind: ast::ErrorKind::RepetitionCountUnclosed,
3360 }
3361 );
3362 assert_eq!(
3363 parser(r"a{9,a").parse().unwrap_err(),
3364 TestError {
3365 span: span(4..4),
3366 kind: ast::ErrorKind::RepetitionCountDecimalEmpty,
3367 }
3368 );
3369 assert_eq!(
3370 parser(r"a{9,9999999999}").parse().unwrap_err(),
3371 TestError {
3372 span: span(4..14),
3373 kind: ast::ErrorKind::DecimalInvalid,
3374 }
3375 );
3376 assert_eq!(
3377 parser(r"a{9,").parse().unwrap_err(),
3378 TestError {
3379 span: span(1..4),
3380 kind: ast::ErrorKind::RepetitionCountUnclosed,
3381 }
3382 );
3383 assert_eq!(
3384 parser(r"a{9,11").parse().unwrap_err(),
3385 TestError {
3386 span: span(1..6),
3387 kind: ast::ErrorKind::RepetitionCountUnclosed,
3388 }
3389 );
3390 assert_eq!(
3391 parser(r"a{2,1}").parse().unwrap_err(),
3392 TestError {
3393 span: span(1..6),
3394 kind: ast::ErrorKind::RepetitionCountInvalid,
3395 }
3396 );
3397 assert_eq!(
3398 parser(r"{5}").parse().unwrap_err(),
3399 TestError {
3400 span: span(0..0),
3401 kind: ast::ErrorKind::RepetitionMissing,
3402 }
3403 );
3404 assert_eq!(
3405 parser(r"|{5}").parse().unwrap_err(),
3406 TestError {
3407 span: span(1..1),
3408 kind: ast::ErrorKind::RepetitionMissing,
3409 }
3410 );
3411 }
3412
3413 #[test]
3414 fn parse_alternate() {
3415 assert_eq!(
3416 parser(r"a|b").parse(),
3417 Ok(Ast::Alternation(ast::Alternation {
3418 span: span(0..3),
3419 asts: vec![lit('a', 0), lit('b', 2)],
3420 }))
3421 );
3422 assert_eq!(
3423 parser(r"(a|b)").parse(),
3424 Ok(group(
3425 0..5,
3426 1,
3427 Ast::Alternation(ast::Alternation {
3428 span: span(1..4),
3429 asts: vec![lit('a', 1), lit('b', 3)],
3430 })
3431 ))
3432 );
3433
3434 assert_eq!(
3435 parser(r"a|b|c").parse(),
3436 Ok(Ast::Alternation(ast::Alternation {
3437 span: span(0..5),
3438 asts: vec![lit('a', 0), lit('b', 2), lit('c', 4)],
3439 }))
3440 );
3441 assert_eq!(
3442 parser(r"ax|by|cz").parse(),
3443 Ok(Ast::Alternation(ast::Alternation {
3444 span: span(0..8),
3445 asts: vec![
3446 concat(0..2, vec![lit('a', 0), lit('x', 1)]),
3447 concat(3..5, vec![lit('b', 3), lit('y', 4)]),
3448 concat(6..8, vec![lit('c', 6), lit('z', 7)]),
3449 ],
3450 }))
3451 );
3452 assert_eq!(
3453 parser(r"(ax|by|cz)").parse(),
3454 Ok(group(
3455 0..10,
3456 1,
3457 Ast::Alternation(ast::Alternation {
3458 span: span(1..9),
3459 asts: vec![
3460 concat(1..3, vec![lit('a', 1), lit('x', 2)]),
3461 concat(4..6, vec![lit('b', 4), lit('y', 5)]),
3462 concat(7..9, vec![lit('c', 7), lit('z', 8)]),
3463 ],
3464 })
3465 ))
3466 );
3467 assert_eq!(
3468 parser(r"(ax|(by|(cz)))").parse(),
3469 Ok(group(
3470 0..14,
3471 1,
3472 alt(
3473 1..13,
3474 vec![
3475 concat(1..3, vec![lit('a', 1), lit('x', 2)]),
3476 group(
3477 4..13,
3478 2,
3479 alt(
3480 5..12,
3481 vec![
3482 concat(
3483 5..7,
3484 vec![lit('b', 5), lit('y', 6)]
3485 ),
3486 group(
3487 8..12,
3488 3,
3489 concat(
3490 9..11,
3491 vec![lit('c', 9), lit('z', 10),]
3492 )
3493 ),
3494 ]
3495 )
3496 ),
3497 ]
3498 )
3499 ))
3500 );
3501
3502 assert_eq!(
3503 parser(r"|").parse(),
3504 Ok(alt(
3505 0..1,
3506 vec![Ast::Empty(span(0..0)), Ast::Empty(span(1..1)),]
3507 ))
3508 );
3509 assert_eq!(
3510 parser(r"||").parse(),
3511 Ok(alt(
3512 0..2,
3513 vec![
3514 Ast::Empty(span(0..0)),
3515 Ast::Empty(span(1..1)),
3516 Ast::Empty(span(2..2)),
3517 ]
3518 ))
3519 );
3520 assert_eq!(
3521 parser(r"a|").parse(),
3522 Ok(alt(0..2, vec![lit('a', 0), Ast::Empty(span(2..2)),]))
3523 );
3524 assert_eq!(
3525 parser(r"|a").parse(),
3526 Ok(alt(0..2, vec![Ast::Empty(span(0..0)), lit('a', 1),]))
3527 );
3528
3529 assert_eq!(
3530 parser(r"(|)").parse(),
3531 Ok(group(
3532 0..3,
3533 1,
3534 alt(
3535 1..2,
3536 vec![Ast::Empty(span(1..1)), Ast::Empty(span(2..2)),]
3537 )
3538 ))
3539 );
3540 assert_eq!(
3541 parser(r"(a|)").parse(),
3542 Ok(group(
3543 0..4,
3544 1,
3545 alt(1..3, vec![lit('a', 1), Ast::Empty(span(3..3)),])
3546 ))
3547 );
3548 assert_eq!(
3549 parser(r"(|a)").parse(),
3550 Ok(group(
3551 0..4,
3552 1,
3553 alt(1..3, vec![Ast::Empty(span(1..1)), lit('a', 2),])
3554 ))
3555 );
3556
3557 assert_eq!(
3558 parser(r"a|b)").parse().unwrap_err(),
3559 TestError {
3560 span: span(3..4),
3561 kind: ast::ErrorKind::GroupUnopened,
3562 }
3563 );
3564 assert_eq!(
3565 parser(r"(a|b").parse().unwrap_err(),
3566 TestError {
3567 span: span(0..1),
3568 kind: ast::ErrorKind::GroupUnclosed,
3569 }
3570 );
3571 }
3572
3573 #[test]
3574 fn parse_unsupported_lookaround() {
3575 assert_eq!(
3576 parser(r"(?=a)").parse().unwrap_err(),
3577 TestError {
3578 span: span(0..3),
3579 kind: ast::ErrorKind::UnsupportedLookAround,
3580 }
3581 );
3582 assert_eq!(
3583 parser(r"(?!a)").parse().unwrap_err(),
3584 TestError {
3585 span: span(0..3),
3586 kind: ast::ErrorKind::UnsupportedLookAround,
3587 }
3588 );
3589 assert_eq!(
3590 parser(r"(?<=a)").parse().unwrap_err(),
3591 TestError {
3592 span: span(0..4),
3593 kind: ast::ErrorKind::UnsupportedLookAround,
3594 }
3595 );
3596 assert_eq!(
3597 parser(r"(?<!a)").parse().unwrap_err(),
3598 TestError {
3599 span: span(0..4),
3600 kind: ast::ErrorKind::UnsupportedLookAround,
3601 }
3602 );
3603 }
3604
3605 #[test]
3606 fn parse_group() {
3607 assert_eq!(
3608 parser("(?i)").parse(),
3609 Ok(Ast::Flags(ast::SetFlags {
3610 span: span(0..4),
3611 flags: ast::Flags {
3612 span: span(2..3),
3613 items: vec![ast::FlagsItem {
3614 span: span(2..3),
3615 kind: ast::FlagsItemKind::Flag(
3616 ast::Flag::CaseInsensitive
3617 ),
3618 }],
3619 },
3620 }))
3621 );
3622 assert_eq!(
3623 parser("(?iU)").parse(),
3624 Ok(Ast::Flags(ast::SetFlags {
3625 span: span(0..5),
3626 flags: ast::Flags {
3627 span: span(2..4),
3628 items: vec![
3629 ast::FlagsItem {
3630 span: span(2..3),
3631 kind: ast::FlagsItemKind::Flag(
3632 ast::Flag::CaseInsensitive
3633 ),
3634 },
3635 ast::FlagsItem {
3636 span: span(3..4),
3637 kind: ast::FlagsItemKind::Flag(
3638 ast::Flag::SwapGreed
3639 ),
3640 },
3641 ],
3642 },
3643 }))
3644 );
3645 assert_eq!(
3646 parser("(?i-U)").parse(),
3647 Ok(Ast::Flags(ast::SetFlags {
3648 span: span(0..6),
3649 flags: ast::Flags {
3650 span: span(2..5),
3651 items: vec![
3652 ast::FlagsItem {
3653 span: span(2..3),
3654 kind: ast::FlagsItemKind::Flag(
3655 ast::Flag::CaseInsensitive
3656 ),
3657 },
3658 ast::FlagsItem {
3659 span: span(3..4),
3660 kind: ast::FlagsItemKind::Negation,
3661 },
3662 ast::FlagsItem {
3663 span: span(4..5),
3664 kind: ast::FlagsItemKind::Flag(
3665 ast::Flag::SwapGreed
3666 ),
3667 },
3668 ],
3669 },
3670 }))
3671 );
3672
3673 assert_eq!(
3674 parser("()").parse(),
3675 Ok(Ast::Group(ast::Group {
3676 span: span(0..2),
3677 kind: ast::GroupKind::CaptureIndex(1),
3678 ast: Box::new(Ast::Empty(span(1..1))),
3679 }))
3680 );
3681 assert_eq!(
3682 parser("(a)").parse(),
3683 Ok(Ast::Group(ast::Group {
3684 span: span(0..3),
3685 kind: ast::GroupKind::CaptureIndex(1),
3686 ast: Box::new(lit('a', 1)),
3687 }))
3688 );
3689 assert_eq!(
3690 parser("(())").parse(),
3691 Ok(Ast::Group(ast::Group {
3692 span: span(0..4),
3693 kind: ast::GroupKind::CaptureIndex(1),
3694 ast: Box::new(Ast::Group(ast::Group {
3695 span: span(1..3),
3696 kind: ast::GroupKind::CaptureIndex(2),
3697 ast: Box::new(Ast::Empty(span(2..2))),
3698 })),
3699 }))
3700 );
3701
3702 assert_eq!(
3703 parser("(?:a)").parse(),
3704 Ok(Ast::Group(ast::Group {
3705 span: span(0..5),
3706 kind: ast::GroupKind::NonCapturing(ast::Flags {
3707 span: span(2..2),
3708 items: vec![],
3709 }),
3710 ast: Box::new(lit('a', 3)),
3711 }))
3712 );
3713
3714 assert_eq!(
3715 parser("(?i:a)").parse(),
3716 Ok(Ast::Group(ast::Group {
3717 span: span(0..6),
3718 kind: ast::GroupKind::NonCapturing(ast::Flags {
3719 span: span(2..3),
3720 items: vec![ast::FlagsItem {
3721 span: span(2..3),
3722 kind: ast::FlagsItemKind::Flag(
3723 ast::Flag::CaseInsensitive
3724 ),
3725 },],
3726 }),
3727 ast: Box::new(lit('a', 4)),
3728 }))
3729 );
3730 assert_eq!(
3731 parser("(?i-U:a)").parse(),
3732 Ok(Ast::Group(ast::Group {
3733 span: span(0..8),
3734 kind: ast::GroupKind::NonCapturing(ast::Flags {
3735 span: span(2..5),
3736 items: vec![
3737 ast::FlagsItem {
3738 span: span(2..3),
3739 kind: ast::FlagsItemKind::Flag(
3740 ast::Flag::CaseInsensitive
3741 ),
3742 },
3743 ast::FlagsItem {
3744 span: span(3..4),
3745 kind: ast::FlagsItemKind::Negation,
3746 },
3747 ast::FlagsItem {
3748 span: span(4..5),
3749 kind: ast::FlagsItemKind::Flag(
3750 ast::Flag::SwapGreed
3751 ),
3752 },
3753 ],
3754 }),
3755 ast: Box::new(lit('a', 6)),
3756 }))
3757 );
3758
3759 assert_eq!(
3760 parser("(").parse().unwrap_err(),
3761 TestError {
3762 span: span(0..1),
3763 kind: ast::ErrorKind::GroupUnclosed,
3764 }
3765 );
3766 assert_eq!(
3767 parser("(?").parse().unwrap_err(),
3768 TestError {
3769 span: span(0..1),
3770 kind: ast::ErrorKind::GroupUnclosed,
3771 }
3772 );
3773 assert_eq!(
3774 parser("(?P").parse().unwrap_err(),
3775 TestError {
3776 span: span(2..3),
3777 kind: ast::ErrorKind::FlagUnrecognized,
3778 }
3779 );
3780 assert_eq!(
3781 parser("(?P<").parse().unwrap_err(),
3782 TestError {
3783 span: span(4..4),
3784 kind: ast::ErrorKind::GroupNameUnexpectedEof,
3785 }
3786 );
3787 assert_eq!(
3788 parser("(a").parse().unwrap_err(),
3789 TestError {
3790 span: span(0..1),
3791 kind: ast::ErrorKind::GroupUnclosed,
3792 }
3793 );
3794 assert_eq!(
3795 parser("(()").parse().unwrap_err(),
3796 TestError {
3797 span: span(0..1),
3798 kind: ast::ErrorKind::GroupUnclosed,
3799 }
3800 );
3801 assert_eq!(
3802 parser(")").parse().unwrap_err(),
3803 TestError {
3804 span: span(0..1),
3805 kind: ast::ErrorKind::GroupUnopened,
3806 }
3807 );
3808 assert_eq!(
3809 parser("a)").parse().unwrap_err(),
3810 TestError {
3811 span: span(1..2),
3812 kind: ast::ErrorKind::GroupUnopened,
3813 }
3814 );
3815 }
3816
3817 #[test]
3818 fn parse_capture_name() {
3819 assert_eq!(
3820 parser("(?<a>z)").parse(),
3821 Ok(Ast::Group(ast::Group {
3822 span: span(0..7),
3823 kind: ast::GroupKind::CaptureName {
3824 starts_with_p: false,
3825 name: ast::CaptureName {
3826 span: span(3..4),
3827 name: s("a"),
3828 index: 1,
3829 }
3830 },
3831 ast: Box::new(lit('z', 5)),
3832 }))
3833 );
3834 assert_eq!(
3835 parser("(?P<a>z)").parse(),
3836 Ok(Ast::Group(ast::Group {
3837 span: span(0..8),
3838 kind: ast::GroupKind::CaptureName {
3839 starts_with_p: true,
3840 name: ast::CaptureName {
3841 span: span(4..5),
3842 name: s("a"),
3843 index: 1,
3844 }
3845 },
3846 ast: Box::new(lit('z', 6)),
3847 }))
3848 );
3849 assert_eq!(
3850 parser("(?P<abc>z)").parse(),
3851 Ok(Ast::Group(ast::Group {
3852 span: span(0..10),
3853 kind: ast::GroupKind::CaptureName {
3854 starts_with_p: true,
3855 name: ast::CaptureName {
3856 span: span(4..7),
3857 name: s("abc"),
3858 index: 1,
3859 }
3860 },
3861 ast: Box::new(lit('z', 8)),
3862 }))
3863 );
3864
3865 assert_eq!(
3866 parser("(?P<a_1>z)").parse(),
3867 Ok(Ast::Group(ast::Group {
3868 span: span(0..10),
3869 kind: ast::GroupKind::CaptureName {
3870 starts_with_p: true,
3871 name: ast::CaptureName {
3872 span: span(4..7),
3873 name: s("a_1"),
3874 index: 1,
3875 }
3876 },
3877 ast: Box::new(lit('z', 8)),
3878 }))
3879 );
3880
3881 assert_eq!(
3882 parser("(?P<a.1>z)").parse(),
3883 Ok(Ast::Group(ast::Group {
3884 span: span(0..10),
3885 kind: ast::GroupKind::CaptureName {
3886 starts_with_p: true,
3887 name: ast::CaptureName {
3888 span: span(4..7),
3889 name: s("a.1"),
3890 index: 1,
3891 }
3892 },
3893 ast: Box::new(lit('z', 8)),
3894 }))
3895 );
3896
3897 assert_eq!(
3898 parser("(?P<a[1]>z)").parse(),
3899 Ok(Ast::Group(ast::Group {
3900 span: span(0..11),
3901 kind: ast::GroupKind::CaptureName {
3902 starts_with_p: true,
3903 name: ast::CaptureName {
3904 span: span(4..8),
3905 name: s("a[1]"),
3906 index: 1,
3907 }
3908 },
3909 ast: Box::new(lit('z', 9)),
3910 }))
3911 );
3912
3913 assert_eq!(
3914 parser("(?P<a¾>)").parse(),
3915 Ok(Ast::Group(ast::Group {
3916 span: Span::new(
3917 Position::new(0, 1, 1),
3918 Position::new(9, 1, 9),
3919 ),
3920 kind: ast::GroupKind::CaptureName {
3921 starts_with_p: true,
3922 name: ast::CaptureName {
3923 span: Span::new(
3924 Position::new(4, 1, 5),
3925 Position::new(7, 1, 7),
3926 ),
3927 name: s("a¾"),
3928 index: 1,
3929 }
3930 },
3931 ast: Box::new(Ast::Empty(Span::new(
3932 Position::new(8, 1, 8),
3933 Position::new(8, 1, 8),
3934 ))),
3935 }))
3936 );
3937 assert_eq!(
3938 parser("(?P<名字>)").parse(),
3939 Ok(Ast::Group(ast::Group {
3940 span: Span::new(
3941 Position::new(0, 1, 1),
3942 Position::new(12, 1, 9),
3943 ),
3944 kind: ast::GroupKind::CaptureName {
3945 starts_with_p: true,
3946 name: ast::CaptureName {
3947 span: Span::new(
3948 Position::new(4, 1, 5),
3949 Position::new(10, 1, 7),
3950 ),
3951 name: s("名字"),
3952 index: 1,
3953 }
3954 },
3955 ast: Box::new(Ast::Empty(Span::new(
3956 Position::new(11, 1, 8),
3957 Position::new(11, 1, 8),
3958 ))),
3959 }))
3960 );
3961
3962 assert_eq!(
3963 parser("(?P<").parse().unwrap_err(),
3964 TestError {
3965 span: span(4..4),
3966 kind: ast::ErrorKind::GroupNameUnexpectedEof,
3967 }
3968 );
3969 assert_eq!(
3970 parser("(?P<>z)").parse().unwrap_err(),
3971 TestError {
3972 span: span(4..4),
3973 kind: ast::ErrorKind::GroupNameEmpty,
3974 }
3975 );
3976 assert_eq!(
3977 parser("(?P<a").parse().unwrap_err(),
3978 TestError {
3979 span: span(5..5),
3980 kind: ast::ErrorKind::GroupNameUnexpectedEof,
3981 }
3982 );
3983 assert_eq!(
3984 parser("(?P<ab").parse().unwrap_err(),
3985 TestError {
3986 span: span(6..6),
3987 kind: ast::ErrorKind::GroupNameUnexpectedEof,
3988 }
3989 );
3990 assert_eq!(
3991 parser("(?P<0a").parse().unwrap_err(),
3992 TestError {
3993 span: span(4..5),
3994 kind: ast::ErrorKind::GroupNameInvalid,
3995 }
3996 );
3997 assert_eq!(
3998 parser("(?P<~").parse().unwrap_err(),
3999 TestError {
4000 span: span(4..5),
4001 kind: ast::ErrorKind::GroupNameInvalid,
4002 }
4003 );
4004 assert_eq!(
4005 parser("(?P<abc~").parse().unwrap_err(),
4006 TestError {
4007 span: span(7..8),
4008 kind: ast::ErrorKind::GroupNameInvalid,
4009 }
4010 );
4011 assert_eq!(
4012 parser("(?P<a>y)(?P<a>z)").parse().unwrap_err(),
4013 TestError {
4014 span: span(12..13),
4015 kind: ast::ErrorKind::GroupNameDuplicate {
4016 original: span(4..5),
4017 },
4018 }
4019 );
4020 assert_eq!(
4021 parser("(?P<5>)").parse().unwrap_err(),
4022 TestError {
4023 span: span(4..5),
4024 kind: ast::ErrorKind::GroupNameInvalid,
4025 }
4026 );
4027 assert_eq!(
4028 parser("(?P<5a>)").parse().unwrap_err(),
4029 TestError {
4030 span: span(4..5),
4031 kind: ast::ErrorKind::GroupNameInvalid,
4032 }
4033 );
4034 assert_eq!(
4035 parser("(?P<¾>)").parse().unwrap_err(),
4036 TestError {
4037 span: Span::new(
4038 Position::new(4, 1, 5),
4039 Position::new(6, 1, 6),
4040 ),
4041 kind: ast::ErrorKind::GroupNameInvalid,
4042 }
4043 );
4044 assert_eq!(
4045 parser("(?P<¾a>)").parse().unwrap_err(),
4046 TestError {
4047 span: Span::new(
4048 Position::new(4, 1, 5),
4049 Position::new(6, 1, 6),
4050 ),
4051 kind: ast::ErrorKind::GroupNameInvalid,
4052 }
4053 );
4054 assert_eq!(
4055 parser("(?P<☃>)").parse().unwrap_err(),
4056 TestError {
4057 span: Span::new(
4058 Position::new(4, 1, 5),
4059 Position::new(7, 1, 6),
4060 ),
4061 kind: ast::ErrorKind::GroupNameInvalid,
4062 }
4063 );
4064 assert_eq!(
4065 parser("(?P<a☃>)").parse().unwrap_err(),
4066 TestError {
4067 span: Span::new(
4068 Position::new(5, 1, 6),
4069 Position::new(8, 1, 7),
4070 ),
4071 kind: ast::ErrorKind::GroupNameInvalid,
4072 }
4073 );
4074 }
4075
4076 #[test]
4077 fn parse_flags() {
4078 assert_eq!(
4079 parser("i:").parse_flags(),
4080 Ok(ast::Flags {
4081 span: span(0..1),
4082 items: vec![ast::FlagsItem {
4083 span: span(0..1),
4084 kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive),
4085 }],
4086 })
4087 );
4088 assert_eq!(
4089 parser("i)").parse_flags(),
4090 Ok(ast::Flags {
4091 span: span(0..1),
4092 items: vec![ast::FlagsItem {
4093 span: span(0..1),
4094 kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive),
4095 }],
4096 })
4097 );
4098
4099 assert_eq!(
4100 parser("isU:").parse_flags(),
4101 Ok(ast::Flags {
4102 span: span(0..3),
4103 items: vec![
4104 ast::FlagsItem {
4105 span: span(0..1),
4106 kind: ast::FlagsItemKind::Flag(
4107 ast::Flag::CaseInsensitive
4108 ),
4109 },
4110 ast::FlagsItem {
4111 span: span(1..2),
4112 kind: ast::FlagsItemKind::Flag(
4113 ast::Flag::DotMatchesNewLine
4114 ),
4115 },
4116 ast::FlagsItem {
4117 span: span(2..3),
4118 kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed),
4119 },
4120 ],
4121 })
4122 );
4123
4124 assert_eq!(
4125 parser("-isU:").parse_flags(),
4126 Ok(ast::Flags {
4127 span: span(0..4),
4128 items: vec![
4129 ast::FlagsItem {
4130 span: span(0..1),
4131 kind: ast::FlagsItemKind::Negation,
4132 },
4133 ast::FlagsItem {
4134 span: span(1..2),
4135 kind: ast::FlagsItemKind::Flag(
4136 ast::Flag::CaseInsensitive
4137 ),
4138 },
4139 ast::FlagsItem {
4140 span: span(2..3),
4141 kind: ast::FlagsItemKind::Flag(
4142 ast::Flag::DotMatchesNewLine
4143 ),
4144 },
4145 ast::FlagsItem {
4146 span: span(3..4),
4147 kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed),
4148 },
4149 ],
4150 })
4151 );
4152 assert_eq!(
4153 parser("i-sU:").parse_flags(),
4154 Ok(ast::Flags {
4155 span: span(0..4),
4156 items: vec![
4157 ast::FlagsItem {
4158 span: span(0..1),
4159 kind: ast::FlagsItemKind::Flag(
4160 ast::Flag::CaseInsensitive
4161 ),
4162 },
4163 ast::FlagsItem {
4164 span: span(1..2),
4165 kind: ast::FlagsItemKind::Negation,
4166 },
4167 ast::FlagsItem {
4168 span: span(2..3),
4169 kind: ast::FlagsItemKind::Flag(
4170 ast::Flag::DotMatchesNewLine
4171 ),
4172 },
4173 ast::FlagsItem {
4174 span: span(3..4),
4175 kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed),
4176 },
4177 ],
4178 })
4179 );
4180 assert_eq!(
4181 parser("i-sR:").parse_flags(),
4182 Ok(ast::Flags {
4183 span: span(0..4),
4184 items: vec![
4185 ast::FlagsItem {
4186 span: span(0..1),
4187 kind: ast::FlagsItemKind::Flag(
4188 ast::Flag::CaseInsensitive
4189 ),
4190 },
4191 ast::FlagsItem {
4192 span: span(1..2),
4193 kind: ast::FlagsItemKind::Negation,
4194 },
4195 ast::FlagsItem {
4196 span: span(2..3),
4197 kind: ast::FlagsItemKind::Flag(
4198 ast::Flag::DotMatchesNewLine
4199 ),
4200 },
4201 ast::FlagsItem {
4202 span: span(3..4),
4203 kind: ast::FlagsItemKind::Flag(ast::Flag::CRLF),
4204 },
4205 ],
4206 })
4207 );
4208
4209 assert_eq!(
4210 parser("isU").parse_flags().unwrap_err(),
4211 TestError {
4212 span: span(3..3),
4213 kind: ast::ErrorKind::FlagUnexpectedEof,
4214 }
4215 );
4216 assert_eq!(
4217 parser("isUa:").parse_flags().unwrap_err(),
4218 TestError {
4219 span: span(3..4),
4220 kind: ast::ErrorKind::FlagUnrecognized,
4221 }
4222 );
4223 assert_eq!(
4224 parser("isUi:").parse_flags().unwrap_err(),
4225 TestError {
4226 span: span(3..4),
4227 kind: ast::ErrorKind::FlagDuplicate { original: span(0..1) },
4228 }
4229 );
4230 assert_eq!(
4231 parser("i-sU-i:").parse_flags().unwrap_err(),
4232 TestError {
4233 span: span(4..5),
4234 kind: ast::ErrorKind::FlagRepeatedNegation {
4235 original: span(1..2),
4236 },
4237 }
4238 );
4239 assert_eq!(
4240 parser("-)").parse_flags().unwrap_err(),
4241 TestError {
4242 span: span(0..1),
4243 kind: ast::ErrorKind::FlagDanglingNegation,
4244 }
4245 );
4246 assert_eq!(
4247 parser("i-)").parse_flags().unwrap_err(),
4248 TestError {
4249 span: span(1..2),
4250 kind: ast::ErrorKind::FlagDanglingNegation,
4251 }
4252 );
4253 assert_eq!(
4254 parser("iU-)").parse_flags().unwrap_err(),
4255 TestError {
4256 span: span(2..3),
4257 kind: ast::ErrorKind::FlagDanglingNegation,
4258 }
4259 );
4260 }
4261
4262 #[test]
4263 fn parse_flag() {
4264 assert_eq!(parser("i").parse_flag(), Ok(ast::Flag::CaseInsensitive));
4265 assert_eq!(parser("m").parse_flag(), Ok(ast::Flag::MultiLine));
4266 assert_eq!(parser("s").parse_flag(), Ok(ast::Flag::DotMatchesNewLine));
4267 assert_eq!(parser("U").parse_flag(), Ok(ast::Flag::SwapGreed));
4268 assert_eq!(parser("u").parse_flag(), Ok(ast::Flag::Unicode));
4269 assert_eq!(parser("R").parse_flag(), Ok(ast::Flag::CRLF));
4270 assert_eq!(parser("x").parse_flag(), Ok(ast::Flag::IgnoreWhitespace));
4271
4272 assert_eq!(
4273 parser("a").parse_flag().unwrap_err(),
4274 TestError {
4275 span: span(0..1),
4276 kind: ast::ErrorKind::FlagUnrecognized,
4277 }
4278 );
4279 assert_eq!(
4280 parser("☃").parse_flag().unwrap_err(),
4281 TestError {
4282 span: span_range("☃", 0..3),
4283 kind: ast::ErrorKind::FlagUnrecognized,
4284 }
4285 );
4286 }
4287
4288 #[test]
4289 fn parse_primitive_non_escape() {
4290 assert_eq!(
4291 parser(r".").parse_primitive(),
4292 Ok(Primitive::Dot(span(0..1)))
4293 );
4294 assert_eq!(
4295 parser(r"^").parse_primitive(),
4296 Ok(Primitive::Assertion(ast::Assertion {
4297 span: span(0..1),
4298 kind: ast::AssertionKind::StartLine,
4299 }))
4300 );
4301 assert_eq!(
4302 parser(r"$").parse_primitive(),
4303 Ok(Primitive::Assertion(ast::Assertion {
4304 span: span(0..1),
4305 kind: ast::AssertionKind::EndLine,
4306 }))
4307 );
4308
4309 assert_eq!(
4310 parser(r"a").parse_primitive(),
4311 Ok(Primitive::Literal(ast::Literal {
4312 span: span(0..1),
4313 kind: ast::LiteralKind::Verbatim,
4314 c: 'a',
4315 }))
4316 );
4317 assert_eq!(
4318 parser(r"|").parse_primitive(),
4319 Ok(Primitive::Literal(ast::Literal {
4320 span: span(0..1),
4321 kind: ast::LiteralKind::Verbatim,
4322 c: '|',
4323 }))
4324 );
4325 assert_eq!(
4326 parser(r"☃").parse_primitive(),
4327 Ok(Primitive::Literal(ast::Literal {
4328 span: span_range("☃", 0..3),
4329 kind: ast::LiteralKind::Verbatim,
4330 c: '☃',
4331 }))
4332 );
4333 }
4334
4335 #[test]
4336 fn parse_escape() {
4337 assert_eq!(
4338 parser(r"\|").parse_primitive(),
4339 Ok(Primitive::Literal(ast::Literal {
4340 span: span(0..2),
4341 kind: ast::LiteralKind::Meta,
4342 c: '|',
4343 }))
4344 );
4345 let specials = &[
4346 (r"\a", '\x07', ast::SpecialLiteralKind::Bell),
4347 (r"\f", '\x0C', ast::SpecialLiteralKind::FormFeed),
4348 (r"\t", '\t', ast::SpecialLiteralKind::Tab),
4349 (r"\n", '\n', ast::SpecialLiteralKind::LineFeed),
4350 (r"\r", '\r', ast::SpecialLiteralKind::CarriageReturn),
4351 (r"\v", '\x0B', ast::SpecialLiteralKind::VerticalTab),
4352 ];
4353 for &(pat, c, ref kind) in specials {
4354 assert_eq!(
4355 parser(pat).parse_primitive(),
4356 Ok(Primitive::Literal(ast::Literal {
4357 span: span(0..2),
4358 kind: ast::LiteralKind::Special(kind.clone()),
4359 c,
4360 }))
4361 );
4362 }
4363 assert_eq!(
4364 parser(r"\A").parse_primitive(),
4365 Ok(Primitive::Assertion(ast::Assertion {
4366 span: span(0..2),
4367 kind: ast::AssertionKind::StartText,
4368 }))
4369 );
4370 assert_eq!(
4371 parser(r"\z").parse_primitive(),
4372 Ok(Primitive::Assertion(ast::Assertion {
4373 span: span(0..2),
4374 kind: ast::AssertionKind::EndText,
4375 }))
4376 );
4377 assert_eq!(
4378 parser(r"\b").parse_primitive(),
4379 Ok(Primitive::Assertion(ast::Assertion {
4380 span: span(0..2),
4381 kind: ast::AssertionKind::WordBoundary,
4382 }))
4383 );
4384 assert_eq!(
4385 parser(r"\B").parse_primitive(),
4386 Ok(Primitive::Assertion(ast::Assertion {
4387 span: span(0..2),
4388 kind: ast::AssertionKind::NotWordBoundary,
4389 }))
4390 );
4391
4392 // We also support superfluous escapes in most cases now too.
4393 for c in ['!', '@', '%', '"', '\'', '/', ' '] {
4394 let pat = format!(r"\{}", c);
4395 assert_eq!(
4396 parser(&pat).parse_primitive(),
4397 Ok(Primitive::Literal(ast::Literal {
4398 span: span(0..2),
4399 kind: ast::LiteralKind::Superfluous,
4400 c,
4401 }))
4402 );
4403 }
4404
4405 // Some superfluous escapes, namely [0-9A-Za-z], are still banned. This
4406 // gives flexibility for future evolution.
4407 assert_eq!(
4408 parser(r"\e").parse_escape().unwrap_err(),
4409 TestError {
4410 span: span(0..2),
4411 kind: ast::ErrorKind::EscapeUnrecognized,
4412 }
4413 );
4414 assert_eq!(
4415 parser(r"\y").parse_escape().unwrap_err(),
4416 TestError {
4417 span: span(0..2),
4418 kind: ast::ErrorKind::EscapeUnrecognized,
4419 }
4420 );
4421 // But also, < and > are banned, so that we may evolve them into
4422 // start/end word boundary assertions. (Not sure if we will...)
4423 assert_eq!(
4424 parser(r"\<").parse_escape().unwrap_err(),
4425 TestError {
4426 span: span(0..2),
4427 kind: ast::ErrorKind::EscapeUnrecognized,
4428 }
4429 );
4430 assert_eq!(
4431 parser(r"\>").parse_escape().unwrap_err(),
4432 TestError {
4433 span: span(0..2),
4434 kind: ast::ErrorKind::EscapeUnrecognized,
4435 }
4436 );
4437
4438 // An unfinished escape is illegal.
4439 assert_eq!(
4440 parser(r"\").parse_escape().unwrap_err(),
4441 TestError {
4442 span: span(0..1),
4443 kind: ast::ErrorKind::EscapeUnexpectedEof,
4444 }
4445 );
4446 }
4447
4448 #[test]
4449 fn parse_unsupported_backreference() {
4450 assert_eq!(
4451 parser(r"\0").parse_escape().unwrap_err(),
4452 TestError {
4453 span: span(0..2),
4454 kind: ast::ErrorKind::UnsupportedBackreference,
4455 }
4456 );
4457 assert_eq!(
4458 parser(r"\9").parse_escape().unwrap_err(),
4459 TestError {
4460 span: span(0..2),
4461 kind: ast::ErrorKind::UnsupportedBackreference,
4462 }
4463 );
4464 }
4465
4466 #[test]
4467 fn parse_octal() {
4468 for i in 0..511 {
4469 let pat = format!(r"\{:o}", i);
4470 assert_eq!(
4471 parser_octal(&pat).parse_escape(),
4472 Ok(Primitive::Literal(ast::Literal {
4473 span: span(0..pat.len()),
4474 kind: ast::LiteralKind::Octal,
4475 c: char::from_u32(i).unwrap(),
4476 }))
4477 );
4478 }
4479 assert_eq!(
4480 parser_octal(r"\778").parse_escape(),
4481 Ok(Primitive::Literal(ast::Literal {
4482 span: span(0..3),
4483 kind: ast::LiteralKind::Octal,
4484 c: '?',
4485 }))
4486 );
4487 assert_eq!(
4488 parser_octal(r"\7777").parse_escape(),
4489 Ok(Primitive::Literal(ast::Literal {
4490 span: span(0..4),
4491 kind: ast::LiteralKind::Octal,
4492 c: '\u{01FF}',
4493 }))
4494 );
4495 assert_eq!(
4496 parser_octal(r"\778").parse(),
4497 Ok(Ast::Concat(ast::Concat {
4498 span: span(0..4),
4499 asts: vec![
4500 Ast::Literal(ast::Literal {
4501 span: span(0..3),
4502 kind: ast::LiteralKind::Octal,
4503 c: '?',
4504 }),
4505 Ast::Literal(ast::Literal {
4506 span: span(3..4),
4507 kind: ast::LiteralKind::Verbatim,
4508 c: '8',
4509 }),
4510 ],
4511 }))
4512 );
4513 assert_eq!(
4514 parser_octal(r"\7777").parse(),
4515 Ok(Ast::Concat(ast::Concat {
4516 span: span(0..5),
4517 asts: vec![
4518 Ast::Literal(ast::Literal {
4519 span: span(0..4),
4520 kind: ast::LiteralKind::Octal,
4521 c: '\u{01FF}',
4522 }),
4523 Ast::Literal(ast::Literal {
4524 span: span(4..5),
4525 kind: ast::LiteralKind::Verbatim,
4526 c: '7',
4527 }),
4528 ],
4529 }))
4530 );
4531
4532 assert_eq!(
4533 parser_octal(r"\8").parse_escape().unwrap_err(),
4534 TestError {
4535 span: span(0..2),
4536 kind: ast::ErrorKind::EscapeUnrecognized,
4537 }
4538 );
4539 }
4540
4541 #[test]
4542 fn parse_hex_two() {
4543 for i in 0..256 {
4544 let pat = format!(r"\x{:02x}", i);
4545 assert_eq!(
4546 parser(&pat).parse_escape(),
4547 Ok(Primitive::Literal(ast::Literal {
4548 span: span(0..pat.len()),
4549 kind: ast::LiteralKind::HexFixed(ast::HexLiteralKind::X),
4550 c: char::from_u32(i).unwrap(),
4551 }))
4552 );
4553 }
4554
4555 assert_eq!(
4556 parser(r"\xF").parse_escape().unwrap_err(),
4557 TestError {
4558 span: span(3..3),
4559 kind: ast::ErrorKind::EscapeUnexpectedEof,
4560 }
4561 );
4562 assert_eq!(
4563 parser(r"\xG").parse_escape().unwrap_err(),
4564 TestError {
4565 span: span(2..3),
4566 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4567 }
4568 );
4569 assert_eq!(
4570 parser(r"\xFG").parse_escape().unwrap_err(),
4571 TestError {
4572 span: span(3..4),
4573 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4574 }
4575 );
4576 }
4577
4578 #[test]
4579 fn parse_hex_four() {
4580 for i in 0..65536 {
4581 let c = match char::from_u32(i) {
4582 None => continue,
4583 Some(c) => c,
4584 };
4585 let pat = format!(r"\u{:04x}", i);
4586 assert_eq!(
4587 parser(&pat).parse_escape(),
4588 Ok(Primitive::Literal(ast::Literal {
4589 span: span(0..pat.len()),
4590 kind: ast::LiteralKind::HexFixed(
4591 ast::HexLiteralKind::UnicodeShort
4592 ),
4593 c,
4594 }))
4595 );
4596 }
4597
4598 assert_eq!(
4599 parser(r"\uF").parse_escape().unwrap_err(),
4600 TestError {
4601 span: span(3..3),
4602 kind: ast::ErrorKind::EscapeUnexpectedEof,
4603 }
4604 );
4605 assert_eq!(
4606 parser(r"\uG").parse_escape().unwrap_err(),
4607 TestError {
4608 span: span(2..3),
4609 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4610 }
4611 );
4612 assert_eq!(
4613 parser(r"\uFG").parse_escape().unwrap_err(),
4614 TestError {
4615 span: span(3..4),
4616 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4617 }
4618 );
4619 assert_eq!(
4620 parser(r"\uFFG").parse_escape().unwrap_err(),
4621 TestError {
4622 span: span(4..5),
4623 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4624 }
4625 );
4626 assert_eq!(
4627 parser(r"\uFFFG").parse_escape().unwrap_err(),
4628 TestError {
4629 span: span(5..6),
4630 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4631 }
4632 );
4633 assert_eq!(
4634 parser(r"\uD800").parse_escape().unwrap_err(),
4635 TestError {
4636 span: span(2..6),
4637 kind: ast::ErrorKind::EscapeHexInvalid,
4638 }
4639 );
4640 }
4641
4642 #[test]
4643 fn parse_hex_eight() {
4644 for i in 0..65536 {
4645 let c = match char::from_u32(i) {
4646 None => continue,
4647 Some(c) => c,
4648 };
4649 let pat = format!(r"\U{:08x}", i);
4650 assert_eq!(
4651 parser(&pat).parse_escape(),
4652 Ok(Primitive::Literal(ast::Literal {
4653 span: span(0..pat.len()),
4654 kind: ast::LiteralKind::HexFixed(
4655 ast::HexLiteralKind::UnicodeLong
4656 ),
4657 c,
4658 }))
4659 );
4660 }
4661
4662 assert_eq!(
4663 parser(r"\UF").parse_escape().unwrap_err(),
4664 TestError {
4665 span: span(3..3),
4666 kind: ast::ErrorKind::EscapeUnexpectedEof,
4667 }
4668 );
4669 assert_eq!(
4670 parser(r"\UG").parse_escape().unwrap_err(),
4671 TestError {
4672 span: span(2..3),
4673 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4674 }
4675 );
4676 assert_eq!(
4677 parser(r"\UFG").parse_escape().unwrap_err(),
4678 TestError {
4679 span: span(3..4),
4680 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4681 }
4682 );
4683 assert_eq!(
4684 parser(r"\UFFG").parse_escape().unwrap_err(),
4685 TestError {
4686 span: span(4..5),
4687 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4688 }
4689 );
4690 assert_eq!(
4691 parser(r"\UFFFG").parse_escape().unwrap_err(),
4692 TestError {
4693 span: span(5..6),
4694 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4695 }
4696 );
4697 assert_eq!(
4698 parser(r"\UFFFFG").parse_escape().unwrap_err(),
4699 TestError {
4700 span: span(6..7),
4701 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4702 }
4703 );
4704 assert_eq!(
4705 parser(r"\UFFFFFG").parse_escape().unwrap_err(),
4706 TestError {
4707 span: span(7..8),
4708 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4709 }
4710 );
4711 assert_eq!(
4712 parser(r"\UFFFFFFG").parse_escape().unwrap_err(),
4713 TestError {
4714 span: span(8..9),
4715 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4716 }
4717 );
4718 assert_eq!(
4719 parser(r"\UFFFFFFFG").parse_escape().unwrap_err(),
4720 TestError {
4721 span: span(9..10),
4722 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4723 }
4724 );
4725 }
4726
4727 #[test]
4728 fn parse_hex_brace() {
4729 assert_eq!(
4730 parser(r"\u{26c4}").parse_escape(),
4731 Ok(Primitive::Literal(ast::Literal {
4732 span: span(0..8),
4733 kind: ast::LiteralKind::HexBrace(
4734 ast::HexLiteralKind::UnicodeShort
4735 ),
4736 c: '⛄',
4737 }))
4738 );
4739 assert_eq!(
4740 parser(r"\U{26c4}").parse_escape(),
4741 Ok(Primitive::Literal(ast::Literal {
4742 span: span(0..8),
4743 kind: ast::LiteralKind::HexBrace(
4744 ast::HexLiteralKind::UnicodeLong
4745 ),
4746 c: '⛄',
4747 }))
4748 );
4749 assert_eq!(
4750 parser(r"\x{26c4}").parse_escape(),
4751 Ok(Primitive::Literal(ast::Literal {
4752 span: span(0..8),
4753 kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X),
4754 c: '⛄',
4755 }))
4756 );
4757 assert_eq!(
4758 parser(r"\x{26C4}").parse_escape(),
4759 Ok(Primitive::Literal(ast::Literal {
4760 span: span(0..8),
4761 kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X),
4762 c: '⛄',
4763 }))
4764 );
4765 assert_eq!(
4766 parser(r"\x{10fFfF}").parse_escape(),
4767 Ok(Primitive::Literal(ast::Literal {
4768 span: span(0..10),
4769 kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X),
4770 c: '\u{10FFFF}',
4771 }))
4772 );
4773
4774 assert_eq!(
4775 parser(r"\x").parse_escape().unwrap_err(),
4776 TestError {
4777 span: span(2..2),
4778 kind: ast::ErrorKind::EscapeUnexpectedEof,
4779 }
4780 );
4781 assert_eq!(
4782 parser(r"\x{").parse_escape().unwrap_err(),
4783 TestError {
4784 span: span(2..3),
4785 kind: ast::ErrorKind::EscapeUnexpectedEof,
4786 }
4787 );
4788 assert_eq!(
4789 parser(r"\x{FF").parse_escape().unwrap_err(),
4790 TestError {
4791 span: span(2..5),
4792 kind: ast::ErrorKind::EscapeUnexpectedEof,
4793 }
4794 );
4795 assert_eq!(
4796 parser(r"\x{}").parse_escape().unwrap_err(),
4797 TestError {
4798 span: span(2..4),
4799 kind: ast::ErrorKind::EscapeHexEmpty,
4800 }
4801 );
4802 assert_eq!(
4803 parser(r"\x{FGF}").parse_escape().unwrap_err(),
4804 TestError {
4805 span: span(4..5),
4806 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4807 }
4808 );
4809 assert_eq!(
4810 parser(r"\x{FFFFFF}").parse_escape().unwrap_err(),
4811 TestError {
4812 span: span(3..9),
4813 kind: ast::ErrorKind::EscapeHexInvalid,
4814 }
4815 );
4816 assert_eq!(
4817 parser(r"\x{D800}").parse_escape().unwrap_err(),
4818 TestError {
4819 span: span(3..7),
4820 kind: ast::ErrorKind::EscapeHexInvalid,
4821 }
4822 );
4823 assert_eq!(
4824 parser(r"\x{FFFFFFFFF}").parse_escape().unwrap_err(),
4825 TestError {
4826 span: span(3..12),
4827 kind: ast::ErrorKind::EscapeHexInvalid,
4828 }
4829 );
4830 }
4831
4832 #[test]
4833 fn parse_decimal() {
4834 assert_eq!(parser("123").parse_decimal(), Ok(123));
4835 assert_eq!(parser("0").parse_decimal(), Ok(0));
4836 assert_eq!(parser("01").parse_decimal(), Ok(1));
4837
4838 assert_eq!(
4839 parser("-1").parse_decimal().unwrap_err(),
4840 TestError { span: span(0..0), kind: ast::ErrorKind::DecimalEmpty }
4841 );
4842 assert_eq!(
4843 parser("").parse_decimal().unwrap_err(),
4844 TestError { span: span(0..0), kind: ast::ErrorKind::DecimalEmpty }
4845 );
4846 assert_eq!(
4847 parser("9999999999").parse_decimal().unwrap_err(),
4848 TestError {
4849 span: span(0..10),
4850 kind: ast::ErrorKind::DecimalInvalid,
4851 }
4852 );
4853 }
4854
4855 #[test]
4856 fn parse_set_class() {
4857 fn union(span: Span, items: Vec<ast::ClassSetItem>) -> ast::ClassSet {
4858 ast::ClassSet::union(ast::ClassSetUnion { span, items })
4859 }
4860
4861 fn intersection(
4862 span: Span,
4863 lhs: ast::ClassSet,
4864 rhs: ast::ClassSet,
4865 ) -> ast::ClassSet {
4866 ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp {
4867 span,
4868 kind: ast::ClassSetBinaryOpKind::Intersection,
4869 lhs: Box::new(lhs),
4870 rhs: Box::new(rhs),
4871 })
4872 }
4873
4874 fn difference(
4875 span: Span,
4876 lhs: ast::ClassSet,
4877 rhs: ast::ClassSet,
4878 ) -> ast::ClassSet {
4879 ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp {
4880 span,
4881 kind: ast::ClassSetBinaryOpKind::Difference,
4882 lhs: Box::new(lhs),
4883 rhs: Box::new(rhs),
4884 })
4885 }
4886
4887 fn symdifference(
4888 span: Span,
4889 lhs: ast::ClassSet,
4890 rhs: ast::ClassSet,
4891 ) -> ast::ClassSet {
4892 ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp {
4893 span,
4894 kind: ast::ClassSetBinaryOpKind::SymmetricDifference,
4895 lhs: Box::new(lhs),
4896 rhs: Box::new(rhs),
4897 })
4898 }
4899
4900 fn itemset(item: ast::ClassSetItem) -> ast::ClassSet {
4901 ast::ClassSet::Item(item)
4902 }
4903
4904 fn item_ascii(cls: ast::ClassAscii) -> ast::ClassSetItem {
4905 ast::ClassSetItem::Ascii(cls)
4906 }
4907
4908 fn item_unicode(cls: ast::ClassUnicode) -> ast::ClassSetItem {
4909 ast::ClassSetItem::Unicode(cls)
4910 }
4911
4912 fn item_perl(cls: ast::ClassPerl) -> ast::ClassSetItem {
4913 ast::ClassSetItem::Perl(cls)
4914 }
4915
4916 fn item_bracket(cls: ast::ClassBracketed) -> ast::ClassSetItem {
4917 ast::ClassSetItem::Bracketed(Box::new(cls))
4918 }
4919
4920 fn lit(span: Span, c: char) -> ast::ClassSetItem {
4921 ast::ClassSetItem::Literal(ast::Literal {
4922 span,
4923 kind: ast::LiteralKind::Verbatim,
4924 c,
4925 })
4926 }
4927
4928 fn empty(span: Span) -> ast::ClassSetItem {
4929 ast::ClassSetItem::Empty(span)
4930 }
4931
4932 fn range(span: Span, start: char, end: char) -> ast::ClassSetItem {
4933 let pos1 = Position {
4934 offset: span.start.offset + start.len_utf8(),
4935 column: span.start.column + 1,
4936 ..span.start
4937 };
4938 let pos2 = Position {
4939 offset: span.end.offset - end.len_utf8(),
4940 column: span.end.column - 1,
4941 ..span.end
4942 };
4943 ast::ClassSetItem::Range(ast::ClassSetRange {
4944 span,
4945 start: ast::Literal {
4946 span: Span { end: pos1, ..span },
4947 kind: ast::LiteralKind::Verbatim,
4948 c: start,
4949 },
4950 end: ast::Literal {
4951 span: Span { start: pos2, ..span },
4952 kind: ast::LiteralKind::Verbatim,
4953 c: end,
4954 },
4955 })
4956 }
4957
4958 fn alnum(span: Span, negated: bool) -> ast::ClassAscii {
4959 ast::ClassAscii { span, kind: ast::ClassAsciiKind::Alnum, negated }
4960 }
4961
4962 fn lower(span: Span, negated: bool) -> ast::ClassAscii {
4963 ast::ClassAscii { span, kind: ast::ClassAsciiKind::Lower, negated }
4964 }
4965
4966 assert_eq!(
4967 parser("[[:alnum:]]").parse(),
4968 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4969 span: span(0..11),
4970 negated: false,
4971 kind: itemset(item_ascii(alnum(span(1..10), false))),
4972 })))
4973 );
4974 assert_eq!(
4975 parser("[[[:alnum:]]]").parse(),
4976 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4977 span: span(0..13),
4978 negated: false,
4979 kind: itemset(item_bracket(ast::ClassBracketed {
4980 span: span(1..12),
4981 negated: false,
4982 kind: itemset(item_ascii(alnum(span(2..11), false))),
4983 })),
4984 })))
4985 );
4986 assert_eq!(
4987 parser("[[:alnum:]&&[:lower:]]").parse(),
4988 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4989 span: span(0..22),
4990 negated: false,
4991 kind: intersection(
4992 span(1..21),
4993 itemset(item_ascii(alnum(span(1..10), false))),
4994 itemset(item_ascii(lower(span(12..21), false))),
4995 ),
4996 })))
4997 );
4998 assert_eq!(
4999 parser("[[:alnum:]--[:lower:]]").parse(),
5000 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5001 span: span(0..22),
5002 negated: false,
5003 kind: difference(
5004 span(1..21),
5005 itemset(item_ascii(alnum(span(1..10), false))),
5006 itemset(item_ascii(lower(span(12..21), false))),
5007 ),
5008 })))
5009 );
5010 assert_eq!(
5011 parser("[[:alnum:]~~[:lower:]]").parse(),
5012 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5013 span: span(0..22),
5014 negated: false,
5015 kind: symdifference(
5016 span(1..21),
5017 itemset(item_ascii(alnum(span(1..10), false))),
5018 itemset(item_ascii(lower(span(12..21), false))),
5019 ),
5020 })))
5021 );
5022
5023 assert_eq!(
5024 parser("[a]").parse(),
5025 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5026 span: span(0..3),
5027 negated: false,
5028 kind: itemset(lit(span(1..2), 'a')),
5029 })))
5030 );
5031 assert_eq!(
5032 parser(r"[a\]]").parse(),
5033 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5034 span: span(0..5),
5035 negated: false,
5036 kind: union(
5037 span(1..4),
5038 vec![
5039 lit(span(1..2), 'a'),
5040 ast::ClassSetItem::Literal(ast::Literal {
5041 span: span(2..4),
5042 kind: ast::LiteralKind::Meta,
5043 c: ']',
5044 }),
5045 ]
5046 ),
5047 })))
5048 );
5049 assert_eq!(
5050 parser(r"[a\-z]").parse(),
5051 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5052 span: span(0..6),
5053 negated: false,
5054 kind: union(
5055 span(1..5),
5056 vec![
5057 lit(span(1..2), 'a'),
5058 ast::ClassSetItem::Literal(ast::Literal {
5059 span: span(2..4),
5060 kind: ast::LiteralKind::Meta,
5061 c: '-',
5062 }),
5063 lit(span(4..5), 'z'),
5064 ]
5065 ),
5066 })))
5067 );
5068 assert_eq!(
5069 parser("[ab]").parse(),
5070 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5071 span: span(0..4),
5072 negated: false,
5073 kind: union(
5074 span(1..3),
5075 vec![lit(span(1..2), 'a'), lit(span(2..3), 'b'),]
5076 ),
5077 })))
5078 );
5079 assert_eq!(
5080 parser("[a-]").parse(),
5081 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5082 span: span(0..4),
5083 negated: false,
5084 kind: union(
5085 span(1..3),
5086 vec![lit(span(1..2), 'a'), lit(span(2..3), '-'),]
5087 ),
5088 })))
5089 );
5090 assert_eq!(
5091 parser("[-a]").parse(),
5092 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5093 span: span(0..4),
5094 negated: false,
5095 kind: union(
5096 span(1..3),
5097 vec![lit(span(1..2), '-'), lit(span(2..3), 'a'),]
5098 ),
5099 })))
5100 );
5101 assert_eq!(
5102 parser(r"[\pL]").parse(),
5103 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5104 span: span(0..5),
5105 negated: false,
5106 kind: itemset(item_unicode(ast::ClassUnicode {
5107 span: span(1..4),
5108 negated: false,
5109 kind: ast::ClassUnicodeKind::OneLetter('L'),
5110 })),
5111 })))
5112 );
5113 assert_eq!(
5114 parser(r"[\w]").parse(),
5115 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5116 span: span(0..4),
5117 negated: false,
5118 kind: itemset(item_perl(ast::ClassPerl {
5119 span: span(1..3),
5120 kind: ast::ClassPerlKind::Word,
5121 negated: false,
5122 })),
5123 })))
5124 );
5125 assert_eq!(
5126 parser(r"[a\wz]").parse(),
5127 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5128 span: span(0..6),
5129 negated: false,
5130 kind: union(
5131 span(1..5),
5132 vec![
5133 lit(span(1..2), 'a'),
5134 item_perl(ast::ClassPerl {
5135 span: span(2..4),
5136 kind: ast::ClassPerlKind::Word,
5137 negated: false,
5138 }),
5139 lit(span(4..5), 'z'),
5140 ]
5141 ),
5142 })))
5143 );
5144
5145 assert_eq!(
5146 parser("[a-z]").parse(),
5147 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5148 span: span(0..5),
5149 negated: false,
5150 kind: itemset(range(span(1..4), 'a', 'z')),
5151 })))
5152 );
5153 assert_eq!(
5154 parser("[a-cx-z]").parse(),
5155 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5156 span: span(0..8),
5157 negated: false,
5158 kind: union(
5159 span(1..7),
5160 vec![
5161 range(span(1..4), 'a', 'c'),
5162 range(span(4..7), 'x', 'z'),
5163 ]
5164 ),
5165 })))
5166 );
5167 assert_eq!(
5168 parser(r"[\w&&a-cx-z]").parse(),
5169 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5170 span: span(0..12),
5171 negated: false,
5172 kind: intersection(
5173 span(1..11),
5174 itemset(item_perl(ast::ClassPerl {
5175 span: span(1..3),
5176 kind: ast::ClassPerlKind::Word,
5177 negated: false,
5178 })),
5179 union(
5180 span(5..11),
5181 vec![
5182 range(span(5..8), 'a', 'c'),
5183 range(span(8..11), 'x', 'z'),
5184 ]
5185 ),
5186 ),
5187 })))
5188 );
5189 assert_eq!(
5190 parser(r"[a-cx-z&&\w]").parse(),
5191 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5192 span: span(0..12),
5193 negated: false,
5194 kind: intersection(
5195 span(1..11),
5196 union(
5197 span(1..7),
5198 vec![
5199 range(span(1..4), 'a', 'c'),
5200 range(span(4..7), 'x', 'z'),
5201 ]
5202 ),
5203 itemset(item_perl(ast::ClassPerl {
5204 span: span(9..11),
5205 kind: ast::ClassPerlKind::Word,
5206 negated: false,
5207 })),
5208 ),
5209 })))
5210 );
5211 assert_eq!(
5212 parser(r"[a--b--c]").parse(),
5213 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5214 span: span(0..9),
5215 negated: false,
5216 kind: difference(
5217 span(1..8),
5218 difference(
5219 span(1..5),
5220 itemset(lit(span(1..2), 'a')),
5221 itemset(lit(span(4..5), 'b')),
5222 ),
5223 itemset(lit(span(7..8), 'c')),
5224 ),
5225 })))
5226 );
5227 assert_eq!(
5228 parser(r"[a~~b~~c]").parse(),
5229 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5230 span: span(0..9),
5231 negated: false,
5232 kind: symdifference(
5233 span(1..8),
5234 symdifference(
5235 span(1..5),
5236 itemset(lit(span(1..2), 'a')),
5237 itemset(lit(span(4..5), 'b')),
5238 ),
5239 itemset(lit(span(7..8), 'c')),
5240 ),
5241 })))
5242 );
5243 assert_eq!(
5244 parser(r"[\^&&^]").parse(),
5245 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5246 span: span(0..7),
5247 negated: false,
5248 kind: intersection(
5249 span(1..6),
5250 itemset(ast::ClassSetItem::Literal(ast::Literal {
5251 span: span(1..3),
5252 kind: ast::LiteralKind::Meta,
5253 c: '^',
5254 })),
5255 itemset(lit(span(5..6), '^')),
5256 ),
5257 })))
5258 );
5259 assert_eq!(
5260 parser(r"[\&&&&]").parse(),
5261 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5262 span: span(0..7),
5263 negated: false,
5264 kind: intersection(
5265 span(1..6),
5266 itemset(ast::ClassSetItem::Literal(ast::Literal {
5267 span: span(1..3),
5268 kind: ast::LiteralKind::Meta,
5269 c: '&',
5270 })),
5271 itemset(lit(span(5..6), '&')),
5272 ),
5273 })))
5274 );
5275 assert_eq!(
5276 parser(r"[&&&&]").parse(),
5277 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5278 span: span(0..6),
5279 negated: false,
5280 kind: intersection(
5281 span(1..5),
5282 intersection(
5283 span(1..3),
5284 itemset(empty(span(1..1))),
5285 itemset(empty(span(3..3))),
5286 ),
5287 itemset(empty(span(5..5))),
5288 ),
5289 })))
5290 );
5291
5292 let pat = "[☃-⛄]";
5293 assert_eq!(
5294 parser(pat).parse(),
5295 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5296 span: span_range(pat, 0..9),
5297 negated: false,
5298 kind: itemset(ast::ClassSetItem::Range(ast::ClassSetRange {
5299 span: span_range(pat, 1..8),
5300 start: ast::Literal {
5301 span: span_range(pat, 1..4),
5302 kind: ast::LiteralKind::Verbatim,
5303 c: '☃',
5304 },
5305 end: ast::Literal {
5306 span: span_range(pat, 5..8),
5307 kind: ast::LiteralKind::Verbatim,
5308 c: '⛄',
5309 },
5310 })),
5311 })))
5312 );
5313
5314 assert_eq!(
5315 parser(r"[]]").parse(),
5316 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5317 span: span(0..3),
5318 negated: false,
5319 kind: itemset(lit(span(1..2), ']')),
5320 })))
5321 );
5322 assert_eq!(
5323 parser(r"[]\[]").parse(),
5324 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5325 span: span(0..5),
5326 negated: false,
5327 kind: union(
5328 span(1..4),
5329 vec![
5330 lit(span(1..2), ']'),
5331 ast::ClassSetItem::Literal(ast::Literal {
5332 span: span(2..4),
5333 kind: ast::LiteralKind::Meta,
5334 c: '[',
5335 }),
5336 ]
5337 ),
5338 })))
5339 );
5340 assert_eq!(
5341 parser(r"[\[]]").parse(),
5342 Ok(concat(
5343 0..5,
5344 vec![
5345 Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5346 span: span(0..4),
5347 negated: false,
5348 kind: itemset(ast::ClassSetItem::Literal(
5349 ast::Literal {
5350 span: span(1..3),
5351 kind: ast::LiteralKind::Meta,
5352 c: '[',
5353 }
5354 )),
5355 })),
5356 Ast::Literal(ast::Literal {
5357 span: span(4..5),
5358 kind: ast::LiteralKind::Verbatim,
5359 c: ']',
5360 }),
5361 ]
5362 ))
5363 );
5364
5365 assert_eq!(
5366 parser("[").parse().unwrap_err(),
5367 TestError {
5368 span: span(0..1),
5369 kind: ast::ErrorKind::ClassUnclosed,
5370 }
5371 );
5372 assert_eq!(
5373 parser("[[").parse().unwrap_err(),
5374 TestError {
5375 span: span(1..2),
5376 kind: ast::ErrorKind::ClassUnclosed,
5377 }
5378 );
5379 assert_eq!(
5380 parser("[[-]").parse().unwrap_err(),
5381 TestError {
5382 span: span(0..1),
5383 kind: ast::ErrorKind::ClassUnclosed,
5384 }
5385 );
5386 assert_eq!(
5387 parser("[[[:alnum:]").parse().unwrap_err(),
5388 TestError {
5389 span: span(1..2),
5390 kind: ast::ErrorKind::ClassUnclosed,
5391 }
5392 );
5393 assert_eq!(
5394 parser(r"[\b]").parse().unwrap_err(),
5395 TestError {
5396 span: span(1..3),
5397 kind: ast::ErrorKind::ClassEscapeInvalid,
5398 }
5399 );
5400 assert_eq!(
5401 parser(r"[\w-a]").parse().unwrap_err(),
5402 TestError {
5403 span: span(1..3),
5404 kind: ast::ErrorKind::ClassRangeLiteral,
5405 }
5406 );
5407 assert_eq!(
5408 parser(r"[a-\w]").parse().unwrap_err(),
5409 TestError {
5410 span: span(3..5),
5411 kind: ast::ErrorKind::ClassRangeLiteral,
5412 }
5413 );
5414 assert_eq!(
5415 parser(r"[z-a]").parse().unwrap_err(),
5416 TestError {
5417 span: span(1..4),
5418 kind: ast::ErrorKind::ClassRangeInvalid,
5419 }
5420 );
5421
5422 assert_eq!(
5423 parser_ignore_whitespace("[a ").parse().unwrap_err(),
5424 TestError {
5425 span: span(0..1),
5426 kind: ast::ErrorKind::ClassUnclosed,
5427 }
5428 );
5429 assert_eq!(
5430 parser_ignore_whitespace("[a- ").parse().unwrap_err(),
5431 TestError {
5432 span: span(0..1),
5433 kind: ast::ErrorKind::ClassUnclosed,
5434 }
5435 );
5436 }
5437
5438 #[test]
5439 fn parse_set_class_open() {
5440 assert_eq!(parser("[a]").parse_set_class_open(), {
5441 let set = ast::ClassBracketed {
5442 span: span(0..1),
5443 negated: false,
5444 kind: ast::ClassSet::union(ast::ClassSetUnion {
5445 span: span(1..1),
5446 items: vec![],
5447 }),
5448 };
5449 let union = ast::ClassSetUnion { span: span(1..1), items: vec![] };
5450 Ok((set, union))
5451 });
5452 assert_eq!(
5453 parser_ignore_whitespace("[ a]").parse_set_class_open(),
5454 {
5455 let set = ast::ClassBracketed {
5456 span: span(0..4),
5457 negated: false,
5458 kind: ast::ClassSet::union(ast::ClassSetUnion {
5459 span: span(4..4),
5460 items: vec![],
5461 }),
5462 };
5463 let union =
5464 ast::ClassSetUnion { span: span(4..4), items: vec![] };
5465 Ok((set, union))
5466 }
5467 );
5468 assert_eq!(parser("[^a]").parse_set_class_open(), {
5469 let set = ast::ClassBracketed {
5470 span: span(0..2),
5471 negated: true,
5472 kind: ast::ClassSet::union(ast::ClassSetUnion {
5473 span: span(2..2),
5474 items: vec![],
5475 }),
5476 };
5477 let union = ast::ClassSetUnion { span: span(2..2), items: vec![] };
5478 Ok((set, union))
5479 });
5480 assert_eq!(
5481 parser_ignore_whitespace("[ ^ a]").parse_set_class_open(),
5482 {
5483 let set = ast::ClassBracketed {
5484 span: span(0..4),
5485 negated: true,
5486 kind: ast::ClassSet::union(ast::ClassSetUnion {
5487 span: span(4..4),
5488 items: vec![],
5489 }),
5490 };
5491 let union =
5492 ast::ClassSetUnion { span: span(4..4), items: vec![] };
5493 Ok((set, union))
5494 }
5495 );
5496 assert_eq!(parser("[-a]").parse_set_class_open(), {
5497 let set = ast::ClassBracketed {
5498 span: span(0..2),
5499 negated: false,
5500 kind: ast::ClassSet::union(ast::ClassSetUnion {
5501 span: span(1..1),
5502 items: vec![],
5503 }),
5504 };
5505 let union = ast::ClassSetUnion {
5506 span: span(1..2),
5507 items: vec![ast::ClassSetItem::Literal(ast::Literal {
5508 span: span(1..2),
5509 kind: ast::LiteralKind::Verbatim,
5510 c: '-',
5511 })],
5512 };
5513 Ok((set, union))
5514 });
5515 assert_eq!(
5516 parser_ignore_whitespace("[ - a]").parse_set_class_open(),
5517 {
5518 let set = ast::ClassBracketed {
5519 span: span(0..4),
5520 negated: false,
5521 kind: ast::ClassSet::union(ast::ClassSetUnion {
5522 span: span(2..2),
5523 items: vec![],
5524 }),
5525 };
5526 let union = ast::ClassSetUnion {
5527 span: span(2..3),
5528 items: vec![ast::ClassSetItem::Literal(ast::Literal {
5529 span: span(2..3),
5530 kind: ast::LiteralKind::Verbatim,
5531 c: '-',
5532 })],
5533 };
5534 Ok((set, union))
5535 }
5536 );
5537 assert_eq!(parser("[^-a]").parse_set_class_open(), {
5538 let set = ast::ClassBracketed {
5539 span: span(0..3),
5540 negated: true,
5541 kind: ast::ClassSet::union(ast::ClassSetUnion {
5542 span: span(2..2),
5543 items: vec![],
5544 }),
5545 };
5546 let union = ast::ClassSetUnion {
5547 span: span(2..3),
5548 items: vec![ast::ClassSetItem::Literal(ast::Literal {
5549 span: span(2..3),
5550 kind: ast::LiteralKind::Verbatim,
5551 c: '-',
5552 })],
5553 };
5554 Ok((set, union))
5555 });
5556 assert_eq!(parser("[--a]").parse_set_class_open(), {
5557 let set = ast::ClassBracketed {
5558 span: span(0..3),
5559 negated: false,
5560 kind: ast::ClassSet::union(ast::ClassSetUnion {
5561 span: span(1..1),
5562 items: vec![],
5563 }),
5564 };
5565 let union = ast::ClassSetUnion {
5566 span: span(1..3),
5567 items: vec![
5568 ast::ClassSetItem::Literal(ast::Literal {
5569 span: span(1..2),
5570 kind: ast::LiteralKind::Verbatim,
5571 c: '-',
5572 }),
5573 ast::ClassSetItem::Literal(ast::Literal {
5574 span: span(2..3),
5575 kind: ast::LiteralKind::Verbatim,
5576 c: '-',
5577 }),
5578 ],
5579 };
5580 Ok((set, union))
5581 });
5582 assert_eq!(parser("[]a]").parse_set_class_open(), {
5583 let set = ast::ClassBracketed {
5584 span: span(0..2),
5585 negated: false,
5586 kind: ast::ClassSet::union(ast::ClassSetUnion {
5587 span: span(1..1),
5588 items: vec![],
5589 }),
5590 };
5591 let union = ast::ClassSetUnion {
5592 span: span(1..2),
5593 items: vec![ast::ClassSetItem::Literal(ast::Literal {
5594 span: span(1..2),
5595 kind: ast::LiteralKind::Verbatim,
5596 c: ']',
5597 })],
5598 };
5599 Ok((set, union))
5600 });
5601 assert_eq!(
5602 parser_ignore_whitespace("[ ] a]").parse_set_class_open(),
5603 {
5604 let set = ast::ClassBracketed {
5605 span: span(0..4),
5606 negated: false,
5607 kind: ast::ClassSet::union(ast::ClassSetUnion {
5608 span: span(2..2),
5609 items: vec![],
5610 }),
5611 };
5612 let union = ast::ClassSetUnion {
5613 span: span(2..3),
5614 items: vec![ast::ClassSetItem::Literal(ast::Literal {
5615 span: span(2..3),
5616 kind: ast::LiteralKind::Verbatim,
5617 c: ']',
5618 })],
5619 };
5620 Ok((set, union))
5621 }
5622 );
5623 assert_eq!(parser("[^]a]").parse_set_class_open(), {
5624 let set = ast::ClassBracketed {
5625 span: span(0..3),
5626 negated: true,
5627 kind: ast::ClassSet::union(ast::ClassSetUnion {
5628 span: span(2..2),
5629 items: vec![],
5630 }),
5631 };
5632 let union = ast::ClassSetUnion {
5633 span: span(2..3),
5634 items: vec![ast::ClassSetItem::Literal(ast::Literal {
5635 span: span(2..3),
5636 kind: ast::LiteralKind::Verbatim,
5637 c: ']',
5638 })],
5639 };
5640 Ok((set, union))
5641 });
5642 assert_eq!(parser("[-]a]").parse_set_class_open(), {
5643 let set = ast::ClassBracketed {
5644 span: span(0..2),
5645 negated: false,
5646 kind: ast::ClassSet::union(ast::ClassSetUnion {
5647 span: span(1..1),
5648 items: vec![],
5649 }),
5650 };
5651 let union = ast::ClassSetUnion {
5652 span: span(1..2),
5653 items: vec![ast::ClassSetItem::Literal(ast::Literal {
5654 span: span(1..2),
5655 kind: ast::LiteralKind::Verbatim,
5656 c: '-',
5657 })],
5658 };
5659 Ok((set, union))
5660 });
5661
5662 assert_eq!(
5663 parser("[").parse_set_class_open().unwrap_err(),
5664 TestError {
5665 span: span(0..1),
5666 kind: ast::ErrorKind::ClassUnclosed,
5667 }
5668 );
5669 assert_eq!(
5670 parser_ignore_whitespace("[ ")
5671 .parse_set_class_open()
5672 .unwrap_err(),
5673 TestError {
5674 span: span(0..5),
5675 kind: ast::ErrorKind::ClassUnclosed,
5676 }
5677 );
5678 assert_eq!(
5679 parser("[^").parse_set_class_open().unwrap_err(),
5680 TestError {
5681 span: span(0..2),
5682 kind: ast::ErrorKind::ClassUnclosed,
5683 }
5684 );
5685 assert_eq!(
5686 parser("[]").parse_set_class_open().unwrap_err(),
5687 TestError {
5688 span: span(0..2),
5689 kind: ast::ErrorKind::ClassUnclosed,
5690 }
5691 );
5692 assert_eq!(
5693 parser("[-").parse_set_class_open().unwrap_err(),
5694 TestError {
5695 span: span(0..0),
5696 kind: ast::ErrorKind::ClassUnclosed,
5697 }
5698 );
5699 assert_eq!(
5700 parser("[--").parse_set_class_open().unwrap_err(),
5701 TestError {
5702 span: span(0..0),
5703 kind: ast::ErrorKind::ClassUnclosed,
5704 }
5705 );
5706
5707 // See: https://github.com/rust-lang/regex/issues/792
5708 assert_eq!(
5709 parser("(?x)[-#]").parse_with_comments().unwrap_err(),
5710 TestError {
5711 span: span(4..4),
5712 kind: ast::ErrorKind::ClassUnclosed,
5713 }
5714 );
5715 }
5716
5717 #[test]
5718 fn maybe_parse_ascii_class() {
5719 assert_eq!(
5720 parser(r"[:alnum:]").maybe_parse_ascii_class(),
5721 Some(ast::ClassAscii {
5722 span: span(0..9),
5723 kind: ast::ClassAsciiKind::Alnum,
5724 negated: false,
5725 })
5726 );
5727 assert_eq!(
5728 parser(r"[:alnum:]A").maybe_parse_ascii_class(),
5729 Some(ast::ClassAscii {
5730 span: span(0..9),
5731 kind: ast::ClassAsciiKind::Alnum,
5732 negated: false,
5733 })
5734 );
5735 assert_eq!(
5736 parser(r"[:^alnum:]").maybe_parse_ascii_class(),
5737 Some(ast::ClassAscii {
5738 span: span(0..10),
5739 kind: ast::ClassAsciiKind::Alnum,
5740 negated: true,
5741 })
5742 );
5743
5744 let p = parser(r"[:");
5745 assert_eq!(p.maybe_parse_ascii_class(), None);
5746 assert_eq!(p.offset(), 0);
5747
5748 let p = parser(r"[:^");
5749 assert_eq!(p.maybe_parse_ascii_class(), None);
5750 assert_eq!(p.offset(), 0);
5751
5752 let p = parser(r"[^:alnum:]");
5753 assert_eq!(p.maybe_parse_ascii_class(), None);
5754 assert_eq!(p.offset(), 0);
5755
5756 let p = parser(r"[:alnnum:]");
5757 assert_eq!(p.maybe_parse_ascii_class(), None);
5758 assert_eq!(p.offset(), 0);
5759
5760 let p = parser(r"[:alnum]");
5761 assert_eq!(p.maybe_parse_ascii_class(), None);
5762 assert_eq!(p.offset(), 0);
5763
5764 let p = parser(r"[:alnum:");
5765 assert_eq!(p.maybe_parse_ascii_class(), None);
5766 assert_eq!(p.offset(), 0);
5767 }
5768
5769 #[test]
5770 fn parse_unicode_class() {
5771 assert_eq!(
5772 parser(r"\pN").parse_escape(),
5773 Ok(Primitive::Unicode(ast::ClassUnicode {
5774 span: span(0..3),
5775 negated: false,
5776 kind: ast::ClassUnicodeKind::OneLetter('N'),
5777 }))
5778 );
5779 assert_eq!(
5780 parser(r"\PN").parse_escape(),
5781 Ok(Primitive::Unicode(ast::ClassUnicode {
5782 span: span(0..3),
5783 negated: true,
5784 kind: ast::ClassUnicodeKind::OneLetter('N'),
5785 }))
5786 );
5787 assert_eq!(
5788 parser(r"\p{N}").parse_escape(),
5789 Ok(Primitive::Unicode(ast::ClassUnicode {
5790 span: span(0..5),
5791 negated: false,
5792 kind: ast::ClassUnicodeKind::Named(s("N")),
5793 }))
5794 );
5795 assert_eq!(
5796 parser(r"\P{N}").parse_escape(),
5797 Ok(Primitive::Unicode(ast::ClassUnicode {
5798 span: span(0..5),
5799 negated: true,
5800 kind: ast::ClassUnicodeKind::Named(s("N")),
5801 }))
5802 );
5803 assert_eq!(
5804 parser(r"\p{Greek}").parse_escape(),
5805 Ok(Primitive::Unicode(ast::ClassUnicode {
5806 span: span(0..9),
5807 negated: false,
5808 kind: ast::ClassUnicodeKind::Named(s("Greek")),
5809 }))
5810 );
5811
5812 assert_eq!(
5813 parser(r"\p{scx:Katakana}").parse_escape(),
5814 Ok(Primitive::Unicode(ast::ClassUnicode {
5815 span: span(0..16),
5816 negated: false,
5817 kind: ast::ClassUnicodeKind::NamedValue {
5818 op: ast::ClassUnicodeOpKind::Colon,
5819 name: s("scx"),
5820 value: s("Katakana"),
5821 },
5822 }))
5823 );
5824 assert_eq!(
5825 parser(r"\p{scx=Katakana}").parse_escape(),
5826 Ok(Primitive::Unicode(ast::ClassUnicode {
5827 span: span(0..16),
5828 negated: false,
5829 kind: ast::ClassUnicodeKind::NamedValue {
5830 op: ast::ClassUnicodeOpKind::Equal,
5831 name: s("scx"),
5832 value: s("Katakana"),
5833 },
5834 }))
5835 );
5836 assert_eq!(
5837 parser(r"\p{scx!=Katakana}").parse_escape(),
5838 Ok(Primitive::Unicode(ast::ClassUnicode {
5839 span: span(0..17),
5840 negated: false,
5841 kind: ast::ClassUnicodeKind::NamedValue {
5842 op: ast::ClassUnicodeOpKind::NotEqual,
5843 name: s("scx"),
5844 value: s("Katakana"),
5845 },
5846 }))
5847 );
5848
5849 assert_eq!(
5850 parser(r"\p{:}").parse_escape(),
5851 Ok(Primitive::Unicode(ast::ClassUnicode {
5852 span: span(0..5),
5853 negated: false,
5854 kind: ast::ClassUnicodeKind::NamedValue {
5855 op: ast::ClassUnicodeOpKind::Colon,
5856 name: s(""),
5857 value: s(""),
5858 },
5859 }))
5860 );
5861 assert_eq!(
5862 parser(r"\p{=}").parse_escape(),
5863 Ok(Primitive::Unicode(ast::ClassUnicode {
5864 span: span(0..5),
5865 negated: false,
5866 kind: ast::ClassUnicodeKind::NamedValue {
5867 op: ast::ClassUnicodeOpKind::Equal,
5868 name: s(""),
5869 value: s(""),
5870 },
5871 }))
5872 );
5873 assert_eq!(
5874 parser(r"\p{!=}").parse_escape(),
5875 Ok(Primitive::Unicode(ast::ClassUnicode {
5876 span: span(0..6),
5877 negated: false,
5878 kind: ast::ClassUnicodeKind::NamedValue {
5879 op: ast::ClassUnicodeOpKind::NotEqual,
5880 name: s(""),
5881 value: s(""),
5882 },
5883 }))
5884 );
5885
5886 assert_eq!(
5887 parser(r"\p").parse_escape().unwrap_err(),
5888 TestError {
5889 span: span(2..2),
5890 kind: ast::ErrorKind::EscapeUnexpectedEof,
5891 }
5892 );
5893 assert_eq!(
5894 parser(r"\p{").parse_escape().unwrap_err(),
5895 TestError {
5896 span: span(3..3),
5897 kind: ast::ErrorKind::EscapeUnexpectedEof,
5898 }
5899 );
5900 assert_eq!(
5901 parser(r"\p{N").parse_escape().unwrap_err(),
5902 TestError {
5903 span: span(4..4),
5904 kind: ast::ErrorKind::EscapeUnexpectedEof,
5905 }
5906 );
5907 assert_eq!(
5908 parser(r"\p{Greek").parse_escape().unwrap_err(),
5909 TestError {
5910 span: span(8..8),
5911 kind: ast::ErrorKind::EscapeUnexpectedEof,
5912 }
5913 );
5914
5915 assert_eq!(
5916 parser(r"\pNz").parse(),
5917 Ok(Ast::Concat(ast::Concat {
5918 span: span(0..4),
5919 asts: vec![
5920 Ast::Class(ast::Class::Unicode(ast::ClassUnicode {
5921 span: span(0..3),
5922 negated: false,
5923 kind: ast::ClassUnicodeKind::OneLetter('N'),
5924 })),
5925 Ast::Literal(ast::Literal {
5926 span: span(3..4),
5927 kind: ast::LiteralKind::Verbatim,
5928 c: 'z',
5929 }),
5930 ],
5931 }))
5932 );
5933 assert_eq!(
5934 parser(r"\p{Greek}z").parse(),
5935 Ok(Ast::Concat(ast::Concat {
5936 span: span(0..10),
5937 asts: vec![
5938 Ast::Class(ast::Class::Unicode(ast::ClassUnicode {
5939 span: span(0..9),
5940 negated: false,
5941 kind: ast::ClassUnicodeKind::Named(s("Greek")),
5942 })),
5943 Ast::Literal(ast::Literal {
5944 span: span(9..10),
5945 kind: ast::LiteralKind::Verbatim,
5946 c: 'z',
5947 }),
5948 ],
5949 }))
5950 );
5951 assert_eq!(
5952 parser(r"\p\{").parse().unwrap_err(),
5953 TestError {
5954 span: span(2..3),
5955 kind: ast::ErrorKind::UnicodeClassInvalid,
5956 }
5957 );
5958 assert_eq!(
5959 parser(r"\P\{").parse().unwrap_err(),
5960 TestError {
5961 span: span(2..3),
5962 kind: ast::ErrorKind::UnicodeClassInvalid,
5963 }
5964 );
5965 }
5966
5967 #[test]
5968 fn parse_perl_class() {
5969 assert_eq!(
5970 parser(r"\d").parse_escape(),
5971 Ok(Primitive::Perl(ast::ClassPerl {
5972 span: span(0..2),
5973 kind: ast::ClassPerlKind::Digit,
5974 negated: false,
5975 }))
5976 );
5977 assert_eq!(
5978 parser(r"\D").parse_escape(),
5979 Ok(Primitive::Perl(ast::ClassPerl {
5980 span: span(0..2),
5981 kind: ast::ClassPerlKind::Digit,
5982 negated: true,
5983 }))
5984 );
5985 assert_eq!(
5986 parser(r"\s").parse_escape(),
5987 Ok(Primitive::Perl(ast::ClassPerl {
5988 span: span(0..2),
5989 kind: ast::ClassPerlKind::Space,
5990 negated: false,
5991 }))
5992 );
5993 assert_eq!(
5994 parser(r"\S").parse_escape(),
5995 Ok(Primitive::Perl(ast::ClassPerl {
5996 span: span(0..2),
5997 kind: ast::ClassPerlKind::Space,
5998 negated: true,
5999 }))
6000 );
6001 assert_eq!(
6002 parser(r"\w").parse_escape(),
6003 Ok(Primitive::Perl(ast::ClassPerl {
6004 span: span(0..2),
6005 kind: ast::ClassPerlKind::Word,
6006 negated: false,
6007 }))
6008 );
6009 assert_eq!(
6010 parser(r"\W").parse_escape(),
6011 Ok(Primitive::Perl(ast::ClassPerl {
6012 span: span(0..2),
6013 kind: ast::ClassPerlKind::Word,
6014 negated: true,
6015 }))
6016 );
6017
6018 assert_eq!(
6019 parser(r"\d").parse(),
6020 Ok(Ast::Class(ast::Class::Perl(ast::ClassPerl {
6021 span: span(0..2),
6022 kind: ast::ClassPerlKind::Digit,
6023 negated: false,
6024 })))
6025 );
6026 assert_eq!(
6027 parser(r"\dz").parse(),
6028 Ok(Ast::Concat(ast::Concat {
6029 span: span(0..3),
6030 asts: vec![
6031 Ast::Class(ast::Class::Perl(ast::ClassPerl {
6032 span: span(0..2),
6033 kind: ast::ClassPerlKind::Digit,
6034 negated: false,
6035 })),
6036 Ast::Literal(ast::Literal {
6037 span: span(2..3),
6038 kind: ast::LiteralKind::Verbatim,
6039 c: 'z',
6040 }),
6041 ],
6042 }))
6043 );
6044 }
6045
6046 // This tests a bug fix where the nest limit checker wasn't decrementing
6047 // its depth during post-traversal, which causes long regexes to trip
6048 // the default limit too aggressively.
6049 #[test]
6050 fn regression_454_nest_too_big() {
6051 let pattern = r#"
6052 2(?:
6053 [45]\d{3}|
6054 7(?:
6055 1[0-267]|
6056 2[0-289]|
6057 3[0-29]|
6058 4[01]|
6059 5[1-3]|
6060 6[013]|
6061 7[0178]|
6062 91
6063 )|
6064 8(?:
6065 0[125]|
6066 [139][1-6]|
6067 2[0157-9]|
6068 41|
6069 6[1-35]|
6070 7[1-5]|
6071 8[1-8]|
6072 90
6073 )|
6074 9(?:
6075 0[0-2]|
6076 1[0-4]|
6077 2[568]|
6078 3[3-6]|
6079 5[5-7]|
6080 6[0167]|
6081 7[15]|
6082 8[0146-9]
6083 )
6084 )\d{4}
6085 "#;
6086 assert!(parser_nest_limit(pattern, 50).parse().is_ok());
6087 }
6088
6089 // This tests that we treat a trailing `-` in a character class as a
6090 // literal `-` even when whitespace mode is enabled and there is whitespace
6091 // after the trailing `-`.
6092 #[test]
6093 fn regression_455_trailing_dash_ignore_whitespace() {
6094 assert!(parser("(?x)[ / - ]").parse().is_ok());
6095 assert!(parser("(?x)[ a - ]").parse().is_ok());
6096 assert!(parser(
6097 "(?x)[
6098 a
6099 - ]
6100 "
6101 )
6102 .parse()
6103 .is_ok());
6104 assert!(parser(
6105 "(?x)[
6106 a # wat
6107 - ]
6108 "
6109 )
6110 .parse()
6111 .is_ok());
6112
6113 assert!(parser("(?x)[ / -").parse().is_err());
6114 assert!(parser("(?x)[ / - ").parse().is_err());
6115 assert!(parser(
6116 "(?x)[
6117 / -
6118 "
6119 )
6120 .parse()
6121 .is_err());
6122 assert!(parser(
6123 "(?x)[
6124 / - # wat
6125 "
6126 )
6127 .parse()
6128 .is_err());
6129 }
6130}
6131