1/*!
2This module provides a regular expression parser.
3*/
4
5use core::{
6 borrow::Borrow,
7 cell::{Cell, RefCell},
8 mem,
9};
10
11use alloc::{
12 boxed::Box,
13 string::{String, ToString},
14 vec,
15 vec::Vec,
16};
17
18use crate::{
19 ast::{self, Ast, Position, Span},
20 either::Either,
21 is_escapeable_character, is_meta_character,
22};
23
24type Result<T> = core::result::Result<T, ast::Error>;
25
26/// A primitive is an expression with no sub-expressions. This includes
27/// literals, assertions and non-set character classes. This representation
28/// is used as intermediate state in the parser.
29///
30/// This does not include ASCII character classes, since they can only appear
31/// within a set character class.
32#[derive(Clone, Debug, Eq, PartialEq)]
33enum Primitive {
34 Literal(ast::Literal),
35 Assertion(ast::Assertion),
36 Dot(Span),
37 Perl(ast::ClassPerl),
38 Unicode(ast::ClassUnicode),
39}
40
41impl Primitive {
42 /// Return the span of this primitive.
43 fn span(&self) -> &Span {
44 match *self {
45 Primitive::Literal(ref x) => &x.span,
46 Primitive::Assertion(ref x) => &x.span,
47 Primitive::Dot(ref span) => span,
48 Primitive::Perl(ref x) => &x.span,
49 Primitive::Unicode(ref x) => &x.span,
50 }
51 }
52
53 /// Convert this primitive into a proper AST.
54 fn into_ast(self) -> Ast {
55 match self {
56 Primitive::Literal(lit) => Ast::literal(lit),
57 Primitive::Assertion(assert) => Ast::assertion(assert),
58 Primitive::Dot(span) => Ast::dot(span),
59 Primitive::Perl(cls) => Ast::class_perl(cls),
60 Primitive::Unicode(cls) => Ast::class_unicode(cls),
61 }
62 }
63
64 /// Convert this primitive into an item in a character class.
65 ///
66 /// If this primitive is not a legal item (i.e., an assertion or a dot),
67 /// then return an error.
68 fn into_class_set_item<P: Borrow<Parser>>(
69 self,
70 p: &ParserI<'_, P>,
71 ) -> Result<ast::ClassSetItem> {
72 use self::Primitive::*;
73 use crate::ast::ClassSetItem;
74
75 match self {
76 Literal(lit) => Ok(ClassSetItem::Literal(lit)),
77 Perl(cls) => Ok(ClassSetItem::Perl(cls)),
78 Unicode(cls) => Ok(ClassSetItem::Unicode(cls)),
79 x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)),
80 }
81 }
82
83 /// Convert this primitive into a literal in a character class. In
84 /// particular, literals are the only valid items that can appear in
85 /// ranges.
86 ///
87 /// If this primitive is not a legal item (i.e., a class, assertion or a
88 /// dot), then return an error.
89 fn into_class_literal<P: Borrow<Parser>>(
90 self,
91 p: &ParserI<'_, P>,
92 ) -> Result<ast::Literal> {
93 use self::Primitive::*;
94
95 match self {
96 Literal(lit) => Ok(lit),
97 x => Err(p.error(*x.span(), ast::ErrorKind::ClassRangeLiteral)),
98 }
99 }
100}
101
102/// Returns true if the given character is a hexadecimal digit.
103fn is_hex(c: char) -> bool {
104 ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')
105}
106
107/// Returns true if the given character is a valid in a capture group name.
108///
109/// If `first` is true, then `c` is treated as the first character in the
110/// group name (which must be alphabetic or underscore).
111fn is_capture_char(c: char, first: bool) -> bool {
112 if first {
113 c == '_' || c.is_alphabetic()
114 } else {
115 c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric()
116 }
117}
118
119/// A builder for a regular expression parser.
120///
121/// This builder permits modifying configuration options for the parser.
122#[derive(Clone, Debug)]
123pub struct ParserBuilder {
124 ignore_whitespace: bool,
125 nest_limit: u32,
126 octal: bool,
127}
128
129impl Default for ParserBuilder {
130 fn default() -> ParserBuilder {
131 ParserBuilder::new()
132 }
133}
134
135impl ParserBuilder {
136 /// Create a new parser builder with a default configuration.
137 pub fn new() -> ParserBuilder {
138 ParserBuilder {
139 ignore_whitespace: false,
140 nest_limit: 250,
141 octal: false,
142 }
143 }
144
145 /// Build a parser from this configuration with the given pattern.
146 pub fn build(&self) -> Parser {
147 Parser {
148 pos: Cell::new(Position { offset: 0, line: 1, column: 1 }),
149 capture_index: Cell::new(0),
150 nest_limit: self.nest_limit,
151 octal: self.octal,
152 initial_ignore_whitespace: self.ignore_whitespace,
153 ignore_whitespace: Cell::new(self.ignore_whitespace),
154 comments: RefCell::new(vec![]),
155 stack_group: RefCell::new(vec![]),
156 stack_class: RefCell::new(vec![]),
157 capture_names: RefCell::new(vec![]),
158 scratch: RefCell::new(String::new()),
159 }
160 }
161
162 /// Set the nesting limit for this parser.
163 ///
164 /// The nesting limit controls how deep the abstract syntax tree is allowed
165 /// to be. If the AST exceeds the given limit (e.g., with too many nested
166 /// groups), then an error is returned by the parser.
167 ///
168 /// The purpose of this limit is to act as a heuristic to prevent stack
169 /// overflow for consumers that do structural induction on an `Ast` using
170 /// explicit recursion. While this crate never does this (instead using
171 /// constant stack space and moving the call stack to the heap), other
172 /// crates may.
173 ///
174 /// This limit is not checked until the entire AST is parsed. Therefore,
175 /// if callers want to put a limit on the amount of heap space used, then
176 /// they should impose a limit on the length, in bytes, of the concrete
177 /// pattern string. In particular, this is viable since this parser
178 /// implementation will limit itself to heap space proportional to the
179 /// length of the pattern string.
180 ///
181 /// Note that a nest limit of `0` will return a nest limit error for most
182 /// patterns but not all. For example, a nest limit of `0` permits `a` but
183 /// not `ab`, since `ab` requires a concatenation, which results in a nest
184 /// depth of `1`. In general, a nest limit is not something that manifests
185 /// in an obvious way in the concrete syntax, therefore, it should not be
186 /// used in a granular way.
187 pub fn nest_limit(&mut self, limit: u32) -> &mut ParserBuilder {
188 self.nest_limit = limit;
189 self
190 }
191
192 /// Whether to support octal syntax or not.
193 ///
194 /// Octal syntax is a little-known way of uttering Unicode codepoints in
195 /// a regular expression. For example, `a`, `\x61`, `\u0061` and
196 /// `\141` are all equivalent regular expressions, where the last example
197 /// shows octal syntax.
198 ///
199 /// While supporting octal syntax isn't in and of itself a problem, it does
200 /// make good error messages harder. That is, in PCRE based regex engines,
201 /// syntax like `\0` invokes a backreference, which is explicitly
202 /// unsupported in Rust's regex engine. However, many users expect it to
203 /// be supported. Therefore, when octal support is disabled, the error
204 /// message will explicitly mention that backreferences aren't supported.
205 ///
206 /// Octal syntax is disabled by default.
207 pub fn octal(&mut self, yes: bool) -> &mut ParserBuilder {
208 self.octal = yes;
209 self
210 }
211
212 /// Enable verbose mode in the regular expression.
213 ///
214 /// When enabled, verbose mode permits insignificant whitespace in many
215 /// places in the regular expression, as well as comments. Comments are
216 /// started using `#` and continue until the end of the line.
217 ///
218 /// By default, this is disabled. It may be selectively enabled in the
219 /// regular expression by using the `x` flag regardless of this setting.
220 pub fn ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder {
221 self.ignore_whitespace = yes;
222 self
223 }
224}
225
226/// A regular expression parser.
227///
228/// This parses a string representation of a regular expression into an
229/// abstract syntax tree. The size of the tree is proportional to the length
230/// of the regular expression pattern.
231///
232/// A `Parser` can be configured in more detail via a [`ParserBuilder`].
233#[derive(Clone, Debug)]
234pub struct Parser {
235 /// The current position of the parser.
236 pos: Cell<Position>,
237 /// The current capture index.
238 capture_index: Cell<u32>,
239 /// The maximum number of open parens/brackets allowed. If the parser
240 /// exceeds this number, then an error is returned.
241 nest_limit: u32,
242 /// Whether to support octal syntax or not. When `false`, the parser will
243 /// return an error helpfully pointing out that backreferences are not
244 /// supported.
245 octal: bool,
246 /// The initial setting for `ignore_whitespace` as provided by
247 /// `ParserBuilder`. It is used when resetting the parser's state.
248 initial_ignore_whitespace: bool,
249 /// Whether whitespace should be ignored. When enabled, comments are
250 /// also permitted.
251 ignore_whitespace: Cell<bool>,
252 /// A list of comments, in order of appearance.
253 comments: RefCell<Vec<ast::Comment>>,
254 /// A stack of grouped sub-expressions, including alternations.
255 stack_group: RefCell<Vec<GroupState>>,
256 /// A stack of nested character classes. This is only non-empty when
257 /// parsing a class.
258 stack_class: RefCell<Vec<ClassState>>,
259 /// A sorted sequence of capture names. This is used to detect duplicate
260 /// capture names and report an error if one is detected.
261 capture_names: RefCell<Vec<ast::CaptureName>>,
262 /// A scratch buffer used in various places. Mostly this is used to
263 /// accumulate relevant characters from parts of a pattern.
264 scratch: RefCell<String>,
265}
266
267/// ParserI is the internal parser implementation.
268///
269/// We use this separate type so that we can carry the provided pattern string
270/// along with us. In particular, a `Parser` internal state is not tied to any
271/// one pattern, but `ParserI` is.
272///
273/// This type also lets us use `ParserI<&Parser>` in production code while
274/// retaining the convenience of `ParserI<Parser>` for tests, which sometimes
275/// work against the internal interface of the parser.
276#[derive(Clone, Debug)]
277struct ParserI<'s, P> {
278 /// The parser state/configuration.
279 parser: P,
280 /// The full regular expression provided by the user.
281 pattern: &'s str,
282}
283
284/// GroupState represents a single stack frame while parsing nested groups
285/// and alternations. Each frame records the state up to an opening parenthesis
286/// or a alternating bracket `|`.
287#[derive(Clone, Debug)]
288enum GroupState {
289 /// This state is pushed whenever an opening group is found.
290 Group {
291 /// The concatenation immediately preceding the opening group.
292 concat: ast::Concat,
293 /// The group that has been opened. Its sub-AST is always empty.
294 group: ast::Group,
295 /// Whether this group has the `x` flag enabled or not.
296 ignore_whitespace: bool,
297 },
298 /// This state is pushed whenever a new alternation branch is found. If
299 /// an alternation branch is found and this state is at the top of the
300 /// stack, then this state should be modified to include the new
301 /// alternation.
302 Alternation(ast::Alternation),
303}
304
305/// ClassState represents a single stack frame while parsing character classes.
306/// Each frame records the state up to an intersection, difference, symmetric
307/// difference or nested class.
308///
309/// Note that a parser's character class stack is only non-empty when parsing
310/// a character class. In all other cases, it is empty.
311#[derive(Clone, Debug)]
312enum ClassState {
313 /// This state is pushed whenever an opening bracket is found.
314 Open {
315 /// The union of class items immediately preceding this class.
316 union: ast::ClassSetUnion,
317 /// The class that has been opened. Typically this just corresponds
318 /// to the `[`, but it can also include `[^` since `^` indicates
319 /// negation of the class.
320 set: ast::ClassBracketed,
321 },
322 /// This state is pushed when a operator is seen. When popped, the stored
323 /// set becomes the left hand side of the operator.
324 Op {
325 /// The type of the operation, i.e., &&, -- or ~~.
326 kind: ast::ClassSetBinaryOpKind,
327 /// The left-hand side of the operator.
328 lhs: ast::ClassSet,
329 },
330}
331
332impl Parser {
333 /// Create a new parser with a default configuration.
334 ///
335 /// The parser can be run with either the `parse` or `parse_with_comments`
336 /// methods. The parse methods return an abstract syntax tree.
337 ///
338 /// To set configuration options on the parser, use [`ParserBuilder`].
339 pub fn new() -> Parser {
340 ParserBuilder::new().build()
341 }
342
343 /// Parse the regular expression into an abstract syntax tree.
344 pub fn parse(&mut self, pattern: &str) -> Result<Ast> {
345 ParserI::new(self, pattern).parse()
346 }
347
348 /// Parse the regular expression and return an abstract syntax tree with
349 /// all of the comments found in the pattern.
350 pub fn parse_with_comments(
351 &mut self,
352 pattern: &str,
353 ) -> Result<ast::WithComments> {
354 ParserI::new(self, pattern).parse_with_comments()
355 }
356
357 /// Reset the internal state of a parser.
358 ///
359 /// This is called at the beginning of every parse. This prevents the
360 /// parser from running with inconsistent state (say, if a previous
361 /// invocation returned an error and the parser is reused).
362 fn reset(&self) {
363 // These settings should be in line with the construction
364 // in `ParserBuilder::build`.
365 self.pos.set(Position { offset: 0, line: 1, column: 1 });
366 self.ignore_whitespace.set(self.initial_ignore_whitespace);
367 self.comments.borrow_mut().clear();
368 self.stack_group.borrow_mut().clear();
369 self.stack_class.borrow_mut().clear();
370 }
371}
372
373impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
374 /// Build an internal parser from a parser configuration and a pattern.
375 fn new(parser: P, pattern: &'s str) -> ParserI<'s, P> {
376 ParserI { parser, pattern }
377 }
378
379 /// Return a reference to the parser state.
380 fn parser(&self) -> &Parser {
381 self.parser.borrow()
382 }
383
384 /// Return a reference to the pattern being parsed.
385 fn pattern(&self) -> &str {
386 self.pattern
387 }
388
389 /// Create a new error with the given span and error type.
390 fn error(&self, span: Span, kind: ast::ErrorKind) -> ast::Error {
391 ast::Error { kind, pattern: self.pattern().to_string(), span }
392 }
393
394 /// Return the current offset of the parser.
395 ///
396 /// The offset starts at `0` from the beginning of the regular expression
397 /// pattern string.
398 fn offset(&self) -> usize {
399 self.parser().pos.get().offset
400 }
401
402 /// Return the current line number of the parser.
403 ///
404 /// The line number starts at `1`.
405 fn line(&self) -> usize {
406 self.parser().pos.get().line
407 }
408
409 /// Return the current column of the parser.
410 ///
411 /// The column number starts at `1` and is reset whenever a `\n` is seen.
412 fn column(&self) -> usize {
413 self.parser().pos.get().column
414 }
415
416 /// Return the next capturing index. Each subsequent call increments the
417 /// internal index.
418 ///
419 /// The span given should correspond to the location of the opening
420 /// parenthesis.
421 ///
422 /// If the capture limit is exceeded, then an error is returned.
423 fn next_capture_index(&self, span: Span) -> Result<u32> {
424 let current = self.parser().capture_index.get();
425 let i = current.checked_add(1).ok_or_else(|| {
426 self.error(span, ast::ErrorKind::CaptureLimitExceeded)
427 })?;
428 self.parser().capture_index.set(i);
429 Ok(i)
430 }
431
432 /// Adds the given capture name to this parser. If this capture name has
433 /// already been used, then an error is returned.
434 fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> {
435 let mut names = self.parser().capture_names.borrow_mut();
436 match names
437 .binary_search_by_key(&cap.name.as_str(), |c| c.name.as_str())
438 {
439 Err(i) => {
440 names.insert(i, cap.clone());
441 Ok(())
442 }
443 Ok(i) => Err(self.error(
444 cap.span,
445 ast::ErrorKind::GroupNameDuplicate { original: names[i].span },
446 )),
447 }
448 }
449
450 /// Return whether the parser should ignore whitespace or not.
451 fn ignore_whitespace(&self) -> bool {
452 self.parser().ignore_whitespace.get()
453 }
454
455 /// Return the character at the current position of the parser.
456 ///
457 /// This panics if the current position does not point to a valid char.
458 fn char(&self) -> char {
459 self.char_at(self.offset())
460 }
461
462 /// Return the character at the given position.
463 ///
464 /// This panics if the given position does not point to a valid char.
465 fn char_at(&self, i: usize) -> char {
466 self.pattern()[i..]
467 .chars()
468 .next()
469 .unwrap_or_else(|| panic!("expected char at offset {}", i))
470 }
471
472 /// Bump the parser to the next Unicode scalar value.
473 ///
474 /// If the end of the input has been reached, then `false` is returned.
475 fn bump(&self) -> bool {
476 if self.is_eof() {
477 return false;
478 }
479 let Position { mut offset, mut line, mut column } = self.pos();
480 if self.char() == '\n' {
481 line = line.checked_add(1).unwrap();
482 column = 1;
483 } else {
484 column = column.checked_add(1).unwrap();
485 }
486 offset += self.char().len_utf8();
487 self.parser().pos.set(Position { offset, line, column });
488 self.pattern()[self.offset()..].chars().next().is_some()
489 }
490
491 /// If the substring starting at the current position of the parser has
492 /// the given prefix, then bump the parser to the character immediately
493 /// following the prefix and return true. Otherwise, don't bump the parser
494 /// and return false.
495 fn bump_if(&self, prefix: &str) -> bool {
496 if self.pattern()[self.offset()..].starts_with(prefix) {
497 for _ in 0..prefix.chars().count() {
498 self.bump();
499 }
500 true
501 } else {
502 false
503 }
504 }
505
506 /// Returns true if and only if the parser is positioned at a look-around
507 /// prefix. The conditions under which this returns true must always
508 /// correspond to a regular expression that would otherwise be consider
509 /// invalid.
510 ///
511 /// This should only be called immediately after parsing the opening of
512 /// a group or a set of flags.
513 fn is_lookaround_prefix(&self) -> bool {
514 self.bump_if("?=")
515 || self.bump_if("?!")
516 || self.bump_if("?<=")
517 || self.bump_if("?<!")
518 }
519
520 /// Bump the parser, and if the `x` flag is enabled, bump through any
521 /// subsequent spaces. Return true if and only if the parser is not at
522 /// EOF.
523 fn bump_and_bump_space(&self) -> bool {
524 if !self.bump() {
525 return false;
526 }
527 self.bump_space();
528 !self.is_eof()
529 }
530
531 /// If the `x` flag is enabled (i.e., whitespace insensitivity with
532 /// comments), then this will advance the parser through all whitespace
533 /// and comments to the next non-whitespace non-comment byte.
534 ///
535 /// If the `x` flag is disabled, then this is a no-op.
536 ///
537 /// This should be used selectively throughout the parser where
538 /// arbitrary whitespace is permitted when the `x` flag is enabled. For
539 /// example, `{ 5 , 6}` is equivalent to `{5,6}`.
540 fn bump_space(&self) {
541 if !self.ignore_whitespace() {
542 return;
543 }
544 while !self.is_eof() {
545 if self.char().is_whitespace() {
546 self.bump();
547 } else if self.char() == '#' {
548 let start = self.pos();
549 let mut comment_text = String::new();
550 self.bump();
551 while !self.is_eof() {
552 let c = self.char();
553 self.bump();
554 if c == '\n' {
555 break;
556 }
557 comment_text.push(c);
558 }
559 let comment = ast::Comment {
560 span: Span::new(start, self.pos()),
561 comment: comment_text,
562 };
563 self.parser().comments.borrow_mut().push(comment);
564 } else {
565 break;
566 }
567 }
568 }
569
570 /// Peek at the next character in the input without advancing the parser.
571 ///
572 /// If the input has been exhausted, then this returns `None`.
573 fn peek(&self) -> Option<char> {
574 if self.is_eof() {
575 return None;
576 }
577 self.pattern()[self.offset() + self.char().len_utf8()..].chars().next()
578 }
579
580 /// Like peek, but will ignore spaces when the parser is in whitespace
581 /// insensitive mode.
582 fn peek_space(&self) -> Option<char> {
583 if !self.ignore_whitespace() {
584 return self.peek();
585 }
586 if self.is_eof() {
587 return None;
588 }
589 let mut start = self.offset() + self.char().len_utf8();
590 let mut in_comment = false;
591 for (i, c) in self.pattern()[start..].char_indices() {
592 if c.is_whitespace() {
593 continue;
594 } else if !in_comment && c == '#' {
595 in_comment = true;
596 } else if in_comment && c == '\n' {
597 in_comment = false;
598 } else {
599 start += i;
600 break;
601 }
602 }
603 self.pattern()[start..].chars().next()
604 }
605
606 /// Returns true if the next call to `bump` would return false.
607 fn is_eof(&self) -> bool {
608 self.offset() == self.pattern().len()
609 }
610
611 /// Return the current position of the parser, which includes the offset,
612 /// line and column.
613 fn pos(&self) -> Position {
614 self.parser().pos.get()
615 }
616
617 /// Create a span at the current position of the parser. Both the start
618 /// and end of the span are set.
619 fn span(&self) -> Span {
620 Span::splat(self.pos())
621 }
622
623 /// Create a span that covers the current character.
624 fn span_char(&self) -> Span {
625 let mut next = Position {
626 offset: self.offset().checked_add(self.char().len_utf8()).unwrap(),
627 line: self.line(),
628 column: self.column().checked_add(1).unwrap(),
629 };
630 if self.char() == '\n' {
631 next.line += 1;
632 next.column = 1;
633 }
634 Span::new(self.pos(), next)
635 }
636
637 /// Parse and push a single alternation on to the parser's internal stack.
638 /// If the top of the stack already has an alternation, then add to that
639 /// instead of pushing a new one.
640 ///
641 /// The concatenation given corresponds to a single alternation branch.
642 /// The concatenation returned starts the next branch and is empty.
643 ///
644 /// This assumes the parser is currently positioned at `|` and will advance
645 /// the parser to the character following `|`.
646 #[inline(never)]
647 fn push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
648 assert_eq!(self.char(), '|');
649 concat.span.end = self.pos();
650 self.push_or_add_alternation(concat);
651 self.bump();
652 Ok(ast::Concat { span: self.span(), asts: vec![] })
653 }
654
655 /// Pushes or adds the given branch of an alternation to the parser's
656 /// internal stack of state.
657 fn push_or_add_alternation(&self, concat: ast::Concat) {
658 use self::GroupState::*;
659
660 let mut stack = self.parser().stack_group.borrow_mut();
661 if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() {
662 alts.asts.push(concat.into_ast());
663 return;
664 }
665 stack.push(Alternation(ast::Alternation {
666 span: Span::new(concat.span.start, self.pos()),
667 asts: vec![concat.into_ast()],
668 }));
669 }
670
671 /// Parse and push a group AST (and its parent concatenation) on to the
672 /// parser's internal stack. Return a fresh concatenation corresponding
673 /// to the group's sub-AST.
674 ///
675 /// If a set of flags was found (with no group), then the concatenation
676 /// is returned with that set of flags added.
677 ///
678 /// This assumes that the parser is currently positioned on the opening
679 /// parenthesis. It advances the parser to the character at the start
680 /// of the sub-expression (or adjoining expression).
681 ///
682 /// If there was a problem parsing the start of the group, then an error
683 /// is returned.
684 #[inline(never)]
685 fn push_group(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
686 assert_eq!(self.char(), '(');
687 match self.parse_group()? {
688 Either::Left(set) => {
689 let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace);
690 if let Some(v) = ignore {
691 self.parser().ignore_whitespace.set(v);
692 }
693
694 concat.asts.push(Ast::flags(set));
695 Ok(concat)
696 }
697 Either::Right(group) => {
698 let old_ignore_whitespace = self.ignore_whitespace();
699 let new_ignore_whitespace = group
700 .flags()
701 .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
702 .unwrap_or(old_ignore_whitespace);
703 self.parser().stack_group.borrow_mut().push(
704 GroupState::Group {
705 concat,
706 group,
707 ignore_whitespace: old_ignore_whitespace,
708 },
709 );
710 self.parser().ignore_whitespace.set(new_ignore_whitespace);
711 Ok(ast::Concat { span: self.span(), asts: vec![] })
712 }
713 }
714 }
715
716 /// Pop a group AST from the parser's internal stack and set the group's
717 /// AST to the given concatenation. Return the concatenation containing
718 /// the group.
719 ///
720 /// This assumes that the parser is currently positioned on the closing
721 /// parenthesis and advances the parser to the character following the `)`.
722 ///
723 /// If no such group could be popped, then an unopened group error is
724 /// returned.
725 #[inline(never)]
726 fn pop_group(&self, mut group_concat: ast::Concat) -> Result<ast::Concat> {
727 use self::GroupState::*;
728
729 assert_eq!(self.char(), ')');
730 let mut stack = self.parser().stack_group.borrow_mut();
731 let (mut prior_concat, mut group, ignore_whitespace, alt) = match stack
732 .pop()
733 {
734 Some(Group { concat, group, ignore_whitespace }) => {
735 (concat, group, ignore_whitespace, None)
736 }
737 Some(Alternation(alt)) => match stack.pop() {
738 Some(Group { concat, group, ignore_whitespace }) => {
739 (concat, group, ignore_whitespace, Some(alt))
740 }
741 None | Some(Alternation(_)) => {
742 return Err(self.error(
743 self.span_char(),
744 ast::ErrorKind::GroupUnopened,
745 ));
746 }
747 },
748 None => {
749 return Err(self
750 .error(self.span_char(), ast::ErrorKind::GroupUnopened));
751 }
752 };
753 self.parser().ignore_whitespace.set(ignore_whitespace);
754 group_concat.span.end = self.pos();
755 self.bump();
756 group.span.end = self.pos();
757 match alt {
758 Some(mut alt) => {
759 alt.span.end = group_concat.span.end;
760 alt.asts.push(group_concat.into_ast());
761 group.ast = Box::new(alt.into_ast());
762 }
763 None => {
764 group.ast = Box::new(group_concat.into_ast());
765 }
766 }
767 prior_concat.asts.push(Ast::group(group));
768 Ok(prior_concat)
769 }
770
771 /// Pop the last state from the parser's internal stack, if it exists, and
772 /// add the given concatenation to it. There either must be no state or a
773 /// single alternation item on the stack. Any other scenario produces an
774 /// error.
775 ///
776 /// This assumes that the parser has advanced to the end.
777 #[inline(never)]
778 fn pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast> {
779 concat.span.end = self.pos();
780 let mut stack = self.parser().stack_group.borrow_mut();
781 let ast = match stack.pop() {
782 None => Ok(concat.into_ast()),
783 Some(GroupState::Alternation(mut alt)) => {
784 alt.span.end = self.pos();
785 alt.asts.push(concat.into_ast());
786 Ok(Ast::alternation(alt))
787 }
788 Some(GroupState::Group { group, .. }) => {
789 return Err(
790 self.error(group.span, ast::ErrorKind::GroupUnclosed)
791 );
792 }
793 };
794 // If we try to pop again, there should be nothing.
795 match stack.pop() {
796 None => ast,
797 Some(GroupState::Alternation(_)) => {
798 // This unreachable is unfortunate. This case can't happen
799 // because the only way we can be here is if there were two
800 // `GroupState::Alternation`s adjacent in the parser's stack,
801 // which we guarantee to never happen because we never push a
802 // `GroupState::Alternation` if one is already at the top of
803 // the stack.
804 unreachable!()
805 }
806 Some(GroupState::Group { group, .. }) => {
807 Err(self.error(group.span, ast::ErrorKind::GroupUnclosed))
808 }
809 }
810 }
811
812 /// Parse the opening of a character class and push the current class
813 /// parsing context onto the parser's stack. This assumes that the parser
814 /// is positioned at an opening `[`. The given union should correspond to
815 /// the union of set items built up before seeing the `[`.
816 ///
817 /// If there was a problem parsing the opening of the class, then an error
818 /// is returned. Otherwise, a new union of set items for the class is
819 /// returned (which may be populated with either a `]` or a `-`).
820 #[inline(never)]
821 fn push_class_open(
822 &self,
823 parent_union: ast::ClassSetUnion,
824 ) -> Result<ast::ClassSetUnion> {
825 assert_eq!(self.char(), '[');
826
827 let (nested_set, nested_union) = self.parse_set_class_open()?;
828 self.parser()
829 .stack_class
830 .borrow_mut()
831 .push(ClassState::Open { union: parent_union, set: nested_set });
832 Ok(nested_union)
833 }
834
835 /// Parse the end of a character class set and pop the character class
836 /// parser stack. The union given corresponds to the last union built
837 /// before seeing the closing `]`. The union returned corresponds to the
838 /// parent character class set with the nested class added to it.
839 ///
840 /// This assumes that the parser is positioned at a `]` and will advance
841 /// the parser to the byte immediately following the `]`.
842 ///
843 /// If the stack is empty after popping, then this returns the final
844 /// "top-level" character class AST (where a "top-level" character class
845 /// is one that is not nested inside any other character class).
846 ///
847 /// If there is no corresponding opening bracket on the parser's stack,
848 /// then an error is returned.
849 #[inline(never)]
850 fn pop_class(
851 &self,
852 nested_union: ast::ClassSetUnion,
853 ) -> Result<Either<ast::ClassSetUnion, ast::ClassBracketed>> {
854 assert_eq!(self.char(), ']');
855
856 let item = ast::ClassSet::Item(nested_union.into_item());
857 let prevset = self.pop_class_op(item);
858 let mut stack = self.parser().stack_class.borrow_mut();
859 match stack.pop() {
860 None => {
861 // We can never observe an empty stack:
862 //
863 // 1) We are guaranteed to start with a non-empty stack since
864 // the character class parser is only initiated when it sees
865 // a `[`.
866 // 2) If we ever observe an empty stack while popping after
867 // seeing a `]`, then we signal the character class parser
868 // to terminate.
869 panic!("unexpected empty character class stack")
870 }
871 Some(ClassState::Op { .. }) => {
872 // This panic is unfortunate, but this case is impossible
873 // since we already popped the Op state if one exists above.
874 // Namely, every push to the class parser stack is guarded by
875 // whether an existing Op is already on the top of the stack.
876 // If it is, the existing Op is modified. That is, the stack
877 // can never have consecutive Op states.
878 panic!("unexpected ClassState::Op")
879 }
880 Some(ClassState::Open { mut union, mut set }) => {
881 self.bump();
882 set.span.end = self.pos();
883 set.kind = prevset;
884 if stack.is_empty() {
885 Ok(Either::Right(set))
886 } else {
887 union.push(ast::ClassSetItem::Bracketed(Box::new(set)));
888 Ok(Either::Left(union))
889 }
890 }
891 }
892 }
893
894 /// Return an "unclosed class" error whose span points to the most
895 /// recently opened class.
896 ///
897 /// This should only be called while parsing a character class.
898 #[inline(never)]
899 fn unclosed_class_error(&self) -> ast::Error {
900 for state in self.parser().stack_class.borrow().iter().rev() {
901 if let ClassState::Open { ref set, .. } = *state {
902 return self.error(set.span, ast::ErrorKind::ClassUnclosed);
903 }
904 }
905 // We are guaranteed to have a non-empty stack with at least
906 // one open bracket, so we should never get here.
907 panic!("no open character class found")
908 }
909
910 /// Push the current set of class items on to the class parser's stack as
911 /// the left hand side of the given operator.
912 ///
913 /// A fresh set union is returned, which should be used to build the right
914 /// hand side of this operator.
915 #[inline(never)]
916 fn push_class_op(
917 &self,
918 next_kind: ast::ClassSetBinaryOpKind,
919 next_union: ast::ClassSetUnion,
920 ) -> ast::ClassSetUnion {
921 let item = ast::ClassSet::Item(next_union.into_item());
922 let new_lhs = self.pop_class_op(item);
923 self.parser()
924 .stack_class
925 .borrow_mut()
926 .push(ClassState::Op { kind: next_kind, lhs: new_lhs });
927 ast::ClassSetUnion { span: self.span(), items: vec![] }
928 }
929
930 /// Pop a character class set from the character class parser stack. If the
931 /// top of the stack is just an item (not an operation), then return the
932 /// given set unchanged. If the top of the stack is an operation, then the
933 /// given set will be used as the rhs of the operation on the top of the
934 /// stack. In that case, the binary operation is returned as a set.
935 #[inline(never)]
936 fn pop_class_op(&self, rhs: ast::ClassSet) -> ast::ClassSet {
937 let mut stack = self.parser().stack_class.borrow_mut();
938 let (kind, lhs) = match stack.pop() {
939 Some(ClassState::Op { kind, lhs }) => (kind, lhs),
940 Some(state @ ClassState::Open { .. }) => {
941 stack.push(state);
942 return rhs;
943 }
944 None => unreachable!(),
945 };
946 let span = Span::new(lhs.span().start, rhs.span().end);
947 ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp {
948 span,
949 kind,
950 lhs: Box::new(lhs),
951 rhs: Box::new(rhs),
952 })
953 }
954}
955
956impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
957 /// Parse the regular expression into an abstract syntax tree.
958 fn parse(&self) -> Result<Ast> {
959 self.parse_with_comments().map(|astc| astc.ast)
960 }
961
962 /// Parse the regular expression and return an abstract syntax tree with
963 /// all of the comments found in the pattern.
964 fn parse_with_comments(&self) -> Result<ast::WithComments> {
965 assert_eq!(self.offset(), 0, "parser can only be used once");
966 self.parser().reset();
967 let mut concat = ast::Concat { span: self.span(), asts: vec![] };
968 loop {
969 self.bump_space();
970 if self.is_eof() {
971 break;
972 }
973 match self.char() {
974 '(' => concat = self.push_group(concat)?,
975 ')' => concat = self.pop_group(concat)?,
976 '|' => concat = self.push_alternate(concat)?,
977 '[' => {
978 let class = self.parse_set_class()?;
979 concat.asts.push(Ast::class_bracketed(class));
980 }
981 '?' => {
982 concat = self.parse_uncounted_repetition(
983 concat,
984 ast::RepetitionKind::ZeroOrOne,
985 )?;
986 }
987 '*' => {
988 concat = self.parse_uncounted_repetition(
989 concat,
990 ast::RepetitionKind::ZeroOrMore,
991 )?;
992 }
993 '+' => {
994 concat = self.parse_uncounted_repetition(
995 concat,
996 ast::RepetitionKind::OneOrMore,
997 )?;
998 }
999 '{' => {
1000 concat = self.parse_counted_repetition(concat)?;
1001 }
1002 _ => concat.asts.push(self.parse_primitive()?.into_ast()),
1003 }
1004 }
1005 let ast = self.pop_group_end(concat)?;
1006 NestLimiter::new(self).check(&ast)?;
1007 Ok(ast::WithComments {
1008 ast,
1009 comments: mem::replace(
1010 &mut *self.parser().comments.borrow_mut(),
1011 vec![],
1012 ),
1013 })
1014 }
1015
1016 /// Parses an uncounted repetition operation. An uncounted repetition
1017 /// operator includes ?, * and +, but does not include the {m,n} syntax.
1018 /// The given `kind` should correspond to the operator observed by the
1019 /// caller.
1020 ///
1021 /// This assumes that the parser is currently positioned at the repetition
1022 /// operator and advances the parser to the first character after the
1023 /// operator. (Note that the operator may include a single additional `?`,
1024 /// which makes the operator ungreedy.)
1025 ///
1026 /// The caller should include the concatenation that is being built. The
1027 /// concatenation returned includes the repetition operator applied to the
1028 /// last expression in the given concatenation.
1029 #[inline(never)]
1030 fn parse_uncounted_repetition(
1031 &self,
1032 mut concat: ast::Concat,
1033 kind: ast::RepetitionKind,
1034 ) -> Result<ast::Concat> {
1035 assert!(
1036 self.char() == '?' || self.char() == '*' || self.char() == '+'
1037 );
1038 let op_start = self.pos();
1039 let ast = match concat.asts.pop() {
1040 Some(ast) => ast,
1041 None => {
1042 return Err(
1043 self.error(self.span(), ast::ErrorKind::RepetitionMissing)
1044 )
1045 }
1046 };
1047 match ast {
1048 Ast::Empty(_) | Ast::Flags(_) => {
1049 return Err(
1050 self.error(self.span(), ast::ErrorKind::RepetitionMissing)
1051 )
1052 }
1053 _ => {}
1054 }
1055 let mut greedy = true;
1056 if self.bump() && self.char() == '?' {
1057 greedy = false;
1058 self.bump();
1059 }
1060 concat.asts.push(Ast::repetition(ast::Repetition {
1061 span: ast.span().with_end(self.pos()),
1062 op: ast::RepetitionOp {
1063 span: Span::new(op_start, self.pos()),
1064 kind,
1065 },
1066 greedy,
1067 ast: Box::new(ast),
1068 }));
1069 Ok(concat)
1070 }
1071
1072 /// Parses a counted repetition operation. A counted repetition operator
1073 /// corresponds to the {m,n} syntax, and does not include the ?, * or +
1074 /// operators.
1075 ///
1076 /// This assumes that the parser is currently positioned at the opening `{`
1077 /// and advances the parser to the first character after the operator.
1078 /// (Note that the operator may include a single additional `?`, which
1079 /// makes the operator ungreedy.)
1080 ///
1081 /// The caller should include the concatenation that is being built. The
1082 /// concatenation returned includes the repetition operator applied to the
1083 /// last expression in the given concatenation.
1084 #[inline(never)]
1085 fn parse_counted_repetition(
1086 &self,
1087 mut concat: ast::Concat,
1088 ) -> Result<ast::Concat> {
1089 assert!(self.char() == '{');
1090 let start = self.pos();
1091 let ast = match concat.asts.pop() {
1092 Some(ast) => ast,
1093 None => {
1094 return Err(
1095 self.error(self.span(), ast::ErrorKind::RepetitionMissing)
1096 )
1097 }
1098 };
1099 match ast {
1100 Ast::Empty(_) | Ast::Flags(_) => {
1101 return Err(
1102 self.error(self.span(), ast::ErrorKind::RepetitionMissing)
1103 )
1104 }
1105 _ => {}
1106 }
1107 if !self.bump_and_bump_space() {
1108 return Err(self.error(
1109 Span::new(start, self.pos()),
1110 ast::ErrorKind::RepetitionCountUnclosed,
1111 ));
1112 }
1113 let count_start = specialize_err(
1114 self.parse_decimal(),
1115 ast::ErrorKind::DecimalEmpty,
1116 ast::ErrorKind::RepetitionCountDecimalEmpty,
1117 )?;
1118 let mut range = ast::RepetitionRange::Exactly(count_start);
1119 if self.is_eof() {
1120 return Err(self.error(
1121 Span::new(start, self.pos()),
1122 ast::ErrorKind::RepetitionCountUnclosed,
1123 ));
1124 }
1125 if self.char() == ',' {
1126 if !self.bump_and_bump_space() {
1127 return Err(self.error(
1128 Span::new(start, self.pos()),
1129 ast::ErrorKind::RepetitionCountUnclosed,
1130 ));
1131 }
1132 if self.char() != '}' {
1133 let count_end = specialize_err(
1134 self.parse_decimal(),
1135 ast::ErrorKind::DecimalEmpty,
1136 ast::ErrorKind::RepetitionCountDecimalEmpty,
1137 )?;
1138 range = ast::RepetitionRange::Bounded(count_start, count_end);
1139 } else {
1140 range = ast::RepetitionRange::AtLeast(count_start);
1141 }
1142 }
1143 if self.is_eof() || self.char() != '}' {
1144 return Err(self.error(
1145 Span::new(start, self.pos()),
1146 ast::ErrorKind::RepetitionCountUnclosed,
1147 ));
1148 }
1149
1150 let mut greedy = true;
1151 if self.bump_and_bump_space() && self.char() == '?' {
1152 greedy = false;
1153 self.bump();
1154 }
1155
1156 let op_span = Span::new(start, self.pos());
1157 if !range.is_valid() {
1158 return Err(
1159 self.error(op_span, ast::ErrorKind::RepetitionCountInvalid)
1160 );
1161 }
1162 concat.asts.push(Ast::repetition(ast::Repetition {
1163 span: ast.span().with_end(self.pos()),
1164 op: ast::RepetitionOp {
1165 span: op_span,
1166 kind: ast::RepetitionKind::Range(range),
1167 },
1168 greedy,
1169 ast: Box::new(ast),
1170 }));
1171 Ok(concat)
1172 }
1173
1174 /// Parse a group (which contains a sub-expression) or a set of flags.
1175 ///
1176 /// If a group was found, then it is returned with an empty AST. If a set
1177 /// of flags is found, then that set is returned.
1178 ///
1179 /// The parser should be positioned at the opening parenthesis.
1180 ///
1181 /// This advances the parser to the character before the start of the
1182 /// sub-expression (in the case of a group) or to the closing parenthesis
1183 /// immediately following the set of flags.
1184 ///
1185 /// # Errors
1186 ///
1187 /// If flags are given and incorrectly specified, then a corresponding
1188 /// error is returned.
1189 ///
1190 /// If a capture name is given and it is incorrectly specified, then a
1191 /// corresponding error is returned.
1192 #[inline(never)]
1193 fn parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>> {
1194 assert_eq!(self.char(), '(');
1195 let open_span = self.span_char();
1196 self.bump();
1197 self.bump_space();
1198 if self.is_lookaround_prefix() {
1199 return Err(self.error(
1200 Span::new(open_span.start, self.span().end),
1201 ast::ErrorKind::UnsupportedLookAround,
1202 ));
1203 }
1204 let inner_span = self.span();
1205 let mut starts_with_p = true;
1206 if self.bump_if("?P<") || {
1207 starts_with_p = false;
1208 self.bump_if("?<")
1209 } {
1210 let capture_index = self.next_capture_index(open_span)?;
1211 let name = self.parse_capture_name(capture_index)?;
1212 Ok(Either::Right(ast::Group {
1213 span: open_span,
1214 kind: ast::GroupKind::CaptureName { starts_with_p, name },
1215 ast: Box::new(Ast::empty(self.span())),
1216 }))
1217 } else if self.bump_if("?") {
1218 if self.is_eof() {
1219 return Err(
1220 self.error(open_span, ast::ErrorKind::GroupUnclosed)
1221 );
1222 }
1223 let flags = self.parse_flags()?;
1224 let char_end = self.char();
1225 self.bump();
1226 if char_end == ')' {
1227 // We don't allow empty flags, e.g., `(?)`. We instead
1228 // interpret it as a repetition operator missing its argument.
1229 if flags.items.is_empty() {
1230 return Err(self.error(
1231 inner_span,
1232 ast::ErrorKind::RepetitionMissing,
1233 ));
1234 }
1235 Ok(Either::Left(ast::SetFlags {
1236 span: Span { end: self.pos(), ..open_span },
1237 flags,
1238 }))
1239 } else {
1240 assert_eq!(char_end, ':');
1241 Ok(Either::Right(ast::Group {
1242 span: open_span,
1243 kind: ast::GroupKind::NonCapturing(flags),
1244 ast: Box::new(Ast::empty(self.span())),
1245 }))
1246 }
1247 } else {
1248 let capture_index = self.next_capture_index(open_span)?;
1249 Ok(Either::Right(ast::Group {
1250 span: open_span,
1251 kind: ast::GroupKind::CaptureIndex(capture_index),
1252 ast: Box::new(Ast::empty(self.span())),
1253 }))
1254 }
1255 }
1256
1257 /// Parses a capture group name. Assumes that the parser is positioned at
1258 /// the first character in the name following the opening `<` (and may
1259 /// possibly be EOF). This advances the parser to the first character
1260 /// following the closing `>`.
1261 ///
1262 /// The caller must provide the capture index of the group for this name.
1263 #[inline(never)]
1264 fn parse_capture_name(
1265 &self,
1266 capture_index: u32,
1267 ) -> Result<ast::CaptureName> {
1268 if self.is_eof() {
1269 return Err(self
1270 .error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
1271 }
1272 let start = self.pos();
1273 loop {
1274 if self.char() == '>' {
1275 break;
1276 }
1277 if !is_capture_char(self.char(), self.pos() == start) {
1278 return Err(self.error(
1279 self.span_char(),
1280 ast::ErrorKind::GroupNameInvalid,
1281 ));
1282 }
1283 if !self.bump() {
1284 break;
1285 }
1286 }
1287 let end = self.pos();
1288 if self.is_eof() {
1289 return Err(self
1290 .error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
1291 }
1292 assert_eq!(self.char(), '>');
1293 self.bump();
1294 let name = &self.pattern()[start.offset..end.offset];
1295 if name.is_empty() {
1296 return Err(self.error(
1297 Span::new(start, start),
1298 ast::ErrorKind::GroupNameEmpty,
1299 ));
1300 }
1301 let capname = ast::CaptureName {
1302 span: Span::new(start, end),
1303 name: name.to_string(),
1304 index: capture_index,
1305 };
1306 self.add_capture_name(&capname)?;
1307 Ok(capname)
1308 }
1309
1310 /// Parse a sequence of flags starting at the current character.
1311 ///
1312 /// This advances the parser to the character immediately following the
1313 /// flags, which is guaranteed to be either `:` or `)`.
1314 ///
1315 /// # Errors
1316 ///
1317 /// If any flags are duplicated, then an error is returned.
1318 ///
1319 /// If the negation operator is used more than once, then an error is
1320 /// returned.
1321 ///
1322 /// If no flags could be found or if the negation operation is not followed
1323 /// by any flags, then an error is returned.
1324 #[inline(never)]
1325 fn parse_flags(&self) -> Result<ast::Flags> {
1326 let mut flags = ast::Flags { span: self.span(), items: vec![] };
1327 let mut last_was_negation = None;
1328 while self.char() != ':' && self.char() != ')' {
1329 if self.char() == '-' {
1330 last_was_negation = Some(self.span_char());
1331 let item = ast::FlagsItem {
1332 span: self.span_char(),
1333 kind: ast::FlagsItemKind::Negation,
1334 };
1335 if let Some(i) = flags.add_item(item) {
1336 return Err(self.error(
1337 self.span_char(),
1338 ast::ErrorKind::FlagRepeatedNegation {
1339 original: flags.items[i].span,
1340 },
1341 ));
1342 }
1343 } else {
1344 last_was_negation = None;
1345 let item = ast::FlagsItem {
1346 span: self.span_char(),
1347 kind: ast::FlagsItemKind::Flag(self.parse_flag()?),
1348 };
1349 if let Some(i) = flags.add_item(item) {
1350 return Err(self.error(
1351 self.span_char(),
1352 ast::ErrorKind::FlagDuplicate {
1353 original: flags.items[i].span,
1354 },
1355 ));
1356 }
1357 }
1358 if !self.bump() {
1359 return Err(
1360 self.error(self.span(), ast::ErrorKind::FlagUnexpectedEof)
1361 );
1362 }
1363 }
1364 if let Some(span) = last_was_negation {
1365 return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation));
1366 }
1367 flags.span.end = self.pos();
1368 Ok(flags)
1369 }
1370
1371 /// Parse the current character as a flag. Do not advance the parser.
1372 ///
1373 /// # Errors
1374 ///
1375 /// If the flag is not recognized, then an error is returned.
1376 #[inline(never)]
1377 fn parse_flag(&self) -> Result<ast::Flag> {
1378 match self.char() {
1379 'i' => Ok(ast::Flag::CaseInsensitive),
1380 'm' => Ok(ast::Flag::MultiLine),
1381 's' => Ok(ast::Flag::DotMatchesNewLine),
1382 'U' => Ok(ast::Flag::SwapGreed),
1383 'u' => Ok(ast::Flag::Unicode),
1384 'R' => Ok(ast::Flag::CRLF),
1385 'x' => Ok(ast::Flag::IgnoreWhitespace),
1386 _ => {
1387 Err(self
1388 .error(self.span_char(), ast::ErrorKind::FlagUnrecognized))
1389 }
1390 }
1391 }
1392
1393 /// Parse a primitive AST. e.g., A literal, non-set character class or
1394 /// assertion.
1395 ///
1396 /// This assumes that the parser expects a primitive at the current
1397 /// location. i.e., All other non-primitive cases have been handled.
1398 /// For example, if the parser's position is at `|`, then `|` will be
1399 /// treated as a literal (e.g., inside a character class).
1400 ///
1401 /// This advances the parser to the first character immediately following
1402 /// the primitive.
1403 fn parse_primitive(&self) -> Result<Primitive> {
1404 match self.char() {
1405 '\\' => self.parse_escape(),
1406 '.' => {
1407 let ast = Primitive::Dot(self.span_char());
1408 self.bump();
1409 Ok(ast)
1410 }
1411 '^' => {
1412 let ast = Primitive::Assertion(ast::Assertion {
1413 span: self.span_char(),
1414 kind: ast::AssertionKind::StartLine,
1415 });
1416 self.bump();
1417 Ok(ast)
1418 }
1419 '$' => {
1420 let ast = Primitive::Assertion(ast::Assertion {
1421 span: self.span_char(),
1422 kind: ast::AssertionKind::EndLine,
1423 });
1424 self.bump();
1425 Ok(ast)
1426 }
1427 c => {
1428 let ast = Primitive::Literal(ast::Literal {
1429 span: self.span_char(),
1430 kind: ast::LiteralKind::Verbatim,
1431 c,
1432 });
1433 self.bump();
1434 Ok(ast)
1435 }
1436 }
1437 }
1438
1439 /// Parse an escape sequence as a primitive AST.
1440 ///
1441 /// This assumes the parser is positioned at the start of the escape
1442 /// sequence, i.e., `\`. It advances the parser to the first position
1443 /// immediately following the escape sequence.
1444 #[inline(never)]
1445 fn parse_escape(&self) -> Result<Primitive> {
1446 assert_eq!(self.char(), '\\');
1447 let start = self.pos();
1448 if !self.bump() {
1449 return Err(self.error(
1450 Span::new(start, self.pos()),
1451 ast::ErrorKind::EscapeUnexpectedEof,
1452 ));
1453 }
1454 let c = self.char();
1455 // Put some of the more complicated routines into helpers.
1456 match c {
1457 '0'..='7' => {
1458 if !self.parser().octal {
1459 return Err(self.error(
1460 Span::new(start, self.span_char().end),
1461 ast::ErrorKind::UnsupportedBackreference,
1462 ));
1463 }
1464 let mut lit = self.parse_octal();
1465 lit.span.start = start;
1466 return Ok(Primitive::Literal(lit));
1467 }
1468 '8'..='9' if !self.parser().octal => {
1469 return Err(self.error(
1470 Span::new(start, self.span_char().end),
1471 ast::ErrorKind::UnsupportedBackreference,
1472 ));
1473 }
1474 'x' | 'u' | 'U' => {
1475 let mut lit = self.parse_hex()?;
1476 lit.span.start = start;
1477 return Ok(Primitive::Literal(lit));
1478 }
1479 'p' | 'P' => {
1480 let mut cls = self.parse_unicode_class()?;
1481 cls.span.start = start;
1482 return Ok(Primitive::Unicode(cls));
1483 }
1484 'd' | 's' | 'w' | 'D' | 'S' | 'W' => {
1485 let mut cls = self.parse_perl_class();
1486 cls.span.start = start;
1487 return Ok(Primitive::Perl(cls));
1488 }
1489 _ => {}
1490 }
1491
1492 // Handle all of the one letter sequences inline.
1493 self.bump();
1494 let span = Span::new(start, self.pos());
1495 if is_meta_character(c) {
1496 return Ok(Primitive::Literal(ast::Literal {
1497 span,
1498 kind: ast::LiteralKind::Meta,
1499 c,
1500 }));
1501 }
1502 if is_escapeable_character(c) {
1503 return Ok(Primitive::Literal(ast::Literal {
1504 span,
1505 kind: ast::LiteralKind::Superfluous,
1506 c,
1507 }));
1508 }
1509 let special = |kind, c| {
1510 Ok(Primitive::Literal(ast::Literal {
1511 span,
1512 kind: ast::LiteralKind::Special(kind),
1513 c,
1514 }))
1515 };
1516 match c {
1517 'a' => special(ast::SpecialLiteralKind::Bell, '\x07'),
1518 'f' => special(ast::SpecialLiteralKind::FormFeed, '\x0C'),
1519 't' => special(ast::SpecialLiteralKind::Tab, '\t'),
1520 'n' => special(ast::SpecialLiteralKind::LineFeed, '\n'),
1521 'r' => special(ast::SpecialLiteralKind::CarriageReturn, '\r'),
1522 'v' => special(ast::SpecialLiteralKind::VerticalTab, '\x0B'),
1523 'A' => Ok(Primitive::Assertion(ast::Assertion {
1524 span,
1525 kind: ast::AssertionKind::StartText,
1526 })),
1527 'z' => Ok(Primitive::Assertion(ast::Assertion {
1528 span,
1529 kind: ast::AssertionKind::EndText,
1530 })),
1531 'b' => {
1532 let mut wb = ast::Assertion {
1533 span,
1534 kind: ast::AssertionKind::WordBoundary,
1535 };
1536 // After a \b, we "try" to parse things like \b{start} for
1537 // special word boundary assertions.
1538 if !self.is_eof() && self.char() == '{' {
1539 if let Some(kind) =
1540 self.maybe_parse_special_word_boundary(start)?
1541 {
1542 wb.kind = kind;
1543 wb.span.end = self.pos();
1544 }
1545 }
1546 Ok(Primitive::Assertion(wb))
1547 }
1548 'B' => Ok(Primitive::Assertion(ast::Assertion {
1549 span,
1550 kind: ast::AssertionKind::NotWordBoundary,
1551 })),
1552 '<' => Ok(Primitive::Assertion(ast::Assertion {
1553 span,
1554 kind: ast::AssertionKind::WordBoundaryStartAngle,
1555 })),
1556 '>' => Ok(Primitive::Assertion(ast::Assertion {
1557 span,
1558 kind: ast::AssertionKind::WordBoundaryEndAngle,
1559 })),
1560 _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)),
1561 }
1562 }
1563
1564 /// Attempt to parse a specialty word boundary. That is, `\b{start}`,
1565 /// `\b{end}`, `\b{start-half}` or `\b{end-half}`.
1566 ///
1567 /// This is similar to `maybe_parse_ascii_class` in that, in most cases,
1568 /// if it fails it will just return `None` with no error. This is done
1569 /// because `\b{5}` is a valid expression and we want to let that be parsed
1570 /// by the existing counted repetition parsing code. (I thought about just
1571 /// invoking the counted repetition code from here, but it seemed a little
1572 /// ham-fisted.)
1573 ///
1574 /// Unlike `maybe_parse_ascii_class` though, this can return an error.
1575 /// Namely, if we definitely know it isn't a counted repetition, then we
1576 /// return an error specific to the specialty word boundaries.
1577 ///
1578 /// This assumes the parser is positioned at a `{` immediately following
1579 /// a `\b`. When `None` is returned, the parser is returned to the position
1580 /// at which it started: pointing at a `{`.
1581 ///
1582 /// The position given should correspond to the start of the `\b`.
1583 fn maybe_parse_special_word_boundary(
1584 &self,
1585 wb_start: Position,
1586 ) -> Result<Option<ast::AssertionKind>> {
1587 assert_eq!(self.char(), '{');
1588
1589 let is_valid_char = |c| match c {
1590 'A'..='Z' | 'a'..='z' | '-' => true,
1591 _ => false,
1592 };
1593 let start = self.pos();
1594 if !self.bump_and_bump_space() {
1595 return Err(self.error(
1596 Span::new(wb_start, self.pos()),
1597 ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof,
1598 ));
1599 }
1600 let start_contents = self.pos();
1601 // This is one of the critical bits: if the first non-whitespace
1602 // character isn't in [-A-Za-z] (i.e., this can't be a special word
1603 // boundary), then we bail and let the counted repetition parser deal
1604 // with this.
1605 if !is_valid_char(self.char()) {
1606 self.parser().pos.set(start);
1607 return Ok(None);
1608 }
1609
1610 // Now collect up our chars until we see a '}'.
1611 let mut scratch = self.parser().scratch.borrow_mut();
1612 scratch.clear();
1613 while !self.is_eof() && is_valid_char(self.char()) {
1614 scratch.push(self.char());
1615 self.bump_and_bump_space();
1616 }
1617 if self.is_eof() || self.char() != '}' {
1618 return Err(self.error(
1619 Span::new(start, self.pos()),
1620 ast::ErrorKind::SpecialWordBoundaryUnclosed,
1621 ));
1622 }
1623 let end = self.pos();
1624 self.bump();
1625 let kind = match scratch.as_str() {
1626 "start" => ast::AssertionKind::WordBoundaryStart,
1627 "end" => ast::AssertionKind::WordBoundaryEnd,
1628 "start-half" => ast::AssertionKind::WordBoundaryStartHalf,
1629 "end-half" => ast::AssertionKind::WordBoundaryEndHalf,
1630 _ => {
1631 return Err(self.error(
1632 Span::new(start_contents, end),
1633 ast::ErrorKind::SpecialWordBoundaryUnrecognized,
1634 ))
1635 }
1636 };
1637 Ok(Some(kind))
1638 }
1639
1640 /// Parse an octal representation of a Unicode codepoint up to 3 digits
1641 /// long. This expects the parser to be positioned at the first octal
1642 /// digit and advances the parser to the first character immediately
1643 /// following the octal number. This also assumes that parsing octal
1644 /// escapes is enabled.
1645 ///
1646 /// Assuming the preconditions are met, this routine can never fail.
1647 #[inline(never)]
1648 fn parse_octal(&self) -> ast::Literal {
1649 assert!(self.parser().octal);
1650 assert!('0' <= self.char() && self.char() <= '7');
1651 let start = self.pos();
1652 // Parse up to two more digits.
1653 while self.bump()
1654 && '0' <= self.char()
1655 && self.char() <= '7'
1656 && self.pos().offset - start.offset <= 2
1657 {}
1658 let end = self.pos();
1659 let octal = &self.pattern()[start.offset..end.offset];
1660 // Parsing the octal should never fail since the above guarantees a
1661 // valid number.
1662 let codepoint =
1663 u32::from_str_radix(octal, 8).expect("valid octal number");
1664 // The max value for 3 digit octal is 0777 = 511 and [0, 511] has no
1665 // invalid Unicode scalar values.
1666 let c = char::from_u32(codepoint).expect("Unicode scalar value");
1667 ast::Literal {
1668 span: Span::new(start, end),
1669 kind: ast::LiteralKind::Octal,
1670 c,
1671 }
1672 }
1673
1674 /// Parse a hex representation of a Unicode codepoint. This handles both
1675 /// hex notations, i.e., `\xFF` and `\x{FFFF}`. This expects the parser to
1676 /// be positioned at the `x`, `u` or `U` prefix. The parser is advanced to
1677 /// the first character immediately following the hexadecimal literal.
1678 #[inline(never)]
1679 fn parse_hex(&self) -> Result<ast::Literal> {
1680 assert!(
1681 self.char() == 'x' || self.char() == 'u' || self.char() == 'U'
1682 );
1683
1684 let hex_kind = match self.char() {
1685 'x' => ast::HexLiteralKind::X,
1686 'u' => ast::HexLiteralKind::UnicodeShort,
1687 _ => ast::HexLiteralKind::UnicodeLong,
1688 };
1689 if !self.bump_and_bump_space() {
1690 return Err(
1691 self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof)
1692 );
1693 }
1694 if self.char() == '{' {
1695 self.parse_hex_brace(hex_kind)
1696 } else {
1697 self.parse_hex_digits(hex_kind)
1698 }
1699 }
1700
1701 /// Parse an N-digit hex representation of a Unicode codepoint. This
1702 /// expects the parser to be positioned at the first digit and will advance
1703 /// the parser to the first character immediately following the escape
1704 /// sequence.
1705 ///
1706 /// The number of digits given must be 2 (for `\xNN`), 4 (for `\uNNNN`)
1707 /// or 8 (for `\UNNNNNNNN`).
1708 #[inline(never)]
1709 fn parse_hex_digits(
1710 &self,
1711 kind: ast::HexLiteralKind,
1712 ) -> Result<ast::Literal> {
1713 let mut scratch = self.parser().scratch.borrow_mut();
1714 scratch.clear();
1715
1716 let start = self.pos();
1717 for i in 0..kind.digits() {
1718 if i > 0 && !self.bump_and_bump_space() {
1719 return Err(self
1720 .error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
1721 }
1722 if !is_hex(self.char()) {
1723 return Err(self.error(
1724 self.span_char(),
1725 ast::ErrorKind::EscapeHexInvalidDigit,
1726 ));
1727 }
1728 scratch.push(self.char());
1729 }
1730 // The final bump just moves the parser past the literal, which may
1731 // be EOF.
1732 self.bump_and_bump_space();
1733 let end = self.pos();
1734 let hex = scratch.as_str();
1735 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
1736 None => Err(self.error(
1737 Span::new(start, end),
1738 ast::ErrorKind::EscapeHexInvalid,
1739 )),
1740 Some(c) => Ok(ast::Literal {
1741 span: Span::new(start, end),
1742 kind: ast::LiteralKind::HexFixed(kind),
1743 c,
1744 }),
1745 }
1746 }
1747
1748 /// Parse a hex representation of any Unicode scalar value. This expects
1749 /// the parser to be positioned at the opening brace `{` and will advance
1750 /// the parser to the first character following the closing brace `}`.
1751 #[inline(never)]
1752 fn parse_hex_brace(
1753 &self,
1754 kind: ast::HexLiteralKind,
1755 ) -> Result<ast::Literal> {
1756 let mut scratch = self.parser().scratch.borrow_mut();
1757 scratch.clear();
1758
1759 let brace_pos = self.pos();
1760 let start = self.span_char().end;
1761 while self.bump_and_bump_space() && self.char() != '}' {
1762 if !is_hex(self.char()) {
1763 return Err(self.error(
1764 self.span_char(),
1765 ast::ErrorKind::EscapeHexInvalidDigit,
1766 ));
1767 }
1768 scratch.push(self.char());
1769 }
1770 if self.is_eof() {
1771 return Err(self.error(
1772 Span::new(brace_pos, self.pos()),
1773 ast::ErrorKind::EscapeUnexpectedEof,
1774 ));
1775 }
1776 let end = self.pos();
1777 let hex = scratch.as_str();
1778 assert_eq!(self.char(), '}');
1779 self.bump_and_bump_space();
1780
1781 if hex.is_empty() {
1782 return Err(self.error(
1783 Span::new(brace_pos, self.pos()),
1784 ast::ErrorKind::EscapeHexEmpty,
1785 ));
1786 }
1787 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
1788 None => Err(self.error(
1789 Span::new(start, end),
1790 ast::ErrorKind::EscapeHexInvalid,
1791 )),
1792 Some(c) => Ok(ast::Literal {
1793 span: Span::new(start, self.pos()),
1794 kind: ast::LiteralKind::HexBrace(kind),
1795 c,
1796 }),
1797 }
1798 }
1799
1800 /// Parse a decimal number into a u32 while trimming leading and trailing
1801 /// whitespace.
1802 ///
1803 /// This expects the parser to be positioned at the first position where
1804 /// a decimal digit could occur. This will advance the parser to the byte
1805 /// immediately following the last contiguous decimal digit.
1806 ///
1807 /// If no decimal digit could be found or if there was a problem parsing
1808 /// the complete set of digits into a u32, then an error is returned.
1809 fn parse_decimal(&self) -> Result<u32> {
1810 let mut scratch = self.parser().scratch.borrow_mut();
1811 scratch.clear();
1812
1813 while !self.is_eof() && self.char().is_whitespace() {
1814 self.bump();
1815 }
1816 let start = self.pos();
1817 while !self.is_eof() && '0' <= self.char() && self.char() <= '9' {
1818 scratch.push(self.char());
1819 self.bump_and_bump_space();
1820 }
1821 let span = Span::new(start, self.pos());
1822 while !self.is_eof() && self.char().is_whitespace() {
1823 self.bump_and_bump_space();
1824 }
1825 let digits = scratch.as_str();
1826 if digits.is_empty() {
1827 return Err(self.error(span, ast::ErrorKind::DecimalEmpty));
1828 }
1829 match u32::from_str_radix(digits, 10).ok() {
1830 Some(n) => Ok(n),
1831 None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)),
1832 }
1833 }
1834
1835 /// Parse a standard character class consisting primarily of characters or
1836 /// character ranges, but can also contain nested character classes of
1837 /// any type (sans `.`).
1838 ///
1839 /// This assumes the parser is positioned at the opening `[`. If parsing
1840 /// is successful, then the parser is advanced to the position immediately
1841 /// following the closing `]`.
1842 #[inline(never)]
1843 fn parse_set_class(&self) -> Result<ast::ClassBracketed> {
1844 assert_eq!(self.char(), '[');
1845
1846 let mut union =
1847 ast::ClassSetUnion { span: self.span(), items: vec![] };
1848 loop {
1849 self.bump_space();
1850 if self.is_eof() {
1851 return Err(self.unclosed_class_error());
1852 }
1853 match self.char() {
1854 '[' => {
1855 // If we've already parsed the opening bracket, then
1856 // attempt to treat this as the beginning of an ASCII
1857 // class. If ASCII class parsing fails, then the parser
1858 // backs up to `[`.
1859 if !self.parser().stack_class.borrow().is_empty() {
1860 if let Some(cls) = self.maybe_parse_ascii_class() {
1861 union.push(ast::ClassSetItem::Ascii(cls));
1862 continue;
1863 }
1864 }
1865 union = self.push_class_open(union)?;
1866 }
1867 ']' => match self.pop_class(union)? {
1868 Either::Left(nested_union) => {
1869 union = nested_union;
1870 }
1871 Either::Right(class) => return Ok(class),
1872 },
1873 '&' if self.peek() == Some('&') => {
1874 assert!(self.bump_if("&&"));
1875 union = self.push_class_op(
1876 ast::ClassSetBinaryOpKind::Intersection,
1877 union,
1878 );
1879 }
1880 '-' if self.peek() == Some('-') => {
1881 assert!(self.bump_if("--"));
1882 union = self.push_class_op(
1883 ast::ClassSetBinaryOpKind::Difference,
1884 union,
1885 );
1886 }
1887 '~' if self.peek() == Some('~') => {
1888 assert!(self.bump_if("~~"));
1889 union = self.push_class_op(
1890 ast::ClassSetBinaryOpKind::SymmetricDifference,
1891 union,
1892 );
1893 }
1894 _ => {
1895 union.push(self.parse_set_class_range()?);
1896 }
1897 }
1898 }
1899 }
1900
1901 /// Parse a single primitive item in a character class set. The item to
1902 /// be parsed can either be one of a simple literal character, a range
1903 /// between two simple literal characters or a "primitive" character
1904 /// class like \w or \p{Greek}.
1905 ///
1906 /// If an invalid escape is found, or if a character class is found where
1907 /// a simple literal is expected (e.g., in a range), then an error is
1908 /// returned.
1909 #[inline(never)]
1910 fn parse_set_class_range(&self) -> Result<ast::ClassSetItem> {
1911 let prim1 = self.parse_set_class_item()?;
1912 self.bump_space();
1913 if self.is_eof() {
1914 return Err(self.unclosed_class_error());
1915 }
1916 // If the next char isn't a `-`, then we don't have a range.
1917 // There are two exceptions. If the char after a `-` is a `]`, then
1918 // `-` is interpreted as a literal `-`. Alternatively, if the char
1919 // after a `-` is a `-`, then `--` corresponds to a "difference"
1920 // operation.
1921 if self.char() != '-'
1922 || self.peek_space() == Some(']')
1923 || self.peek_space() == Some('-')
1924 {
1925 return prim1.into_class_set_item(self);
1926 }
1927 // OK, now we're parsing a range, so bump past the `-` and parse the
1928 // second half of the range.
1929 if !self.bump_and_bump_space() {
1930 return Err(self.unclosed_class_error());
1931 }
1932 let prim2 = self.parse_set_class_item()?;
1933 let range = ast::ClassSetRange {
1934 span: Span::new(prim1.span().start, prim2.span().end),
1935 start: prim1.into_class_literal(self)?,
1936 end: prim2.into_class_literal(self)?,
1937 };
1938 if !range.is_valid() {
1939 return Err(
1940 self.error(range.span, ast::ErrorKind::ClassRangeInvalid)
1941 );
1942 }
1943 Ok(ast::ClassSetItem::Range(range))
1944 }
1945
1946 /// Parse a single item in a character class as a primitive, where the
1947 /// primitive either consists of a verbatim literal or a single escape
1948 /// sequence.
1949 ///
1950 /// This assumes the parser is positioned at the beginning of a primitive,
1951 /// and advances the parser to the first position after the primitive if
1952 /// successful.
1953 ///
1954 /// Note that it is the caller's responsibility to report an error if an
1955 /// illegal primitive was parsed.
1956 #[inline(never)]
1957 fn parse_set_class_item(&self) -> Result<Primitive> {
1958 if self.char() == '\\' {
1959 self.parse_escape()
1960 } else {
1961 let x = Primitive::Literal(ast::Literal {
1962 span: self.span_char(),
1963 kind: ast::LiteralKind::Verbatim,
1964 c: self.char(),
1965 });
1966 self.bump();
1967 Ok(x)
1968 }
1969 }
1970
1971 /// Parses the opening of a character class set. This includes the opening
1972 /// bracket along with `^` if present to indicate negation. This also
1973 /// starts parsing the opening set of unioned items if applicable, since
1974 /// there are special rules applied to certain characters in the opening
1975 /// of a character class. For example, `[^]]` is the class of all
1976 /// characters not equal to `]`. (`]` would need to be escaped in any other
1977 /// position.) Similarly for `-`.
1978 ///
1979 /// In all cases, the op inside the returned `ast::ClassBracketed` is an
1980 /// empty union. This empty union should be replaced with the actual item
1981 /// when it is popped from the parser's stack.
1982 ///
1983 /// This assumes the parser is positioned at the opening `[` and advances
1984 /// the parser to the first non-special byte of the character class.
1985 ///
1986 /// An error is returned if EOF is found.
1987 #[inline(never)]
1988 fn parse_set_class_open(
1989 &self,
1990 ) -> Result<(ast::ClassBracketed, ast::ClassSetUnion)> {
1991 assert_eq!(self.char(), '[');
1992 let start = self.pos();
1993 if !self.bump_and_bump_space() {
1994 return Err(self.error(
1995 Span::new(start, self.pos()),
1996 ast::ErrorKind::ClassUnclosed,
1997 ));
1998 }
1999
2000 let negated = if self.char() != '^' {
2001 false
2002 } else {
2003 if !self.bump_and_bump_space() {
2004 return Err(self.error(
2005 Span::new(start, self.pos()),
2006 ast::ErrorKind::ClassUnclosed,
2007 ));
2008 }
2009 true
2010 };
2011 // Accept any number of `-` as literal `-`.
2012 let mut union =
2013 ast::ClassSetUnion { span: self.span(), items: vec![] };
2014 while self.char() == '-' {
2015 union.push(ast::ClassSetItem::Literal(ast::Literal {
2016 span: self.span_char(),
2017 kind: ast::LiteralKind::Verbatim,
2018 c: '-',
2019 }));
2020 if !self.bump_and_bump_space() {
2021 return Err(self.error(
2022 Span::new(start, start),
2023 ast::ErrorKind::ClassUnclosed,
2024 ));
2025 }
2026 }
2027 // If `]` is the *first* char in a set, then interpret it as a literal
2028 // `]`. That is, an empty class is impossible to write.
2029 if union.items.is_empty() && self.char() == ']' {
2030 union.push(ast::ClassSetItem::Literal(ast::Literal {
2031 span: self.span_char(),
2032 kind: ast::LiteralKind::Verbatim,
2033 c: ']',
2034 }));
2035 if !self.bump_and_bump_space() {
2036 return Err(self.error(
2037 Span::new(start, self.pos()),
2038 ast::ErrorKind::ClassUnclosed,
2039 ));
2040 }
2041 }
2042 let set = ast::ClassBracketed {
2043 span: Span::new(start, self.pos()),
2044 negated,
2045 kind: ast::ClassSet::union(ast::ClassSetUnion {
2046 span: Span::new(union.span.start, union.span.start),
2047 items: vec![],
2048 }),
2049 };
2050 Ok((set, union))
2051 }
2052
2053 /// Attempt to parse an ASCII character class, e.g., `[:alnum:]`.
2054 ///
2055 /// This assumes the parser is positioned at the opening `[`.
2056 ///
2057 /// If no valid ASCII character class could be found, then this does not
2058 /// advance the parser and `None` is returned. Otherwise, the parser is
2059 /// advanced to the first byte following the closing `]` and the
2060 /// corresponding ASCII class is returned.
2061 #[inline(never)]
2062 fn maybe_parse_ascii_class(&self) -> Option<ast::ClassAscii> {
2063 // ASCII character classes are interesting from a parsing perspective
2064 // because parsing cannot fail with any interesting error. For example,
2065 // in order to use an ASCII character class, it must be enclosed in
2066 // double brackets, e.g., `[[:alnum:]]`. Alternatively, you might think
2067 // of it as "ASCII character classes have the syntax `[:NAME:]` which
2068 // can only appear within character brackets." This means that things
2069 // like `[[:lower:]A]` are legal constructs.
2070 //
2071 // However, if one types an incorrect ASCII character class, e.g.,
2072 // `[[:loower:]]`, then we treat that as a normal nested character
2073 // class containing the characters `:elorw`. One might argue that we
2074 // should return an error instead since the repeated colons give away
2075 // the intent to write an ASCII class. But what if the user typed
2076 // `[[:lower]]` instead? How can we tell that was intended to be an
2077 // ASCII class and not just a normal nested class?
2078 //
2079 // Reasonable people can probably disagree over this, but for better
2080 // or worse, we implement semantics that never fails at the expense
2081 // of better failure modes.
2082 assert_eq!(self.char(), '[');
2083 // If parsing fails, then we back up the parser to this starting point.
2084 let start = self.pos();
2085 let mut negated = false;
2086 if !self.bump() || self.char() != ':' {
2087 self.parser().pos.set(start);
2088 return None;
2089 }
2090 if !self.bump() {
2091 self.parser().pos.set(start);
2092 return None;
2093 }
2094 if self.char() == '^' {
2095 negated = true;
2096 if !self.bump() {
2097 self.parser().pos.set(start);
2098 return None;
2099 }
2100 }
2101 let name_start = self.offset();
2102 while self.char() != ':' && self.bump() {}
2103 if self.is_eof() {
2104 self.parser().pos.set(start);
2105 return None;
2106 }
2107 let name = &self.pattern()[name_start..self.offset()];
2108 if !self.bump_if(":]") {
2109 self.parser().pos.set(start);
2110 return None;
2111 }
2112 let kind = match ast::ClassAsciiKind::from_name(name) {
2113 Some(kind) => kind,
2114 None => {
2115 self.parser().pos.set(start);
2116 return None;
2117 }
2118 };
2119 Some(ast::ClassAscii {
2120 span: Span::new(start, self.pos()),
2121 kind,
2122 negated,
2123 })
2124 }
2125
2126 /// Parse a Unicode class in either the single character notation, `\pN`
2127 /// or the multi-character bracketed notation, `\p{Greek}`. This assumes
2128 /// the parser is positioned at the `p` (or `P` for negation) and will
2129 /// advance the parser to the character immediately following the class.
2130 ///
2131 /// Note that this does not check whether the class name is valid or not.
2132 #[inline(never)]
2133 fn parse_unicode_class(&self) -> Result<ast::ClassUnicode> {
2134 assert!(self.char() == 'p' || self.char() == 'P');
2135
2136 let mut scratch = self.parser().scratch.borrow_mut();
2137 scratch.clear();
2138
2139 let negated = self.char() == 'P';
2140 if !self.bump_and_bump_space() {
2141 return Err(
2142 self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof)
2143 );
2144 }
2145 let (start, kind) = if self.char() == '{' {
2146 let start = self.span_char().end;
2147 while self.bump_and_bump_space() && self.char() != '}' {
2148 scratch.push(self.char());
2149 }
2150 if self.is_eof() {
2151 return Err(self
2152 .error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2153 }
2154 assert_eq!(self.char(), '}');
2155 self.bump();
2156
2157 let name = scratch.as_str();
2158 if let Some(i) = name.find("!=") {
2159 (
2160 start,
2161 ast::ClassUnicodeKind::NamedValue {
2162 op: ast::ClassUnicodeOpKind::NotEqual,
2163 name: name[..i].to_string(),
2164 value: name[i + 2..].to_string(),
2165 },
2166 )
2167 } else if let Some(i) = name.find(':') {
2168 (
2169 start,
2170 ast::ClassUnicodeKind::NamedValue {
2171 op: ast::ClassUnicodeOpKind::Colon,
2172 name: name[..i].to_string(),
2173 value: name[i + 1..].to_string(),
2174 },
2175 )
2176 } else if let Some(i) = name.find('=') {
2177 (
2178 start,
2179 ast::ClassUnicodeKind::NamedValue {
2180 op: ast::ClassUnicodeOpKind::Equal,
2181 name: name[..i].to_string(),
2182 value: name[i + 1..].to_string(),
2183 },
2184 )
2185 } else {
2186 (start, ast::ClassUnicodeKind::Named(name.to_string()))
2187 }
2188 } else {
2189 let start = self.pos();
2190 let c = self.char();
2191 if c == '\\' {
2192 return Err(self.error(
2193 self.span_char(),
2194 ast::ErrorKind::UnicodeClassInvalid,
2195 ));
2196 }
2197 self.bump_and_bump_space();
2198 let kind = ast::ClassUnicodeKind::OneLetter(c);
2199 (start, kind)
2200 };
2201 Ok(ast::ClassUnicode {
2202 span: Span::new(start, self.pos()),
2203 negated,
2204 kind,
2205 })
2206 }
2207
2208 /// Parse a Perl character class, e.g., `\d` or `\W`. This assumes the
2209 /// parser is currently at a valid character class name and will be
2210 /// advanced to the character immediately following the class.
2211 #[inline(never)]
2212 fn parse_perl_class(&self) -> ast::ClassPerl {
2213 let c = self.char();
2214 let span = self.span_char();
2215 self.bump();
2216 let (negated, kind) = match c {
2217 'd' => (false, ast::ClassPerlKind::Digit),
2218 'D' => (true, ast::ClassPerlKind::Digit),
2219 's' => (false, ast::ClassPerlKind::Space),
2220 'S' => (true, ast::ClassPerlKind::Space),
2221 'w' => (false, ast::ClassPerlKind::Word),
2222 'W' => (true, ast::ClassPerlKind::Word),
2223 c => panic!("expected valid Perl class but got '{}'", c),
2224 };
2225 ast::ClassPerl { span, kind, negated }
2226 }
2227}
2228
2229/// A type that traverses a fully parsed Ast and checks whether its depth
2230/// exceeds the specified nesting limit. If it does, then an error is returned.
2231#[derive(Debug)]
2232struct NestLimiter<'p, 's, P> {
2233 /// The parser that is checking the nest limit.
2234 p: &'p ParserI<'s, P>,
2235 /// The current depth while walking an Ast.
2236 depth: u32,
2237}
2238
2239impl<'p, 's, P: Borrow<Parser>> NestLimiter<'p, 's, P> {
2240 fn new(p: &'p ParserI<'s, P>) -> NestLimiter<'p, 's, P> {
2241 NestLimiter { p, depth: 0 }
2242 }
2243
2244 #[inline(never)]
2245 fn check(self, ast: &Ast) -> Result<()> {
2246 ast::visit(ast, self)
2247 }
2248
2249 fn increment_depth(&mut self, span: &Span) -> Result<()> {
2250 let new = self.depth.checked_add(1).ok_or_else(|| {
2251 self.p.error(
2252 span.clone(),
2253 ast::ErrorKind::NestLimitExceeded(u32::MAX),
2254 )
2255 })?;
2256 let limit = self.p.parser().nest_limit;
2257 if new > limit {
2258 return Err(self.p.error(
2259 span.clone(),
2260 ast::ErrorKind::NestLimitExceeded(limit),
2261 ));
2262 }
2263 self.depth = new;
2264 Ok(())
2265 }
2266
2267 fn decrement_depth(&mut self) {
2268 // Assuming the correctness of the visitor, this should never drop
2269 // below 0.
2270 self.depth = self.depth.checked_sub(1).unwrap();
2271 }
2272}
2273
2274impl<'p, 's, P: Borrow<Parser>> ast::Visitor for NestLimiter<'p, 's, P> {
2275 type Output = ();
2276 type Err = ast::Error;
2277
2278 fn finish(self) -> Result<()> {
2279 Ok(())
2280 }
2281
2282 fn visit_pre(&mut self, ast: &Ast) -> Result<()> {
2283 let span = match *ast {
2284 Ast::Empty(_)
2285 | Ast::Flags(_)
2286 | Ast::Literal(_)
2287 | Ast::Dot(_)
2288 | Ast::Assertion(_)
2289 | Ast::ClassUnicode(_)
2290 | Ast::ClassPerl(_) => {
2291 // These are all base cases, so we don't increment depth.
2292 return Ok(());
2293 }
2294 Ast::ClassBracketed(ref x) => &x.span,
2295 Ast::Repetition(ref x) => &x.span,
2296 Ast::Group(ref x) => &x.span,
2297 Ast::Alternation(ref x) => &x.span,
2298 Ast::Concat(ref x) => &x.span,
2299 };
2300 self.increment_depth(span)
2301 }
2302
2303 fn visit_post(&mut self, ast: &Ast) -> Result<()> {
2304 match *ast {
2305 Ast::Empty(_)
2306 | Ast::Flags(_)
2307 | Ast::Literal(_)
2308 | Ast::Dot(_)
2309 | Ast::Assertion(_)
2310 | Ast::ClassUnicode(_)
2311 | Ast::ClassPerl(_) => {
2312 // These are all base cases, so we don't decrement depth.
2313 Ok(())
2314 }
2315 Ast::ClassBracketed(_)
2316 | Ast::Repetition(_)
2317 | Ast::Group(_)
2318 | Ast::Alternation(_)
2319 | Ast::Concat(_) => {
2320 self.decrement_depth();
2321 Ok(())
2322 }
2323 }
2324 }
2325
2326 fn visit_class_set_item_pre(
2327 &mut self,
2328 ast: &ast::ClassSetItem,
2329 ) -> Result<()> {
2330 let span = match *ast {
2331 ast::ClassSetItem::Empty(_)
2332 | ast::ClassSetItem::Literal(_)
2333 | ast::ClassSetItem::Range(_)
2334 | ast::ClassSetItem::Ascii(_)
2335 | ast::ClassSetItem::Unicode(_)
2336 | ast::ClassSetItem::Perl(_) => {
2337 // These are all base cases, so we don't increment depth.
2338 return Ok(());
2339 }
2340 ast::ClassSetItem::Bracketed(ref x) => &x.span,
2341 ast::ClassSetItem::Union(ref x) => &x.span,
2342 };
2343 self.increment_depth(span)
2344 }
2345
2346 fn visit_class_set_item_post(
2347 &mut self,
2348 ast: &ast::ClassSetItem,
2349 ) -> Result<()> {
2350 match *ast {
2351 ast::ClassSetItem::Empty(_)
2352 | ast::ClassSetItem::Literal(_)
2353 | ast::ClassSetItem::Range(_)
2354 | ast::ClassSetItem::Ascii(_)
2355 | ast::ClassSetItem::Unicode(_)
2356 | ast::ClassSetItem::Perl(_) => {
2357 // These are all base cases, so we don't decrement depth.
2358 Ok(())
2359 }
2360 ast::ClassSetItem::Bracketed(_) | ast::ClassSetItem::Union(_) => {
2361 self.decrement_depth();
2362 Ok(())
2363 }
2364 }
2365 }
2366
2367 fn visit_class_set_binary_op_pre(
2368 &mut self,
2369 ast: &ast::ClassSetBinaryOp,
2370 ) -> Result<()> {
2371 self.increment_depth(&ast.span)
2372 }
2373
2374 fn visit_class_set_binary_op_post(
2375 &mut self,
2376 _ast: &ast::ClassSetBinaryOp,
2377 ) -> Result<()> {
2378 self.decrement_depth();
2379 Ok(())
2380 }
2381}
2382
2383/// When the result is an error, transforms the ast::ErrorKind from the source
2384/// Result into another one. This function is used to return clearer error
2385/// messages when possible.
2386fn specialize_err<T>(
2387 result: Result<T>,
2388 from: ast::ErrorKind,
2389 to: ast::ErrorKind,
2390) -> Result<T> {
2391 if let Err(e: Error) = result {
2392 if e.kind == from {
2393 Err(ast::Error { kind: to, pattern: e.pattern, span: e.span })
2394 } else {
2395 Err(e)
2396 }
2397 } else {
2398 result
2399 }
2400}
2401
2402#[cfg(test)]
2403mod tests {
2404 use core::ops::Range;
2405
2406 use alloc::format;
2407
2408 use crate::ast::{self, Ast, Position, Span};
2409
2410 use super::*;
2411
2412 // Our own assert_eq, which has slightly better formatting (but honestly
2413 // still kind of crappy).
2414 macro_rules! assert_eq {
2415 ($left:expr, $right:expr) => {{
2416 match (&$left, &$right) {
2417 (left_val, right_val) => {
2418 if !(*left_val == *right_val) {
2419 panic!(
2420 "assertion failed: `(left == right)`\n\n\
2421 left: `{:?}`\nright: `{:?}`\n\n",
2422 left_val, right_val
2423 )
2424 }
2425 }
2426 }
2427 }};
2428 }
2429
2430 // We create these errors to compare with real ast::Errors in the tests.
2431 // We define equality between TestError and ast::Error to disregard the
2432 // pattern string in ast::Error, which is annoying to provide in tests.
2433 #[derive(Clone, Debug)]
2434 struct TestError {
2435 span: Span,
2436 kind: ast::ErrorKind,
2437 }
2438
2439 impl PartialEq<ast::Error> for TestError {
2440 fn eq(&self, other: &ast::Error) -> bool {
2441 self.span == other.span && self.kind == other.kind
2442 }
2443 }
2444
2445 impl PartialEq<TestError> for ast::Error {
2446 fn eq(&self, other: &TestError) -> bool {
2447 self.span == other.span && self.kind == other.kind
2448 }
2449 }
2450
2451 fn s(str: &str) -> String {
2452 str.to_string()
2453 }
2454
2455 fn parser(pattern: &str) -> ParserI<'_, Parser> {
2456 ParserI::new(Parser::new(), pattern)
2457 }
2458
2459 fn parser_octal(pattern: &str) -> ParserI<'_, Parser> {
2460 let parser = ParserBuilder::new().octal(true).build();
2461 ParserI::new(parser, pattern)
2462 }
2463
2464 fn parser_nest_limit(
2465 pattern: &str,
2466 nest_limit: u32,
2467 ) -> ParserI<'_, Parser> {
2468 let p = ParserBuilder::new().nest_limit(nest_limit).build();
2469 ParserI::new(p, pattern)
2470 }
2471
2472 fn parser_ignore_whitespace(pattern: &str) -> ParserI<'_, Parser> {
2473 let p = ParserBuilder::new().ignore_whitespace(true).build();
2474 ParserI::new(p, pattern)
2475 }
2476
2477 /// Short alias for creating a new span.
2478 fn nspan(start: Position, end: Position) -> Span {
2479 Span::new(start, end)
2480 }
2481
2482 /// Short alias for creating a new position.
2483 fn npos(offset: usize, line: usize, column: usize) -> Position {
2484 Position::new(offset, line, column)
2485 }
2486
2487 /// Create a new span from the given offset range. This assumes a single
2488 /// line and sets the columns based on the offsets. i.e., This only works
2489 /// out of the box for ASCII, which is fine for most tests.
2490 fn span(range: Range<usize>) -> Span {
2491 let start = Position::new(range.start, 1, range.start + 1);
2492 let end = Position::new(range.end, 1, range.end + 1);
2493 Span::new(start, end)
2494 }
2495
2496 /// Create a new span for the corresponding byte range in the given string.
2497 fn span_range(subject: &str, range: Range<usize>) -> Span {
2498 let start = Position {
2499 offset: range.start,
2500 line: 1 + subject[..range.start].matches('\n').count(),
2501 column: 1 + subject[..range.start]
2502 .chars()
2503 .rev()
2504 .position(|c| c == '\n')
2505 .unwrap_or(subject[..range.start].chars().count()),
2506 };
2507 let end = Position {
2508 offset: range.end,
2509 line: 1 + subject[..range.end].matches('\n').count(),
2510 column: 1 + subject[..range.end]
2511 .chars()
2512 .rev()
2513 .position(|c| c == '\n')
2514 .unwrap_or(subject[..range.end].chars().count()),
2515 };
2516 Span::new(start, end)
2517 }
2518
2519 /// Create a verbatim literal starting at the given position.
2520 fn lit(c: char, start: usize) -> Ast {
2521 lit_with(c, span(start..start + c.len_utf8()))
2522 }
2523
2524 /// Create a meta literal starting at the given position.
2525 fn meta_lit(c: char, span: Span) -> Ast {
2526 Ast::literal(ast::Literal { span, kind: ast::LiteralKind::Meta, c })
2527 }
2528
2529 /// Create a verbatim literal with the given span.
2530 fn lit_with(c: char, span: Span) -> Ast {
2531 Ast::literal(ast::Literal {
2532 span,
2533 kind: ast::LiteralKind::Verbatim,
2534 c,
2535 })
2536 }
2537
2538 /// Create a concatenation with the given range.
2539 fn concat(range: Range<usize>, asts: Vec<Ast>) -> Ast {
2540 concat_with(span(range), asts)
2541 }
2542
2543 /// Create a concatenation with the given span.
2544 fn concat_with(span: Span, asts: Vec<Ast>) -> Ast {
2545 Ast::concat(ast::Concat { span, asts })
2546 }
2547
2548 /// Create an alternation with the given span.
2549 fn alt(range: Range<usize>, asts: Vec<Ast>) -> Ast {
2550 Ast::alternation(ast::Alternation { span: span(range), asts })
2551 }
2552
2553 /// Create a capturing group with the given span.
2554 fn group(range: Range<usize>, index: u32, ast: Ast) -> Ast {
2555 Ast::group(ast::Group {
2556 span: span(range),
2557 kind: ast::GroupKind::CaptureIndex(index),
2558 ast: Box::new(ast),
2559 })
2560 }
2561
2562 /// Create an ast::SetFlags.
2563 ///
2564 /// The given pattern should be the full pattern string. The range given
2565 /// should correspond to the byte offsets where the flag set occurs.
2566 ///
2567 /// If negated is true, then the set is interpreted as beginning with a
2568 /// negation.
2569 fn flag_set(
2570 pat: &str,
2571 range: Range<usize>,
2572 flag: ast::Flag,
2573 negated: bool,
2574 ) -> Ast {
2575 let mut items = vec![ast::FlagsItem {
2576 span: span_range(pat, (range.end - 2)..(range.end - 1)),
2577 kind: ast::FlagsItemKind::Flag(flag),
2578 }];
2579 if negated {
2580 items.insert(
2581 0,
2582 ast::FlagsItem {
2583 span: span_range(pat, (range.start + 2)..(range.end - 2)),
2584 kind: ast::FlagsItemKind::Negation,
2585 },
2586 );
2587 }
2588 Ast::flags(ast::SetFlags {
2589 span: span_range(pat, range.clone()),
2590 flags: ast::Flags {
2591 span: span_range(pat, (range.start + 2)..(range.end - 1)),
2592 items,
2593 },
2594 })
2595 }
2596
2597 #[test]
2598 fn parse_nest_limit() {
2599 // A nest limit of 0 still allows some types of regexes.
2600 assert_eq!(
2601 parser_nest_limit("", 0).parse(),
2602 Ok(Ast::empty(span(0..0)))
2603 );
2604 assert_eq!(parser_nest_limit("a", 0).parse(), Ok(lit('a', 0)));
2605
2606 // Test repetition operations, which require one level of nesting.
2607 assert_eq!(
2608 parser_nest_limit("a+", 0).parse().unwrap_err(),
2609 TestError {
2610 span: span(0..2),
2611 kind: ast::ErrorKind::NestLimitExceeded(0),
2612 }
2613 );
2614 assert_eq!(
2615 parser_nest_limit("a+", 1).parse(),
2616 Ok(Ast::repetition(ast::Repetition {
2617 span: span(0..2),
2618 op: ast::RepetitionOp {
2619 span: span(1..2),
2620 kind: ast::RepetitionKind::OneOrMore,
2621 },
2622 greedy: true,
2623 ast: Box::new(lit('a', 0)),
2624 }))
2625 );
2626 assert_eq!(
2627 parser_nest_limit("(a)+", 1).parse().unwrap_err(),
2628 TestError {
2629 span: span(0..3),
2630 kind: ast::ErrorKind::NestLimitExceeded(1),
2631 }
2632 );
2633 assert_eq!(
2634 parser_nest_limit("a+*", 1).parse().unwrap_err(),
2635 TestError {
2636 span: span(0..2),
2637 kind: ast::ErrorKind::NestLimitExceeded(1),
2638 }
2639 );
2640 assert_eq!(
2641 parser_nest_limit("a+*", 2).parse(),
2642 Ok(Ast::repetition(ast::Repetition {
2643 span: span(0..3),
2644 op: ast::RepetitionOp {
2645 span: span(2..3),
2646 kind: ast::RepetitionKind::ZeroOrMore,
2647 },
2648 greedy: true,
2649 ast: Box::new(Ast::repetition(ast::Repetition {
2650 span: span(0..2),
2651 op: ast::RepetitionOp {
2652 span: span(1..2),
2653 kind: ast::RepetitionKind::OneOrMore,
2654 },
2655 greedy: true,
2656 ast: Box::new(lit('a', 0)),
2657 })),
2658 }))
2659 );
2660
2661 // Test concatenations. A concatenation requires one level of nesting.
2662 assert_eq!(
2663 parser_nest_limit("ab", 0).parse().unwrap_err(),
2664 TestError {
2665 span: span(0..2),
2666 kind: ast::ErrorKind::NestLimitExceeded(0),
2667 }
2668 );
2669 assert_eq!(
2670 parser_nest_limit("ab", 1).parse(),
2671 Ok(concat(0..2, vec![lit('a', 0), lit('b', 1)]))
2672 );
2673 assert_eq!(
2674 parser_nest_limit("abc", 1).parse(),
2675 Ok(concat(0..3, vec![lit('a', 0), lit('b', 1), lit('c', 2)]))
2676 );
2677
2678 // Test alternations. An alternation requires one level of nesting.
2679 assert_eq!(
2680 parser_nest_limit("a|b", 0).parse().unwrap_err(),
2681 TestError {
2682 span: span(0..3),
2683 kind: ast::ErrorKind::NestLimitExceeded(0),
2684 }
2685 );
2686 assert_eq!(
2687 parser_nest_limit("a|b", 1).parse(),
2688 Ok(alt(0..3, vec![lit('a', 0), lit('b', 2)]))
2689 );
2690 assert_eq!(
2691 parser_nest_limit("a|b|c", 1).parse(),
2692 Ok(alt(0..5, vec![lit('a', 0), lit('b', 2), lit('c', 4)]))
2693 );
2694
2695 // Test character classes. Classes form their own mini-recursive
2696 // syntax!
2697 assert_eq!(
2698 parser_nest_limit("[a]", 0).parse().unwrap_err(),
2699 TestError {
2700 span: span(0..3),
2701 kind: ast::ErrorKind::NestLimitExceeded(0),
2702 }
2703 );
2704 assert_eq!(
2705 parser_nest_limit("[a]", 1).parse(),
2706 Ok(Ast::class_bracketed(ast::ClassBracketed {
2707 span: span(0..3),
2708 negated: false,
2709 kind: ast::ClassSet::Item(ast::ClassSetItem::Literal(
2710 ast::Literal {
2711 span: span(1..2),
2712 kind: ast::LiteralKind::Verbatim,
2713 c: 'a',
2714 }
2715 )),
2716 }))
2717 );
2718 assert_eq!(
2719 parser_nest_limit("[ab]", 1).parse().unwrap_err(),
2720 TestError {
2721 span: span(1..3),
2722 kind: ast::ErrorKind::NestLimitExceeded(1),
2723 }
2724 );
2725 assert_eq!(
2726 parser_nest_limit("[ab[cd]]", 2).parse().unwrap_err(),
2727 TestError {
2728 span: span(3..7),
2729 kind: ast::ErrorKind::NestLimitExceeded(2),
2730 }
2731 );
2732 assert_eq!(
2733 parser_nest_limit("[ab[cd]]", 3).parse().unwrap_err(),
2734 TestError {
2735 span: span(4..6),
2736 kind: ast::ErrorKind::NestLimitExceeded(3),
2737 }
2738 );
2739 assert_eq!(
2740 parser_nest_limit("[a--b]", 1).parse().unwrap_err(),
2741 TestError {
2742 span: span(1..5),
2743 kind: ast::ErrorKind::NestLimitExceeded(1),
2744 }
2745 );
2746 assert_eq!(
2747 parser_nest_limit("[a--bc]", 2).parse().unwrap_err(),
2748 TestError {
2749 span: span(4..6),
2750 kind: ast::ErrorKind::NestLimitExceeded(2),
2751 }
2752 );
2753 }
2754
2755 #[test]
2756 fn parse_comments() {
2757 let pat = "(?x)
2758# This is comment 1.
2759foo # This is comment 2.
2760 # This is comment 3.
2761bar
2762# This is comment 4.";
2763 let astc = parser(pat).parse_with_comments().unwrap();
2764 assert_eq!(
2765 astc.ast,
2766 concat_with(
2767 span_range(pat, 0..pat.len()),
2768 vec![
2769 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2770 lit_with('f', span_range(pat, 26..27)),
2771 lit_with('o', span_range(pat, 27..28)),
2772 lit_with('o', span_range(pat, 28..29)),
2773 lit_with('b', span_range(pat, 74..75)),
2774 lit_with('a', span_range(pat, 75..76)),
2775 lit_with('r', span_range(pat, 76..77)),
2776 ]
2777 )
2778 );
2779 assert_eq!(
2780 astc.comments,
2781 vec![
2782 ast::Comment {
2783 span: span_range(pat, 5..26),
2784 comment: s(" This is comment 1."),
2785 },
2786 ast::Comment {
2787 span: span_range(pat, 30..51),
2788 comment: s(" This is comment 2."),
2789 },
2790 ast::Comment {
2791 span: span_range(pat, 53..74),
2792 comment: s(" This is comment 3."),
2793 },
2794 ast::Comment {
2795 span: span_range(pat, 78..98),
2796 comment: s(" This is comment 4."),
2797 },
2798 ]
2799 );
2800 }
2801
2802 #[test]
2803 fn parse_holistic() {
2804 assert_eq!(parser("]").parse(), Ok(lit(']', 0)));
2805 assert_eq!(
2806 parser(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#\&\-\~").parse(),
2807 Ok(concat(
2808 0..36,
2809 vec![
2810 meta_lit('\\', span(0..2)),
2811 meta_lit('.', span(2..4)),
2812 meta_lit('+', span(4..6)),
2813 meta_lit('*', span(6..8)),
2814 meta_lit('?', span(8..10)),
2815 meta_lit('(', span(10..12)),
2816 meta_lit(')', span(12..14)),
2817 meta_lit('|', span(14..16)),
2818 meta_lit('[', span(16..18)),
2819 meta_lit(']', span(18..20)),
2820 meta_lit('{', span(20..22)),
2821 meta_lit('}', span(22..24)),
2822 meta_lit('^', span(24..26)),
2823 meta_lit('$', span(26..28)),
2824 meta_lit('#', span(28..30)),
2825 meta_lit('&', span(30..32)),
2826 meta_lit('-', span(32..34)),
2827 meta_lit('~', span(34..36)),
2828 ]
2829 ))
2830 );
2831 }
2832
2833 #[test]
2834 fn parse_ignore_whitespace() {
2835 // Test that basic whitespace insensitivity works.
2836 let pat = "(?x)a b";
2837 assert_eq!(
2838 parser(pat).parse(),
2839 Ok(concat_with(
2840 nspan(npos(0, 1, 1), npos(7, 1, 8)),
2841 vec![
2842 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2843 lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))),
2844 lit_with('b', nspan(npos(6, 1, 7), npos(7, 1, 8))),
2845 ]
2846 ))
2847 );
2848
2849 // Test that we can toggle whitespace insensitivity.
2850 let pat = "(?x)a b(?-x)a b";
2851 assert_eq!(
2852 parser(pat).parse(),
2853 Ok(concat_with(
2854 nspan(npos(0, 1, 1), npos(15, 1, 16)),
2855 vec![
2856 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2857 lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))),
2858 lit_with('b', nspan(npos(6, 1, 7), npos(7, 1, 8))),
2859 flag_set(pat, 7..12, ast::Flag::IgnoreWhitespace, true),
2860 lit_with('a', nspan(npos(12, 1, 13), npos(13, 1, 14))),
2861 lit_with(' ', nspan(npos(13, 1, 14), npos(14, 1, 15))),
2862 lit_with('b', nspan(npos(14, 1, 15), npos(15, 1, 16))),
2863 ]
2864 ))
2865 );
2866
2867 // Test that nesting whitespace insensitive flags works.
2868 let pat = "a (?x:a )a ";
2869 assert_eq!(
2870 parser(pat).parse(),
2871 Ok(concat_with(
2872 span_range(pat, 0..11),
2873 vec![
2874 lit_with('a', span_range(pat, 0..1)),
2875 lit_with(' ', span_range(pat, 1..2)),
2876 Ast::group(ast::Group {
2877 span: span_range(pat, 2..9),
2878 kind: ast::GroupKind::NonCapturing(ast::Flags {
2879 span: span_range(pat, 4..5),
2880 items: vec![ast::FlagsItem {
2881 span: span_range(pat, 4..5),
2882 kind: ast::FlagsItemKind::Flag(
2883 ast::Flag::IgnoreWhitespace
2884 ),
2885 },],
2886 }),
2887 ast: Box::new(lit_with('a', span_range(pat, 6..7))),
2888 }),
2889 lit_with('a', span_range(pat, 9..10)),
2890 lit_with(' ', span_range(pat, 10..11)),
2891 ]
2892 ))
2893 );
2894
2895 // Test that whitespace after an opening paren is insignificant.
2896 let pat = "(?x)( ?P<foo> a )";
2897 assert_eq!(
2898 parser(pat).parse(),
2899 Ok(concat_with(
2900 span_range(pat, 0..pat.len()),
2901 vec![
2902 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2903 Ast::group(ast::Group {
2904 span: span_range(pat, 4..pat.len()),
2905 kind: ast::GroupKind::CaptureName {
2906 starts_with_p: true,
2907 name: ast::CaptureName {
2908 span: span_range(pat, 9..12),
2909 name: s("foo"),
2910 index: 1,
2911 }
2912 },
2913 ast: Box::new(lit_with('a', span_range(pat, 14..15))),
2914 }),
2915 ]
2916 ))
2917 );
2918 let pat = "(?x)( a )";
2919 assert_eq!(
2920 parser(pat).parse(),
2921 Ok(concat_with(
2922 span_range(pat, 0..pat.len()),
2923 vec![
2924 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2925 Ast::group(ast::Group {
2926 span: span_range(pat, 4..pat.len()),
2927 kind: ast::GroupKind::CaptureIndex(1),
2928 ast: Box::new(lit_with('a', span_range(pat, 7..8))),
2929 }),
2930 ]
2931 ))
2932 );
2933 let pat = "(?x)( ?: a )";
2934 assert_eq!(
2935 parser(pat).parse(),
2936 Ok(concat_with(
2937 span_range(pat, 0..pat.len()),
2938 vec![
2939 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2940 Ast::group(ast::Group {
2941 span: span_range(pat, 4..pat.len()),
2942 kind: ast::GroupKind::NonCapturing(ast::Flags {
2943 span: span_range(pat, 8..8),
2944 items: vec![],
2945 }),
2946 ast: Box::new(lit_with('a', span_range(pat, 11..12))),
2947 }),
2948 ]
2949 ))
2950 );
2951 let pat = r"(?x)\x { 53 }";
2952 assert_eq!(
2953 parser(pat).parse(),
2954 Ok(concat_with(
2955 span_range(pat, 0..pat.len()),
2956 vec![
2957 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2958 Ast::literal(ast::Literal {
2959 span: span(4..13),
2960 kind: ast::LiteralKind::HexBrace(
2961 ast::HexLiteralKind::X
2962 ),
2963 c: 'S',
2964 }),
2965 ]
2966 ))
2967 );
2968
2969 // Test that whitespace after an escape is OK.
2970 let pat = r"(?x)\ ";
2971 assert_eq!(
2972 parser(pat).parse(),
2973 Ok(concat_with(
2974 span_range(pat, 0..pat.len()),
2975 vec![
2976 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2977 Ast::literal(ast::Literal {
2978 span: span_range(pat, 4..6),
2979 kind: ast::LiteralKind::Superfluous,
2980 c: ' ',
2981 }),
2982 ]
2983 ))
2984 );
2985 }
2986
2987 #[test]
2988 fn parse_newlines() {
2989 let pat = ".\n.";
2990 assert_eq!(
2991 parser(pat).parse(),
2992 Ok(concat_with(
2993 span_range(pat, 0..3),
2994 vec![
2995 Ast::dot(span_range(pat, 0..1)),
2996 lit_with('\n', span_range(pat, 1..2)),
2997 Ast::dot(span_range(pat, 2..3)),
2998 ]
2999 ))
3000 );
3001
3002 let pat = "foobar\nbaz\nquux\n";
3003 assert_eq!(
3004 parser(pat).parse(),
3005 Ok(concat_with(
3006 span_range(pat, 0..pat.len()),
3007 vec![
3008 lit_with('f', nspan(npos(0, 1, 1), npos(1, 1, 2))),
3009 lit_with('o', nspan(npos(1, 1, 2), npos(2, 1, 3))),
3010 lit_with('o', nspan(npos(2, 1, 3), npos(3, 1, 4))),
3011 lit_with('b', nspan(npos(3, 1, 4), npos(4, 1, 5))),
3012 lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))),
3013 lit_with('r', nspan(npos(5, 1, 6), npos(6, 1, 7))),
3014 lit_with('\n', nspan(npos(6, 1, 7), npos(7, 2, 1))),
3015 lit_with('b', nspan(npos(7, 2, 1), npos(8, 2, 2))),
3016 lit_with('a', nspan(npos(8, 2, 2), npos(9, 2, 3))),
3017 lit_with('z', nspan(npos(9, 2, 3), npos(10, 2, 4))),
3018 lit_with('\n', nspan(npos(10, 2, 4), npos(11, 3, 1))),
3019 lit_with('q', nspan(npos(11, 3, 1), npos(12, 3, 2))),
3020 lit_with('u', nspan(npos(12, 3, 2), npos(13, 3, 3))),
3021 lit_with('u', nspan(npos(13, 3, 3), npos(14, 3, 4))),
3022 lit_with('x', nspan(npos(14, 3, 4), npos(15, 3, 5))),
3023 lit_with('\n', nspan(npos(15, 3, 5), npos(16, 4, 1))),
3024 ]
3025 ))
3026 );
3027 }
3028
3029 #[test]
3030 fn parse_uncounted_repetition() {
3031 assert_eq!(
3032 parser(r"a*").parse(),
3033 Ok(Ast::repetition(ast::Repetition {
3034 span: span(0..2),
3035 op: ast::RepetitionOp {
3036 span: span(1..2),
3037 kind: ast::RepetitionKind::ZeroOrMore,
3038 },
3039 greedy: true,
3040 ast: Box::new(lit('a', 0)),
3041 }))
3042 );
3043 assert_eq!(
3044 parser(r"a+").parse(),
3045 Ok(Ast::repetition(ast::Repetition {
3046 span: span(0..2),
3047 op: ast::RepetitionOp {
3048 span: span(1..2),
3049 kind: ast::RepetitionKind::OneOrMore,
3050 },
3051 greedy: true,
3052 ast: Box::new(lit('a', 0)),
3053 }))
3054 );
3055
3056 assert_eq!(
3057 parser(r"a?").parse(),
3058 Ok(Ast::repetition(ast::Repetition {
3059 span: span(0..2),
3060 op: ast::RepetitionOp {
3061 span: span(1..2),
3062 kind: ast::RepetitionKind::ZeroOrOne,
3063 },
3064 greedy: true,
3065 ast: Box::new(lit('a', 0)),
3066 }))
3067 );
3068 assert_eq!(
3069 parser(r"a??").parse(),
3070 Ok(Ast::repetition(ast::Repetition {
3071 span: span(0..3),
3072 op: ast::RepetitionOp {
3073 span: span(1..3),
3074 kind: ast::RepetitionKind::ZeroOrOne,
3075 },
3076 greedy: false,
3077 ast: Box::new(lit('a', 0)),
3078 }))
3079 );
3080 assert_eq!(
3081 parser(r"a?").parse(),
3082 Ok(Ast::repetition(ast::Repetition {
3083 span: span(0..2),
3084 op: ast::RepetitionOp {
3085 span: span(1..2),
3086 kind: ast::RepetitionKind::ZeroOrOne,
3087 },
3088 greedy: true,
3089 ast: Box::new(lit('a', 0)),
3090 }))
3091 );
3092 assert_eq!(
3093 parser(r"a?b").parse(),
3094 Ok(concat(
3095 0..3,
3096 vec![
3097 Ast::repetition(ast::Repetition {
3098 span: span(0..2),
3099 op: ast::RepetitionOp {
3100 span: span(1..2),
3101 kind: ast::RepetitionKind::ZeroOrOne,
3102 },
3103 greedy: true,
3104 ast: Box::new(lit('a', 0)),
3105 }),
3106 lit('b', 2),
3107 ]
3108 ))
3109 );
3110 assert_eq!(
3111 parser(r"a??b").parse(),
3112 Ok(concat(
3113 0..4,
3114 vec![
3115 Ast::repetition(ast::Repetition {
3116 span: span(0..3),
3117 op: ast::RepetitionOp {
3118 span: span(1..3),
3119 kind: ast::RepetitionKind::ZeroOrOne,
3120 },
3121 greedy: false,
3122 ast: Box::new(lit('a', 0)),
3123 }),
3124 lit('b', 3),
3125 ]
3126 ))
3127 );
3128 assert_eq!(
3129 parser(r"ab?").parse(),
3130 Ok(concat(
3131 0..3,
3132 vec![
3133 lit('a', 0),
3134 Ast::repetition(ast::Repetition {
3135 span: span(1..3),
3136 op: ast::RepetitionOp {
3137 span: span(2..3),
3138 kind: ast::RepetitionKind::ZeroOrOne,
3139 },
3140 greedy: true,
3141 ast: Box::new(lit('b', 1)),
3142 }),
3143 ]
3144 ))
3145 );
3146 assert_eq!(
3147 parser(r"(ab)?").parse(),
3148 Ok(Ast::repetition(ast::Repetition {
3149 span: span(0..5),
3150 op: ast::RepetitionOp {
3151 span: span(4..5),
3152 kind: ast::RepetitionKind::ZeroOrOne,
3153 },
3154 greedy: true,
3155 ast: Box::new(group(
3156 0..4,
3157 1,
3158 concat(1..3, vec![lit('a', 1), lit('b', 2),])
3159 )),
3160 }))
3161 );
3162 assert_eq!(
3163 parser(r"|a?").parse(),
3164 Ok(alt(
3165 0..3,
3166 vec![
3167 Ast::empty(span(0..0)),
3168 Ast::repetition(ast::Repetition {
3169 span: span(1..3),
3170 op: ast::RepetitionOp {
3171 span: span(2..3),
3172 kind: ast::RepetitionKind::ZeroOrOne,
3173 },
3174 greedy: true,
3175 ast: Box::new(lit('a', 1)),
3176 }),
3177 ]
3178 ))
3179 );
3180
3181 assert_eq!(
3182 parser(r"*").parse().unwrap_err(),
3183 TestError {
3184 span: span(0..0),
3185 kind: ast::ErrorKind::RepetitionMissing,
3186 }
3187 );
3188 assert_eq!(
3189 parser(r"(?i)*").parse().unwrap_err(),
3190 TestError {
3191 span: span(4..4),
3192 kind: ast::ErrorKind::RepetitionMissing,
3193 }
3194 );
3195 assert_eq!(
3196 parser(r"(*)").parse().unwrap_err(),
3197 TestError {
3198 span: span(1..1),
3199 kind: ast::ErrorKind::RepetitionMissing,
3200 }
3201 );
3202 assert_eq!(
3203 parser(r"(?:?)").parse().unwrap_err(),
3204 TestError {
3205 span: span(3..3),
3206 kind: ast::ErrorKind::RepetitionMissing,
3207 }
3208 );
3209 assert_eq!(
3210 parser(r"+").parse().unwrap_err(),
3211 TestError {
3212 span: span(0..0),
3213 kind: ast::ErrorKind::RepetitionMissing,
3214 }
3215 );
3216 assert_eq!(
3217 parser(r"?").parse().unwrap_err(),
3218 TestError {
3219 span: span(0..0),
3220 kind: ast::ErrorKind::RepetitionMissing,
3221 }
3222 );
3223 assert_eq!(
3224 parser(r"(?)").parse().unwrap_err(),
3225 TestError {
3226 span: span(1..1),
3227 kind: ast::ErrorKind::RepetitionMissing,
3228 }
3229 );
3230 assert_eq!(
3231 parser(r"|*").parse().unwrap_err(),
3232 TestError {
3233 span: span(1..1),
3234 kind: ast::ErrorKind::RepetitionMissing,
3235 }
3236 );
3237 assert_eq!(
3238 parser(r"|+").parse().unwrap_err(),
3239 TestError {
3240 span: span(1..1),
3241 kind: ast::ErrorKind::RepetitionMissing,
3242 }
3243 );
3244 assert_eq!(
3245 parser(r"|?").parse().unwrap_err(),
3246 TestError {
3247 span: span(1..1),
3248 kind: ast::ErrorKind::RepetitionMissing,
3249 }
3250 );
3251 }
3252
3253 #[test]
3254 fn parse_counted_repetition() {
3255 assert_eq!(
3256 parser(r"a{5}").parse(),
3257 Ok(Ast::repetition(ast::Repetition {
3258 span: span(0..4),
3259 op: ast::RepetitionOp {
3260 span: span(1..4),
3261 kind: ast::RepetitionKind::Range(
3262 ast::RepetitionRange::Exactly(5)
3263 ),
3264 },
3265 greedy: true,
3266 ast: Box::new(lit('a', 0)),
3267 }))
3268 );
3269 assert_eq!(
3270 parser(r"a{5,}").parse(),
3271 Ok(Ast::repetition(ast::Repetition {
3272 span: span(0..5),
3273 op: ast::RepetitionOp {
3274 span: span(1..5),
3275 kind: ast::RepetitionKind::Range(
3276 ast::RepetitionRange::AtLeast(5)
3277 ),
3278 },
3279 greedy: true,
3280 ast: Box::new(lit('a', 0)),
3281 }))
3282 );
3283 assert_eq!(
3284 parser(r"a{5,9}").parse(),
3285 Ok(Ast::repetition(ast::Repetition {
3286 span: span(0..6),
3287 op: ast::RepetitionOp {
3288 span: span(1..6),
3289 kind: ast::RepetitionKind::Range(
3290 ast::RepetitionRange::Bounded(5, 9)
3291 ),
3292 },
3293 greedy: true,
3294 ast: Box::new(lit('a', 0)),
3295 }))
3296 );
3297 assert_eq!(
3298 parser(r"a{5}?").parse(),
3299 Ok(Ast::repetition(ast::Repetition {
3300 span: span(0..5),
3301 op: ast::RepetitionOp {
3302 span: span(1..5),
3303 kind: ast::RepetitionKind::Range(
3304 ast::RepetitionRange::Exactly(5)
3305 ),
3306 },
3307 greedy: false,
3308 ast: Box::new(lit('a', 0)),
3309 }))
3310 );
3311 assert_eq!(
3312 parser(r"ab{5}").parse(),
3313 Ok(concat(
3314 0..5,
3315 vec![
3316 lit('a', 0),
3317 Ast::repetition(ast::Repetition {
3318 span: span(1..5),
3319 op: ast::RepetitionOp {
3320 span: span(2..5),
3321 kind: ast::RepetitionKind::Range(
3322 ast::RepetitionRange::Exactly(5)
3323 ),
3324 },
3325 greedy: true,
3326 ast: Box::new(lit('b', 1)),
3327 }),
3328 ]
3329 ))
3330 );
3331 assert_eq!(
3332 parser(r"ab{5}c").parse(),
3333 Ok(concat(
3334 0..6,
3335 vec![
3336 lit('a', 0),
3337 Ast::repetition(ast::Repetition {
3338 span: span(1..5),
3339 op: ast::RepetitionOp {
3340 span: span(2..5),
3341 kind: ast::RepetitionKind::Range(
3342 ast::RepetitionRange::Exactly(5)
3343 ),
3344 },
3345 greedy: true,
3346 ast: Box::new(lit('b', 1)),
3347 }),
3348 lit('c', 5),
3349 ]
3350 ))
3351 );
3352
3353 assert_eq!(
3354 parser(r"a{ 5 }").parse(),
3355 Ok(Ast::repetition(ast::Repetition {
3356 span: span(0..6),
3357 op: ast::RepetitionOp {
3358 span: span(1..6),
3359 kind: ast::RepetitionKind::Range(
3360 ast::RepetitionRange::Exactly(5)
3361 ),
3362 },
3363 greedy: true,
3364 ast: Box::new(lit('a', 0)),
3365 }))
3366 );
3367 assert_eq!(
3368 parser(r"a{ 5 , 9 }").parse(),
3369 Ok(Ast::repetition(ast::Repetition {
3370 span: span(0..10),
3371 op: ast::RepetitionOp {
3372 span: span(1..10),
3373 kind: ast::RepetitionKind::Range(
3374 ast::RepetitionRange::Bounded(5, 9)
3375 ),
3376 },
3377 greedy: true,
3378 ast: Box::new(lit('a', 0)),
3379 }))
3380 );
3381 assert_eq!(
3382 parser_ignore_whitespace(r"a{5,9} ?").parse(),
3383 Ok(Ast::repetition(ast::Repetition {
3384 span: span(0..8),
3385 op: ast::RepetitionOp {
3386 span: span(1..8),
3387 kind: ast::RepetitionKind::Range(
3388 ast::RepetitionRange::Bounded(5, 9)
3389 ),
3390 },
3391 greedy: false,
3392 ast: Box::new(lit('a', 0)),
3393 }))
3394 );
3395 assert_eq!(
3396 parser(r"\b{5,9}").parse(),
3397 Ok(Ast::repetition(ast::Repetition {
3398 span: span(0..7),
3399 op: ast::RepetitionOp {
3400 span: span(2..7),
3401 kind: ast::RepetitionKind::Range(
3402 ast::RepetitionRange::Bounded(5, 9)
3403 ),
3404 },
3405 greedy: true,
3406 ast: Box::new(Ast::assertion(ast::Assertion {
3407 span: span(0..2),
3408 kind: ast::AssertionKind::WordBoundary,
3409 })),
3410 }))
3411 );
3412
3413 assert_eq!(
3414 parser(r"(?i){0}").parse().unwrap_err(),
3415 TestError {
3416 span: span(4..4),
3417 kind: ast::ErrorKind::RepetitionMissing,
3418 }
3419 );
3420 assert_eq!(
3421 parser(r"(?m){1,1}").parse().unwrap_err(),
3422 TestError {
3423 span: span(4..4),
3424 kind: ast::ErrorKind::RepetitionMissing,
3425 }
3426 );
3427 assert_eq!(
3428 parser(r"a{]}").parse().unwrap_err(),
3429 TestError {
3430 span: span(2..2),
3431 kind: ast::ErrorKind::RepetitionCountDecimalEmpty,
3432 }
3433 );
3434 assert_eq!(
3435 parser(r"a{1,]}").parse().unwrap_err(),
3436 TestError {
3437 span: span(4..4),
3438 kind: ast::ErrorKind::RepetitionCountDecimalEmpty,
3439 }
3440 );
3441 assert_eq!(
3442 parser(r"a{").parse().unwrap_err(),
3443 TestError {
3444 span: span(1..2),
3445 kind: ast::ErrorKind::RepetitionCountUnclosed,
3446 }
3447 );
3448 assert_eq!(
3449 parser(r"a{}").parse().unwrap_err(),
3450 TestError {
3451 span: span(2..2),
3452 kind: ast::ErrorKind::RepetitionCountDecimalEmpty,
3453 }
3454 );
3455 assert_eq!(
3456 parser(r"a{a").parse().unwrap_err(),
3457 TestError {
3458 span: span(2..2),
3459 kind: ast::ErrorKind::RepetitionCountDecimalEmpty,
3460 }
3461 );
3462 assert_eq!(
3463 parser(r"a{9999999999}").parse().unwrap_err(),
3464 TestError {
3465 span: span(2..12),
3466 kind: ast::ErrorKind::DecimalInvalid,
3467 }
3468 );
3469 assert_eq!(
3470 parser(r"a{9").parse().unwrap_err(),
3471 TestError {
3472 span: span(1..3),
3473 kind: ast::ErrorKind::RepetitionCountUnclosed,
3474 }
3475 );
3476 assert_eq!(
3477 parser(r"a{9,a").parse().unwrap_err(),
3478 TestError {
3479 span: span(4..4),
3480 kind: ast::ErrorKind::RepetitionCountDecimalEmpty,
3481 }
3482 );
3483 assert_eq!(
3484 parser(r"a{9,9999999999}").parse().unwrap_err(),
3485 TestError {
3486 span: span(4..14),
3487 kind: ast::ErrorKind::DecimalInvalid,
3488 }
3489 );
3490 assert_eq!(
3491 parser(r"a{9,").parse().unwrap_err(),
3492 TestError {
3493 span: span(1..4),
3494 kind: ast::ErrorKind::RepetitionCountUnclosed,
3495 }
3496 );
3497 assert_eq!(
3498 parser(r"a{9,11").parse().unwrap_err(),
3499 TestError {
3500 span: span(1..6),
3501 kind: ast::ErrorKind::RepetitionCountUnclosed,
3502 }
3503 );
3504 assert_eq!(
3505 parser(r"a{2,1}").parse().unwrap_err(),
3506 TestError {
3507 span: span(1..6),
3508 kind: ast::ErrorKind::RepetitionCountInvalid,
3509 }
3510 );
3511 assert_eq!(
3512 parser(r"{5}").parse().unwrap_err(),
3513 TestError {
3514 span: span(0..0),
3515 kind: ast::ErrorKind::RepetitionMissing,
3516 }
3517 );
3518 assert_eq!(
3519 parser(r"|{5}").parse().unwrap_err(),
3520 TestError {
3521 span: span(1..1),
3522 kind: ast::ErrorKind::RepetitionMissing,
3523 }
3524 );
3525 }
3526
3527 #[test]
3528 fn parse_alternate() {
3529 assert_eq!(
3530 parser(r"a|b").parse(),
3531 Ok(Ast::alternation(ast::Alternation {
3532 span: span(0..3),
3533 asts: vec![lit('a', 0), lit('b', 2)],
3534 }))
3535 );
3536 assert_eq!(
3537 parser(r"(a|b)").parse(),
3538 Ok(group(
3539 0..5,
3540 1,
3541 Ast::alternation(ast::Alternation {
3542 span: span(1..4),
3543 asts: vec![lit('a', 1), lit('b', 3)],
3544 })
3545 ))
3546 );
3547
3548 assert_eq!(
3549 parser(r"a|b|c").parse(),
3550 Ok(Ast::alternation(ast::Alternation {
3551 span: span(0..5),
3552 asts: vec![lit('a', 0), lit('b', 2), lit('c', 4)],
3553 }))
3554 );
3555 assert_eq!(
3556 parser(r"ax|by|cz").parse(),
3557 Ok(Ast::alternation(ast::Alternation {
3558 span: span(0..8),
3559 asts: vec![
3560 concat(0..2, vec![lit('a', 0), lit('x', 1)]),
3561 concat(3..5, vec![lit('b', 3), lit('y', 4)]),
3562 concat(6..8, vec![lit('c', 6), lit('z', 7)]),
3563 ],
3564 }))
3565 );
3566 assert_eq!(
3567 parser(r"(ax|by|cz)").parse(),
3568 Ok(group(
3569 0..10,
3570 1,
3571 Ast::alternation(ast::Alternation {
3572 span: span(1..9),
3573 asts: vec![
3574 concat(1..3, vec![lit('a', 1), lit('x', 2)]),
3575 concat(4..6, vec![lit('b', 4), lit('y', 5)]),
3576 concat(7..9, vec![lit('c', 7), lit('z', 8)]),
3577 ],
3578 })
3579 ))
3580 );
3581 assert_eq!(
3582 parser(r"(ax|(by|(cz)))").parse(),
3583 Ok(group(
3584 0..14,
3585 1,
3586 alt(
3587 1..13,
3588 vec![
3589 concat(1..3, vec![lit('a', 1), lit('x', 2)]),
3590 group(
3591 4..13,
3592 2,
3593 alt(
3594 5..12,
3595 vec![
3596 concat(
3597 5..7,
3598 vec![lit('b', 5), lit('y', 6)]
3599 ),
3600 group(
3601 8..12,
3602 3,
3603 concat(
3604 9..11,
3605 vec![lit('c', 9), lit('z', 10),]
3606 )
3607 ),
3608 ]
3609 )
3610 ),
3611 ]
3612 )
3613 ))
3614 );
3615
3616 assert_eq!(
3617 parser(r"|").parse(),
3618 Ok(alt(
3619 0..1,
3620 vec![Ast::empty(span(0..0)), Ast::empty(span(1..1)),]
3621 ))
3622 );
3623 assert_eq!(
3624 parser(r"||").parse(),
3625 Ok(alt(
3626 0..2,
3627 vec![
3628 Ast::empty(span(0..0)),
3629 Ast::empty(span(1..1)),
3630 Ast::empty(span(2..2)),
3631 ]
3632 ))
3633 );
3634 assert_eq!(
3635 parser(r"a|").parse(),
3636 Ok(alt(0..2, vec![lit('a', 0), Ast::empty(span(2..2)),]))
3637 );
3638 assert_eq!(
3639 parser(r"|a").parse(),
3640 Ok(alt(0..2, vec![Ast::empty(span(0..0)), lit('a', 1),]))
3641 );
3642
3643 assert_eq!(
3644 parser(r"(|)").parse(),
3645 Ok(group(
3646 0..3,
3647 1,
3648 alt(
3649 1..2,
3650 vec![Ast::empty(span(1..1)), Ast::empty(span(2..2)),]
3651 )
3652 ))
3653 );
3654 assert_eq!(
3655 parser(r"(a|)").parse(),
3656 Ok(group(
3657 0..4,
3658 1,
3659 alt(1..3, vec![lit('a', 1), Ast::empty(span(3..3)),])
3660 ))
3661 );
3662 assert_eq!(
3663 parser(r"(|a)").parse(),
3664 Ok(group(
3665 0..4,
3666 1,
3667 alt(1..3, vec![Ast::empty(span(1..1)), lit('a', 2),])
3668 ))
3669 );
3670
3671 assert_eq!(
3672 parser(r"a|b)").parse().unwrap_err(),
3673 TestError {
3674 span: span(3..4),
3675 kind: ast::ErrorKind::GroupUnopened,
3676 }
3677 );
3678 assert_eq!(
3679 parser(r"(a|b").parse().unwrap_err(),
3680 TestError {
3681 span: span(0..1),
3682 kind: ast::ErrorKind::GroupUnclosed,
3683 }
3684 );
3685 }
3686
3687 #[test]
3688 fn parse_unsupported_lookaround() {
3689 assert_eq!(
3690 parser(r"(?=a)").parse().unwrap_err(),
3691 TestError {
3692 span: span(0..3),
3693 kind: ast::ErrorKind::UnsupportedLookAround,
3694 }
3695 );
3696 assert_eq!(
3697 parser(r"(?!a)").parse().unwrap_err(),
3698 TestError {
3699 span: span(0..3),
3700 kind: ast::ErrorKind::UnsupportedLookAround,
3701 }
3702 );
3703 assert_eq!(
3704 parser(r"(?<=a)").parse().unwrap_err(),
3705 TestError {
3706 span: span(0..4),
3707 kind: ast::ErrorKind::UnsupportedLookAround,
3708 }
3709 );
3710 assert_eq!(
3711 parser(r"(?<!a)").parse().unwrap_err(),
3712 TestError {
3713 span: span(0..4),
3714 kind: ast::ErrorKind::UnsupportedLookAround,
3715 }
3716 );
3717 }
3718
3719 #[test]
3720 fn parse_group() {
3721 assert_eq!(
3722 parser("(?i)").parse(),
3723 Ok(Ast::flags(ast::SetFlags {
3724 span: span(0..4),
3725 flags: ast::Flags {
3726 span: span(2..3),
3727 items: vec![ast::FlagsItem {
3728 span: span(2..3),
3729 kind: ast::FlagsItemKind::Flag(
3730 ast::Flag::CaseInsensitive
3731 ),
3732 }],
3733 },
3734 }))
3735 );
3736 assert_eq!(
3737 parser("(?iU)").parse(),
3738 Ok(Ast::flags(ast::SetFlags {
3739 span: span(0..5),
3740 flags: ast::Flags {
3741 span: span(2..4),
3742 items: vec![
3743 ast::FlagsItem {
3744 span: span(2..3),
3745 kind: ast::FlagsItemKind::Flag(
3746 ast::Flag::CaseInsensitive
3747 ),
3748 },
3749 ast::FlagsItem {
3750 span: span(3..4),
3751 kind: ast::FlagsItemKind::Flag(
3752 ast::Flag::SwapGreed
3753 ),
3754 },
3755 ],
3756 },
3757 }))
3758 );
3759 assert_eq!(
3760 parser("(?i-U)").parse(),
3761 Ok(Ast::flags(ast::SetFlags {
3762 span: span(0..6),
3763 flags: ast::Flags {
3764 span: span(2..5),
3765 items: vec![
3766 ast::FlagsItem {
3767 span: span(2..3),
3768 kind: ast::FlagsItemKind::Flag(
3769 ast::Flag::CaseInsensitive
3770 ),
3771 },
3772 ast::FlagsItem {
3773 span: span(3..4),
3774 kind: ast::FlagsItemKind::Negation,
3775 },
3776 ast::FlagsItem {
3777 span: span(4..5),
3778 kind: ast::FlagsItemKind::Flag(
3779 ast::Flag::SwapGreed
3780 ),
3781 },
3782 ],
3783 },
3784 }))
3785 );
3786
3787 assert_eq!(
3788 parser("()").parse(),
3789 Ok(Ast::group(ast::Group {
3790 span: span(0..2),
3791 kind: ast::GroupKind::CaptureIndex(1),
3792 ast: Box::new(Ast::empty(span(1..1))),
3793 }))
3794 );
3795 assert_eq!(
3796 parser("(a)").parse(),
3797 Ok(Ast::group(ast::Group {
3798 span: span(0..3),
3799 kind: ast::GroupKind::CaptureIndex(1),
3800 ast: Box::new(lit('a', 1)),
3801 }))
3802 );
3803 assert_eq!(
3804 parser("(())").parse(),
3805 Ok(Ast::group(ast::Group {
3806 span: span(0..4),
3807 kind: ast::GroupKind::CaptureIndex(1),
3808 ast: Box::new(Ast::group(ast::Group {
3809 span: span(1..3),
3810 kind: ast::GroupKind::CaptureIndex(2),
3811 ast: Box::new(Ast::empty(span(2..2))),
3812 })),
3813 }))
3814 );
3815
3816 assert_eq!(
3817 parser("(?:a)").parse(),
3818 Ok(Ast::group(ast::Group {
3819 span: span(0..5),
3820 kind: ast::GroupKind::NonCapturing(ast::Flags {
3821 span: span(2..2),
3822 items: vec![],
3823 }),
3824 ast: Box::new(lit('a', 3)),
3825 }))
3826 );
3827
3828 assert_eq!(
3829 parser("(?i:a)").parse(),
3830 Ok(Ast::group(ast::Group {
3831 span: span(0..6),
3832 kind: ast::GroupKind::NonCapturing(ast::Flags {
3833 span: span(2..3),
3834 items: vec![ast::FlagsItem {
3835 span: span(2..3),
3836 kind: ast::FlagsItemKind::Flag(
3837 ast::Flag::CaseInsensitive
3838 ),
3839 },],
3840 }),
3841 ast: Box::new(lit('a', 4)),
3842 }))
3843 );
3844 assert_eq!(
3845 parser("(?i-U:a)").parse(),
3846 Ok(Ast::group(ast::Group {
3847 span: span(0..8),
3848 kind: ast::GroupKind::NonCapturing(ast::Flags {
3849 span: span(2..5),
3850 items: vec![
3851 ast::FlagsItem {
3852 span: span(2..3),
3853 kind: ast::FlagsItemKind::Flag(
3854 ast::Flag::CaseInsensitive
3855 ),
3856 },
3857 ast::FlagsItem {
3858 span: span(3..4),
3859 kind: ast::FlagsItemKind::Negation,
3860 },
3861 ast::FlagsItem {
3862 span: span(4..5),
3863 kind: ast::FlagsItemKind::Flag(
3864 ast::Flag::SwapGreed
3865 ),
3866 },
3867 ],
3868 }),
3869 ast: Box::new(lit('a', 6)),
3870 }))
3871 );
3872
3873 assert_eq!(
3874 parser("(").parse().unwrap_err(),
3875 TestError {
3876 span: span(0..1),
3877 kind: ast::ErrorKind::GroupUnclosed,
3878 }
3879 );
3880 assert_eq!(
3881 parser("(?").parse().unwrap_err(),
3882 TestError {
3883 span: span(0..1),
3884 kind: ast::ErrorKind::GroupUnclosed,
3885 }
3886 );
3887 assert_eq!(
3888 parser("(?P").parse().unwrap_err(),
3889 TestError {
3890 span: span(2..3),
3891 kind: ast::ErrorKind::FlagUnrecognized,
3892 }
3893 );
3894 assert_eq!(
3895 parser("(?P<").parse().unwrap_err(),
3896 TestError {
3897 span: span(4..4),
3898 kind: ast::ErrorKind::GroupNameUnexpectedEof,
3899 }
3900 );
3901 assert_eq!(
3902 parser("(a").parse().unwrap_err(),
3903 TestError {
3904 span: span(0..1),
3905 kind: ast::ErrorKind::GroupUnclosed,
3906 }
3907 );
3908 assert_eq!(
3909 parser("(()").parse().unwrap_err(),
3910 TestError {
3911 span: span(0..1),
3912 kind: ast::ErrorKind::GroupUnclosed,
3913 }
3914 );
3915 assert_eq!(
3916 parser(")").parse().unwrap_err(),
3917 TestError {
3918 span: span(0..1),
3919 kind: ast::ErrorKind::GroupUnopened,
3920 }
3921 );
3922 assert_eq!(
3923 parser("a)").parse().unwrap_err(),
3924 TestError {
3925 span: span(1..2),
3926 kind: ast::ErrorKind::GroupUnopened,
3927 }
3928 );
3929 }
3930
3931 #[test]
3932 fn parse_capture_name() {
3933 assert_eq!(
3934 parser("(?<a>z)").parse(),
3935 Ok(Ast::group(ast::Group {
3936 span: span(0..7),
3937 kind: ast::GroupKind::CaptureName {
3938 starts_with_p: false,
3939 name: ast::CaptureName {
3940 span: span(3..4),
3941 name: s("a"),
3942 index: 1,
3943 }
3944 },
3945 ast: Box::new(lit('z', 5)),
3946 }))
3947 );
3948 assert_eq!(
3949 parser("(?P<a>z)").parse(),
3950 Ok(Ast::group(ast::Group {
3951 span: span(0..8),
3952 kind: ast::GroupKind::CaptureName {
3953 starts_with_p: true,
3954 name: ast::CaptureName {
3955 span: span(4..5),
3956 name: s("a"),
3957 index: 1,
3958 }
3959 },
3960 ast: Box::new(lit('z', 6)),
3961 }))
3962 );
3963 assert_eq!(
3964 parser("(?P<abc>z)").parse(),
3965 Ok(Ast::group(ast::Group {
3966 span: span(0..10),
3967 kind: ast::GroupKind::CaptureName {
3968 starts_with_p: true,
3969 name: ast::CaptureName {
3970 span: span(4..7),
3971 name: s("abc"),
3972 index: 1,
3973 }
3974 },
3975 ast: Box::new(lit('z', 8)),
3976 }))
3977 );
3978
3979 assert_eq!(
3980 parser("(?P<a_1>z)").parse(),
3981 Ok(Ast::group(ast::Group {
3982 span: span(0..10),
3983 kind: ast::GroupKind::CaptureName {
3984 starts_with_p: true,
3985 name: ast::CaptureName {
3986 span: span(4..7),
3987 name: s("a_1"),
3988 index: 1,
3989 }
3990 },
3991 ast: Box::new(lit('z', 8)),
3992 }))
3993 );
3994
3995 assert_eq!(
3996 parser("(?P<a.1>z)").parse(),
3997 Ok(Ast::group(ast::Group {
3998 span: span(0..10),
3999 kind: ast::GroupKind::CaptureName {
4000 starts_with_p: true,
4001 name: ast::CaptureName {
4002 span: span(4..7),
4003 name: s("a.1"),
4004 index: 1,
4005 }
4006 },
4007 ast: Box::new(lit('z', 8)),
4008 }))
4009 );
4010
4011 assert_eq!(
4012 parser("(?P<a[1]>z)").parse(),
4013 Ok(Ast::group(ast::Group {
4014 span: span(0..11),
4015 kind: ast::GroupKind::CaptureName {
4016 starts_with_p: true,
4017 name: ast::CaptureName {
4018 span: span(4..8),
4019 name: s("a[1]"),
4020 index: 1,
4021 }
4022 },
4023 ast: Box::new(lit('z', 9)),
4024 }))
4025 );
4026
4027 assert_eq!(
4028 parser("(?P<a¾>)").parse(),
4029 Ok(Ast::group(ast::Group {
4030 span: Span::new(
4031 Position::new(0, 1, 1),
4032 Position::new(9, 1, 9),
4033 ),
4034 kind: ast::GroupKind::CaptureName {
4035 starts_with_p: true,
4036 name: ast::CaptureName {
4037 span: Span::new(
4038 Position::new(4, 1, 5),
4039 Position::new(7, 1, 7),
4040 ),
4041 name: s("a¾"),
4042 index: 1,
4043 }
4044 },
4045 ast: Box::new(Ast::empty(Span::new(
4046 Position::new(8, 1, 8),
4047 Position::new(8, 1, 8),
4048 ))),
4049 }))
4050 );
4051 assert_eq!(
4052 parser("(?P<名字>)").parse(),
4053 Ok(Ast::group(ast::Group {
4054 span: Span::new(
4055 Position::new(0, 1, 1),
4056 Position::new(12, 1, 9),
4057 ),
4058 kind: ast::GroupKind::CaptureName {
4059 starts_with_p: true,
4060 name: ast::CaptureName {
4061 span: Span::new(
4062 Position::new(4, 1, 5),
4063 Position::new(10, 1, 7),
4064 ),
4065 name: s("名字"),
4066 index: 1,
4067 }
4068 },
4069 ast: Box::new(Ast::empty(Span::new(
4070 Position::new(11, 1, 8),
4071 Position::new(11, 1, 8),
4072 ))),
4073 }))
4074 );
4075
4076 assert_eq!(
4077 parser("(?P<").parse().unwrap_err(),
4078 TestError {
4079 span: span(4..4),
4080 kind: ast::ErrorKind::GroupNameUnexpectedEof,
4081 }
4082 );
4083 assert_eq!(
4084 parser("(?P<>z)").parse().unwrap_err(),
4085 TestError {
4086 span: span(4..4),
4087 kind: ast::ErrorKind::GroupNameEmpty,
4088 }
4089 );
4090 assert_eq!(
4091 parser("(?P<a").parse().unwrap_err(),
4092 TestError {
4093 span: span(5..5),
4094 kind: ast::ErrorKind::GroupNameUnexpectedEof,
4095 }
4096 );
4097 assert_eq!(
4098 parser("(?P<ab").parse().unwrap_err(),
4099 TestError {
4100 span: span(6..6),
4101 kind: ast::ErrorKind::GroupNameUnexpectedEof,
4102 }
4103 );
4104 assert_eq!(
4105 parser("(?P<0a").parse().unwrap_err(),
4106 TestError {
4107 span: span(4..5),
4108 kind: ast::ErrorKind::GroupNameInvalid,
4109 }
4110 );
4111 assert_eq!(
4112 parser("(?P<~").parse().unwrap_err(),
4113 TestError {
4114 span: span(4..5),
4115 kind: ast::ErrorKind::GroupNameInvalid,
4116 }
4117 );
4118 assert_eq!(
4119 parser("(?P<abc~").parse().unwrap_err(),
4120 TestError {
4121 span: span(7..8),
4122 kind: ast::ErrorKind::GroupNameInvalid,
4123 }
4124 );
4125 assert_eq!(
4126 parser("(?P<a>y)(?P<a>z)").parse().unwrap_err(),
4127 TestError {
4128 span: span(12..13),
4129 kind: ast::ErrorKind::GroupNameDuplicate {
4130 original: span(4..5),
4131 },
4132 }
4133 );
4134 assert_eq!(
4135 parser("(?P<5>)").parse().unwrap_err(),
4136 TestError {
4137 span: span(4..5),
4138 kind: ast::ErrorKind::GroupNameInvalid,
4139 }
4140 );
4141 assert_eq!(
4142 parser("(?P<5a>)").parse().unwrap_err(),
4143 TestError {
4144 span: span(4..5),
4145 kind: ast::ErrorKind::GroupNameInvalid,
4146 }
4147 );
4148 assert_eq!(
4149 parser("(?P<¾>)").parse().unwrap_err(),
4150 TestError {
4151 span: Span::new(
4152 Position::new(4, 1, 5),
4153 Position::new(6, 1, 6),
4154 ),
4155 kind: ast::ErrorKind::GroupNameInvalid,
4156 }
4157 );
4158 assert_eq!(
4159 parser("(?P<¾a>)").parse().unwrap_err(),
4160 TestError {
4161 span: Span::new(
4162 Position::new(4, 1, 5),
4163 Position::new(6, 1, 6),
4164 ),
4165 kind: ast::ErrorKind::GroupNameInvalid,
4166 }
4167 );
4168 assert_eq!(
4169 parser("(?P<☃>)").parse().unwrap_err(),
4170 TestError {
4171 span: Span::new(
4172 Position::new(4, 1, 5),
4173 Position::new(7, 1, 6),
4174 ),
4175 kind: ast::ErrorKind::GroupNameInvalid,
4176 }
4177 );
4178 assert_eq!(
4179 parser("(?P<a☃>)").parse().unwrap_err(),
4180 TestError {
4181 span: Span::new(
4182 Position::new(5, 1, 6),
4183 Position::new(8, 1, 7),
4184 ),
4185 kind: ast::ErrorKind::GroupNameInvalid,
4186 }
4187 );
4188 }
4189
4190 #[test]
4191 fn parse_flags() {
4192 assert_eq!(
4193 parser("i:").parse_flags(),
4194 Ok(ast::Flags {
4195 span: span(0..1),
4196 items: vec![ast::FlagsItem {
4197 span: span(0..1),
4198 kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive),
4199 }],
4200 })
4201 );
4202 assert_eq!(
4203 parser("i)").parse_flags(),
4204 Ok(ast::Flags {
4205 span: span(0..1),
4206 items: vec![ast::FlagsItem {
4207 span: span(0..1),
4208 kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive),
4209 }],
4210 })
4211 );
4212
4213 assert_eq!(
4214 parser("isU:").parse_flags(),
4215 Ok(ast::Flags {
4216 span: span(0..3),
4217 items: vec![
4218 ast::FlagsItem {
4219 span: span(0..1),
4220 kind: ast::FlagsItemKind::Flag(
4221 ast::Flag::CaseInsensitive
4222 ),
4223 },
4224 ast::FlagsItem {
4225 span: span(1..2),
4226 kind: ast::FlagsItemKind::Flag(
4227 ast::Flag::DotMatchesNewLine
4228 ),
4229 },
4230 ast::FlagsItem {
4231 span: span(2..3),
4232 kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed),
4233 },
4234 ],
4235 })
4236 );
4237
4238 assert_eq!(
4239 parser("-isU:").parse_flags(),
4240 Ok(ast::Flags {
4241 span: span(0..4),
4242 items: vec![
4243 ast::FlagsItem {
4244 span: span(0..1),
4245 kind: ast::FlagsItemKind::Negation,
4246 },
4247 ast::FlagsItem {
4248 span: span(1..2),
4249 kind: ast::FlagsItemKind::Flag(
4250 ast::Flag::CaseInsensitive
4251 ),
4252 },
4253 ast::FlagsItem {
4254 span: span(2..3),
4255 kind: ast::FlagsItemKind::Flag(
4256 ast::Flag::DotMatchesNewLine
4257 ),
4258 },
4259 ast::FlagsItem {
4260 span: span(3..4),
4261 kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed),
4262 },
4263 ],
4264 })
4265 );
4266 assert_eq!(
4267 parser("i-sU:").parse_flags(),
4268 Ok(ast::Flags {
4269 span: span(0..4),
4270 items: vec![
4271 ast::FlagsItem {
4272 span: span(0..1),
4273 kind: ast::FlagsItemKind::Flag(
4274 ast::Flag::CaseInsensitive
4275 ),
4276 },
4277 ast::FlagsItem {
4278 span: span(1..2),
4279 kind: ast::FlagsItemKind::Negation,
4280 },
4281 ast::FlagsItem {
4282 span: span(2..3),
4283 kind: ast::FlagsItemKind::Flag(
4284 ast::Flag::DotMatchesNewLine
4285 ),
4286 },
4287 ast::FlagsItem {
4288 span: span(3..4),
4289 kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed),
4290 },
4291 ],
4292 })
4293 );
4294 assert_eq!(
4295 parser("i-sR:").parse_flags(),
4296 Ok(ast::Flags {
4297 span: span(0..4),
4298 items: vec![
4299 ast::FlagsItem {
4300 span: span(0..1),
4301 kind: ast::FlagsItemKind::Flag(
4302 ast::Flag::CaseInsensitive
4303 ),
4304 },
4305 ast::FlagsItem {
4306 span: span(1..2),
4307 kind: ast::FlagsItemKind::Negation,
4308 },
4309 ast::FlagsItem {
4310 span: span(2..3),
4311 kind: ast::FlagsItemKind::Flag(
4312 ast::Flag::DotMatchesNewLine
4313 ),
4314 },
4315 ast::FlagsItem {
4316 span: span(3..4),
4317 kind: ast::FlagsItemKind::Flag(ast::Flag::CRLF),
4318 },
4319 ],
4320 })
4321 );
4322
4323 assert_eq!(
4324 parser("isU").parse_flags().unwrap_err(),
4325 TestError {
4326 span: span(3..3),
4327 kind: ast::ErrorKind::FlagUnexpectedEof,
4328 }
4329 );
4330 assert_eq!(
4331 parser("isUa:").parse_flags().unwrap_err(),
4332 TestError {
4333 span: span(3..4),
4334 kind: ast::ErrorKind::FlagUnrecognized,
4335 }
4336 );
4337 assert_eq!(
4338 parser("isUi:").parse_flags().unwrap_err(),
4339 TestError {
4340 span: span(3..4),
4341 kind: ast::ErrorKind::FlagDuplicate { original: span(0..1) },
4342 }
4343 );
4344 assert_eq!(
4345 parser("i-sU-i:").parse_flags().unwrap_err(),
4346 TestError {
4347 span: span(4..5),
4348 kind: ast::ErrorKind::FlagRepeatedNegation {
4349 original: span(1..2),
4350 },
4351 }
4352 );
4353 assert_eq!(
4354 parser("-)").parse_flags().unwrap_err(),
4355 TestError {
4356 span: span(0..1),
4357 kind: ast::ErrorKind::FlagDanglingNegation,
4358 }
4359 );
4360 assert_eq!(
4361 parser("i-)").parse_flags().unwrap_err(),
4362 TestError {
4363 span: span(1..2),
4364 kind: ast::ErrorKind::FlagDanglingNegation,
4365 }
4366 );
4367 assert_eq!(
4368 parser("iU-)").parse_flags().unwrap_err(),
4369 TestError {
4370 span: span(2..3),
4371 kind: ast::ErrorKind::FlagDanglingNegation,
4372 }
4373 );
4374 }
4375
4376 #[test]
4377 fn parse_flag() {
4378 assert_eq!(parser("i").parse_flag(), Ok(ast::Flag::CaseInsensitive));
4379 assert_eq!(parser("m").parse_flag(), Ok(ast::Flag::MultiLine));
4380 assert_eq!(parser("s").parse_flag(), Ok(ast::Flag::DotMatchesNewLine));
4381 assert_eq!(parser("U").parse_flag(), Ok(ast::Flag::SwapGreed));
4382 assert_eq!(parser("u").parse_flag(), Ok(ast::Flag::Unicode));
4383 assert_eq!(parser("R").parse_flag(), Ok(ast::Flag::CRLF));
4384 assert_eq!(parser("x").parse_flag(), Ok(ast::Flag::IgnoreWhitespace));
4385
4386 assert_eq!(
4387 parser("a").parse_flag().unwrap_err(),
4388 TestError {
4389 span: span(0..1),
4390 kind: ast::ErrorKind::FlagUnrecognized,
4391 }
4392 );
4393 assert_eq!(
4394 parser("☃").parse_flag().unwrap_err(),
4395 TestError {
4396 span: span_range("☃", 0..3),
4397 kind: ast::ErrorKind::FlagUnrecognized,
4398 }
4399 );
4400 }
4401
4402 #[test]
4403 fn parse_primitive_non_escape() {
4404 assert_eq!(
4405 parser(r".").parse_primitive(),
4406 Ok(Primitive::Dot(span(0..1)))
4407 );
4408 assert_eq!(
4409 parser(r"^").parse_primitive(),
4410 Ok(Primitive::Assertion(ast::Assertion {
4411 span: span(0..1),
4412 kind: ast::AssertionKind::StartLine,
4413 }))
4414 );
4415 assert_eq!(
4416 parser(r"$").parse_primitive(),
4417 Ok(Primitive::Assertion(ast::Assertion {
4418 span: span(0..1),
4419 kind: ast::AssertionKind::EndLine,
4420 }))
4421 );
4422
4423 assert_eq!(
4424 parser(r"a").parse_primitive(),
4425 Ok(Primitive::Literal(ast::Literal {
4426 span: span(0..1),
4427 kind: ast::LiteralKind::Verbatim,
4428 c: 'a',
4429 }))
4430 );
4431 assert_eq!(
4432 parser(r"|").parse_primitive(),
4433 Ok(Primitive::Literal(ast::Literal {
4434 span: span(0..1),
4435 kind: ast::LiteralKind::Verbatim,
4436 c: '|',
4437 }))
4438 );
4439 assert_eq!(
4440 parser(r"☃").parse_primitive(),
4441 Ok(Primitive::Literal(ast::Literal {
4442 span: span_range("☃", 0..3),
4443 kind: ast::LiteralKind::Verbatim,
4444 c: '☃',
4445 }))
4446 );
4447 }
4448
4449 #[test]
4450 fn parse_escape() {
4451 assert_eq!(
4452 parser(r"\|").parse_primitive(),
4453 Ok(Primitive::Literal(ast::Literal {
4454 span: span(0..2),
4455 kind: ast::LiteralKind::Meta,
4456 c: '|',
4457 }))
4458 );
4459 let specials = &[
4460 (r"\a", '\x07', ast::SpecialLiteralKind::Bell),
4461 (r"\f", '\x0C', ast::SpecialLiteralKind::FormFeed),
4462 (r"\t", '\t', ast::SpecialLiteralKind::Tab),
4463 (r"\n", '\n', ast::SpecialLiteralKind::LineFeed),
4464 (r"\r", '\r', ast::SpecialLiteralKind::CarriageReturn),
4465 (r"\v", '\x0B', ast::SpecialLiteralKind::VerticalTab),
4466 ];
4467 for &(pat, c, ref kind) in specials {
4468 assert_eq!(
4469 parser(pat).parse_primitive(),
4470 Ok(Primitive::Literal(ast::Literal {
4471 span: span(0..2),
4472 kind: ast::LiteralKind::Special(kind.clone()),
4473 c,
4474 }))
4475 );
4476 }
4477 assert_eq!(
4478 parser(r"\A").parse_primitive(),
4479 Ok(Primitive::Assertion(ast::Assertion {
4480 span: span(0..2),
4481 kind: ast::AssertionKind::StartText,
4482 }))
4483 );
4484 assert_eq!(
4485 parser(r"\z").parse_primitive(),
4486 Ok(Primitive::Assertion(ast::Assertion {
4487 span: span(0..2),
4488 kind: ast::AssertionKind::EndText,
4489 }))
4490 );
4491 assert_eq!(
4492 parser(r"\b").parse_primitive(),
4493 Ok(Primitive::Assertion(ast::Assertion {
4494 span: span(0..2),
4495 kind: ast::AssertionKind::WordBoundary,
4496 }))
4497 );
4498 assert_eq!(
4499 parser(r"\b{start}").parse_primitive(),
4500 Ok(Primitive::Assertion(ast::Assertion {
4501 span: span(0..9),
4502 kind: ast::AssertionKind::WordBoundaryStart,
4503 }))
4504 );
4505 assert_eq!(
4506 parser(r"\b{end}").parse_primitive(),
4507 Ok(Primitive::Assertion(ast::Assertion {
4508 span: span(0..7),
4509 kind: ast::AssertionKind::WordBoundaryEnd,
4510 }))
4511 );
4512 assert_eq!(
4513 parser(r"\b{start-half}").parse_primitive(),
4514 Ok(Primitive::Assertion(ast::Assertion {
4515 span: span(0..14),
4516 kind: ast::AssertionKind::WordBoundaryStartHalf,
4517 }))
4518 );
4519 assert_eq!(
4520 parser(r"\b{end-half}").parse_primitive(),
4521 Ok(Primitive::Assertion(ast::Assertion {
4522 span: span(0..12),
4523 kind: ast::AssertionKind::WordBoundaryEndHalf,
4524 }))
4525 );
4526 assert_eq!(
4527 parser(r"\<").parse_primitive(),
4528 Ok(Primitive::Assertion(ast::Assertion {
4529 span: span(0..2),
4530 kind: ast::AssertionKind::WordBoundaryStartAngle,
4531 }))
4532 );
4533 assert_eq!(
4534 parser(r"\>").parse_primitive(),
4535 Ok(Primitive::Assertion(ast::Assertion {
4536 span: span(0..2),
4537 kind: ast::AssertionKind::WordBoundaryEndAngle,
4538 }))
4539 );
4540 assert_eq!(
4541 parser(r"\B").parse_primitive(),
4542 Ok(Primitive::Assertion(ast::Assertion {
4543 span: span(0..2),
4544 kind: ast::AssertionKind::NotWordBoundary,
4545 }))
4546 );
4547
4548 // We also support superfluous escapes in most cases now too.
4549 for c in ['!', '@', '%', '"', '\'', '/', ' '] {
4550 let pat = format!(r"\{}", c);
4551 assert_eq!(
4552 parser(&pat).parse_primitive(),
4553 Ok(Primitive::Literal(ast::Literal {
4554 span: span(0..2),
4555 kind: ast::LiteralKind::Superfluous,
4556 c,
4557 }))
4558 );
4559 }
4560
4561 // Some superfluous escapes, namely [0-9A-Za-z], are still banned. This
4562 // gives flexibility for future evolution.
4563 assert_eq!(
4564 parser(r"\e").parse_escape().unwrap_err(),
4565 TestError {
4566 span: span(0..2),
4567 kind: ast::ErrorKind::EscapeUnrecognized,
4568 }
4569 );
4570 assert_eq!(
4571 parser(r"\y").parse_escape().unwrap_err(),
4572 TestError {
4573 span: span(0..2),
4574 kind: ast::ErrorKind::EscapeUnrecognized,
4575 }
4576 );
4577
4578 // Starting a special word boundary without any non-whitespace chars
4579 // after the brace makes it ambiguous whether the user meant to write
4580 // a counted repetition (probably not?) or an actual special word
4581 // boundary assertion.
4582 assert_eq!(
4583 parser(r"\b{").parse_escape().unwrap_err(),
4584 TestError {
4585 span: span(0..3),
4586 kind: ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof,
4587 }
4588 );
4589 assert_eq!(
4590 parser_ignore_whitespace(r"\b{ ").parse_escape().unwrap_err(),
4591 TestError {
4592 span: span(0..4),
4593 kind: ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof,
4594 }
4595 );
4596 // When 'x' is not enabled, the space is seen as a non-[-A-Za-z] char,
4597 // and thus causes the parser to treat it as a counted repetition.
4598 assert_eq!(
4599 parser(r"\b{ ").parse().unwrap_err(),
4600 TestError {
4601 span: span(4..4),
4602 kind: ast::ErrorKind::RepetitionCountDecimalEmpty,
4603 }
4604 );
4605 // In this case, we got some valid chars that makes it look like the
4606 // user is writing one of the special word boundary assertions, but
4607 // we forget to close the brace.
4608 assert_eq!(
4609 parser(r"\b{foo").parse_escape().unwrap_err(),
4610 TestError {
4611 span: span(2..6),
4612 kind: ast::ErrorKind::SpecialWordBoundaryUnclosed,
4613 }
4614 );
4615 // We get the same error as above, except it is provoked by seeing a
4616 // char that we know is invalid before seeing a closing brace.
4617 assert_eq!(
4618 parser(r"\b{foo!}").parse_escape().unwrap_err(),
4619 TestError {
4620 span: span(2..6),
4621 kind: ast::ErrorKind::SpecialWordBoundaryUnclosed,
4622 }
4623 );
4624 // And this one occurs when, syntactically, everything looks okay, but
4625 // we don't use a valid spelling of a word boundary assertion.
4626 assert_eq!(
4627 parser(r"\b{foo}").parse_escape().unwrap_err(),
4628 TestError {
4629 span: span(3..6),
4630 kind: ast::ErrorKind::SpecialWordBoundaryUnrecognized,
4631 }
4632 );
4633
4634 // An unfinished escape is illegal.
4635 assert_eq!(
4636 parser(r"\").parse_escape().unwrap_err(),
4637 TestError {
4638 span: span(0..1),
4639 kind: ast::ErrorKind::EscapeUnexpectedEof,
4640 }
4641 );
4642 }
4643
4644 #[test]
4645 fn parse_unsupported_backreference() {
4646 assert_eq!(
4647 parser(r"\0").parse_escape().unwrap_err(),
4648 TestError {
4649 span: span(0..2),
4650 kind: ast::ErrorKind::UnsupportedBackreference,
4651 }
4652 );
4653 assert_eq!(
4654 parser(r"\9").parse_escape().unwrap_err(),
4655 TestError {
4656 span: span(0..2),
4657 kind: ast::ErrorKind::UnsupportedBackreference,
4658 }
4659 );
4660 }
4661
4662 #[test]
4663 fn parse_octal() {
4664 for i in 0..511 {
4665 let pat = format!(r"\{:o}", i);
4666 assert_eq!(
4667 parser_octal(&pat).parse_escape(),
4668 Ok(Primitive::Literal(ast::Literal {
4669 span: span(0..pat.len()),
4670 kind: ast::LiteralKind::Octal,
4671 c: char::from_u32(i).unwrap(),
4672 }))
4673 );
4674 }
4675 assert_eq!(
4676 parser_octal(r"\778").parse_escape(),
4677 Ok(Primitive::Literal(ast::Literal {
4678 span: span(0..3),
4679 kind: ast::LiteralKind::Octal,
4680 c: '?',
4681 }))
4682 );
4683 assert_eq!(
4684 parser_octal(r"\7777").parse_escape(),
4685 Ok(Primitive::Literal(ast::Literal {
4686 span: span(0..4),
4687 kind: ast::LiteralKind::Octal,
4688 c: '\u{01FF}',
4689 }))
4690 );
4691 assert_eq!(
4692 parser_octal(r"\778").parse(),
4693 Ok(Ast::concat(ast::Concat {
4694 span: span(0..4),
4695 asts: vec![
4696 Ast::literal(ast::Literal {
4697 span: span(0..3),
4698 kind: ast::LiteralKind::Octal,
4699 c: '?',
4700 }),
4701 Ast::literal(ast::Literal {
4702 span: span(3..4),
4703 kind: ast::LiteralKind::Verbatim,
4704 c: '8',
4705 }),
4706 ],
4707 }))
4708 );
4709 assert_eq!(
4710 parser_octal(r"\7777").parse(),
4711 Ok(Ast::concat(ast::Concat {
4712 span: span(0..5),
4713 asts: vec![
4714 Ast::literal(ast::Literal {
4715 span: span(0..4),
4716 kind: ast::LiteralKind::Octal,
4717 c: '\u{01FF}',
4718 }),
4719 Ast::literal(ast::Literal {
4720 span: span(4..5),
4721 kind: ast::LiteralKind::Verbatim,
4722 c: '7',
4723 }),
4724 ],
4725 }))
4726 );
4727
4728 assert_eq!(
4729 parser_octal(r"\8").parse_escape().unwrap_err(),
4730 TestError {
4731 span: span(0..2),
4732 kind: ast::ErrorKind::EscapeUnrecognized,
4733 }
4734 );
4735 }
4736
4737 #[test]
4738 fn parse_hex_two() {
4739 for i in 0..256 {
4740 let pat = format!(r"\x{:02x}", i);
4741 assert_eq!(
4742 parser(&pat).parse_escape(),
4743 Ok(Primitive::Literal(ast::Literal {
4744 span: span(0..pat.len()),
4745 kind: ast::LiteralKind::HexFixed(ast::HexLiteralKind::X),
4746 c: char::from_u32(i).unwrap(),
4747 }))
4748 );
4749 }
4750
4751 assert_eq!(
4752 parser(r"\xF").parse_escape().unwrap_err(),
4753 TestError {
4754 span: span(3..3),
4755 kind: ast::ErrorKind::EscapeUnexpectedEof,
4756 }
4757 );
4758 assert_eq!(
4759 parser(r"\xG").parse_escape().unwrap_err(),
4760 TestError {
4761 span: span(2..3),
4762 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4763 }
4764 );
4765 assert_eq!(
4766 parser(r"\xFG").parse_escape().unwrap_err(),
4767 TestError {
4768 span: span(3..4),
4769 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4770 }
4771 );
4772 }
4773
4774 #[test]
4775 fn parse_hex_four() {
4776 for i in 0..65536 {
4777 let c = match char::from_u32(i) {
4778 None => continue,
4779 Some(c) => c,
4780 };
4781 let pat = format!(r"\u{:04x}", i);
4782 assert_eq!(
4783 parser(&pat).parse_escape(),
4784 Ok(Primitive::Literal(ast::Literal {
4785 span: span(0..pat.len()),
4786 kind: ast::LiteralKind::HexFixed(
4787 ast::HexLiteralKind::UnicodeShort
4788 ),
4789 c,
4790 }))
4791 );
4792 }
4793
4794 assert_eq!(
4795 parser(r"\uF").parse_escape().unwrap_err(),
4796 TestError {
4797 span: span(3..3),
4798 kind: ast::ErrorKind::EscapeUnexpectedEof,
4799 }
4800 );
4801 assert_eq!(
4802 parser(r"\uG").parse_escape().unwrap_err(),
4803 TestError {
4804 span: span(2..3),
4805 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4806 }
4807 );
4808 assert_eq!(
4809 parser(r"\uFG").parse_escape().unwrap_err(),
4810 TestError {
4811 span: span(3..4),
4812 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4813 }
4814 );
4815 assert_eq!(
4816 parser(r"\uFFG").parse_escape().unwrap_err(),
4817 TestError {
4818 span: span(4..5),
4819 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4820 }
4821 );
4822 assert_eq!(
4823 parser(r"\uFFFG").parse_escape().unwrap_err(),
4824 TestError {
4825 span: span(5..6),
4826 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4827 }
4828 );
4829 assert_eq!(
4830 parser(r"\uD800").parse_escape().unwrap_err(),
4831 TestError {
4832 span: span(2..6),
4833 kind: ast::ErrorKind::EscapeHexInvalid,
4834 }
4835 );
4836 }
4837
4838 #[test]
4839 fn parse_hex_eight() {
4840 for i in 0..65536 {
4841 let c = match char::from_u32(i) {
4842 None => continue,
4843 Some(c) => c,
4844 };
4845 let pat = format!(r"\U{:08x}", i);
4846 assert_eq!(
4847 parser(&pat).parse_escape(),
4848 Ok(Primitive::Literal(ast::Literal {
4849 span: span(0..pat.len()),
4850 kind: ast::LiteralKind::HexFixed(
4851 ast::HexLiteralKind::UnicodeLong
4852 ),
4853 c,
4854 }))
4855 );
4856 }
4857
4858 assert_eq!(
4859 parser(r"\UF").parse_escape().unwrap_err(),
4860 TestError {
4861 span: span(3..3),
4862 kind: ast::ErrorKind::EscapeUnexpectedEof,
4863 }
4864 );
4865 assert_eq!(
4866 parser(r"\UG").parse_escape().unwrap_err(),
4867 TestError {
4868 span: span(2..3),
4869 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4870 }
4871 );
4872 assert_eq!(
4873 parser(r"\UFG").parse_escape().unwrap_err(),
4874 TestError {
4875 span: span(3..4),
4876 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4877 }
4878 );
4879 assert_eq!(
4880 parser(r"\UFFG").parse_escape().unwrap_err(),
4881 TestError {
4882 span: span(4..5),
4883 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4884 }
4885 );
4886 assert_eq!(
4887 parser(r"\UFFFG").parse_escape().unwrap_err(),
4888 TestError {
4889 span: span(5..6),
4890 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4891 }
4892 );
4893 assert_eq!(
4894 parser(r"\UFFFFG").parse_escape().unwrap_err(),
4895 TestError {
4896 span: span(6..7),
4897 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4898 }
4899 );
4900 assert_eq!(
4901 parser(r"\UFFFFFG").parse_escape().unwrap_err(),
4902 TestError {
4903 span: span(7..8),
4904 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4905 }
4906 );
4907 assert_eq!(
4908 parser(r"\UFFFFFFG").parse_escape().unwrap_err(),
4909 TestError {
4910 span: span(8..9),
4911 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4912 }
4913 );
4914 assert_eq!(
4915 parser(r"\UFFFFFFFG").parse_escape().unwrap_err(),
4916 TestError {
4917 span: span(9..10),
4918 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4919 }
4920 );
4921 }
4922
4923 #[test]
4924 fn parse_hex_brace() {
4925 assert_eq!(
4926 parser(r"\u{26c4}").parse_escape(),
4927 Ok(Primitive::Literal(ast::Literal {
4928 span: span(0..8),
4929 kind: ast::LiteralKind::HexBrace(
4930 ast::HexLiteralKind::UnicodeShort
4931 ),
4932 c: '⛄',
4933 }))
4934 );
4935 assert_eq!(
4936 parser(r"\U{26c4}").parse_escape(),
4937 Ok(Primitive::Literal(ast::Literal {
4938 span: span(0..8),
4939 kind: ast::LiteralKind::HexBrace(
4940 ast::HexLiteralKind::UnicodeLong
4941 ),
4942 c: '⛄',
4943 }))
4944 );
4945 assert_eq!(
4946 parser(r"\x{26c4}").parse_escape(),
4947 Ok(Primitive::Literal(ast::Literal {
4948 span: span(0..8),
4949 kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X),
4950 c: '⛄',
4951 }))
4952 );
4953 assert_eq!(
4954 parser(r"\x{26C4}").parse_escape(),
4955 Ok(Primitive::Literal(ast::Literal {
4956 span: span(0..8),
4957 kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X),
4958 c: '⛄',
4959 }))
4960 );
4961 assert_eq!(
4962 parser(r"\x{10fFfF}").parse_escape(),
4963 Ok(Primitive::Literal(ast::Literal {
4964 span: span(0..10),
4965 kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X),
4966 c: '\u{10FFFF}',
4967 }))
4968 );
4969
4970 assert_eq!(
4971 parser(r"\x").parse_escape().unwrap_err(),
4972 TestError {
4973 span: span(2..2),
4974 kind: ast::ErrorKind::EscapeUnexpectedEof,
4975 }
4976 );
4977 assert_eq!(
4978 parser(r"\x{").parse_escape().unwrap_err(),
4979 TestError {
4980 span: span(2..3),
4981 kind: ast::ErrorKind::EscapeUnexpectedEof,
4982 }
4983 );
4984 assert_eq!(
4985 parser(r"\x{FF").parse_escape().unwrap_err(),
4986 TestError {
4987 span: span(2..5),
4988 kind: ast::ErrorKind::EscapeUnexpectedEof,
4989 }
4990 );
4991 assert_eq!(
4992 parser(r"\x{}").parse_escape().unwrap_err(),
4993 TestError {
4994 span: span(2..4),
4995 kind: ast::ErrorKind::EscapeHexEmpty,
4996 }
4997 );
4998 assert_eq!(
4999 parser(r"\x{FGF}").parse_escape().unwrap_err(),
5000 TestError {
5001 span: span(4..5),
5002 kind: ast::ErrorKind::EscapeHexInvalidDigit,
5003 }
5004 );
5005 assert_eq!(
5006 parser(r"\x{FFFFFF}").parse_escape().unwrap_err(),
5007 TestError {
5008 span: span(3..9),
5009 kind: ast::ErrorKind::EscapeHexInvalid,
5010 }
5011 );
5012 assert_eq!(
5013 parser(r"\x{D800}").parse_escape().unwrap_err(),
5014 TestError {
5015 span: span(3..7),
5016 kind: ast::ErrorKind::EscapeHexInvalid,
5017 }
5018 );
5019 assert_eq!(
5020 parser(r"\x{FFFFFFFFF}").parse_escape().unwrap_err(),
5021 TestError {
5022 span: span(3..12),
5023 kind: ast::ErrorKind::EscapeHexInvalid,
5024 }
5025 );
5026 }
5027
5028 #[test]
5029 fn parse_decimal() {
5030 assert_eq!(parser("123").parse_decimal(), Ok(123));
5031 assert_eq!(parser("0").parse_decimal(), Ok(0));
5032 assert_eq!(parser("01").parse_decimal(), Ok(1));
5033
5034 assert_eq!(
5035 parser("-1").parse_decimal().unwrap_err(),
5036 TestError { span: span(0..0), kind: ast::ErrorKind::DecimalEmpty }
5037 );
5038 assert_eq!(
5039 parser("").parse_decimal().unwrap_err(),
5040 TestError { span: span(0..0), kind: ast::ErrorKind::DecimalEmpty }
5041 );
5042 assert_eq!(
5043 parser("9999999999").parse_decimal().unwrap_err(),
5044 TestError {
5045 span: span(0..10),
5046 kind: ast::ErrorKind::DecimalInvalid,
5047 }
5048 );
5049 }
5050
5051 #[test]
5052 fn parse_set_class() {
5053 fn union(span: Span, items: Vec<ast::ClassSetItem>) -> ast::ClassSet {
5054 ast::ClassSet::union(ast::ClassSetUnion { span, items })
5055 }
5056
5057 fn intersection(
5058 span: Span,
5059 lhs: ast::ClassSet,
5060 rhs: ast::ClassSet,
5061 ) -> ast::ClassSet {
5062 ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp {
5063 span,
5064 kind: ast::ClassSetBinaryOpKind::Intersection,
5065 lhs: Box::new(lhs),
5066 rhs: Box::new(rhs),
5067 })
5068 }
5069
5070 fn difference(
5071 span: Span,
5072 lhs: ast::ClassSet,
5073 rhs: ast::ClassSet,
5074 ) -> ast::ClassSet {
5075 ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp {
5076 span,
5077 kind: ast::ClassSetBinaryOpKind::Difference,
5078 lhs: Box::new(lhs),
5079 rhs: Box::new(rhs),
5080 })
5081 }
5082
5083 fn symdifference(
5084 span: Span,
5085 lhs: ast::ClassSet,
5086 rhs: ast::ClassSet,
5087 ) -> ast::ClassSet {
5088 ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp {
5089 span,
5090 kind: ast::ClassSetBinaryOpKind::SymmetricDifference,
5091 lhs: Box::new(lhs),
5092 rhs: Box::new(rhs),
5093 })
5094 }
5095
5096 fn itemset(item: ast::ClassSetItem) -> ast::ClassSet {
5097 ast::ClassSet::Item(item)
5098 }
5099
5100 fn item_ascii(cls: ast::ClassAscii) -> ast::ClassSetItem {
5101 ast::ClassSetItem::Ascii(cls)
5102 }
5103
5104 fn item_unicode(cls: ast::ClassUnicode) -> ast::ClassSetItem {
5105 ast::ClassSetItem::Unicode(cls)
5106 }
5107
5108 fn item_perl(cls: ast::ClassPerl) -> ast::ClassSetItem {
5109 ast::ClassSetItem::Perl(cls)
5110 }
5111
5112 fn item_bracket(cls: ast::ClassBracketed) -> ast::ClassSetItem {
5113 ast::ClassSetItem::Bracketed(Box::new(cls))
5114 }
5115
5116 fn lit(span: Span, c: char) -> ast::ClassSetItem {
5117 ast::ClassSetItem::Literal(ast::Literal {
5118 span,
5119 kind: ast::LiteralKind::Verbatim,
5120 c,
5121 })
5122 }
5123
5124 fn empty(span: Span) -> ast::ClassSetItem {
5125 ast::ClassSetItem::Empty(span)
5126 }
5127
5128 fn range(span: Span, start: char, end: char) -> ast::ClassSetItem {
5129 let pos1 = Position {
5130 offset: span.start.offset + start.len_utf8(),
5131 column: span.start.column + 1,
5132 ..span.start
5133 };
5134 let pos2 = Position {
5135 offset: span.end.offset - end.len_utf8(),
5136 column: span.end.column - 1,
5137 ..span.end
5138 };
5139 ast::ClassSetItem::Range(ast::ClassSetRange {
5140 span,
5141 start: ast::Literal {
5142 span: Span { end: pos1, ..span },
5143 kind: ast::LiteralKind::Verbatim,
5144 c: start,
5145 },
5146 end: ast::Literal {
5147 span: Span { start: pos2, ..span },
5148 kind: ast::LiteralKind::Verbatim,
5149 c: end,
5150 },
5151 })
5152 }
5153
5154 fn alnum(span: Span, negated: bool) -> ast::ClassAscii {
5155 ast::ClassAscii { span, kind: ast::ClassAsciiKind::Alnum, negated }
5156 }
5157
5158 fn lower(span: Span, negated: bool) -> ast::ClassAscii {
5159 ast::ClassAscii { span, kind: ast::ClassAsciiKind::Lower, negated }
5160 }
5161
5162 assert_eq!(
5163 parser("[[:alnum:]]").parse(),
5164 Ok(Ast::class_bracketed(ast::ClassBracketed {
5165 span: span(0..11),
5166 negated: false,
5167 kind: itemset(item_ascii(alnum(span(1..10), false))),
5168 }))
5169 );
5170 assert_eq!(
5171 parser("[[[:alnum:]]]").parse(),
5172 Ok(Ast::class_bracketed(ast::ClassBracketed {
5173 span: span(0..13),
5174 negated: false,
5175 kind: itemset(item_bracket(ast::ClassBracketed {
5176 span: span(1..12),
5177 negated: false,
5178 kind: itemset(item_ascii(alnum(span(2..11), false))),
5179 })),
5180 }))
5181 );
5182 assert_eq!(
5183 parser("[[:alnum:]&&[:lower:]]").parse(),
5184 Ok(Ast::class_bracketed(ast::ClassBracketed {
5185 span: span(0..22),
5186 negated: false,
5187 kind: intersection(
5188 span(1..21),
5189 itemset(item_ascii(alnum(span(1..10), false))),
5190 itemset(item_ascii(lower(span(12..21), false))),
5191 ),
5192 }))
5193 );
5194 assert_eq!(
5195 parser("[[:alnum:]--[:lower:]]").parse(),
5196 Ok(Ast::class_bracketed(ast::ClassBracketed {
5197 span: span(0..22),
5198 negated: false,
5199 kind: difference(
5200 span(1..21),
5201 itemset(item_ascii(alnum(span(1..10), false))),
5202 itemset(item_ascii(lower(span(12..21), false))),
5203 ),
5204 }))
5205 );
5206 assert_eq!(
5207 parser("[[:alnum:]~~[:lower:]]").parse(),
5208 Ok(Ast::class_bracketed(ast::ClassBracketed {
5209 span: span(0..22),
5210 negated: false,
5211 kind: symdifference(
5212 span(1..21),
5213 itemset(item_ascii(alnum(span(1..10), false))),
5214 itemset(item_ascii(lower(span(12..21), false))),
5215 ),
5216 }))
5217 );
5218
5219 assert_eq!(
5220 parser("[a]").parse(),
5221 Ok(Ast::class_bracketed(ast::ClassBracketed {
5222 span: span(0..3),
5223 negated: false,
5224 kind: itemset(lit(span(1..2), 'a')),
5225 }))
5226 );
5227 assert_eq!(
5228 parser(r"[a\]]").parse(),
5229 Ok(Ast::class_bracketed(ast::ClassBracketed {
5230 span: span(0..5),
5231 negated: false,
5232 kind: union(
5233 span(1..4),
5234 vec![
5235 lit(span(1..2), 'a'),
5236 ast::ClassSetItem::Literal(ast::Literal {
5237 span: span(2..4),
5238 kind: ast::LiteralKind::Meta,
5239 c: ']',
5240 }),
5241 ]
5242 ),
5243 }))
5244 );
5245 assert_eq!(
5246 parser(r"[a\-z]").parse(),
5247 Ok(Ast::class_bracketed(ast::ClassBracketed {
5248 span: span(0..6),
5249 negated: false,
5250 kind: union(
5251 span(1..5),
5252 vec![
5253 lit(span(1..2), 'a'),
5254 ast::ClassSetItem::Literal(ast::Literal {
5255 span: span(2..4),
5256 kind: ast::LiteralKind::Meta,
5257 c: '-',
5258 }),
5259 lit(span(4..5), 'z'),
5260 ]
5261 ),
5262 }))
5263 );
5264 assert_eq!(
5265 parser("[ab]").parse(),
5266 Ok(Ast::class_bracketed(ast::ClassBracketed {
5267 span: span(0..4),
5268 negated: false,
5269 kind: union(
5270 span(1..3),
5271 vec![lit(span(1..2), 'a'), lit(span(2..3), 'b'),]
5272 ),
5273 }))
5274 );
5275 assert_eq!(
5276 parser("[a-]").parse(),
5277 Ok(Ast::class_bracketed(ast::ClassBracketed {
5278 span: span(0..4),
5279 negated: false,
5280 kind: union(
5281 span(1..3),
5282 vec![lit(span(1..2), 'a'), lit(span(2..3), '-'),]
5283 ),
5284 }))
5285 );
5286 assert_eq!(
5287 parser("[-a]").parse(),
5288 Ok(Ast::class_bracketed(ast::ClassBracketed {
5289 span: span(0..4),
5290 negated: false,
5291 kind: union(
5292 span(1..3),
5293 vec![lit(span(1..2), '-'), lit(span(2..3), 'a'),]
5294 ),
5295 }))
5296 );
5297 assert_eq!(
5298 parser(r"[\pL]").parse(),
5299 Ok(Ast::class_bracketed(ast::ClassBracketed {
5300 span: span(0..5),
5301 negated: false,
5302 kind: itemset(item_unicode(ast::ClassUnicode {
5303 span: span(1..4),
5304 negated: false,
5305 kind: ast::ClassUnicodeKind::OneLetter('L'),
5306 })),
5307 }))
5308 );
5309 assert_eq!(
5310 parser(r"[\w]").parse(),
5311 Ok(Ast::class_bracketed(ast::ClassBracketed {
5312 span: span(0..4),
5313 negated: false,
5314 kind: itemset(item_perl(ast::ClassPerl {
5315 span: span(1..3),
5316 kind: ast::ClassPerlKind::Word,
5317 negated: false,
5318 })),
5319 }))
5320 );
5321 assert_eq!(
5322 parser(r"[a\wz]").parse(),
5323 Ok(Ast::class_bracketed(ast::ClassBracketed {
5324 span: span(0..6),
5325 negated: false,
5326 kind: union(
5327 span(1..5),
5328 vec![
5329 lit(span(1..2), 'a'),
5330 item_perl(ast::ClassPerl {
5331 span: span(2..4),
5332 kind: ast::ClassPerlKind::Word,
5333 negated: false,
5334 }),
5335 lit(span(4..5), 'z'),
5336 ]
5337 ),
5338 }))
5339 );
5340
5341 assert_eq!(
5342 parser("[a-z]").parse(),
5343 Ok(Ast::class_bracketed(ast::ClassBracketed {
5344 span: span(0..5),
5345 negated: false,
5346 kind: itemset(range(span(1..4), 'a', 'z')),
5347 }))
5348 );
5349 assert_eq!(
5350 parser("[a-cx-z]").parse(),
5351 Ok(Ast::class_bracketed(ast::ClassBracketed {
5352 span: span(0..8),
5353 negated: false,
5354 kind: union(
5355 span(1..7),
5356 vec![
5357 range(span(1..4), 'a', 'c'),
5358 range(span(4..7), 'x', 'z'),
5359 ]
5360 ),
5361 }))
5362 );
5363 assert_eq!(
5364 parser(r"[\w&&a-cx-z]").parse(),
5365 Ok(Ast::class_bracketed(ast::ClassBracketed {
5366 span: span(0..12),
5367 negated: false,
5368 kind: intersection(
5369 span(1..11),
5370 itemset(item_perl(ast::ClassPerl {
5371 span: span(1..3),
5372 kind: ast::ClassPerlKind::Word,
5373 negated: false,
5374 })),
5375 union(
5376 span(5..11),
5377 vec![
5378 range(span(5..8), 'a', 'c'),
5379 range(span(8..11), 'x', 'z'),
5380 ]
5381 ),
5382 ),
5383 }))
5384 );
5385 assert_eq!(
5386 parser(r"[a-cx-z&&\w]").parse(),
5387 Ok(Ast::class_bracketed(ast::ClassBracketed {
5388 span: span(0..12),
5389 negated: false,
5390 kind: intersection(
5391 span(1..11),
5392 union(
5393 span(1..7),
5394 vec![
5395 range(span(1..4), 'a', 'c'),
5396 range(span(4..7), 'x', 'z'),
5397 ]
5398 ),
5399 itemset(item_perl(ast::ClassPerl {
5400 span: span(9..11),
5401 kind: ast::ClassPerlKind::Word,
5402 negated: false,
5403 })),
5404 ),
5405 }))
5406 );
5407 assert_eq!(
5408 parser(r"[a--b--c]").parse(),
5409 Ok(Ast::class_bracketed(ast::ClassBracketed {
5410 span: span(0..9),
5411 negated: false,
5412 kind: difference(
5413 span(1..8),
5414 difference(
5415 span(1..5),
5416 itemset(lit(span(1..2), 'a')),
5417 itemset(lit(span(4..5), 'b')),
5418 ),
5419 itemset(lit(span(7..8), 'c')),
5420 ),
5421 }))
5422 );
5423 assert_eq!(
5424 parser(r"[a~~b~~c]").parse(),
5425 Ok(Ast::class_bracketed(ast::ClassBracketed {
5426 span: span(0..9),
5427 negated: false,
5428 kind: symdifference(
5429 span(1..8),
5430 symdifference(
5431 span(1..5),
5432 itemset(lit(span(1..2), 'a')),
5433 itemset(lit(span(4..5), 'b')),
5434 ),
5435 itemset(lit(span(7..8), 'c')),
5436 ),
5437 }))
5438 );
5439 assert_eq!(
5440 parser(r"[\^&&^]").parse(),
5441 Ok(Ast::class_bracketed(ast::ClassBracketed {
5442 span: span(0..7),
5443 negated: false,
5444 kind: intersection(
5445 span(1..6),
5446 itemset(ast::ClassSetItem::Literal(ast::Literal {
5447 span: span(1..3),
5448 kind: ast::LiteralKind::Meta,
5449 c: '^',
5450 })),
5451 itemset(lit(span(5..6), '^')),
5452 ),
5453 }))
5454 );
5455 assert_eq!(
5456 parser(r"[\&&&&]").parse(),
5457 Ok(Ast::class_bracketed(ast::ClassBracketed {
5458 span: span(0..7),
5459 negated: false,
5460 kind: intersection(
5461 span(1..6),
5462 itemset(ast::ClassSetItem::Literal(ast::Literal {
5463 span: span(1..3),
5464 kind: ast::LiteralKind::Meta,
5465 c: '&',
5466 })),
5467 itemset(lit(span(5..6), '&')),
5468 ),
5469 }))
5470 );
5471 assert_eq!(
5472 parser(r"[&&&&]").parse(),
5473 Ok(Ast::class_bracketed(ast::ClassBracketed {
5474 span: span(0..6),
5475 negated: false,
5476 kind: intersection(
5477 span(1..5),
5478 intersection(
5479 span(1..3),
5480 itemset(empty(span(1..1))),
5481 itemset(empty(span(3..3))),
5482 ),
5483 itemset(empty(span(5..5))),
5484 ),
5485 }))
5486 );
5487
5488 let pat = "[☃-⛄]";
5489 assert_eq!(
5490 parser(pat).parse(),
5491 Ok(Ast::class_bracketed(ast::ClassBracketed {
5492 span: span_range(pat, 0..9),
5493 negated: false,
5494 kind: itemset(ast::ClassSetItem::Range(ast::ClassSetRange {
5495 span: span_range(pat, 1..8),
5496 start: ast::Literal {
5497 span: span_range(pat, 1..4),
5498 kind: ast::LiteralKind::Verbatim,
5499 c: '☃',
5500 },
5501 end: ast::Literal {
5502 span: span_range(pat, 5..8),
5503 kind: ast::LiteralKind::Verbatim,
5504 c: '⛄',
5505 },
5506 })),
5507 }))
5508 );
5509
5510 assert_eq!(
5511 parser(r"[]]").parse(),
5512 Ok(Ast::class_bracketed(ast::ClassBracketed {
5513 span: span(0..3),
5514 negated: false,
5515 kind: itemset(lit(span(1..2), ']')),
5516 }))
5517 );
5518 assert_eq!(
5519 parser(r"[]\[]").parse(),
5520 Ok(Ast::class_bracketed(ast::ClassBracketed {
5521 span: span(0..5),
5522 negated: false,
5523 kind: union(
5524 span(1..4),
5525 vec![
5526 lit(span(1..2), ']'),
5527 ast::ClassSetItem::Literal(ast::Literal {
5528 span: span(2..4),
5529 kind: ast::LiteralKind::Meta,
5530 c: '[',
5531 }),
5532 ]
5533 ),
5534 }))
5535 );
5536 assert_eq!(
5537 parser(r"[\[]]").parse(),
5538 Ok(concat(
5539 0..5,
5540 vec![
5541 Ast::class_bracketed(ast::ClassBracketed {
5542 span: span(0..4),
5543 negated: false,
5544 kind: itemset(ast::ClassSetItem::Literal(
5545 ast::Literal {
5546 span: span(1..3),
5547 kind: ast::LiteralKind::Meta,
5548 c: '[',
5549 }
5550 )),
5551 }),
5552 Ast::literal(ast::Literal {
5553 span: span(4..5),
5554 kind: ast::LiteralKind::Verbatim,
5555 c: ']',
5556 }),
5557 ]
5558 ))
5559 );
5560
5561 assert_eq!(
5562 parser("[").parse().unwrap_err(),
5563 TestError {
5564 span: span(0..1),
5565 kind: ast::ErrorKind::ClassUnclosed,
5566 }
5567 );
5568 assert_eq!(
5569 parser("[[").parse().unwrap_err(),
5570 TestError {
5571 span: span(1..2),
5572 kind: ast::ErrorKind::ClassUnclosed,
5573 }
5574 );
5575 assert_eq!(
5576 parser("[[-]").parse().unwrap_err(),
5577 TestError {
5578 span: span(0..1),
5579 kind: ast::ErrorKind::ClassUnclosed,
5580 }
5581 );
5582 assert_eq!(
5583 parser("[[[:alnum:]").parse().unwrap_err(),
5584 TestError {
5585 span: span(1..2),
5586 kind: ast::ErrorKind::ClassUnclosed,
5587 }
5588 );
5589 assert_eq!(
5590 parser(r"[\b]").parse().unwrap_err(),
5591 TestError {
5592 span: span(1..3),
5593 kind: ast::ErrorKind::ClassEscapeInvalid,
5594 }
5595 );
5596 assert_eq!(
5597 parser(r"[\w-a]").parse().unwrap_err(),
5598 TestError {
5599 span: span(1..3),
5600 kind: ast::ErrorKind::ClassRangeLiteral,
5601 }
5602 );
5603 assert_eq!(
5604 parser(r"[a-\w]").parse().unwrap_err(),
5605 TestError {
5606 span: span(3..5),
5607 kind: ast::ErrorKind::ClassRangeLiteral,
5608 }
5609 );
5610 assert_eq!(
5611 parser(r"[z-a]").parse().unwrap_err(),
5612 TestError {
5613 span: span(1..4),
5614 kind: ast::ErrorKind::ClassRangeInvalid,
5615 }
5616 );
5617
5618 assert_eq!(
5619 parser_ignore_whitespace("[a ").parse().unwrap_err(),
5620 TestError {
5621 span: span(0..1),
5622 kind: ast::ErrorKind::ClassUnclosed,
5623 }
5624 );
5625 assert_eq!(
5626 parser_ignore_whitespace("[a- ").parse().unwrap_err(),
5627 TestError {
5628 span: span(0..1),
5629 kind: ast::ErrorKind::ClassUnclosed,
5630 }
5631 );
5632 }
5633
5634 #[test]
5635 fn parse_set_class_open() {
5636 assert_eq!(parser("[a]").parse_set_class_open(), {
5637 let set = ast::ClassBracketed {
5638 span: span(0..1),
5639 negated: false,
5640 kind: ast::ClassSet::union(ast::ClassSetUnion {
5641 span: span(1..1),
5642 items: vec![],
5643 }),
5644 };
5645 let union = ast::ClassSetUnion { span: span(1..1), items: vec![] };
5646 Ok((set, union))
5647 });
5648 assert_eq!(
5649 parser_ignore_whitespace("[ a]").parse_set_class_open(),
5650 {
5651 let set = ast::ClassBracketed {
5652 span: span(0..4),
5653 negated: false,
5654 kind: ast::ClassSet::union(ast::ClassSetUnion {
5655 span: span(4..4),
5656 items: vec![],
5657 }),
5658 };
5659 let union =
5660 ast::ClassSetUnion { span: span(4..4), items: vec![] };
5661 Ok((set, union))
5662 }
5663 );
5664 assert_eq!(parser("[^a]").parse_set_class_open(), {
5665 let set = ast::ClassBracketed {
5666 span: span(0..2),
5667 negated: true,
5668 kind: ast::ClassSet::union(ast::ClassSetUnion {
5669 span: span(2..2),
5670 items: vec![],
5671 }),
5672 };
5673 let union = ast::ClassSetUnion { span: span(2..2), items: vec![] };
5674 Ok((set, union))
5675 });
5676 assert_eq!(
5677 parser_ignore_whitespace("[ ^ a]").parse_set_class_open(),
5678 {
5679 let set = ast::ClassBracketed {
5680 span: span(0..4),
5681 negated: true,
5682 kind: ast::ClassSet::union(ast::ClassSetUnion {
5683 span: span(4..4),
5684 items: vec![],
5685 }),
5686 };
5687 let union =
5688 ast::ClassSetUnion { span: span(4..4), items: vec![] };
5689 Ok((set, union))
5690 }
5691 );
5692 assert_eq!(parser("[-a]").parse_set_class_open(), {
5693 let set = ast::ClassBracketed {
5694 span: span(0..2),
5695 negated: false,
5696 kind: ast::ClassSet::union(ast::ClassSetUnion {
5697 span: span(1..1),
5698 items: vec![],
5699 }),
5700 };
5701 let union = ast::ClassSetUnion {
5702 span: span(1..2),
5703 items: vec![ast::ClassSetItem::Literal(ast::Literal {
5704 span: span(1..2),
5705 kind: ast::LiteralKind::Verbatim,
5706 c: '-',
5707 })],
5708 };
5709 Ok((set, union))
5710 });
5711 assert_eq!(
5712 parser_ignore_whitespace("[ - a]").parse_set_class_open(),
5713 {
5714 let set = ast::ClassBracketed {
5715 span: span(0..4),
5716 negated: false,
5717 kind: ast::ClassSet::union(ast::ClassSetUnion {
5718 span: span(2..2),
5719 items: vec![],
5720 }),
5721 };
5722 let union = ast::ClassSetUnion {
5723 span: span(2..3),
5724 items: vec![ast::ClassSetItem::Literal(ast::Literal {
5725 span: span(2..3),
5726 kind: ast::LiteralKind::Verbatim,
5727 c: '-',
5728 })],
5729 };
5730 Ok((set, union))
5731 }
5732 );
5733 assert_eq!(parser("[^-a]").parse_set_class_open(), {
5734 let set = ast::ClassBracketed {
5735 span: span(0..3),
5736 negated: true,
5737 kind: ast::ClassSet::union(ast::ClassSetUnion {
5738 span: span(2..2),
5739 items: vec![],
5740 }),
5741 };
5742 let union = ast::ClassSetUnion {
5743 span: span(2..3),
5744 items: vec![ast::ClassSetItem::Literal(ast::Literal {
5745 span: span(2..3),
5746 kind: ast::LiteralKind::Verbatim,
5747 c: '-',
5748 })],
5749 };
5750 Ok((set, union))
5751 });
5752 assert_eq!(parser("[--a]").parse_set_class_open(), {
5753 let set = ast::ClassBracketed {
5754 span: span(0..3),
5755 negated: false,
5756 kind: ast::ClassSet::union(ast::ClassSetUnion {
5757 span: span(1..1),
5758 items: vec![],
5759 }),
5760 };
5761 let union = ast::ClassSetUnion {
5762 span: span(1..3),
5763 items: vec![
5764 ast::ClassSetItem::Literal(ast::Literal {
5765 span: span(1..2),
5766 kind: ast::LiteralKind::Verbatim,
5767 c: '-',
5768 }),
5769 ast::ClassSetItem::Literal(ast::Literal {
5770 span: span(2..3),
5771 kind: ast::LiteralKind::Verbatim,
5772 c: '-',
5773 }),
5774 ],
5775 };
5776 Ok((set, union))
5777 });
5778 assert_eq!(parser("[]a]").parse_set_class_open(), {
5779 let set = ast::ClassBracketed {
5780 span: span(0..2),
5781 negated: false,
5782 kind: ast::ClassSet::union(ast::ClassSetUnion {
5783 span: span(1..1),
5784 items: vec![],
5785 }),
5786 };
5787 let union = ast::ClassSetUnion {
5788 span: span(1..2),
5789 items: vec![ast::ClassSetItem::Literal(ast::Literal {
5790 span: span(1..2),
5791 kind: ast::LiteralKind::Verbatim,
5792 c: ']',
5793 })],
5794 };
5795 Ok((set, union))
5796 });
5797 assert_eq!(
5798 parser_ignore_whitespace("[ ] a]").parse_set_class_open(),
5799 {
5800 let set = ast::ClassBracketed {
5801 span: span(0..4),
5802 negated: false,
5803 kind: ast::ClassSet::union(ast::ClassSetUnion {
5804 span: span(2..2),
5805 items: vec![],
5806 }),
5807 };
5808 let union = ast::ClassSetUnion {
5809 span: span(2..3),
5810 items: vec![ast::ClassSetItem::Literal(ast::Literal {
5811 span: span(2..3),
5812 kind: ast::LiteralKind::Verbatim,
5813 c: ']',
5814 })],
5815 };
5816 Ok((set, union))
5817 }
5818 );
5819 assert_eq!(parser("[^]a]").parse_set_class_open(), {
5820 let set = ast::ClassBracketed {
5821 span: span(0..3),
5822 negated: true,
5823 kind: ast::ClassSet::union(ast::ClassSetUnion {
5824 span: span(2..2),
5825 items: vec![],
5826 }),
5827 };
5828 let union = ast::ClassSetUnion {
5829 span: span(2..3),
5830 items: vec![ast::ClassSetItem::Literal(ast::Literal {
5831 span: span(2..3),
5832 kind: ast::LiteralKind::Verbatim,
5833 c: ']',
5834 })],
5835 };
5836 Ok((set, union))
5837 });
5838 assert_eq!(parser("[-]a]").parse_set_class_open(), {
5839 let set = ast::ClassBracketed {
5840 span: span(0..2),
5841 negated: false,
5842 kind: ast::ClassSet::union(ast::ClassSetUnion {
5843 span: span(1..1),
5844 items: vec![],
5845 }),
5846 };
5847 let union = ast::ClassSetUnion {
5848 span: span(1..2),
5849 items: vec![ast::ClassSetItem::Literal(ast::Literal {
5850 span: span(1..2),
5851 kind: ast::LiteralKind::Verbatim,
5852 c: '-',
5853 })],
5854 };
5855 Ok((set, union))
5856 });
5857
5858 assert_eq!(
5859 parser("[").parse_set_class_open().unwrap_err(),
5860 TestError {
5861 span: span(0..1),
5862 kind: ast::ErrorKind::ClassUnclosed,
5863 }
5864 );
5865 assert_eq!(
5866 parser_ignore_whitespace("[ ")
5867 .parse_set_class_open()
5868 .unwrap_err(),
5869 TestError {
5870 span: span(0..5),
5871 kind: ast::ErrorKind::ClassUnclosed,
5872 }
5873 );
5874 assert_eq!(
5875 parser("[^").parse_set_class_open().unwrap_err(),
5876 TestError {
5877 span: span(0..2),
5878 kind: ast::ErrorKind::ClassUnclosed,
5879 }
5880 );
5881 assert_eq!(
5882 parser("[]").parse_set_class_open().unwrap_err(),
5883 TestError {
5884 span: span(0..2),
5885 kind: ast::ErrorKind::ClassUnclosed,
5886 }
5887 );
5888 assert_eq!(
5889 parser("[-").parse_set_class_open().unwrap_err(),
5890 TestError {
5891 span: span(0..0),
5892 kind: ast::ErrorKind::ClassUnclosed,
5893 }
5894 );
5895 assert_eq!(
5896 parser("[--").parse_set_class_open().unwrap_err(),
5897 TestError {
5898 span: span(0..0),
5899 kind: ast::ErrorKind::ClassUnclosed,
5900 }
5901 );
5902
5903 // See: https://github.com/rust-lang/regex/issues/792
5904 assert_eq!(
5905 parser("(?x)[-#]").parse_with_comments().unwrap_err(),
5906 TestError {
5907 span: span(4..4),
5908 kind: ast::ErrorKind::ClassUnclosed,
5909 }
5910 );
5911 }
5912
5913 #[test]
5914 fn maybe_parse_ascii_class() {
5915 assert_eq!(
5916 parser(r"[:alnum:]").maybe_parse_ascii_class(),
5917 Some(ast::ClassAscii {
5918 span: span(0..9),
5919 kind: ast::ClassAsciiKind::Alnum,
5920 negated: false,
5921 })
5922 );
5923 assert_eq!(
5924 parser(r"[:alnum:]A").maybe_parse_ascii_class(),
5925 Some(ast::ClassAscii {
5926 span: span(0..9),
5927 kind: ast::ClassAsciiKind::Alnum,
5928 negated: false,
5929 })
5930 );
5931 assert_eq!(
5932 parser(r"[:^alnum:]").maybe_parse_ascii_class(),
5933 Some(ast::ClassAscii {
5934 span: span(0..10),
5935 kind: ast::ClassAsciiKind::Alnum,
5936 negated: true,
5937 })
5938 );
5939
5940 let p = parser(r"[:");
5941 assert_eq!(p.maybe_parse_ascii_class(), None);
5942 assert_eq!(p.offset(), 0);
5943
5944 let p = parser(r"[:^");
5945 assert_eq!(p.maybe_parse_ascii_class(), None);
5946 assert_eq!(p.offset(), 0);
5947
5948 let p = parser(r"[^:alnum:]");
5949 assert_eq!(p.maybe_parse_ascii_class(), None);
5950 assert_eq!(p.offset(), 0);
5951
5952 let p = parser(r"[:alnnum:]");
5953 assert_eq!(p.maybe_parse_ascii_class(), None);
5954 assert_eq!(p.offset(), 0);
5955
5956 let p = parser(r"[:alnum]");
5957 assert_eq!(p.maybe_parse_ascii_class(), None);
5958 assert_eq!(p.offset(), 0);
5959
5960 let p = parser(r"[:alnum:");
5961 assert_eq!(p.maybe_parse_ascii_class(), None);
5962 assert_eq!(p.offset(), 0);
5963 }
5964
5965 #[test]
5966 fn parse_unicode_class() {
5967 assert_eq!(
5968 parser(r"\pN").parse_escape(),
5969 Ok(Primitive::Unicode(ast::ClassUnicode {
5970 span: span(0..3),
5971 negated: false,
5972 kind: ast::ClassUnicodeKind::OneLetter('N'),
5973 }))
5974 );
5975 assert_eq!(
5976 parser(r"\PN").parse_escape(),
5977 Ok(Primitive::Unicode(ast::ClassUnicode {
5978 span: span(0..3),
5979 negated: true,
5980 kind: ast::ClassUnicodeKind::OneLetter('N'),
5981 }))
5982 );
5983 assert_eq!(
5984 parser(r"\p{N}").parse_escape(),
5985 Ok(Primitive::Unicode(ast::ClassUnicode {
5986 span: span(0..5),
5987 negated: false,
5988 kind: ast::ClassUnicodeKind::Named(s("N")),
5989 }))
5990 );
5991 assert_eq!(
5992 parser(r"\P{N}").parse_escape(),
5993 Ok(Primitive::Unicode(ast::ClassUnicode {
5994 span: span(0..5),
5995 negated: true,
5996 kind: ast::ClassUnicodeKind::Named(s("N")),
5997 }))
5998 );
5999 assert_eq!(
6000 parser(r"\p{Greek}").parse_escape(),
6001 Ok(Primitive::Unicode(ast::ClassUnicode {
6002 span: span(0..9),
6003 negated: false,
6004 kind: ast::ClassUnicodeKind::Named(s("Greek")),
6005 }))
6006 );
6007
6008 assert_eq!(
6009 parser(r"\p{scx:Katakana}").parse_escape(),
6010 Ok(Primitive::Unicode(ast::ClassUnicode {
6011 span: span(0..16),
6012 negated: false,
6013 kind: ast::ClassUnicodeKind::NamedValue {
6014 op: ast::ClassUnicodeOpKind::Colon,
6015 name: s("scx"),
6016 value: s("Katakana"),
6017 },
6018 }))
6019 );
6020 assert_eq!(
6021 parser(r"\p{scx=Katakana}").parse_escape(),
6022 Ok(Primitive::Unicode(ast::ClassUnicode {
6023 span: span(0..16),
6024 negated: false,
6025 kind: ast::ClassUnicodeKind::NamedValue {
6026 op: ast::ClassUnicodeOpKind::Equal,
6027 name: s("scx"),
6028 value: s("Katakana"),
6029 },
6030 }))
6031 );
6032 assert_eq!(
6033 parser(r"\p{scx!=Katakana}").parse_escape(),
6034 Ok(Primitive::Unicode(ast::ClassUnicode {
6035 span: span(0..17),
6036 negated: false,
6037 kind: ast::ClassUnicodeKind::NamedValue {
6038 op: ast::ClassUnicodeOpKind::NotEqual,
6039 name: s("scx"),
6040 value: s("Katakana"),
6041 },
6042 }))
6043 );
6044
6045 assert_eq!(
6046 parser(r"\p{:}").parse_escape(),
6047 Ok(Primitive::Unicode(ast::ClassUnicode {
6048 span: span(0..5),
6049 negated: false,
6050 kind: ast::ClassUnicodeKind::NamedValue {
6051 op: ast::ClassUnicodeOpKind::Colon,
6052 name: s(""),
6053 value: s(""),
6054 },
6055 }))
6056 );
6057 assert_eq!(
6058 parser(r"\p{=}").parse_escape(),
6059 Ok(Primitive::Unicode(ast::ClassUnicode {
6060 span: span(0..5),
6061 negated: false,
6062 kind: ast::ClassUnicodeKind::NamedValue {
6063 op: ast::ClassUnicodeOpKind::Equal,
6064 name: s(""),
6065 value: s(""),
6066 },
6067 }))
6068 );
6069 assert_eq!(
6070 parser(r"\p{!=}").parse_escape(),
6071 Ok(Primitive::Unicode(ast::ClassUnicode {
6072 span: span(0..6),
6073 negated: false,
6074 kind: ast::ClassUnicodeKind::NamedValue {
6075 op: ast::ClassUnicodeOpKind::NotEqual,
6076 name: s(""),
6077 value: s(""),
6078 },
6079 }))
6080 );
6081
6082 assert_eq!(
6083 parser(r"\p").parse_escape().unwrap_err(),
6084 TestError {
6085 span: span(2..2),
6086 kind: ast::ErrorKind::EscapeUnexpectedEof,
6087 }
6088 );
6089 assert_eq!(
6090 parser(r"\p{").parse_escape().unwrap_err(),
6091 TestError {
6092 span: span(3..3),
6093 kind: ast::ErrorKind::EscapeUnexpectedEof,
6094 }
6095 );
6096 assert_eq!(
6097 parser(r"\p{N").parse_escape().unwrap_err(),
6098 TestError {
6099 span: span(4..4),
6100 kind: ast::ErrorKind::EscapeUnexpectedEof,
6101 }
6102 );
6103 assert_eq!(
6104 parser(r"\p{Greek").parse_escape().unwrap_err(),
6105 TestError {
6106 span: span(8..8),
6107 kind: ast::ErrorKind::EscapeUnexpectedEof,
6108 }
6109 );
6110
6111 assert_eq!(
6112 parser(r"\pNz").parse(),
6113 Ok(Ast::concat(ast::Concat {
6114 span: span(0..4),
6115 asts: vec![
6116 Ast::class_unicode(ast::ClassUnicode {
6117 span: span(0..3),
6118 negated: false,
6119 kind: ast::ClassUnicodeKind::OneLetter('N'),
6120 }),
6121 Ast::literal(ast::Literal {
6122 span: span(3..4),
6123 kind: ast::LiteralKind::Verbatim,
6124 c: 'z',
6125 }),
6126 ],
6127 }))
6128 );
6129 assert_eq!(
6130 parser(r"\p{Greek}z").parse(),
6131 Ok(Ast::concat(ast::Concat {
6132 span: span(0..10),
6133 asts: vec![
6134 Ast::class_unicode(ast::ClassUnicode {
6135 span: span(0..9),
6136 negated: false,
6137 kind: ast::ClassUnicodeKind::Named(s("Greek")),
6138 }),
6139 Ast::literal(ast::Literal {
6140 span: span(9..10),
6141 kind: ast::LiteralKind::Verbatim,
6142 c: 'z',
6143 }),
6144 ],
6145 }))
6146 );
6147 assert_eq!(
6148 parser(r"\p\{").parse().unwrap_err(),
6149 TestError {
6150 span: span(2..3),
6151 kind: ast::ErrorKind::UnicodeClassInvalid,
6152 }
6153 );
6154 assert_eq!(
6155 parser(r"\P\{").parse().unwrap_err(),
6156 TestError {
6157 span: span(2..3),
6158 kind: ast::ErrorKind::UnicodeClassInvalid,
6159 }
6160 );
6161 }
6162
6163 #[test]
6164 fn parse_perl_class() {
6165 assert_eq!(
6166 parser(r"\d").parse_escape(),
6167 Ok(Primitive::Perl(ast::ClassPerl {
6168 span: span(0..2),
6169 kind: ast::ClassPerlKind::Digit,
6170 negated: false,
6171 }))
6172 );
6173 assert_eq!(
6174 parser(r"\D").parse_escape(),
6175 Ok(Primitive::Perl(ast::ClassPerl {
6176 span: span(0..2),
6177 kind: ast::ClassPerlKind::Digit,
6178 negated: true,
6179 }))
6180 );
6181 assert_eq!(
6182 parser(r"\s").parse_escape(),
6183 Ok(Primitive::Perl(ast::ClassPerl {
6184 span: span(0..2),
6185 kind: ast::ClassPerlKind::Space,
6186 negated: false,
6187 }))
6188 );
6189 assert_eq!(
6190 parser(r"\S").parse_escape(),
6191 Ok(Primitive::Perl(ast::ClassPerl {
6192 span: span(0..2),
6193 kind: ast::ClassPerlKind::Space,
6194 negated: true,
6195 }))
6196 );
6197 assert_eq!(
6198 parser(r"\w").parse_escape(),
6199 Ok(Primitive::Perl(ast::ClassPerl {
6200 span: span(0..2),
6201 kind: ast::ClassPerlKind::Word,
6202 negated: false,
6203 }))
6204 );
6205 assert_eq!(
6206 parser(r"\W").parse_escape(),
6207 Ok(Primitive::Perl(ast::ClassPerl {
6208 span: span(0..2),
6209 kind: ast::ClassPerlKind::Word,
6210 negated: true,
6211 }))
6212 );
6213
6214 assert_eq!(
6215 parser(r"\d").parse(),
6216 Ok(Ast::class_perl(ast::ClassPerl {
6217 span: span(0..2),
6218 kind: ast::ClassPerlKind::Digit,
6219 negated: false,
6220 }))
6221 );
6222 assert_eq!(
6223 parser(r"\dz").parse(),
6224 Ok(Ast::concat(ast::Concat {
6225 span: span(0..3),
6226 asts: vec![
6227 Ast::class_perl(ast::ClassPerl {
6228 span: span(0..2),
6229 kind: ast::ClassPerlKind::Digit,
6230 negated: false,
6231 }),
6232 Ast::literal(ast::Literal {
6233 span: span(2..3),
6234 kind: ast::LiteralKind::Verbatim,
6235 c: 'z',
6236 }),
6237 ],
6238 }))
6239 );
6240 }
6241
6242 // This tests a bug fix where the nest limit checker wasn't decrementing
6243 // its depth during post-traversal, which causes long regexes to trip
6244 // the default limit too aggressively.
6245 #[test]
6246 fn regression_454_nest_too_big() {
6247 let pattern = r#"
6248 2(?:
6249 [45]\d{3}|
6250 7(?:
6251 1[0-267]|
6252 2[0-289]|
6253 3[0-29]|
6254 4[01]|
6255 5[1-3]|
6256 6[013]|
6257 7[0178]|
6258 91
6259 )|
6260 8(?:
6261 0[125]|
6262 [139][1-6]|
6263 2[0157-9]|
6264 41|
6265 6[1-35]|
6266 7[1-5]|
6267 8[1-8]|
6268 90
6269 )|
6270 9(?:
6271 0[0-2]|
6272 1[0-4]|
6273 2[568]|
6274 3[3-6]|
6275 5[5-7]|
6276 6[0167]|
6277 7[15]|
6278 8[0146-9]
6279 )
6280 )\d{4}
6281 "#;
6282 assert!(parser_nest_limit(pattern, 50).parse().is_ok());
6283 }
6284
6285 // This tests that we treat a trailing `-` in a character class as a
6286 // literal `-` even when whitespace mode is enabled and there is whitespace
6287 // after the trailing `-`.
6288 #[test]
6289 fn regression_455_trailing_dash_ignore_whitespace() {
6290 assert!(parser("(?x)[ / - ]").parse().is_ok());
6291 assert!(parser("(?x)[ a - ]").parse().is_ok());
6292 assert!(parser(
6293 "(?x)[
6294 a
6295 - ]
6296 "
6297 )
6298 .parse()
6299 .is_ok());
6300 assert!(parser(
6301 "(?x)[
6302 a # wat
6303 - ]
6304 "
6305 )
6306 .parse()
6307 .is_ok());
6308
6309 assert!(parser("(?x)[ / -").parse().is_err());
6310 assert!(parser("(?x)[ / - ").parse().is_err());
6311 assert!(parser(
6312 "(?x)[
6313 / -
6314 "
6315 )
6316 .parse()
6317 .is_err());
6318 assert!(parser(
6319 "(?x)[
6320 / - # wat
6321 "
6322 )
6323 .parse()
6324 .is_err());
6325 }
6326}
6327