1 | /*! |
2 | This module provides a regular expression parser. |
3 | */ |
4 | |
5 | use core::{ |
6 | borrow::Borrow, |
7 | cell::{Cell, RefCell}, |
8 | mem, |
9 | }; |
10 | |
11 | use alloc::{ |
12 | boxed::Box, |
13 | string::{String, ToString}, |
14 | vec, |
15 | vec::Vec, |
16 | }; |
17 | |
18 | use crate::{ |
19 | ast::{self, Ast, Position, Span}, |
20 | either::Either, |
21 | is_escapeable_character, is_meta_character, |
22 | }; |
23 | |
24 | type Result<T> = core::result::Result<T, ast::Error>; |
25 | |
26 | /// A primitive is an expression with no sub-expressions. This includes |
27 | /// literals, assertions and non-set character classes. This representation |
28 | /// is used as intermediate state in the parser. |
29 | /// |
30 | /// This does not include ASCII character classes, since they can only appear |
31 | /// within a set character class. |
32 | #[derive (Clone, Debug, Eq, PartialEq)] |
33 | enum Primitive { |
34 | Literal(ast::Literal), |
35 | Assertion(ast::Assertion), |
36 | Dot(Span), |
37 | Perl(ast::ClassPerl), |
38 | Unicode(ast::ClassUnicode), |
39 | } |
40 | |
41 | impl Primitive { |
42 | /// Return the span of this primitive. |
43 | fn span(&self) -> &Span { |
44 | match *self { |
45 | Primitive::Literal(ref x) => &x.span, |
46 | Primitive::Assertion(ref x) => &x.span, |
47 | Primitive::Dot(ref span) => span, |
48 | Primitive::Perl(ref x) => &x.span, |
49 | Primitive::Unicode(ref x) => &x.span, |
50 | } |
51 | } |
52 | |
53 | /// Convert this primitive into a proper AST. |
54 | fn into_ast(self) -> Ast { |
55 | match self { |
56 | Primitive::Literal(lit) => Ast::literal(lit), |
57 | Primitive::Assertion(assert) => Ast::assertion(assert), |
58 | Primitive::Dot(span) => Ast::dot(span), |
59 | Primitive::Perl(cls) => Ast::class_perl(cls), |
60 | Primitive::Unicode(cls) => Ast::class_unicode(cls), |
61 | } |
62 | } |
63 | |
64 | /// Convert this primitive into an item in a character class. |
65 | /// |
66 | /// If this primitive is not a legal item (i.e., an assertion or a dot), |
67 | /// then return an error. |
68 | fn into_class_set_item<P: Borrow<Parser>>( |
69 | self, |
70 | p: &ParserI<'_, P>, |
71 | ) -> Result<ast::ClassSetItem> { |
72 | use self::Primitive::*; |
73 | use crate::ast::ClassSetItem; |
74 | |
75 | match self { |
76 | Literal(lit) => Ok(ClassSetItem::Literal(lit)), |
77 | Perl(cls) => Ok(ClassSetItem::Perl(cls)), |
78 | Unicode(cls) => Ok(ClassSetItem::Unicode(cls)), |
79 | x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)), |
80 | } |
81 | } |
82 | |
83 | /// Convert this primitive into a literal in a character class. In |
84 | /// particular, literals are the only valid items that can appear in |
85 | /// ranges. |
86 | /// |
87 | /// If this primitive is not a legal item (i.e., a class, assertion or a |
88 | /// dot), then return an error. |
89 | fn into_class_literal<P: Borrow<Parser>>( |
90 | self, |
91 | p: &ParserI<'_, P>, |
92 | ) -> Result<ast::Literal> { |
93 | use self::Primitive::*; |
94 | |
95 | match self { |
96 | Literal(lit) => Ok(lit), |
97 | x => Err(p.error(*x.span(), ast::ErrorKind::ClassRangeLiteral)), |
98 | } |
99 | } |
100 | } |
101 | |
102 | /// Returns true if the given character is a hexadecimal digit. |
103 | fn is_hex(c: char) -> bool { |
104 | ('0' <= c && c <= '9' ) || ('a' <= c && c <= 'f' ) || ('A' <= c && c <= 'F' ) |
105 | } |
106 | |
107 | /// Returns true if the given character is a valid in a capture group name. |
108 | /// |
109 | /// If `first` is true, then `c` is treated as the first character in the |
110 | /// group name (which must be alphabetic or underscore). |
111 | fn is_capture_char(c: char, first: bool) -> bool { |
112 | if first { |
113 | c == '_' || c.is_alphabetic() |
114 | } else { |
115 | c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric() |
116 | } |
117 | } |
118 | |
119 | /// A builder for a regular expression parser. |
120 | /// |
121 | /// This builder permits modifying configuration options for the parser. |
122 | #[derive (Clone, Debug)] |
123 | pub struct ParserBuilder { |
124 | ignore_whitespace: bool, |
125 | nest_limit: u32, |
126 | octal: bool, |
127 | empty_min_range: bool, |
128 | } |
129 | |
130 | impl Default for ParserBuilder { |
131 | fn default() -> ParserBuilder { |
132 | ParserBuilder::new() |
133 | } |
134 | } |
135 | |
136 | impl ParserBuilder { |
137 | /// Create a new parser builder with a default configuration. |
138 | pub fn new() -> ParserBuilder { |
139 | ParserBuilder { |
140 | ignore_whitespace: false, |
141 | nest_limit: 250, |
142 | octal: false, |
143 | empty_min_range: false, |
144 | } |
145 | } |
146 | |
147 | /// Build a parser from this configuration with the given pattern. |
148 | pub fn build(&self) -> Parser { |
149 | Parser { |
150 | pos: Cell::new(Position { offset: 0, line: 1, column: 1 }), |
151 | capture_index: Cell::new(0), |
152 | nest_limit: self.nest_limit, |
153 | octal: self.octal, |
154 | empty_min_range: self.empty_min_range, |
155 | initial_ignore_whitespace: self.ignore_whitespace, |
156 | ignore_whitespace: Cell::new(self.ignore_whitespace), |
157 | comments: RefCell::new(vec![]), |
158 | stack_group: RefCell::new(vec![]), |
159 | stack_class: RefCell::new(vec![]), |
160 | capture_names: RefCell::new(vec![]), |
161 | scratch: RefCell::new(String::new()), |
162 | } |
163 | } |
164 | |
165 | /// Set the nesting limit for this parser. |
166 | /// |
167 | /// The nesting limit controls how deep the abstract syntax tree is allowed |
168 | /// to be. If the AST exceeds the given limit (e.g., with too many nested |
169 | /// groups), then an error is returned by the parser. |
170 | /// |
171 | /// The purpose of this limit is to act as a heuristic to prevent stack |
172 | /// overflow for consumers that do structural induction on an `Ast` using |
173 | /// explicit recursion. While this crate never does this (instead using |
174 | /// constant stack space and moving the call stack to the heap), other |
175 | /// crates may. |
176 | /// |
177 | /// This limit is not checked until the entire AST is parsed. Therefore, |
178 | /// if callers want to put a limit on the amount of heap space used, then |
179 | /// they should impose a limit on the length, in bytes, of the concrete |
180 | /// pattern string. In particular, this is viable since this parser |
181 | /// implementation will limit itself to heap space proportional to the |
182 | /// length of the pattern string. |
183 | /// |
184 | /// Note that a nest limit of `0` will return a nest limit error for most |
185 | /// patterns but not all. For example, a nest limit of `0` permits `a` but |
186 | /// not `ab`, since `ab` requires a concatenation, which results in a nest |
187 | /// depth of `1`. In general, a nest limit is not something that manifests |
188 | /// in an obvious way in the concrete syntax, therefore, it should not be |
189 | /// used in a granular way. |
190 | pub fn nest_limit(&mut self, limit: u32) -> &mut ParserBuilder { |
191 | self.nest_limit = limit; |
192 | self |
193 | } |
194 | |
195 | /// Whether to support octal syntax or not. |
196 | /// |
197 | /// Octal syntax is a little-known way of uttering Unicode codepoints in |
198 | /// a regular expression. For example, `a`, `\x61`, `\u0061` and |
199 | /// `\141` are all equivalent regular expressions, where the last example |
200 | /// shows octal syntax. |
201 | /// |
202 | /// While supporting octal syntax isn't in and of itself a problem, it does |
203 | /// make good error messages harder. That is, in PCRE based regex engines, |
204 | /// syntax like `\0` invokes a backreference, which is explicitly |
205 | /// unsupported in Rust's regex engine. However, many users expect it to |
206 | /// be supported. Therefore, when octal support is disabled, the error |
207 | /// message will explicitly mention that backreferences aren't supported. |
208 | /// |
209 | /// Octal syntax is disabled by default. |
210 | pub fn octal(&mut self, yes: bool) -> &mut ParserBuilder { |
211 | self.octal = yes; |
212 | self |
213 | } |
214 | |
215 | /// Enable verbose mode in the regular expression. |
216 | /// |
217 | /// When enabled, verbose mode permits insignificant whitespace in many |
218 | /// places in the regular expression, as well as comments. Comments are |
219 | /// started using `#` and continue until the end of the line. |
220 | /// |
221 | /// By default, this is disabled. It may be selectively enabled in the |
222 | /// regular expression by using the `x` flag regardless of this setting. |
223 | pub fn ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder { |
224 | self.ignore_whitespace = yes; |
225 | self |
226 | } |
227 | |
228 | /// Allow using `{,n}` as an equivalent to `{0,n}`. |
229 | /// |
230 | /// When enabled, the parser accepts `{,n}` as valid syntax for `{0,n}`. |
231 | /// Most regular expression engines don't support the `{,n}` syntax, but |
232 | /// some others do it, namely Python's `re` library. |
233 | /// |
234 | /// This is disabled by default. |
235 | pub fn empty_min_range(&mut self, yes: bool) -> &mut ParserBuilder { |
236 | self.empty_min_range = yes; |
237 | self |
238 | } |
239 | } |
240 | |
241 | /// A regular expression parser. |
242 | /// |
243 | /// This parses a string representation of a regular expression into an |
244 | /// abstract syntax tree. The size of the tree is proportional to the length |
245 | /// of the regular expression pattern. |
246 | /// |
247 | /// A `Parser` can be configured in more detail via a [`ParserBuilder`]. |
248 | #[derive (Clone, Debug)] |
249 | pub struct Parser { |
250 | /// The current position of the parser. |
251 | pos: Cell<Position>, |
252 | /// The current capture index. |
253 | capture_index: Cell<u32>, |
254 | /// The maximum number of open parens/brackets allowed. If the parser |
255 | /// exceeds this number, then an error is returned. |
256 | nest_limit: u32, |
257 | /// Whether to support octal syntax or not. When `false`, the parser will |
258 | /// return an error helpfully pointing out that backreferences are not |
259 | /// supported. |
260 | octal: bool, |
261 | /// The initial setting for `ignore_whitespace` as provided by |
262 | /// `ParserBuilder`. It is used when resetting the parser's state. |
263 | initial_ignore_whitespace: bool, |
264 | /// Whether the parser supports `{,n}` repetitions as an equivalent to |
265 | /// `{0,n}.` |
266 | empty_min_range: bool, |
267 | /// Whether whitespace should be ignored. When enabled, comments are |
268 | /// also permitted. |
269 | ignore_whitespace: Cell<bool>, |
270 | /// A list of comments, in order of appearance. |
271 | comments: RefCell<Vec<ast::Comment>>, |
272 | /// A stack of grouped sub-expressions, including alternations. |
273 | stack_group: RefCell<Vec<GroupState>>, |
274 | /// A stack of nested character classes. This is only non-empty when |
275 | /// parsing a class. |
276 | stack_class: RefCell<Vec<ClassState>>, |
277 | /// A sorted sequence of capture names. This is used to detect duplicate |
278 | /// capture names and report an error if one is detected. |
279 | capture_names: RefCell<Vec<ast::CaptureName>>, |
280 | /// A scratch buffer used in various places. Mostly this is used to |
281 | /// accumulate relevant characters from parts of a pattern. |
282 | scratch: RefCell<String>, |
283 | } |
284 | |
285 | /// ParserI is the internal parser implementation. |
286 | /// |
287 | /// We use this separate type so that we can carry the provided pattern string |
288 | /// along with us. In particular, a `Parser` internal state is not tied to any |
289 | /// one pattern, but `ParserI` is. |
290 | /// |
291 | /// This type also lets us use `ParserI<&Parser>` in production code while |
292 | /// retaining the convenience of `ParserI<Parser>` for tests, which sometimes |
293 | /// work against the internal interface of the parser. |
294 | #[derive (Clone, Debug)] |
295 | struct ParserI<'s, P> { |
296 | /// The parser state/configuration. |
297 | parser: P, |
298 | /// The full regular expression provided by the user. |
299 | pattern: &'s str, |
300 | } |
301 | |
302 | /// GroupState represents a single stack frame while parsing nested groups |
303 | /// and alternations. Each frame records the state up to an opening parenthesis |
304 | /// or a alternating bracket `|`. |
305 | #[derive (Clone, Debug)] |
306 | enum GroupState { |
307 | /// This state is pushed whenever an opening group is found. |
308 | Group { |
309 | /// The concatenation immediately preceding the opening group. |
310 | concat: ast::Concat, |
311 | /// The group that has been opened. Its sub-AST is always empty. |
312 | group: ast::Group, |
313 | /// Whether this group has the `x` flag enabled or not. |
314 | ignore_whitespace: bool, |
315 | }, |
316 | /// This state is pushed whenever a new alternation branch is found. If |
317 | /// an alternation branch is found and this state is at the top of the |
318 | /// stack, then this state should be modified to include the new |
319 | /// alternation. |
320 | Alternation(ast::Alternation), |
321 | } |
322 | |
323 | /// ClassState represents a single stack frame while parsing character classes. |
324 | /// Each frame records the state up to an intersection, difference, symmetric |
325 | /// difference or nested class. |
326 | /// |
327 | /// Note that a parser's character class stack is only non-empty when parsing |
328 | /// a character class. In all other cases, it is empty. |
329 | #[derive (Clone, Debug)] |
330 | enum ClassState { |
331 | /// This state is pushed whenever an opening bracket is found. |
332 | Open { |
333 | /// The union of class items immediately preceding this class. |
334 | union: ast::ClassSetUnion, |
335 | /// The class that has been opened. Typically this just corresponds |
336 | /// to the `[`, but it can also include `[^` since `^` indicates |
337 | /// negation of the class. |
338 | set: ast::ClassBracketed, |
339 | }, |
340 | /// This state is pushed when a operator is seen. When popped, the stored |
341 | /// set becomes the left hand side of the operator. |
342 | Op { |
343 | /// The type of the operation, i.e., &&, -- or ~~. |
344 | kind: ast::ClassSetBinaryOpKind, |
345 | /// The left-hand side of the operator. |
346 | lhs: ast::ClassSet, |
347 | }, |
348 | } |
349 | |
350 | impl Parser { |
351 | /// Create a new parser with a default configuration. |
352 | /// |
353 | /// The parser can be run with either the `parse` or `parse_with_comments` |
354 | /// methods. The parse methods return an abstract syntax tree. |
355 | /// |
356 | /// To set configuration options on the parser, use [`ParserBuilder`]. |
357 | pub fn new() -> Parser { |
358 | ParserBuilder::new().build() |
359 | } |
360 | |
361 | /// Parse the regular expression into an abstract syntax tree. |
362 | pub fn parse(&mut self, pattern: &str) -> Result<Ast> { |
363 | ParserI::new(self, pattern).parse() |
364 | } |
365 | |
366 | /// Parse the regular expression and return an abstract syntax tree with |
367 | /// all of the comments found in the pattern. |
368 | pub fn parse_with_comments( |
369 | &mut self, |
370 | pattern: &str, |
371 | ) -> Result<ast::WithComments> { |
372 | ParserI::new(self, pattern).parse_with_comments() |
373 | } |
374 | |
375 | /// Reset the internal state of a parser. |
376 | /// |
377 | /// This is called at the beginning of every parse. This prevents the |
378 | /// parser from running with inconsistent state (say, if a previous |
379 | /// invocation returned an error and the parser is reused). |
380 | fn reset(&self) { |
381 | // These settings should be in line with the construction |
382 | // in `ParserBuilder::build`. |
383 | self.pos.set(Position { offset: 0, line: 1, column: 1 }); |
384 | self.ignore_whitespace.set(self.initial_ignore_whitespace); |
385 | self.comments.borrow_mut().clear(); |
386 | self.stack_group.borrow_mut().clear(); |
387 | self.stack_class.borrow_mut().clear(); |
388 | } |
389 | } |
390 | |
391 | impl<'s, P: Borrow<Parser>> ParserI<'s, P> { |
392 | /// Build an internal parser from a parser configuration and a pattern. |
393 | fn new(parser: P, pattern: &'s str) -> ParserI<'s, P> { |
394 | ParserI { parser, pattern } |
395 | } |
396 | |
397 | /// Return a reference to the parser state. |
398 | fn parser(&self) -> &Parser { |
399 | self.parser.borrow() |
400 | } |
401 | |
402 | /// Return a reference to the pattern being parsed. |
403 | fn pattern(&self) -> &str { |
404 | self.pattern |
405 | } |
406 | |
407 | /// Create a new error with the given span and error type. |
408 | fn error(&self, span: Span, kind: ast::ErrorKind) -> ast::Error { |
409 | ast::Error { kind, pattern: self.pattern().to_string(), span } |
410 | } |
411 | |
412 | /// Return the current offset of the parser. |
413 | /// |
414 | /// The offset starts at `0` from the beginning of the regular expression |
415 | /// pattern string. |
416 | fn offset(&self) -> usize { |
417 | self.parser().pos.get().offset |
418 | } |
419 | |
420 | /// Return the current line number of the parser. |
421 | /// |
422 | /// The line number starts at `1`. |
423 | fn line(&self) -> usize { |
424 | self.parser().pos.get().line |
425 | } |
426 | |
427 | /// Return the current column of the parser. |
428 | /// |
429 | /// The column number starts at `1` and is reset whenever a `\n` is seen. |
430 | fn column(&self) -> usize { |
431 | self.parser().pos.get().column |
432 | } |
433 | |
434 | /// Return the next capturing index. Each subsequent call increments the |
435 | /// internal index. |
436 | /// |
437 | /// The span given should correspond to the location of the opening |
438 | /// parenthesis. |
439 | /// |
440 | /// If the capture limit is exceeded, then an error is returned. |
441 | fn next_capture_index(&self, span: Span) -> Result<u32> { |
442 | let current = self.parser().capture_index.get(); |
443 | let i = current.checked_add(1).ok_or_else(|| { |
444 | self.error(span, ast::ErrorKind::CaptureLimitExceeded) |
445 | })?; |
446 | self.parser().capture_index.set(i); |
447 | Ok(i) |
448 | } |
449 | |
450 | /// Adds the given capture name to this parser. If this capture name has |
451 | /// already been used, then an error is returned. |
452 | fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> { |
453 | let mut names = self.parser().capture_names.borrow_mut(); |
454 | match names |
455 | .binary_search_by_key(&cap.name.as_str(), |c| c.name.as_str()) |
456 | { |
457 | Err(i) => { |
458 | names.insert(i, cap.clone()); |
459 | Ok(()) |
460 | } |
461 | Ok(i) => Err(self.error( |
462 | cap.span, |
463 | ast::ErrorKind::GroupNameDuplicate { original: names[i].span }, |
464 | )), |
465 | } |
466 | } |
467 | |
468 | /// Return whether the parser should ignore whitespace or not. |
469 | fn ignore_whitespace(&self) -> bool { |
470 | self.parser().ignore_whitespace.get() |
471 | } |
472 | |
473 | /// Return the character at the current position of the parser. |
474 | /// |
475 | /// This panics if the current position does not point to a valid char. |
476 | fn char(&self) -> char { |
477 | self.char_at(self.offset()) |
478 | } |
479 | |
480 | /// Return the character at the given position. |
481 | /// |
482 | /// This panics if the given position does not point to a valid char. |
483 | fn char_at(&self, i: usize) -> char { |
484 | self.pattern()[i..] |
485 | .chars() |
486 | .next() |
487 | .unwrap_or_else(|| panic!("expected char at offset {}" , i)) |
488 | } |
489 | |
490 | /// Bump the parser to the next Unicode scalar value. |
491 | /// |
492 | /// If the end of the input has been reached, then `false` is returned. |
493 | fn bump(&self) -> bool { |
494 | if self.is_eof() { |
495 | return false; |
496 | } |
497 | let Position { mut offset, mut line, mut column } = self.pos(); |
498 | if self.char() == ' \n' { |
499 | line = line.checked_add(1).unwrap(); |
500 | column = 1; |
501 | } else { |
502 | column = column.checked_add(1).unwrap(); |
503 | } |
504 | offset += self.char().len_utf8(); |
505 | self.parser().pos.set(Position { offset, line, column }); |
506 | self.pattern()[self.offset()..].chars().next().is_some() |
507 | } |
508 | |
509 | /// If the substring starting at the current position of the parser has |
510 | /// the given prefix, then bump the parser to the character immediately |
511 | /// following the prefix and return true. Otherwise, don't bump the parser |
512 | /// and return false. |
513 | fn bump_if(&self, prefix: &str) -> bool { |
514 | if self.pattern()[self.offset()..].starts_with(prefix) { |
515 | for _ in 0..prefix.chars().count() { |
516 | self.bump(); |
517 | } |
518 | true |
519 | } else { |
520 | false |
521 | } |
522 | } |
523 | |
524 | /// Returns true if and only if the parser is positioned at a look-around |
525 | /// prefix. The conditions under which this returns true must always |
526 | /// correspond to a regular expression that would otherwise be consider |
527 | /// invalid. |
528 | /// |
529 | /// This should only be called immediately after parsing the opening of |
530 | /// a group or a set of flags. |
531 | fn is_lookaround_prefix(&self) -> bool { |
532 | self.bump_if("?=" ) |
533 | || self.bump_if("?!" ) |
534 | || self.bump_if("?<=" ) |
535 | || self.bump_if("?<!" ) |
536 | } |
537 | |
538 | /// Bump the parser, and if the `x` flag is enabled, bump through any |
539 | /// subsequent spaces. Return true if and only if the parser is not at |
540 | /// EOF. |
541 | fn bump_and_bump_space(&self) -> bool { |
542 | if !self.bump() { |
543 | return false; |
544 | } |
545 | self.bump_space(); |
546 | !self.is_eof() |
547 | } |
548 | |
549 | /// If the `x` flag is enabled (i.e., whitespace insensitivity with |
550 | /// comments), then this will advance the parser through all whitespace |
551 | /// and comments to the next non-whitespace non-comment byte. |
552 | /// |
553 | /// If the `x` flag is disabled, then this is a no-op. |
554 | /// |
555 | /// This should be used selectively throughout the parser where |
556 | /// arbitrary whitespace is permitted when the `x` flag is enabled. For |
557 | /// example, `{ 5 , 6}` is equivalent to `{5,6}`. |
558 | fn bump_space(&self) { |
559 | if !self.ignore_whitespace() { |
560 | return; |
561 | } |
562 | while !self.is_eof() { |
563 | if self.char().is_whitespace() { |
564 | self.bump(); |
565 | } else if self.char() == '#' { |
566 | let start = self.pos(); |
567 | let mut comment_text = String::new(); |
568 | self.bump(); |
569 | while !self.is_eof() { |
570 | let c = self.char(); |
571 | self.bump(); |
572 | if c == ' \n' { |
573 | break; |
574 | } |
575 | comment_text.push(c); |
576 | } |
577 | let comment = ast::Comment { |
578 | span: Span::new(start, self.pos()), |
579 | comment: comment_text, |
580 | }; |
581 | self.parser().comments.borrow_mut().push(comment); |
582 | } else { |
583 | break; |
584 | } |
585 | } |
586 | } |
587 | |
588 | /// Peek at the next character in the input without advancing the parser. |
589 | /// |
590 | /// If the input has been exhausted, then this returns `None`. |
591 | fn peek(&self) -> Option<char> { |
592 | if self.is_eof() { |
593 | return None; |
594 | } |
595 | self.pattern()[self.offset() + self.char().len_utf8()..].chars().next() |
596 | } |
597 | |
598 | /// Like peek, but will ignore spaces when the parser is in whitespace |
599 | /// insensitive mode. |
600 | fn peek_space(&self) -> Option<char> { |
601 | if !self.ignore_whitespace() { |
602 | return self.peek(); |
603 | } |
604 | if self.is_eof() { |
605 | return None; |
606 | } |
607 | let mut start = self.offset() + self.char().len_utf8(); |
608 | let mut in_comment = false; |
609 | for (i, c) in self.pattern()[start..].char_indices() { |
610 | if c.is_whitespace() { |
611 | continue; |
612 | } else if !in_comment && c == '#' { |
613 | in_comment = true; |
614 | } else if in_comment && c == ' \n' { |
615 | in_comment = false; |
616 | } else { |
617 | start += i; |
618 | break; |
619 | } |
620 | } |
621 | self.pattern()[start..].chars().next() |
622 | } |
623 | |
624 | /// Returns true if the next call to `bump` would return false. |
625 | fn is_eof(&self) -> bool { |
626 | self.offset() == self.pattern().len() |
627 | } |
628 | |
629 | /// Return the current position of the parser, which includes the offset, |
630 | /// line and column. |
631 | fn pos(&self) -> Position { |
632 | self.parser().pos.get() |
633 | } |
634 | |
635 | /// Create a span at the current position of the parser. Both the start |
636 | /// and end of the span are set. |
637 | fn span(&self) -> Span { |
638 | Span::splat(self.pos()) |
639 | } |
640 | |
641 | /// Create a span that covers the current character. |
642 | fn span_char(&self) -> Span { |
643 | let mut next = Position { |
644 | offset: self.offset().checked_add(self.char().len_utf8()).unwrap(), |
645 | line: self.line(), |
646 | column: self.column().checked_add(1).unwrap(), |
647 | }; |
648 | if self.char() == ' \n' { |
649 | next.line += 1; |
650 | next.column = 1; |
651 | } |
652 | Span::new(self.pos(), next) |
653 | } |
654 | |
655 | /// Parse and push a single alternation on to the parser's internal stack. |
656 | /// If the top of the stack already has an alternation, then add to that |
657 | /// instead of pushing a new one. |
658 | /// |
659 | /// The concatenation given corresponds to a single alternation branch. |
660 | /// The concatenation returned starts the next branch and is empty. |
661 | /// |
662 | /// This assumes the parser is currently positioned at `|` and will advance |
663 | /// the parser to the character following `|`. |
664 | #[inline (never)] |
665 | fn push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat> { |
666 | assert_eq!(self.char(), '|' ); |
667 | concat.span.end = self.pos(); |
668 | self.push_or_add_alternation(concat); |
669 | self.bump(); |
670 | Ok(ast::Concat { span: self.span(), asts: vec![] }) |
671 | } |
672 | |
673 | /// Pushes or adds the given branch of an alternation to the parser's |
674 | /// internal stack of state. |
675 | fn push_or_add_alternation(&self, concat: ast::Concat) { |
676 | use self::GroupState::*; |
677 | |
678 | let mut stack = self.parser().stack_group.borrow_mut(); |
679 | if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() { |
680 | alts.asts.push(concat.into_ast()); |
681 | return; |
682 | } |
683 | stack.push(Alternation(ast::Alternation { |
684 | span: Span::new(concat.span.start, self.pos()), |
685 | asts: vec![concat.into_ast()], |
686 | })); |
687 | } |
688 | |
689 | /// Parse and push a group AST (and its parent concatenation) on to the |
690 | /// parser's internal stack. Return a fresh concatenation corresponding |
691 | /// to the group's sub-AST. |
692 | /// |
693 | /// If a set of flags was found (with no group), then the concatenation |
694 | /// is returned with that set of flags added. |
695 | /// |
696 | /// This assumes that the parser is currently positioned on the opening |
697 | /// parenthesis. It advances the parser to the character at the start |
698 | /// of the sub-expression (or adjoining expression). |
699 | /// |
700 | /// If there was a problem parsing the start of the group, then an error |
701 | /// is returned. |
702 | #[inline (never)] |
703 | fn push_group(&self, mut concat: ast::Concat) -> Result<ast::Concat> { |
704 | assert_eq!(self.char(), '(' ); |
705 | match self.parse_group()? { |
706 | Either::Left(set) => { |
707 | let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace); |
708 | if let Some(v) = ignore { |
709 | self.parser().ignore_whitespace.set(v); |
710 | } |
711 | |
712 | concat.asts.push(Ast::flags(set)); |
713 | Ok(concat) |
714 | } |
715 | Either::Right(group) => { |
716 | let old_ignore_whitespace = self.ignore_whitespace(); |
717 | let new_ignore_whitespace = group |
718 | .flags() |
719 | .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace)) |
720 | .unwrap_or(old_ignore_whitespace); |
721 | self.parser().stack_group.borrow_mut().push( |
722 | GroupState::Group { |
723 | concat, |
724 | group, |
725 | ignore_whitespace: old_ignore_whitespace, |
726 | }, |
727 | ); |
728 | self.parser().ignore_whitespace.set(new_ignore_whitespace); |
729 | Ok(ast::Concat { span: self.span(), asts: vec![] }) |
730 | } |
731 | } |
732 | } |
733 | |
734 | /// Pop a group AST from the parser's internal stack and set the group's |
735 | /// AST to the given concatenation. Return the concatenation containing |
736 | /// the group. |
737 | /// |
738 | /// This assumes that the parser is currently positioned on the closing |
739 | /// parenthesis and advances the parser to the character following the `)`. |
740 | /// |
741 | /// If no such group could be popped, then an unopened group error is |
742 | /// returned. |
743 | #[inline (never)] |
744 | fn pop_group(&self, mut group_concat: ast::Concat) -> Result<ast::Concat> { |
745 | use self::GroupState::*; |
746 | |
747 | assert_eq!(self.char(), ')' ); |
748 | let mut stack = self.parser().stack_group.borrow_mut(); |
749 | let (mut prior_concat, mut group, ignore_whitespace, alt) = match stack |
750 | .pop() |
751 | { |
752 | Some(Group { concat, group, ignore_whitespace }) => { |
753 | (concat, group, ignore_whitespace, None) |
754 | } |
755 | Some(Alternation(alt)) => match stack.pop() { |
756 | Some(Group { concat, group, ignore_whitespace }) => { |
757 | (concat, group, ignore_whitespace, Some(alt)) |
758 | } |
759 | None | Some(Alternation(_)) => { |
760 | return Err(self.error( |
761 | self.span_char(), |
762 | ast::ErrorKind::GroupUnopened, |
763 | )); |
764 | } |
765 | }, |
766 | None => { |
767 | return Err(self |
768 | .error(self.span_char(), ast::ErrorKind::GroupUnopened)); |
769 | } |
770 | }; |
771 | self.parser().ignore_whitespace.set(ignore_whitespace); |
772 | group_concat.span.end = self.pos(); |
773 | self.bump(); |
774 | group.span.end = self.pos(); |
775 | match alt { |
776 | Some(mut alt) => { |
777 | alt.span.end = group_concat.span.end; |
778 | alt.asts.push(group_concat.into_ast()); |
779 | group.ast = Box::new(alt.into_ast()); |
780 | } |
781 | None => { |
782 | group.ast = Box::new(group_concat.into_ast()); |
783 | } |
784 | } |
785 | prior_concat.asts.push(Ast::group(group)); |
786 | Ok(prior_concat) |
787 | } |
788 | |
789 | /// Pop the last state from the parser's internal stack, if it exists, and |
790 | /// add the given concatenation to it. There either must be no state or a |
791 | /// single alternation item on the stack. Any other scenario produces an |
792 | /// error. |
793 | /// |
794 | /// This assumes that the parser has advanced to the end. |
795 | #[inline (never)] |
796 | fn pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast> { |
797 | concat.span.end = self.pos(); |
798 | let mut stack = self.parser().stack_group.borrow_mut(); |
799 | let ast = match stack.pop() { |
800 | None => Ok(concat.into_ast()), |
801 | Some(GroupState::Alternation(mut alt)) => { |
802 | alt.span.end = self.pos(); |
803 | alt.asts.push(concat.into_ast()); |
804 | Ok(Ast::alternation(alt)) |
805 | } |
806 | Some(GroupState::Group { group, .. }) => { |
807 | return Err( |
808 | self.error(group.span, ast::ErrorKind::GroupUnclosed) |
809 | ); |
810 | } |
811 | }; |
812 | // If we try to pop again, there should be nothing. |
813 | match stack.pop() { |
814 | None => ast, |
815 | Some(GroupState::Alternation(_)) => { |
816 | // This unreachable is unfortunate. This case can't happen |
817 | // because the only way we can be here is if there were two |
818 | // `GroupState::Alternation`s adjacent in the parser's stack, |
819 | // which we guarantee to never happen because we never push a |
820 | // `GroupState::Alternation` if one is already at the top of |
821 | // the stack. |
822 | unreachable!() |
823 | } |
824 | Some(GroupState::Group { group, .. }) => { |
825 | Err(self.error(group.span, ast::ErrorKind::GroupUnclosed)) |
826 | } |
827 | } |
828 | } |
829 | |
830 | /// Parse the opening of a character class and push the current class |
831 | /// parsing context onto the parser's stack. This assumes that the parser |
832 | /// is positioned at an opening `[`. The given union should correspond to |
833 | /// the union of set items built up before seeing the `[`. |
834 | /// |
835 | /// If there was a problem parsing the opening of the class, then an error |
836 | /// is returned. Otherwise, a new union of set items for the class is |
837 | /// returned (which may be populated with either a `]` or a `-`). |
838 | #[inline (never)] |
839 | fn push_class_open( |
840 | &self, |
841 | parent_union: ast::ClassSetUnion, |
842 | ) -> Result<ast::ClassSetUnion> { |
843 | assert_eq!(self.char(), '[' ); |
844 | |
845 | let (nested_set, nested_union) = self.parse_set_class_open()?; |
846 | self.parser() |
847 | .stack_class |
848 | .borrow_mut() |
849 | .push(ClassState::Open { union: parent_union, set: nested_set }); |
850 | Ok(nested_union) |
851 | } |
852 | |
853 | /// Parse the end of a character class set and pop the character class |
854 | /// parser stack. The union given corresponds to the last union built |
855 | /// before seeing the closing `]`. The union returned corresponds to the |
856 | /// parent character class set with the nested class added to it. |
857 | /// |
858 | /// This assumes that the parser is positioned at a `]` and will advance |
859 | /// the parser to the byte immediately following the `]`. |
860 | /// |
861 | /// If the stack is empty after popping, then this returns the final |
862 | /// "top-level" character class AST (where a "top-level" character class |
863 | /// is one that is not nested inside any other character class). |
864 | /// |
865 | /// If there is no corresponding opening bracket on the parser's stack, |
866 | /// then an error is returned. |
867 | #[inline (never)] |
868 | fn pop_class( |
869 | &self, |
870 | nested_union: ast::ClassSetUnion, |
871 | ) -> Result<Either<ast::ClassSetUnion, ast::ClassBracketed>> { |
872 | assert_eq!(self.char(), ']' ); |
873 | |
874 | let item = ast::ClassSet::Item(nested_union.into_item()); |
875 | let prevset = self.pop_class_op(item); |
876 | let mut stack = self.parser().stack_class.borrow_mut(); |
877 | match stack.pop() { |
878 | None => { |
879 | // We can never observe an empty stack: |
880 | // |
881 | // 1) We are guaranteed to start with a non-empty stack since |
882 | // the character class parser is only initiated when it sees |
883 | // a `[`. |
884 | // 2) If we ever observe an empty stack while popping after |
885 | // seeing a `]`, then we signal the character class parser |
886 | // to terminate. |
887 | panic!("unexpected empty character class stack" ) |
888 | } |
889 | Some(ClassState::Op { .. }) => { |
890 | // This panic is unfortunate, but this case is impossible |
891 | // since we already popped the Op state if one exists above. |
892 | // Namely, every push to the class parser stack is guarded by |
893 | // whether an existing Op is already on the top of the stack. |
894 | // If it is, the existing Op is modified. That is, the stack |
895 | // can never have consecutive Op states. |
896 | panic!("unexpected ClassState::Op" ) |
897 | } |
898 | Some(ClassState::Open { mut union, mut set }) => { |
899 | self.bump(); |
900 | set.span.end = self.pos(); |
901 | set.kind = prevset; |
902 | if stack.is_empty() { |
903 | Ok(Either::Right(set)) |
904 | } else { |
905 | union.push(ast::ClassSetItem::Bracketed(Box::new(set))); |
906 | Ok(Either::Left(union)) |
907 | } |
908 | } |
909 | } |
910 | } |
911 | |
912 | /// Return an "unclosed class" error whose span points to the most |
913 | /// recently opened class. |
914 | /// |
915 | /// This should only be called while parsing a character class. |
916 | #[inline (never)] |
917 | fn unclosed_class_error(&self) -> ast::Error { |
918 | for state in self.parser().stack_class.borrow().iter().rev() { |
919 | if let ClassState::Open { ref set, .. } = *state { |
920 | return self.error(set.span, ast::ErrorKind::ClassUnclosed); |
921 | } |
922 | } |
923 | // We are guaranteed to have a non-empty stack with at least |
924 | // one open bracket, so we should never get here. |
925 | panic!("no open character class found" ) |
926 | } |
927 | |
928 | /// Push the current set of class items on to the class parser's stack as |
929 | /// the left hand side of the given operator. |
930 | /// |
931 | /// A fresh set union is returned, which should be used to build the right |
932 | /// hand side of this operator. |
933 | #[inline (never)] |
934 | fn push_class_op( |
935 | &self, |
936 | next_kind: ast::ClassSetBinaryOpKind, |
937 | next_union: ast::ClassSetUnion, |
938 | ) -> ast::ClassSetUnion { |
939 | let item = ast::ClassSet::Item(next_union.into_item()); |
940 | let new_lhs = self.pop_class_op(item); |
941 | self.parser() |
942 | .stack_class |
943 | .borrow_mut() |
944 | .push(ClassState::Op { kind: next_kind, lhs: new_lhs }); |
945 | ast::ClassSetUnion { span: self.span(), items: vec![] } |
946 | } |
947 | |
948 | /// Pop a character class set from the character class parser stack. If the |
949 | /// top of the stack is just an item (not an operation), then return the |
950 | /// given set unchanged. If the top of the stack is an operation, then the |
951 | /// given set will be used as the rhs of the operation on the top of the |
952 | /// stack. In that case, the binary operation is returned as a set. |
953 | #[inline (never)] |
954 | fn pop_class_op(&self, rhs: ast::ClassSet) -> ast::ClassSet { |
955 | let mut stack = self.parser().stack_class.borrow_mut(); |
956 | let (kind, lhs) = match stack.pop() { |
957 | Some(ClassState::Op { kind, lhs }) => (kind, lhs), |
958 | Some(state @ ClassState::Open { .. }) => { |
959 | stack.push(state); |
960 | return rhs; |
961 | } |
962 | None => unreachable!(), |
963 | }; |
964 | let span = Span::new(lhs.span().start, rhs.span().end); |
965 | ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp { |
966 | span, |
967 | kind, |
968 | lhs: Box::new(lhs), |
969 | rhs: Box::new(rhs), |
970 | }) |
971 | } |
972 | } |
973 | |
974 | impl<'s, P: Borrow<Parser>> ParserI<'s, P> { |
975 | /// Parse the regular expression into an abstract syntax tree. |
976 | fn parse(&self) -> Result<Ast> { |
977 | self.parse_with_comments().map(|astc| astc.ast) |
978 | } |
979 | |
980 | /// Parse the regular expression and return an abstract syntax tree with |
981 | /// all of the comments found in the pattern. |
982 | fn parse_with_comments(&self) -> Result<ast::WithComments> { |
983 | assert_eq!(self.offset(), 0, "parser can only be used once" ); |
984 | self.parser().reset(); |
985 | let mut concat = ast::Concat { span: self.span(), asts: vec![] }; |
986 | loop { |
987 | self.bump_space(); |
988 | if self.is_eof() { |
989 | break; |
990 | } |
991 | match self.char() { |
992 | '(' => concat = self.push_group(concat)?, |
993 | ')' => concat = self.pop_group(concat)?, |
994 | '|' => concat = self.push_alternate(concat)?, |
995 | '[' => { |
996 | let class = self.parse_set_class()?; |
997 | concat.asts.push(Ast::class_bracketed(class)); |
998 | } |
999 | '?' => { |
1000 | concat = self.parse_uncounted_repetition( |
1001 | concat, |
1002 | ast::RepetitionKind::ZeroOrOne, |
1003 | )?; |
1004 | } |
1005 | '*' => { |
1006 | concat = self.parse_uncounted_repetition( |
1007 | concat, |
1008 | ast::RepetitionKind::ZeroOrMore, |
1009 | )?; |
1010 | } |
1011 | '+' => { |
1012 | concat = self.parse_uncounted_repetition( |
1013 | concat, |
1014 | ast::RepetitionKind::OneOrMore, |
1015 | )?; |
1016 | } |
1017 | '{' => { |
1018 | concat = self.parse_counted_repetition(concat)?; |
1019 | } |
1020 | _ => concat.asts.push(self.parse_primitive()?.into_ast()), |
1021 | } |
1022 | } |
1023 | let ast = self.pop_group_end(concat)?; |
1024 | NestLimiter::new(self).check(&ast)?; |
1025 | Ok(ast::WithComments { |
1026 | ast, |
1027 | comments: mem::replace( |
1028 | &mut *self.parser().comments.borrow_mut(), |
1029 | vec![], |
1030 | ), |
1031 | }) |
1032 | } |
1033 | |
1034 | /// Parses an uncounted repetition operation. An uncounted repetition |
1035 | /// operator includes ?, * and +, but does not include the {m,n} syntax. |
1036 | /// The given `kind` should correspond to the operator observed by the |
1037 | /// caller. |
1038 | /// |
1039 | /// This assumes that the parser is currently positioned at the repetition |
1040 | /// operator and advances the parser to the first character after the |
1041 | /// operator. (Note that the operator may include a single additional `?`, |
1042 | /// which makes the operator ungreedy.) |
1043 | /// |
1044 | /// The caller should include the concatenation that is being built. The |
1045 | /// concatenation returned includes the repetition operator applied to the |
1046 | /// last expression in the given concatenation. |
1047 | #[inline (never)] |
1048 | fn parse_uncounted_repetition( |
1049 | &self, |
1050 | mut concat: ast::Concat, |
1051 | kind: ast::RepetitionKind, |
1052 | ) -> Result<ast::Concat> { |
1053 | assert!( |
1054 | self.char() == '?' || self.char() == '*' || self.char() == '+' |
1055 | ); |
1056 | let op_start = self.pos(); |
1057 | let ast = match concat.asts.pop() { |
1058 | Some(ast) => ast, |
1059 | None => { |
1060 | return Err( |
1061 | self.error(self.span(), ast::ErrorKind::RepetitionMissing) |
1062 | ) |
1063 | } |
1064 | }; |
1065 | match ast { |
1066 | Ast::Empty(_) | Ast::Flags(_) => { |
1067 | return Err( |
1068 | self.error(self.span(), ast::ErrorKind::RepetitionMissing) |
1069 | ) |
1070 | } |
1071 | _ => {} |
1072 | } |
1073 | let mut greedy = true; |
1074 | if self.bump() && self.char() == '?' { |
1075 | greedy = false; |
1076 | self.bump(); |
1077 | } |
1078 | concat.asts.push(Ast::repetition(ast::Repetition { |
1079 | span: ast.span().with_end(self.pos()), |
1080 | op: ast::RepetitionOp { |
1081 | span: Span::new(op_start, self.pos()), |
1082 | kind, |
1083 | }, |
1084 | greedy, |
1085 | ast: Box::new(ast), |
1086 | })); |
1087 | Ok(concat) |
1088 | } |
1089 | |
1090 | /// Parses a counted repetition operation. A counted repetition operator |
1091 | /// corresponds to the {m,n} syntax, and does not include the ?, * or + |
1092 | /// operators. |
1093 | /// |
1094 | /// This assumes that the parser is currently positioned at the opening `{` |
1095 | /// and advances the parser to the first character after the operator. |
1096 | /// (Note that the operator may include a single additional `?`, which |
1097 | /// makes the operator ungreedy.) |
1098 | /// |
1099 | /// The caller should include the concatenation that is being built. The |
1100 | /// concatenation returned includes the repetition operator applied to the |
1101 | /// last expression in the given concatenation. |
1102 | #[inline (never)] |
1103 | fn parse_counted_repetition( |
1104 | &self, |
1105 | mut concat: ast::Concat, |
1106 | ) -> Result<ast::Concat> { |
1107 | assert!(self.char() == '{' ); |
1108 | let start = self.pos(); |
1109 | let ast = match concat.asts.pop() { |
1110 | Some(ast) => ast, |
1111 | None => { |
1112 | return Err( |
1113 | self.error(self.span(), ast::ErrorKind::RepetitionMissing) |
1114 | ) |
1115 | } |
1116 | }; |
1117 | match ast { |
1118 | Ast::Empty(_) | Ast::Flags(_) => { |
1119 | return Err( |
1120 | self.error(self.span(), ast::ErrorKind::RepetitionMissing) |
1121 | ) |
1122 | } |
1123 | _ => {} |
1124 | } |
1125 | if !self.bump_and_bump_space() { |
1126 | return Err(self.error( |
1127 | Span::new(start, self.pos()), |
1128 | ast::ErrorKind::RepetitionCountUnclosed, |
1129 | )); |
1130 | } |
1131 | let count_start = specialize_err( |
1132 | self.parse_decimal(), |
1133 | ast::ErrorKind::DecimalEmpty, |
1134 | ast::ErrorKind::RepetitionCountDecimalEmpty, |
1135 | ); |
1136 | if self.is_eof() { |
1137 | return Err(self.error( |
1138 | Span::new(start, self.pos()), |
1139 | ast::ErrorKind::RepetitionCountUnclosed, |
1140 | )); |
1141 | } |
1142 | let range = if self.char() == ',' { |
1143 | if !self.bump_and_bump_space() { |
1144 | return Err(self.error( |
1145 | Span::new(start, self.pos()), |
1146 | ast::ErrorKind::RepetitionCountUnclosed, |
1147 | )); |
1148 | } |
1149 | if self.char() != '}' { |
1150 | let count_start = match count_start { |
1151 | Ok(c) => c, |
1152 | Err(err) |
1153 | if err.kind |
1154 | == ast::ErrorKind::RepetitionCountDecimalEmpty => |
1155 | { |
1156 | if self.parser().empty_min_range { |
1157 | 0 |
1158 | } else { |
1159 | return Err(err); |
1160 | } |
1161 | } |
1162 | err => err?, |
1163 | }; |
1164 | let count_end = specialize_err( |
1165 | self.parse_decimal(), |
1166 | ast::ErrorKind::DecimalEmpty, |
1167 | ast::ErrorKind::RepetitionCountDecimalEmpty, |
1168 | )?; |
1169 | ast::RepetitionRange::Bounded(count_start, count_end) |
1170 | } else { |
1171 | ast::RepetitionRange::AtLeast(count_start?) |
1172 | } |
1173 | } else { |
1174 | ast::RepetitionRange::Exactly(count_start?) |
1175 | }; |
1176 | |
1177 | if self.is_eof() || self.char() != '}' { |
1178 | return Err(self.error( |
1179 | Span::new(start, self.pos()), |
1180 | ast::ErrorKind::RepetitionCountUnclosed, |
1181 | )); |
1182 | } |
1183 | |
1184 | let mut greedy = true; |
1185 | if self.bump_and_bump_space() && self.char() == '?' { |
1186 | greedy = false; |
1187 | self.bump(); |
1188 | } |
1189 | |
1190 | let op_span = Span::new(start, self.pos()); |
1191 | if !range.is_valid() { |
1192 | return Err( |
1193 | self.error(op_span, ast::ErrorKind::RepetitionCountInvalid) |
1194 | ); |
1195 | } |
1196 | concat.asts.push(Ast::repetition(ast::Repetition { |
1197 | span: ast.span().with_end(self.pos()), |
1198 | op: ast::RepetitionOp { |
1199 | span: op_span, |
1200 | kind: ast::RepetitionKind::Range(range), |
1201 | }, |
1202 | greedy, |
1203 | ast: Box::new(ast), |
1204 | })); |
1205 | Ok(concat) |
1206 | } |
1207 | |
1208 | /// Parse a group (which contains a sub-expression) or a set of flags. |
1209 | /// |
1210 | /// If a group was found, then it is returned with an empty AST. If a set |
1211 | /// of flags is found, then that set is returned. |
1212 | /// |
1213 | /// The parser should be positioned at the opening parenthesis. |
1214 | /// |
1215 | /// This advances the parser to the character before the start of the |
1216 | /// sub-expression (in the case of a group) or to the closing parenthesis |
1217 | /// immediately following the set of flags. |
1218 | /// |
1219 | /// # Errors |
1220 | /// |
1221 | /// If flags are given and incorrectly specified, then a corresponding |
1222 | /// error is returned. |
1223 | /// |
1224 | /// If a capture name is given and it is incorrectly specified, then a |
1225 | /// corresponding error is returned. |
1226 | #[inline (never)] |
1227 | fn parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>> { |
1228 | assert_eq!(self.char(), '(' ); |
1229 | let open_span = self.span_char(); |
1230 | self.bump(); |
1231 | self.bump_space(); |
1232 | if self.is_lookaround_prefix() { |
1233 | return Err(self.error( |
1234 | Span::new(open_span.start, self.span().end), |
1235 | ast::ErrorKind::UnsupportedLookAround, |
1236 | )); |
1237 | } |
1238 | let inner_span = self.span(); |
1239 | let mut starts_with_p = true; |
1240 | if self.bump_if("?P<" ) || { |
1241 | starts_with_p = false; |
1242 | self.bump_if("?<" ) |
1243 | } { |
1244 | let capture_index = self.next_capture_index(open_span)?; |
1245 | let name = self.parse_capture_name(capture_index)?; |
1246 | Ok(Either::Right(ast::Group { |
1247 | span: open_span, |
1248 | kind: ast::GroupKind::CaptureName { starts_with_p, name }, |
1249 | ast: Box::new(Ast::empty(self.span())), |
1250 | })) |
1251 | } else if self.bump_if("?" ) { |
1252 | if self.is_eof() { |
1253 | return Err( |
1254 | self.error(open_span, ast::ErrorKind::GroupUnclosed) |
1255 | ); |
1256 | } |
1257 | let flags = self.parse_flags()?; |
1258 | let char_end = self.char(); |
1259 | self.bump(); |
1260 | if char_end == ')' { |
1261 | // We don't allow empty flags, e.g., `(?)`. We instead |
1262 | // interpret it as a repetition operator missing its argument. |
1263 | if flags.items.is_empty() { |
1264 | return Err(self.error( |
1265 | inner_span, |
1266 | ast::ErrorKind::RepetitionMissing, |
1267 | )); |
1268 | } |
1269 | Ok(Either::Left(ast::SetFlags { |
1270 | span: Span { end: self.pos(), ..open_span }, |
1271 | flags, |
1272 | })) |
1273 | } else { |
1274 | assert_eq!(char_end, ':' ); |
1275 | Ok(Either::Right(ast::Group { |
1276 | span: open_span, |
1277 | kind: ast::GroupKind::NonCapturing(flags), |
1278 | ast: Box::new(Ast::empty(self.span())), |
1279 | })) |
1280 | } |
1281 | } else { |
1282 | let capture_index = self.next_capture_index(open_span)?; |
1283 | Ok(Either::Right(ast::Group { |
1284 | span: open_span, |
1285 | kind: ast::GroupKind::CaptureIndex(capture_index), |
1286 | ast: Box::new(Ast::empty(self.span())), |
1287 | })) |
1288 | } |
1289 | } |
1290 | |
1291 | /// Parses a capture group name. Assumes that the parser is positioned at |
1292 | /// the first character in the name following the opening `<` (and may |
1293 | /// possibly be EOF). This advances the parser to the first character |
1294 | /// following the closing `>`. |
1295 | /// |
1296 | /// The caller must provide the capture index of the group for this name. |
1297 | #[inline (never)] |
1298 | fn parse_capture_name( |
1299 | &self, |
1300 | capture_index: u32, |
1301 | ) -> Result<ast::CaptureName> { |
1302 | if self.is_eof() { |
1303 | return Err(self |
1304 | .error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof)); |
1305 | } |
1306 | let start = self.pos(); |
1307 | loop { |
1308 | if self.char() == '>' { |
1309 | break; |
1310 | } |
1311 | if !is_capture_char(self.char(), self.pos() == start) { |
1312 | return Err(self.error( |
1313 | self.span_char(), |
1314 | ast::ErrorKind::GroupNameInvalid, |
1315 | )); |
1316 | } |
1317 | if !self.bump() { |
1318 | break; |
1319 | } |
1320 | } |
1321 | let end = self.pos(); |
1322 | if self.is_eof() { |
1323 | return Err(self |
1324 | .error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof)); |
1325 | } |
1326 | assert_eq!(self.char(), '>' ); |
1327 | self.bump(); |
1328 | let name = &self.pattern()[start.offset..end.offset]; |
1329 | if name.is_empty() { |
1330 | return Err(self.error( |
1331 | Span::new(start, start), |
1332 | ast::ErrorKind::GroupNameEmpty, |
1333 | )); |
1334 | } |
1335 | let capname = ast::CaptureName { |
1336 | span: Span::new(start, end), |
1337 | name: name.to_string(), |
1338 | index: capture_index, |
1339 | }; |
1340 | self.add_capture_name(&capname)?; |
1341 | Ok(capname) |
1342 | } |
1343 | |
1344 | /// Parse a sequence of flags starting at the current character. |
1345 | /// |
1346 | /// This advances the parser to the character immediately following the |
1347 | /// flags, which is guaranteed to be either `:` or `)`. |
1348 | /// |
1349 | /// # Errors |
1350 | /// |
1351 | /// If any flags are duplicated, then an error is returned. |
1352 | /// |
1353 | /// If the negation operator is used more than once, then an error is |
1354 | /// returned. |
1355 | /// |
1356 | /// If no flags could be found or if the negation operation is not followed |
1357 | /// by any flags, then an error is returned. |
1358 | #[inline (never)] |
1359 | fn parse_flags(&self) -> Result<ast::Flags> { |
1360 | let mut flags = ast::Flags { span: self.span(), items: vec![] }; |
1361 | let mut last_was_negation = None; |
1362 | while self.char() != ':' && self.char() != ')' { |
1363 | if self.char() == '-' { |
1364 | last_was_negation = Some(self.span_char()); |
1365 | let item = ast::FlagsItem { |
1366 | span: self.span_char(), |
1367 | kind: ast::FlagsItemKind::Negation, |
1368 | }; |
1369 | if let Some(i) = flags.add_item(item) { |
1370 | return Err(self.error( |
1371 | self.span_char(), |
1372 | ast::ErrorKind::FlagRepeatedNegation { |
1373 | original: flags.items[i].span, |
1374 | }, |
1375 | )); |
1376 | } |
1377 | } else { |
1378 | last_was_negation = None; |
1379 | let item = ast::FlagsItem { |
1380 | span: self.span_char(), |
1381 | kind: ast::FlagsItemKind::Flag(self.parse_flag()?), |
1382 | }; |
1383 | if let Some(i) = flags.add_item(item) { |
1384 | return Err(self.error( |
1385 | self.span_char(), |
1386 | ast::ErrorKind::FlagDuplicate { |
1387 | original: flags.items[i].span, |
1388 | }, |
1389 | )); |
1390 | } |
1391 | } |
1392 | if !self.bump() { |
1393 | return Err( |
1394 | self.error(self.span(), ast::ErrorKind::FlagUnexpectedEof) |
1395 | ); |
1396 | } |
1397 | } |
1398 | if let Some(span) = last_was_negation { |
1399 | return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation)); |
1400 | } |
1401 | flags.span.end = self.pos(); |
1402 | Ok(flags) |
1403 | } |
1404 | |
1405 | /// Parse the current character as a flag. Do not advance the parser. |
1406 | /// |
1407 | /// # Errors |
1408 | /// |
1409 | /// If the flag is not recognized, then an error is returned. |
1410 | #[inline (never)] |
1411 | fn parse_flag(&self) -> Result<ast::Flag> { |
1412 | match self.char() { |
1413 | 'i' => Ok(ast::Flag::CaseInsensitive), |
1414 | 'm' => Ok(ast::Flag::MultiLine), |
1415 | 's' => Ok(ast::Flag::DotMatchesNewLine), |
1416 | 'U' => Ok(ast::Flag::SwapGreed), |
1417 | 'u' => Ok(ast::Flag::Unicode), |
1418 | 'R' => Ok(ast::Flag::CRLF), |
1419 | 'x' => Ok(ast::Flag::IgnoreWhitespace), |
1420 | _ => { |
1421 | Err(self |
1422 | .error(self.span_char(), ast::ErrorKind::FlagUnrecognized)) |
1423 | } |
1424 | } |
1425 | } |
1426 | |
1427 | /// Parse a primitive AST. e.g., A literal, non-set character class or |
1428 | /// assertion. |
1429 | /// |
1430 | /// This assumes that the parser expects a primitive at the current |
1431 | /// location. i.e., All other non-primitive cases have been handled. |
1432 | /// For example, if the parser's position is at `|`, then `|` will be |
1433 | /// treated as a literal (e.g., inside a character class). |
1434 | /// |
1435 | /// This advances the parser to the first character immediately following |
1436 | /// the primitive. |
1437 | fn parse_primitive(&self) -> Result<Primitive> { |
1438 | match self.char() { |
1439 | ' \\' => self.parse_escape(), |
1440 | '.' => { |
1441 | let ast = Primitive::Dot(self.span_char()); |
1442 | self.bump(); |
1443 | Ok(ast) |
1444 | } |
1445 | '^' => { |
1446 | let ast = Primitive::Assertion(ast::Assertion { |
1447 | span: self.span_char(), |
1448 | kind: ast::AssertionKind::StartLine, |
1449 | }); |
1450 | self.bump(); |
1451 | Ok(ast) |
1452 | } |
1453 | '$' => { |
1454 | let ast = Primitive::Assertion(ast::Assertion { |
1455 | span: self.span_char(), |
1456 | kind: ast::AssertionKind::EndLine, |
1457 | }); |
1458 | self.bump(); |
1459 | Ok(ast) |
1460 | } |
1461 | c => { |
1462 | let ast = Primitive::Literal(ast::Literal { |
1463 | span: self.span_char(), |
1464 | kind: ast::LiteralKind::Verbatim, |
1465 | c, |
1466 | }); |
1467 | self.bump(); |
1468 | Ok(ast) |
1469 | } |
1470 | } |
1471 | } |
1472 | |
1473 | /// Parse an escape sequence as a primitive AST. |
1474 | /// |
1475 | /// This assumes the parser is positioned at the start of the escape |
1476 | /// sequence, i.e., `\`. It advances the parser to the first position |
1477 | /// immediately following the escape sequence. |
1478 | #[inline (never)] |
1479 | fn parse_escape(&self) -> Result<Primitive> { |
1480 | assert_eq!(self.char(), ' \\' ); |
1481 | let start = self.pos(); |
1482 | if !self.bump() { |
1483 | return Err(self.error( |
1484 | Span::new(start, self.pos()), |
1485 | ast::ErrorKind::EscapeUnexpectedEof, |
1486 | )); |
1487 | } |
1488 | let c = self.char(); |
1489 | // Put some of the more complicated routines into helpers. |
1490 | match c { |
1491 | '0' ..='7' => { |
1492 | if !self.parser().octal { |
1493 | return Err(self.error( |
1494 | Span::new(start, self.span_char().end), |
1495 | ast::ErrorKind::UnsupportedBackreference, |
1496 | )); |
1497 | } |
1498 | let mut lit = self.parse_octal(); |
1499 | lit.span.start = start; |
1500 | return Ok(Primitive::Literal(lit)); |
1501 | } |
1502 | '8' ..='9' if !self.parser().octal => { |
1503 | return Err(self.error( |
1504 | Span::new(start, self.span_char().end), |
1505 | ast::ErrorKind::UnsupportedBackreference, |
1506 | )); |
1507 | } |
1508 | 'x' | 'u' | 'U' => { |
1509 | let mut lit = self.parse_hex()?; |
1510 | lit.span.start = start; |
1511 | return Ok(Primitive::Literal(lit)); |
1512 | } |
1513 | 'p' | 'P' => { |
1514 | let mut cls = self.parse_unicode_class()?; |
1515 | cls.span.start = start; |
1516 | return Ok(Primitive::Unicode(cls)); |
1517 | } |
1518 | 'd' | 's' | 'w' | 'D' | 'S' | 'W' => { |
1519 | let mut cls = self.parse_perl_class(); |
1520 | cls.span.start = start; |
1521 | return Ok(Primitive::Perl(cls)); |
1522 | } |
1523 | _ => {} |
1524 | } |
1525 | |
1526 | // Handle all of the one letter sequences inline. |
1527 | self.bump(); |
1528 | let span = Span::new(start, self.pos()); |
1529 | if is_meta_character(c) { |
1530 | return Ok(Primitive::Literal(ast::Literal { |
1531 | span, |
1532 | kind: ast::LiteralKind::Meta, |
1533 | c, |
1534 | })); |
1535 | } |
1536 | if is_escapeable_character(c) { |
1537 | return Ok(Primitive::Literal(ast::Literal { |
1538 | span, |
1539 | kind: ast::LiteralKind::Superfluous, |
1540 | c, |
1541 | })); |
1542 | } |
1543 | let special = |kind, c| { |
1544 | Ok(Primitive::Literal(ast::Literal { |
1545 | span, |
1546 | kind: ast::LiteralKind::Special(kind), |
1547 | c, |
1548 | })) |
1549 | }; |
1550 | match c { |
1551 | 'a' => special(ast::SpecialLiteralKind::Bell, ' \x07' ), |
1552 | 'f' => special(ast::SpecialLiteralKind::FormFeed, ' \x0C' ), |
1553 | 't' => special(ast::SpecialLiteralKind::Tab, ' \t' ), |
1554 | 'n' => special(ast::SpecialLiteralKind::LineFeed, ' \n' ), |
1555 | 'r' => special(ast::SpecialLiteralKind::CarriageReturn, ' \r' ), |
1556 | 'v' => special(ast::SpecialLiteralKind::VerticalTab, ' \x0B' ), |
1557 | 'A' => Ok(Primitive::Assertion(ast::Assertion { |
1558 | span, |
1559 | kind: ast::AssertionKind::StartText, |
1560 | })), |
1561 | 'z' => Ok(Primitive::Assertion(ast::Assertion { |
1562 | span, |
1563 | kind: ast::AssertionKind::EndText, |
1564 | })), |
1565 | 'b' => { |
1566 | let mut wb = ast::Assertion { |
1567 | span, |
1568 | kind: ast::AssertionKind::WordBoundary, |
1569 | }; |
1570 | // After a \b, we "try" to parse things like \b{start} for |
1571 | // special word boundary assertions. |
1572 | if !self.is_eof() && self.char() == '{' { |
1573 | if let Some(kind) = |
1574 | self.maybe_parse_special_word_boundary(start)? |
1575 | { |
1576 | wb.kind = kind; |
1577 | wb.span.end = self.pos(); |
1578 | } |
1579 | } |
1580 | Ok(Primitive::Assertion(wb)) |
1581 | } |
1582 | 'B' => Ok(Primitive::Assertion(ast::Assertion { |
1583 | span, |
1584 | kind: ast::AssertionKind::NotWordBoundary, |
1585 | })), |
1586 | '<' => Ok(Primitive::Assertion(ast::Assertion { |
1587 | span, |
1588 | kind: ast::AssertionKind::WordBoundaryStartAngle, |
1589 | })), |
1590 | '>' => Ok(Primitive::Assertion(ast::Assertion { |
1591 | span, |
1592 | kind: ast::AssertionKind::WordBoundaryEndAngle, |
1593 | })), |
1594 | _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)), |
1595 | } |
1596 | } |
1597 | |
1598 | /// Attempt to parse a specialty word boundary. That is, `\b{start}`, |
1599 | /// `\b{end}`, `\b{start-half}` or `\b{end-half}`. |
1600 | /// |
1601 | /// This is similar to `maybe_parse_ascii_class` in that, in most cases, |
1602 | /// if it fails it will just return `None` with no error. This is done |
1603 | /// because `\b{5}` is a valid expression and we want to let that be parsed |
1604 | /// by the existing counted repetition parsing code. (I thought about just |
1605 | /// invoking the counted repetition code from here, but it seemed a little |
1606 | /// ham-fisted.) |
1607 | /// |
1608 | /// Unlike `maybe_parse_ascii_class` though, this can return an error. |
1609 | /// Namely, if we definitely know it isn't a counted repetition, then we |
1610 | /// return an error specific to the specialty word boundaries. |
1611 | /// |
1612 | /// This assumes the parser is positioned at a `{` immediately following |
1613 | /// a `\b`. When `None` is returned, the parser is returned to the position |
1614 | /// at which it started: pointing at a `{`. |
1615 | /// |
1616 | /// The position given should correspond to the start of the `\b`. |
1617 | fn maybe_parse_special_word_boundary( |
1618 | &self, |
1619 | wb_start: Position, |
1620 | ) -> Result<Option<ast::AssertionKind>> { |
1621 | assert_eq!(self.char(), '{' ); |
1622 | |
1623 | let is_valid_char = |c| match c { |
1624 | 'A' ..='Z' | 'a' ..='z' | '-' => true, |
1625 | _ => false, |
1626 | }; |
1627 | let start = self.pos(); |
1628 | if !self.bump_and_bump_space() { |
1629 | return Err(self.error( |
1630 | Span::new(wb_start, self.pos()), |
1631 | ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof, |
1632 | )); |
1633 | } |
1634 | let start_contents = self.pos(); |
1635 | // This is one of the critical bits: if the first non-whitespace |
1636 | // character isn't in [-A-Za-z] (i.e., this can't be a special word |
1637 | // boundary), then we bail and let the counted repetition parser deal |
1638 | // with this. |
1639 | if !is_valid_char(self.char()) { |
1640 | self.parser().pos.set(start); |
1641 | return Ok(None); |
1642 | } |
1643 | |
1644 | // Now collect up our chars until we see a '}'. |
1645 | let mut scratch = self.parser().scratch.borrow_mut(); |
1646 | scratch.clear(); |
1647 | while !self.is_eof() && is_valid_char(self.char()) { |
1648 | scratch.push(self.char()); |
1649 | self.bump_and_bump_space(); |
1650 | } |
1651 | if self.is_eof() || self.char() != '}' { |
1652 | return Err(self.error( |
1653 | Span::new(start, self.pos()), |
1654 | ast::ErrorKind::SpecialWordBoundaryUnclosed, |
1655 | )); |
1656 | } |
1657 | let end = self.pos(); |
1658 | self.bump(); |
1659 | let kind = match scratch.as_str() { |
1660 | "start" => ast::AssertionKind::WordBoundaryStart, |
1661 | "end" => ast::AssertionKind::WordBoundaryEnd, |
1662 | "start-half" => ast::AssertionKind::WordBoundaryStartHalf, |
1663 | "end-half" => ast::AssertionKind::WordBoundaryEndHalf, |
1664 | _ => { |
1665 | return Err(self.error( |
1666 | Span::new(start_contents, end), |
1667 | ast::ErrorKind::SpecialWordBoundaryUnrecognized, |
1668 | )) |
1669 | } |
1670 | }; |
1671 | Ok(Some(kind)) |
1672 | } |
1673 | |
1674 | /// Parse an octal representation of a Unicode codepoint up to 3 digits |
1675 | /// long. This expects the parser to be positioned at the first octal |
1676 | /// digit and advances the parser to the first character immediately |
1677 | /// following the octal number. This also assumes that parsing octal |
1678 | /// escapes is enabled. |
1679 | /// |
1680 | /// Assuming the preconditions are met, this routine can never fail. |
1681 | #[inline (never)] |
1682 | fn parse_octal(&self) -> ast::Literal { |
1683 | assert!(self.parser().octal); |
1684 | assert!('0' <= self.char() && self.char() <= '7' ); |
1685 | let start = self.pos(); |
1686 | // Parse up to two more digits. |
1687 | while self.bump() |
1688 | && '0' <= self.char() |
1689 | && self.char() <= '7' |
1690 | && self.pos().offset - start.offset <= 2 |
1691 | {} |
1692 | let end = self.pos(); |
1693 | let octal = &self.pattern()[start.offset..end.offset]; |
1694 | // Parsing the octal should never fail since the above guarantees a |
1695 | // valid number. |
1696 | let codepoint = |
1697 | u32::from_str_radix(octal, 8).expect("valid octal number" ); |
1698 | // The max value for 3 digit octal is 0777 = 511 and [0, 511] has no |
1699 | // invalid Unicode scalar values. |
1700 | let c = char::from_u32(codepoint).expect("Unicode scalar value" ); |
1701 | ast::Literal { |
1702 | span: Span::new(start, end), |
1703 | kind: ast::LiteralKind::Octal, |
1704 | c, |
1705 | } |
1706 | } |
1707 | |
1708 | /// Parse a hex representation of a Unicode codepoint. This handles both |
1709 | /// hex notations, i.e., `\xFF` and `\x{FFFF}`. This expects the parser to |
1710 | /// be positioned at the `x`, `u` or `U` prefix. The parser is advanced to |
1711 | /// the first character immediately following the hexadecimal literal. |
1712 | #[inline (never)] |
1713 | fn parse_hex(&self) -> Result<ast::Literal> { |
1714 | assert!( |
1715 | self.char() == 'x' || self.char() == 'u' || self.char() == 'U' |
1716 | ); |
1717 | |
1718 | let hex_kind = match self.char() { |
1719 | 'x' => ast::HexLiteralKind::X, |
1720 | 'u' => ast::HexLiteralKind::UnicodeShort, |
1721 | _ => ast::HexLiteralKind::UnicodeLong, |
1722 | }; |
1723 | if !self.bump_and_bump_space() { |
1724 | return Err( |
1725 | self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof) |
1726 | ); |
1727 | } |
1728 | if self.char() == '{' { |
1729 | self.parse_hex_brace(hex_kind) |
1730 | } else { |
1731 | self.parse_hex_digits(hex_kind) |
1732 | } |
1733 | } |
1734 | |
1735 | /// Parse an N-digit hex representation of a Unicode codepoint. This |
1736 | /// expects the parser to be positioned at the first digit and will advance |
1737 | /// the parser to the first character immediately following the escape |
1738 | /// sequence. |
1739 | /// |
1740 | /// The number of digits given must be 2 (for `\xNN`), 4 (for `\uNNNN`) |
1741 | /// or 8 (for `\UNNNNNNNN`). |
1742 | #[inline (never)] |
1743 | fn parse_hex_digits( |
1744 | &self, |
1745 | kind: ast::HexLiteralKind, |
1746 | ) -> Result<ast::Literal> { |
1747 | let mut scratch = self.parser().scratch.borrow_mut(); |
1748 | scratch.clear(); |
1749 | |
1750 | let start = self.pos(); |
1751 | for i in 0..kind.digits() { |
1752 | if i > 0 && !self.bump_and_bump_space() { |
1753 | return Err(self |
1754 | .error(self.span(), ast::ErrorKind::EscapeUnexpectedEof)); |
1755 | } |
1756 | if !is_hex(self.char()) { |
1757 | return Err(self.error( |
1758 | self.span_char(), |
1759 | ast::ErrorKind::EscapeHexInvalidDigit, |
1760 | )); |
1761 | } |
1762 | scratch.push(self.char()); |
1763 | } |
1764 | // The final bump just moves the parser past the literal, which may |
1765 | // be EOF. |
1766 | self.bump_and_bump_space(); |
1767 | let end = self.pos(); |
1768 | let hex = scratch.as_str(); |
1769 | match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) { |
1770 | None => Err(self.error( |
1771 | Span::new(start, end), |
1772 | ast::ErrorKind::EscapeHexInvalid, |
1773 | )), |
1774 | Some(c) => Ok(ast::Literal { |
1775 | span: Span::new(start, end), |
1776 | kind: ast::LiteralKind::HexFixed(kind), |
1777 | c, |
1778 | }), |
1779 | } |
1780 | } |
1781 | |
1782 | /// Parse a hex representation of any Unicode scalar value. This expects |
1783 | /// the parser to be positioned at the opening brace `{` and will advance |
1784 | /// the parser to the first character following the closing brace `}`. |
1785 | #[inline (never)] |
1786 | fn parse_hex_brace( |
1787 | &self, |
1788 | kind: ast::HexLiteralKind, |
1789 | ) -> Result<ast::Literal> { |
1790 | let mut scratch = self.parser().scratch.borrow_mut(); |
1791 | scratch.clear(); |
1792 | |
1793 | let brace_pos = self.pos(); |
1794 | let start = self.span_char().end; |
1795 | while self.bump_and_bump_space() && self.char() != '}' { |
1796 | if !is_hex(self.char()) { |
1797 | return Err(self.error( |
1798 | self.span_char(), |
1799 | ast::ErrorKind::EscapeHexInvalidDigit, |
1800 | )); |
1801 | } |
1802 | scratch.push(self.char()); |
1803 | } |
1804 | if self.is_eof() { |
1805 | return Err(self.error( |
1806 | Span::new(brace_pos, self.pos()), |
1807 | ast::ErrorKind::EscapeUnexpectedEof, |
1808 | )); |
1809 | } |
1810 | let end = self.pos(); |
1811 | let hex = scratch.as_str(); |
1812 | assert_eq!(self.char(), '}' ); |
1813 | self.bump_and_bump_space(); |
1814 | |
1815 | if hex.is_empty() { |
1816 | return Err(self.error( |
1817 | Span::new(brace_pos, self.pos()), |
1818 | ast::ErrorKind::EscapeHexEmpty, |
1819 | )); |
1820 | } |
1821 | match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) { |
1822 | None => Err(self.error( |
1823 | Span::new(start, end), |
1824 | ast::ErrorKind::EscapeHexInvalid, |
1825 | )), |
1826 | Some(c) => Ok(ast::Literal { |
1827 | span: Span::new(start, self.pos()), |
1828 | kind: ast::LiteralKind::HexBrace(kind), |
1829 | c, |
1830 | }), |
1831 | } |
1832 | } |
1833 | |
1834 | /// Parse a decimal number into a u32 while trimming leading and trailing |
1835 | /// whitespace. |
1836 | /// |
1837 | /// This expects the parser to be positioned at the first position where |
1838 | /// a decimal digit could occur. This will advance the parser to the byte |
1839 | /// immediately following the last contiguous decimal digit. |
1840 | /// |
1841 | /// If no decimal digit could be found or if there was a problem parsing |
1842 | /// the complete set of digits into a u32, then an error is returned. |
1843 | fn parse_decimal(&self) -> Result<u32> { |
1844 | let mut scratch = self.parser().scratch.borrow_mut(); |
1845 | scratch.clear(); |
1846 | |
1847 | while !self.is_eof() && self.char().is_whitespace() { |
1848 | self.bump(); |
1849 | } |
1850 | let start = self.pos(); |
1851 | while !self.is_eof() && '0' <= self.char() && self.char() <= '9' { |
1852 | scratch.push(self.char()); |
1853 | self.bump_and_bump_space(); |
1854 | } |
1855 | let span = Span::new(start, self.pos()); |
1856 | while !self.is_eof() && self.char().is_whitespace() { |
1857 | self.bump_and_bump_space(); |
1858 | } |
1859 | let digits = scratch.as_str(); |
1860 | if digits.is_empty() { |
1861 | return Err(self.error(span, ast::ErrorKind::DecimalEmpty)); |
1862 | } |
1863 | match u32::from_str_radix(digits, 10).ok() { |
1864 | Some(n) => Ok(n), |
1865 | None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)), |
1866 | } |
1867 | } |
1868 | |
1869 | /// Parse a standard character class consisting primarily of characters or |
1870 | /// character ranges, but can also contain nested character classes of |
1871 | /// any type (sans `.`). |
1872 | /// |
1873 | /// This assumes the parser is positioned at the opening `[`. If parsing |
1874 | /// is successful, then the parser is advanced to the position immediately |
1875 | /// following the closing `]`. |
1876 | #[inline (never)] |
1877 | fn parse_set_class(&self) -> Result<ast::ClassBracketed> { |
1878 | assert_eq!(self.char(), '[' ); |
1879 | |
1880 | let mut union = |
1881 | ast::ClassSetUnion { span: self.span(), items: vec![] }; |
1882 | loop { |
1883 | self.bump_space(); |
1884 | if self.is_eof() { |
1885 | return Err(self.unclosed_class_error()); |
1886 | } |
1887 | match self.char() { |
1888 | '[' => { |
1889 | // If we've already parsed the opening bracket, then |
1890 | // attempt to treat this as the beginning of an ASCII |
1891 | // class. If ASCII class parsing fails, then the parser |
1892 | // backs up to `[`. |
1893 | if !self.parser().stack_class.borrow().is_empty() { |
1894 | if let Some(cls) = self.maybe_parse_ascii_class() { |
1895 | union.push(ast::ClassSetItem::Ascii(cls)); |
1896 | continue; |
1897 | } |
1898 | } |
1899 | union = self.push_class_open(union)?; |
1900 | } |
1901 | ']' => match self.pop_class(union)? { |
1902 | Either::Left(nested_union) => { |
1903 | union = nested_union; |
1904 | } |
1905 | Either::Right(class) => return Ok(class), |
1906 | }, |
1907 | '&' if self.peek() == Some('&' ) => { |
1908 | assert!(self.bump_if("&&" )); |
1909 | union = self.push_class_op( |
1910 | ast::ClassSetBinaryOpKind::Intersection, |
1911 | union, |
1912 | ); |
1913 | } |
1914 | '-' if self.peek() == Some('-' ) => { |
1915 | assert!(self.bump_if("--" )); |
1916 | union = self.push_class_op( |
1917 | ast::ClassSetBinaryOpKind::Difference, |
1918 | union, |
1919 | ); |
1920 | } |
1921 | '~' if self.peek() == Some('~' ) => { |
1922 | assert!(self.bump_if("~~" )); |
1923 | union = self.push_class_op( |
1924 | ast::ClassSetBinaryOpKind::SymmetricDifference, |
1925 | union, |
1926 | ); |
1927 | } |
1928 | _ => { |
1929 | union.push(self.parse_set_class_range()?); |
1930 | } |
1931 | } |
1932 | } |
1933 | } |
1934 | |
1935 | /// Parse a single primitive item in a character class set. The item to |
1936 | /// be parsed can either be one of a simple literal character, a range |
1937 | /// between two simple literal characters or a "primitive" character |
1938 | /// class like \w or \p{Greek}. |
1939 | /// |
1940 | /// If an invalid escape is found, or if a character class is found where |
1941 | /// a simple literal is expected (e.g., in a range), then an error is |
1942 | /// returned. |
1943 | #[inline (never)] |
1944 | fn parse_set_class_range(&self) -> Result<ast::ClassSetItem> { |
1945 | let prim1 = self.parse_set_class_item()?; |
1946 | self.bump_space(); |
1947 | if self.is_eof() { |
1948 | return Err(self.unclosed_class_error()); |
1949 | } |
1950 | // If the next char isn't a `-`, then we don't have a range. |
1951 | // There are two exceptions. If the char after a `-` is a `]`, then |
1952 | // `-` is interpreted as a literal `-`. Alternatively, if the char |
1953 | // after a `-` is a `-`, then `--` corresponds to a "difference" |
1954 | // operation. |
1955 | if self.char() != '-' |
1956 | || self.peek_space() == Some(']' ) |
1957 | || self.peek_space() == Some('-' ) |
1958 | { |
1959 | return prim1.into_class_set_item(self); |
1960 | } |
1961 | // OK, now we're parsing a range, so bump past the `-` and parse the |
1962 | // second half of the range. |
1963 | if !self.bump_and_bump_space() { |
1964 | return Err(self.unclosed_class_error()); |
1965 | } |
1966 | let prim2 = self.parse_set_class_item()?; |
1967 | let range = ast::ClassSetRange { |
1968 | span: Span::new(prim1.span().start, prim2.span().end), |
1969 | start: prim1.into_class_literal(self)?, |
1970 | end: prim2.into_class_literal(self)?, |
1971 | }; |
1972 | if !range.is_valid() { |
1973 | return Err( |
1974 | self.error(range.span, ast::ErrorKind::ClassRangeInvalid) |
1975 | ); |
1976 | } |
1977 | Ok(ast::ClassSetItem::Range(range)) |
1978 | } |
1979 | |
1980 | /// Parse a single item in a character class as a primitive, where the |
1981 | /// primitive either consists of a verbatim literal or a single escape |
1982 | /// sequence. |
1983 | /// |
1984 | /// This assumes the parser is positioned at the beginning of a primitive, |
1985 | /// and advances the parser to the first position after the primitive if |
1986 | /// successful. |
1987 | /// |
1988 | /// Note that it is the caller's responsibility to report an error if an |
1989 | /// illegal primitive was parsed. |
1990 | #[inline (never)] |
1991 | fn parse_set_class_item(&self) -> Result<Primitive> { |
1992 | if self.char() == ' \\' { |
1993 | self.parse_escape() |
1994 | } else { |
1995 | let x = Primitive::Literal(ast::Literal { |
1996 | span: self.span_char(), |
1997 | kind: ast::LiteralKind::Verbatim, |
1998 | c: self.char(), |
1999 | }); |
2000 | self.bump(); |
2001 | Ok(x) |
2002 | } |
2003 | } |
2004 | |
2005 | /// Parses the opening of a character class set. This includes the opening |
2006 | /// bracket along with `^` if present to indicate negation. This also |
2007 | /// starts parsing the opening set of unioned items if applicable, since |
2008 | /// there are special rules applied to certain characters in the opening |
2009 | /// of a character class. For example, `[^]]` is the class of all |
2010 | /// characters not equal to `]`. (`]` would need to be escaped in any other |
2011 | /// position.) Similarly for `-`. |
2012 | /// |
2013 | /// In all cases, the op inside the returned `ast::ClassBracketed` is an |
2014 | /// empty union. This empty union should be replaced with the actual item |
2015 | /// when it is popped from the parser's stack. |
2016 | /// |
2017 | /// This assumes the parser is positioned at the opening `[` and advances |
2018 | /// the parser to the first non-special byte of the character class. |
2019 | /// |
2020 | /// An error is returned if EOF is found. |
2021 | #[inline (never)] |
2022 | fn parse_set_class_open( |
2023 | &self, |
2024 | ) -> Result<(ast::ClassBracketed, ast::ClassSetUnion)> { |
2025 | assert_eq!(self.char(), '[' ); |
2026 | let start = self.pos(); |
2027 | if !self.bump_and_bump_space() { |
2028 | return Err(self.error( |
2029 | Span::new(start, self.pos()), |
2030 | ast::ErrorKind::ClassUnclosed, |
2031 | )); |
2032 | } |
2033 | |
2034 | let negated = if self.char() != '^' { |
2035 | false |
2036 | } else { |
2037 | if !self.bump_and_bump_space() { |
2038 | return Err(self.error( |
2039 | Span::new(start, self.pos()), |
2040 | ast::ErrorKind::ClassUnclosed, |
2041 | )); |
2042 | } |
2043 | true |
2044 | }; |
2045 | // Accept any number of `-` as literal `-`. |
2046 | let mut union = |
2047 | ast::ClassSetUnion { span: self.span(), items: vec![] }; |
2048 | while self.char() == '-' { |
2049 | union.push(ast::ClassSetItem::Literal(ast::Literal { |
2050 | span: self.span_char(), |
2051 | kind: ast::LiteralKind::Verbatim, |
2052 | c: '-' , |
2053 | })); |
2054 | if !self.bump_and_bump_space() { |
2055 | return Err(self.error( |
2056 | Span::new(start, start), |
2057 | ast::ErrorKind::ClassUnclosed, |
2058 | )); |
2059 | } |
2060 | } |
2061 | // If `]` is the *first* char in a set, then interpret it as a literal |
2062 | // `]`. That is, an empty class is impossible to write. |
2063 | if union.items.is_empty() && self.char() == ']' { |
2064 | union.push(ast::ClassSetItem::Literal(ast::Literal { |
2065 | span: self.span_char(), |
2066 | kind: ast::LiteralKind::Verbatim, |
2067 | c: ']' , |
2068 | })); |
2069 | if !self.bump_and_bump_space() { |
2070 | return Err(self.error( |
2071 | Span::new(start, self.pos()), |
2072 | ast::ErrorKind::ClassUnclosed, |
2073 | )); |
2074 | } |
2075 | } |
2076 | let set = ast::ClassBracketed { |
2077 | span: Span::new(start, self.pos()), |
2078 | negated, |
2079 | kind: ast::ClassSet::union(ast::ClassSetUnion { |
2080 | span: Span::new(union.span.start, union.span.start), |
2081 | items: vec![], |
2082 | }), |
2083 | }; |
2084 | Ok((set, union)) |
2085 | } |
2086 | |
2087 | /// Attempt to parse an ASCII character class, e.g., `[:alnum:]`. |
2088 | /// |
2089 | /// This assumes the parser is positioned at the opening `[`. |
2090 | /// |
2091 | /// If no valid ASCII character class could be found, then this does not |
2092 | /// advance the parser and `None` is returned. Otherwise, the parser is |
2093 | /// advanced to the first byte following the closing `]` and the |
2094 | /// corresponding ASCII class is returned. |
2095 | #[inline (never)] |
2096 | fn maybe_parse_ascii_class(&self) -> Option<ast::ClassAscii> { |
2097 | // ASCII character classes are interesting from a parsing perspective |
2098 | // because parsing cannot fail with any interesting error. For example, |
2099 | // in order to use an ASCII character class, it must be enclosed in |
2100 | // double brackets, e.g., `[[:alnum:]]`. Alternatively, you might think |
2101 | // of it as "ASCII character classes have the syntax `[:NAME:]` which |
2102 | // can only appear within character brackets." This means that things |
2103 | // like `[[:lower:]A]` are legal constructs. |
2104 | // |
2105 | // However, if one types an incorrect ASCII character class, e.g., |
2106 | // `[[:loower:]]`, then we treat that as a normal nested character |
2107 | // class containing the characters `:elorw`. One might argue that we |
2108 | // should return an error instead since the repeated colons give away |
2109 | // the intent to write an ASCII class. But what if the user typed |
2110 | // `[[:lower]]` instead? How can we tell that was intended to be an |
2111 | // ASCII class and not just a normal nested class? |
2112 | // |
2113 | // Reasonable people can probably disagree over this, but for better |
2114 | // or worse, we implement semantics that never fails at the expense |
2115 | // of better failure modes. |
2116 | assert_eq!(self.char(), '[' ); |
2117 | // If parsing fails, then we back up the parser to this starting point. |
2118 | let start = self.pos(); |
2119 | let mut negated = false; |
2120 | if !self.bump() || self.char() != ':' { |
2121 | self.parser().pos.set(start); |
2122 | return None; |
2123 | } |
2124 | if !self.bump() { |
2125 | self.parser().pos.set(start); |
2126 | return None; |
2127 | } |
2128 | if self.char() == '^' { |
2129 | negated = true; |
2130 | if !self.bump() { |
2131 | self.parser().pos.set(start); |
2132 | return None; |
2133 | } |
2134 | } |
2135 | let name_start = self.offset(); |
2136 | while self.char() != ':' && self.bump() {} |
2137 | if self.is_eof() { |
2138 | self.parser().pos.set(start); |
2139 | return None; |
2140 | } |
2141 | let name = &self.pattern()[name_start..self.offset()]; |
2142 | if !self.bump_if(":]" ) { |
2143 | self.parser().pos.set(start); |
2144 | return None; |
2145 | } |
2146 | let kind = match ast::ClassAsciiKind::from_name(name) { |
2147 | Some(kind) => kind, |
2148 | None => { |
2149 | self.parser().pos.set(start); |
2150 | return None; |
2151 | } |
2152 | }; |
2153 | Some(ast::ClassAscii { |
2154 | span: Span::new(start, self.pos()), |
2155 | kind, |
2156 | negated, |
2157 | }) |
2158 | } |
2159 | |
2160 | /// Parse a Unicode class in either the single character notation, `\pN` |
2161 | /// or the multi-character bracketed notation, `\p{Greek}`. This assumes |
2162 | /// the parser is positioned at the `p` (or `P` for negation) and will |
2163 | /// advance the parser to the character immediately following the class. |
2164 | /// |
2165 | /// Note that this does not check whether the class name is valid or not. |
2166 | #[inline (never)] |
2167 | fn parse_unicode_class(&self) -> Result<ast::ClassUnicode> { |
2168 | assert!(self.char() == 'p' || self.char() == 'P' ); |
2169 | |
2170 | let mut scratch = self.parser().scratch.borrow_mut(); |
2171 | scratch.clear(); |
2172 | |
2173 | let negated = self.char() == 'P' ; |
2174 | if !self.bump_and_bump_space() { |
2175 | return Err( |
2176 | self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof) |
2177 | ); |
2178 | } |
2179 | let (start, kind) = if self.char() == '{' { |
2180 | let start = self.span_char().end; |
2181 | while self.bump_and_bump_space() && self.char() != '}' { |
2182 | scratch.push(self.char()); |
2183 | } |
2184 | if self.is_eof() { |
2185 | return Err(self |
2186 | .error(self.span(), ast::ErrorKind::EscapeUnexpectedEof)); |
2187 | } |
2188 | assert_eq!(self.char(), '}' ); |
2189 | self.bump(); |
2190 | |
2191 | let name = scratch.as_str(); |
2192 | if let Some(i) = name.find("!=" ) { |
2193 | ( |
2194 | start, |
2195 | ast::ClassUnicodeKind::NamedValue { |
2196 | op: ast::ClassUnicodeOpKind::NotEqual, |
2197 | name: name[..i].to_string(), |
2198 | value: name[i + 2..].to_string(), |
2199 | }, |
2200 | ) |
2201 | } else if let Some(i) = name.find(':' ) { |
2202 | ( |
2203 | start, |
2204 | ast::ClassUnicodeKind::NamedValue { |
2205 | op: ast::ClassUnicodeOpKind::Colon, |
2206 | name: name[..i].to_string(), |
2207 | value: name[i + 1..].to_string(), |
2208 | }, |
2209 | ) |
2210 | } else if let Some(i) = name.find('=' ) { |
2211 | ( |
2212 | start, |
2213 | ast::ClassUnicodeKind::NamedValue { |
2214 | op: ast::ClassUnicodeOpKind::Equal, |
2215 | name: name[..i].to_string(), |
2216 | value: name[i + 1..].to_string(), |
2217 | }, |
2218 | ) |
2219 | } else { |
2220 | (start, ast::ClassUnicodeKind::Named(name.to_string())) |
2221 | } |
2222 | } else { |
2223 | let start = self.pos(); |
2224 | let c = self.char(); |
2225 | if c == ' \\' { |
2226 | return Err(self.error( |
2227 | self.span_char(), |
2228 | ast::ErrorKind::UnicodeClassInvalid, |
2229 | )); |
2230 | } |
2231 | self.bump_and_bump_space(); |
2232 | let kind = ast::ClassUnicodeKind::OneLetter(c); |
2233 | (start, kind) |
2234 | }; |
2235 | Ok(ast::ClassUnicode { |
2236 | span: Span::new(start, self.pos()), |
2237 | negated, |
2238 | kind, |
2239 | }) |
2240 | } |
2241 | |
2242 | /// Parse a Perl character class, e.g., `\d` or `\W`. This assumes the |
2243 | /// parser is currently at a valid character class name and will be |
2244 | /// advanced to the character immediately following the class. |
2245 | #[inline (never)] |
2246 | fn parse_perl_class(&self) -> ast::ClassPerl { |
2247 | let c = self.char(); |
2248 | let span = self.span_char(); |
2249 | self.bump(); |
2250 | let (negated, kind) = match c { |
2251 | 'd' => (false, ast::ClassPerlKind::Digit), |
2252 | 'D' => (true, ast::ClassPerlKind::Digit), |
2253 | 's' => (false, ast::ClassPerlKind::Space), |
2254 | 'S' => (true, ast::ClassPerlKind::Space), |
2255 | 'w' => (false, ast::ClassPerlKind::Word), |
2256 | 'W' => (true, ast::ClassPerlKind::Word), |
2257 | c => panic!("expected valid Perl class but got ' {}'" , c), |
2258 | }; |
2259 | ast::ClassPerl { span, kind, negated } |
2260 | } |
2261 | } |
2262 | |
2263 | /// A type that traverses a fully parsed Ast and checks whether its depth |
2264 | /// exceeds the specified nesting limit. If it does, then an error is returned. |
2265 | #[derive (Debug)] |
2266 | struct NestLimiter<'p, 's, P> { |
2267 | /// The parser that is checking the nest limit. |
2268 | p: &'p ParserI<'s, P>, |
2269 | /// The current depth while walking an Ast. |
2270 | depth: u32, |
2271 | } |
2272 | |
2273 | impl<'p, 's, P: Borrow<Parser>> NestLimiter<'p, 's, P> { |
2274 | fn new(p: &'p ParserI<'s, P>) -> NestLimiter<'p, 's, P> { |
2275 | NestLimiter { p, depth: 0 } |
2276 | } |
2277 | |
2278 | #[inline (never)] |
2279 | fn check(self, ast: &Ast) -> Result<()> { |
2280 | ast::visit(ast, self) |
2281 | } |
2282 | |
2283 | fn increment_depth(&mut self, span: &Span) -> Result<()> { |
2284 | let new = self.depth.checked_add(1).ok_or_else(|| { |
2285 | self.p.error( |
2286 | span.clone(), |
2287 | ast::ErrorKind::NestLimitExceeded(u32::MAX), |
2288 | ) |
2289 | })?; |
2290 | let limit = self.p.parser().nest_limit; |
2291 | if new > limit { |
2292 | return Err(self.p.error( |
2293 | span.clone(), |
2294 | ast::ErrorKind::NestLimitExceeded(limit), |
2295 | )); |
2296 | } |
2297 | self.depth = new; |
2298 | Ok(()) |
2299 | } |
2300 | |
2301 | fn decrement_depth(&mut self) { |
2302 | // Assuming the correctness of the visitor, this should never drop |
2303 | // below 0. |
2304 | self.depth = self.depth.checked_sub(1).unwrap(); |
2305 | } |
2306 | } |
2307 | |
2308 | impl<'p, 's, P: Borrow<Parser>> ast::Visitor for NestLimiter<'p, 's, P> { |
2309 | type Output = (); |
2310 | type Err = ast::Error; |
2311 | |
2312 | fn finish(self) -> Result<()> { |
2313 | Ok(()) |
2314 | } |
2315 | |
2316 | fn visit_pre(&mut self, ast: &Ast) -> Result<()> { |
2317 | let span = match *ast { |
2318 | Ast::Empty(_) |
2319 | | Ast::Flags(_) |
2320 | | Ast::Literal(_) |
2321 | | Ast::Dot(_) |
2322 | | Ast::Assertion(_) |
2323 | | Ast::ClassUnicode(_) |
2324 | | Ast::ClassPerl(_) => { |
2325 | // These are all base cases, so we don't increment depth. |
2326 | return Ok(()); |
2327 | } |
2328 | Ast::ClassBracketed(ref x) => &x.span, |
2329 | Ast::Repetition(ref x) => &x.span, |
2330 | Ast::Group(ref x) => &x.span, |
2331 | Ast::Alternation(ref x) => &x.span, |
2332 | Ast::Concat(ref x) => &x.span, |
2333 | }; |
2334 | self.increment_depth(span) |
2335 | } |
2336 | |
2337 | fn visit_post(&mut self, ast: &Ast) -> Result<()> { |
2338 | match *ast { |
2339 | Ast::Empty(_) |
2340 | | Ast::Flags(_) |
2341 | | Ast::Literal(_) |
2342 | | Ast::Dot(_) |
2343 | | Ast::Assertion(_) |
2344 | | Ast::ClassUnicode(_) |
2345 | | Ast::ClassPerl(_) => { |
2346 | // These are all base cases, so we don't decrement depth. |
2347 | Ok(()) |
2348 | } |
2349 | Ast::ClassBracketed(_) |
2350 | | Ast::Repetition(_) |
2351 | | Ast::Group(_) |
2352 | | Ast::Alternation(_) |
2353 | | Ast::Concat(_) => { |
2354 | self.decrement_depth(); |
2355 | Ok(()) |
2356 | } |
2357 | } |
2358 | } |
2359 | |
2360 | fn visit_class_set_item_pre( |
2361 | &mut self, |
2362 | ast: &ast::ClassSetItem, |
2363 | ) -> Result<()> { |
2364 | let span = match *ast { |
2365 | ast::ClassSetItem::Empty(_) |
2366 | | ast::ClassSetItem::Literal(_) |
2367 | | ast::ClassSetItem::Range(_) |
2368 | | ast::ClassSetItem::Ascii(_) |
2369 | | ast::ClassSetItem::Unicode(_) |
2370 | | ast::ClassSetItem::Perl(_) => { |
2371 | // These are all base cases, so we don't increment depth. |
2372 | return Ok(()); |
2373 | } |
2374 | ast::ClassSetItem::Bracketed(ref x) => &x.span, |
2375 | ast::ClassSetItem::Union(ref x) => &x.span, |
2376 | }; |
2377 | self.increment_depth(span) |
2378 | } |
2379 | |
2380 | fn visit_class_set_item_post( |
2381 | &mut self, |
2382 | ast: &ast::ClassSetItem, |
2383 | ) -> Result<()> { |
2384 | match *ast { |
2385 | ast::ClassSetItem::Empty(_) |
2386 | | ast::ClassSetItem::Literal(_) |
2387 | | ast::ClassSetItem::Range(_) |
2388 | | ast::ClassSetItem::Ascii(_) |
2389 | | ast::ClassSetItem::Unicode(_) |
2390 | | ast::ClassSetItem::Perl(_) => { |
2391 | // These are all base cases, so we don't decrement depth. |
2392 | Ok(()) |
2393 | } |
2394 | ast::ClassSetItem::Bracketed(_) | ast::ClassSetItem::Union(_) => { |
2395 | self.decrement_depth(); |
2396 | Ok(()) |
2397 | } |
2398 | } |
2399 | } |
2400 | |
2401 | fn visit_class_set_binary_op_pre( |
2402 | &mut self, |
2403 | ast: &ast::ClassSetBinaryOp, |
2404 | ) -> Result<()> { |
2405 | self.increment_depth(&ast.span) |
2406 | } |
2407 | |
2408 | fn visit_class_set_binary_op_post( |
2409 | &mut self, |
2410 | _ast: &ast::ClassSetBinaryOp, |
2411 | ) -> Result<()> { |
2412 | self.decrement_depth(); |
2413 | Ok(()) |
2414 | } |
2415 | } |
2416 | |
2417 | /// When the result is an error, transforms the ast::ErrorKind from the source |
2418 | /// Result into another one. This function is used to return clearer error |
2419 | /// messages when possible. |
2420 | fn specialize_err<T>( |
2421 | result: Result<T>, |
2422 | from: ast::ErrorKind, |
2423 | to: ast::ErrorKind, |
2424 | ) -> Result<T> { |
2425 | if let Err(e: Error) = result { |
2426 | if e.kind == from { |
2427 | Err(ast::Error { kind: to, pattern: e.pattern, span: e.span }) |
2428 | } else { |
2429 | Err(e) |
2430 | } |
2431 | } else { |
2432 | result |
2433 | } |
2434 | } |
2435 | |
2436 | #[cfg (test)] |
2437 | mod tests { |
2438 | use core::ops::Range; |
2439 | |
2440 | use alloc::format; |
2441 | |
2442 | use super::*; |
2443 | |
2444 | // Our own assert_eq, which has slightly better formatting (but honestly |
2445 | // still kind of crappy). |
2446 | macro_rules! assert_eq { |
2447 | ($left:expr, $right:expr) => {{ |
2448 | match (&$left, &$right) { |
2449 | (left_val, right_val) => { |
2450 | if !(*left_val == *right_val) { |
2451 | panic!( |
2452 | "assertion failed: `(left == right)` \n\n\ |
2453 | left: `{:?}` \nright: `{:?}` \n\n" , |
2454 | left_val, right_val |
2455 | ) |
2456 | } |
2457 | } |
2458 | } |
2459 | }}; |
2460 | } |
2461 | |
2462 | // We create these errors to compare with real ast::Errors in the tests. |
2463 | // We define equality between TestError and ast::Error to disregard the |
2464 | // pattern string in ast::Error, which is annoying to provide in tests. |
2465 | #[derive (Clone, Debug)] |
2466 | struct TestError { |
2467 | span: Span, |
2468 | kind: ast::ErrorKind, |
2469 | } |
2470 | |
2471 | impl PartialEq<ast::Error> for TestError { |
2472 | fn eq(&self, other: &ast::Error) -> bool { |
2473 | self.span == other.span && self.kind == other.kind |
2474 | } |
2475 | } |
2476 | |
2477 | impl PartialEq<TestError> for ast::Error { |
2478 | fn eq(&self, other: &TestError) -> bool { |
2479 | self.span == other.span && self.kind == other.kind |
2480 | } |
2481 | } |
2482 | |
2483 | fn s(str: &str) -> String { |
2484 | str.to_string() |
2485 | } |
2486 | |
2487 | fn parser(pattern: &str) -> ParserI<'_, Parser> { |
2488 | ParserI::new(Parser::new(), pattern) |
2489 | } |
2490 | |
2491 | fn parser_octal(pattern: &str) -> ParserI<'_, Parser> { |
2492 | let parser = ParserBuilder::new().octal(true).build(); |
2493 | ParserI::new(parser, pattern) |
2494 | } |
2495 | |
2496 | fn parser_empty_min_range(pattern: &str) -> ParserI<'_, Parser> { |
2497 | let parser = ParserBuilder::new().empty_min_range(true).build(); |
2498 | ParserI::new(parser, pattern) |
2499 | } |
2500 | |
2501 | fn parser_nest_limit( |
2502 | pattern: &str, |
2503 | nest_limit: u32, |
2504 | ) -> ParserI<'_, Parser> { |
2505 | let p = ParserBuilder::new().nest_limit(nest_limit).build(); |
2506 | ParserI::new(p, pattern) |
2507 | } |
2508 | |
2509 | fn parser_ignore_whitespace(pattern: &str) -> ParserI<'_, Parser> { |
2510 | let p = ParserBuilder::new().ignore_whitespace(true).build(); |
2511 | ParserI::new(p, pattern) |
2512 | } |
2513 | |
2514 | /// Short alias for creating a new span. |
2515 | fn nspan(start: Position, end: Position) -> Span { |
2516 | Span::new(start, end) |
2517 | } |
2518 | |
2519 | /// Short alias for creating a new position. |
2520 | fn npos(offset: usize, line: usize, column: usize) -> Position { |
2521 | Position::new(offset, line, column) |
2522 | } |
2523 | |
2524 | /// Create a new span from the given offset range. This assumes a single |
2525 | /// line and sets the columns based on the offsets. i.e., This only works |
2526 | /// out of the box for ASCII, which is fine for most tests. |
2527 | fn span(range: Range<usize>) -> Span { |
2528 | let start = Position::new(range.start, 1, range.start + 1); |
2529 | let end = Position::new(range.end, 1, range.end + 1); |
2530 | Span::new(start, end) |
2531 | } |
2532 | |
2533 | /// Create a new span for the corresponding byte range in the given string. |
2534 | fn span_range(subject: &str, range: Range<usize>) -> Span { |
2535 | let start = Position { |
2536 | offset: range.start, |
2537 | line: 1 + subject[..range.start].matches(' \n' ).count(), |
2538 | column: 1 + subject[..range.start] |
2539 | .chars() |
2540 | .rev() |
2541 | .position(|c| c == ' \n' ) |
2542 | .unwrap_or(subject[..range.start].chars().count()), |
2543 | }; |
2544 | let end = Position { |
2545 | offset: range.end, |
2546 | line: 1 + subject[..range.end].matches(' \n' ).count(), |
2547 | column: 1 + subject[..range.end] |
2548 | .chars() |
2549 | .rev() |
2550 | .position(|c| c == ' \n' ) |
2551 | .unwrap_or(subject[..range.end].chars().count()), |
2552 | }; |
2553 | Span::new(start, end) |
2554 | } |
2555 | |
2556 | /// Create a verbatim literal starting at the given position. |
2557 | fn lit(c: char, start: usize) -> Ast { |
2558 | lit_with(c, span(start..start + c.len_utf8())) |
2559 | } |
2560 | |
2561 | /// Create a meta literal starting at the given position. |
2562 | fn meta_lit(c: char, span: Span) -> Ast { |
2563 | Ast::literal(ast::Literal { span, kind: ast::LiteralKind::Meta, c }) |
2564 | } |
2565 | |
2566 | /// Create a verbatim literal with the given span. |
2567 | fn lit_with(c: char, span: Span) -> Ast { |
2568 | Ast::literal(ast::Literal { |
2569 | span, |
2570 | kind: ast::LiteralKind::Verbatim, |
2571 | c, |
2572 | }) |
2573 | } |
2574 | |
2575 | /// Create a concatenation with the given range. |
2576 | fn concat(range: Range<usize>, asts: Vec<Ast>) -> Ast { |
2577 | concat_with(span(range), asts) |
2578 | } |
2579 | |
2580 | /// Create a concatenation with the given span. |
2581 | fn concat_with(span: Span, asts: Vec<Ast>) -> Ast { |
2582 | Ast::concat(ast::Concat { span, asts }) |
2583 | } |
2584 | |
2585 | /// Create an alternation with the given span. |
2586 | fn alt(range: Range<usize>, asts: Vec<Ast>) -> Ast { |
2587 | Ast::alternation(ast::Alternation { span: span(range), asts }) |
2588 | } |
2589 | |
2590 | /// Create a capturing group with the given span. |
2591 | fn group(range: Range<usize>, index: u32, ast: Ast) -> Ast { |
2592 | Ast::group(ast::Group { |
2593 | span: span(range), |
2594 | kind: ast::GroupKind::CaptureIndex(index), |
2595 | ast: Box::new(ast), |
2596 | }) |
2597 | } |
2598 | |
2599 | /// Create an ast::SetFlags. |
2600 | /// |
2601 | /// The given pattern should be the full pattern string. The range given |
2602 | /// should correspond to the byte offsets where the flag set occurs. |
2603 | /// |
2604 | /// If negated is true, then the set is interpreted as beginning with a |
2605 | /// negation. |
2606 | fn flag_set( |
2607 | pat: &str, |
2608 | range: Range<usize>, |
2609 | flag: ast::Flag, |
2610 | negated: bool, |
2611 | ) -> Ast { |
2612 | let mut items = vec![ast::FlagsItem { |
2613 | span: span_range(pat, (range.end - 2)..(range.end - 1)), |
2614 | kind: ast::FlagsItemKind::Flag(flag), |
2615 | }]; |
2616 | if negated { |
2617 | items.insert( |
2618 | 0, |
2619 | ast::FlagsItem { |
2620 | span: span_range(pat, (range.start + 2)..(range.end - 2)), |
2621 | kind: ast::FlagsItemKind::Negation, |
2622 | }, |
2623 | ); |
2624 | } |
2625 | Ast::flags(ast::SetFlags { |
2626 | span: span_range(pat, range.clone()), |
2627 | flags: ast::Flags { |
2628 | span: span_range(pat, (range.start + 2)..(range.end - 1)), |
2629 | items, |
2630 | }, |
2631 | }) |
2632 | } |
2633 | |
2634 | #[test ] |
2635 | fn parse_nest_limit() { |
2636 | // A nest limit of 0 still allows some types of regexes. |
2637 | assert_eq!( |
2638 | parser_nest_limit("" , 0).parse(), |
2639 | Ok(Ast::empty(span(0..0))) |
2640 | ); |
2641 | assert_eq!(parser_nest_limit("a" , 0).parse(), Ok(lit('a' , 0))); |
2642 | |
2643 | // Test repetition operations, which require one level of nesting. |
2644 | assert_eq!( |
2645 | parser_nest_limit("a+" , 0).parse().unwrap_err(), |
2646 | TestError { |
2647 | span: span(0..2), |
2648 | kind: ast::ErrorKind::NestLimitExceeded(0), |
2649 | } |
2650 | ); |
2651 | assert_eq!( |
2652 | parser_nest_limit("a+" , 1).parse(), |
2653 | Ok(Ast::repetition(ast::Repetition { |
2654 | span: span(0..2), |
2655 | op: ast::RepetitionOp { |
2656 | span: span(1..2), |
2657 | kind: ast::RepetitionKind::OneOrMore, |
2658 | }, |
2659 | greedy: true, |
2660 | ast: Box::new(lit('a' , 0)), |
2661 | })) |
2662 | ); |
2663 | assert_eq!( |
2664 | parser_nest_limit("(a)+" , 1).parse().unwrap_err(), |
2665 | TestError { |
2666 | span: span(0..3), |
2667 | kind: ast::ErrorKind::NestLimitExceeded(1), |
2668 | } |
2669 | ); |
2670 | assert_eq!( |
2671 | parser_nest_limit("a+*" , 1).parse().unwrap_err(), |
2672 | TestError { |
2673 | span: span(0..2), |
2674 | kind: ast::ErrorKind::NestLimitExceeded(1), |
2675 | } |
2676 | ); |
2677 | assert_eq!( |
2678 | parser_nest_limit("a+*" , 2).parse(), |
2679 | Ok(Ast::repetition(ast::Repetition { |
2680 | span: span(0..3), |
2681 | op: ast::RepetitionOp { |
2682 | span: span(2..3), |
2683 | kind: ast::RepetitionKind::ZeroOrMore, |
2684 | }, |
2685 | greedy: true, |
2686 | ast: Box::new(Ast::repetition(ast::Repetition { |
2687 | span: span(0..2), |
2688 | op: ast::RepetitionOp { |
2689 | span: span(1..2), |
2690 | kind: ast::RepetitionKind::OneOrMore, |
2691 | }, |
2692 | greedy: true, |
2693 | ast: Box::new(lit('a' , 0)), |
2694 | })), |
2695 | })) |
2696 | ); |
2697 | |
2698 | // Test concatenations. A concatenation requires one level of nesting. |
2699 | assert_eq!( |
2700 | parser_nest_limit("ab" , 0).parse().unwrap_err(), |
2701 | TestError { |
2702 | span: span(0..2), |
2703 | kind: ast::ErrorKind::NestLimitExceeded(0), |
2704 | } |
2705 | ); |
2706 | assert_eq!( |
2707 | parser_nest_limit("ab" , 1).parse(), |
2708 | Ok(concat(0..2, vec![lit('a' , 0), lit('b' , 1)])) |
2709 | ); |
2710 | assert_eq!( |
2711 | parser_nest_limit("abc" , 1).parse(), |
2712 | Ok(concat(0..3, vec![lit('a' , 0), lit('b' , 1), lit('c' , 2)])) |
2713 | ); |
2714 | |
2715 | // Test alternations. An alternation requires one level of nesting. |
2716 | assert_eq!( |
2717 | parser_nest_limit("a|b" , 0).parse().unwrap_err(), |
2718 | TestError { |
2719 | span: span(0..3), |
2720 | kind: ast::ErrorKind::NestLimitExceeded(0), |
2721 | } |
2722 | ); |
2723 | assert_eq!( |
2724 | parser_nest_limit("a|b" , 1).parse(), |
2725 | Ok(alt(0..3, vec![lit('a' , 0), lit('b' , 2)])) |
2726 | ); |
2727 | assert_eq!( |
2728 | parser_nest_limit("a|b|c" , 1).parse(), |
2729 | Ok(alt(0..5, vec![lit('a' , 0), lit('b' , 2), lit('c' , 4)])) |
2730 | ); |
2731 | |
2732 | // Test character classes. Classes form their own mini-recursive |
2733 | // syntax! |
2734 | assert_eq!( |
2735 | parser_nest_limit("[a]" , 0).parse().unwrap_err(), |
2736 | TestError { |
2737 | span: span(0..3), |
2738 | kind: ast::ErrorKind::NestLimitExceeded(0), |
2739 | } |
2740 | ); |
2741 | assert_eq!( |
2742 | parser_nest_limit("[a]" , 1).parse(), |
2743 | Ok(Ast::class_bracketed(ast::ClassBracketed { |
2744 | span: span(0..3), |
2745 | negated: false, |
2746 | kind: ast::ClassSet::Item(ast::ClassSetItem::Literal( |
2747 | ast::Literal { |
2748 | span: span(1..2), |
2749 | kind: ast::LiteralKind::Verbatim, |
2750 | c: 'a' , |
2751 | } |
2752 | )), |
2753 | })) |
2754 | ); |
2755 | assert_eq!( |
2756 | parser_nest_limit("[ab]" , 1).parse().unwrap_err(), |
2757 | TestError { |
2758 | span: span(1..3), |
2759 | kind: ast::ErrorKind::NestLimitExceeded(1), |
2760 | } |
2761 | ); |
2762 | assert_eq!( |
2763 | parser_nest_limit("[ab[cd]]" , 2).parse().unwrap_err(), |
2764 | TestError { |
2765 | span: span(3..7), |
2766 | kind: ast::ErrorKind::NestLimitExceeded(2), |
2767 | } |
2768 | ); |
2769 | assert_eq!( |
2770 | parser_nest_limit("[ab[cd]]" , 3).parse().unwrap_err(), |
2771 | TestError { |
2772 | span: span(4..6), |
2773 | kind: ast::ErrorKind::NestLimitExceeded(3), |
2774 | } |
2775 | ); |
2776 | assert_eq!( |
2777 | parser_nest_limit("[a--b]" , 1).parse().unwrap_err(), |
2778 | TestError { |
2779 | span: span(1..5), |
2780 | kind: ast::ErrorKind::NestLimitExceeded(1), |
2781 | } |
2782 | ); |
2783 | assert_eq!( |
2784 | parser_nest_limit("[a--bc]" , 2).parse().unwrap_err(), |
2785 | TestError { |
2786 | span: span(4..6), |
2787 | kind: ast::ErrorKind::NestLimitExceeded(2), |
2788 | } |
2789 | ); |
2790 | } |
2791 | |
2792 | #[test ] |
2793 | fn parse_comments() { |
2794 | let pat = "(?x) |
2795 | # This is comment 1. |
2796 | foo # This is comment 2. |
2797 | # This is comment 3. |
2798 | bar |
2799 | # This is comment 4." ; |
2800 | let astc = parser(pat).parse_with_comments().unwrap(); |
2801 | assert_eq!( |
2802 | astc.ast, |
2803 | concat_with( |
2804 | span_range(pat, 0..pat.len()), |
2805 | vec![ |
2806 | flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), |
2807 | lit_with('f' , span_range(pat, 26..27)), |
2808 | lit_with('o' , span_range(pat, 27..28)), |
2809 | lit_with('o' , span_range(pat, 28..29)), |
2810 | lit_with('b' , span_range(pat, 74..75)), |
2811 | lit_with('a' , span_range(pat, 75..76)), |
2812 | lit_with('r' , span_range(pat, 76..77)), |
2813 | ] |
2814 | ) |
2815 | ); |
2816 | assert_eq!( |
2817 | astc.comments, |
2818 | vec![ |
2819 | ast::Comment { |
2820 | span: span_range(pat, 5..26), |
2821 | comment: s(" This is comment 1." ), |
2822 | }, |
2823 | ast::Comment { |
2824 | span: span_range(pat, 30..51), |
2825 | comment: s(" This is comment 2." ), |
2826 | }, |
2827 | ast::Comment { |
2828 | span: span_range(pat, 53..74), |
2829 | comment: s(" This is comment 3." ), |
2830 | }, |
2831 | ast::Comment { |
2832 | span: span_range(pat, 78..98), |
2833 | comment: s(" This is comment 4." ), |
2834 | }, |
2835 | ] |
2836 | ); |
2837 | } |
2838 | |
2839 | #[test ] |
2840 | fn parse_holistic() { |
2841 | assert_eq!(parser("]" ).parse(), Ok(lit(']' , 0))); |
2842 | assert_eq!( |
2843 | parser(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#\&\-\~" ).parse(), |
2844 | Ok(concat( |
2845 | 0..36, |
2846 | vec![ |
2847 | meta_lit(' \\' , span(0..2)), |
2848 | meta_lit('.' , span(2..4)), |
2849 | meta_lit('+' , span(4..6)), |
2850 | meta_lit('*' , span(6..8)), |
2851 | meta_lit('?' , span(8..10)), |
2852 | meta_lit('(' , span(10..12)), |
2853 | meta_lit(')' , span(12..14)), |
2854 | meta_lit('|' , span(14..16)), |
2855 | meta_lit('[' , span(16..18)), |
2856 | meta_lit(']' , span(18..20)), |
2857 | meta_lit('{' , span(20..22)), |
2858 | meta_lit('}' , span(22..24)), |
2859 | meta_lit('^' , span(24..26)), |
2860 | meta_lit('$' , span(26..28)), |
2861 | meta_lit('#' , span(28..30)), |
2862 | meta_lit('&' , span(30..32)), |
2863 | meta_lit('-' , span(32..34)), |
2864 | meta_lit('~' , span(34..36)), |
2865 | ] |
2866 | )) |
2867 | ); |
2868 | } |
2869 | |
2870 | #[test ] |
2871 | fn parse_ignore_whitespace() { |
2872 | // Test that basic whitespace insensitivity works. |
2873 | let pat = "(?x)a b" ; |
2874 | assert_eq!( |
2875 | parser(pat).parse(), |
2876 | Ok(concat_with( |
2877 | nspan(npos(0, 1, 1), npos(7, 1, 8)), |
2878 | vec![ |
2879 | flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), |
2880 | lit_with('a' , nspan(npos(4, 1, 5), npos(5, 1, 6))), |
2881 | lit_with('b' , nspan(npos(6, 1, 7), npos(7, 1, 8))), |
2882 | ] |
2883 | )) |
2884 | ); |
2885 | |
2886 | // Test that we can toggle whitespace insensitivity. |
2887 | let pat = "(?x)a b(?-x)a b" ; |
2888 | assert_eq!( |
2889 | parser(pat).parse(), |
2890 | Ok(concat_with( |
2891 | nspan(npos(0, 1, 1), npos(15, 1, 16)), |
2892 | vec![ |
2893 | flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), |
2894 | lit_with('a' , nspan(npos(4, 1, 5), npos(5, 1, 6))), |
2895 | lit_with('b' , nspan(npos(6, 1, 7), npos(7, 1, 8))), |
2896 | flag_set(pat, 7..12, ast::Flag::IgnoreWhitespace, true), |
2897 | lit_with('a' , nspan(npos(12, 1, 13), npos(13, 1, 14))), |
2898 | lit_with(' ' , nspan(npos(13, 1, 14), npos(14, 1, 15))), |
2899 | lit_with('b' , nspan(npos(14, 1, 15), npos(15, 1, 16))), |
2900 | ] |
2901 | )) |
2902 | ); |
2903 | |
2904 | // Test that nesting whitespace insensitive flags works. |
2905 | let pat = "a (?x:a )a " ; |
2906 | assert_eq!( |
2907 | parser(pat).parse(), |
2908 | Ok(concat_with( |
2909 | span_range(pat, 0..11), |
2910 | vec![ |
2911 | lit_with('a' , span_range(pat, 0..1)), |
2912 | lit_with(' ' , span_range(pat, 1..2)), |
2913 | Ast::group(ast::Group { |
2914 | span: span_range(pat, 2..9), |
2915 | kind: ast::GroupKind::NonCapturing(ast::Flags { |
2916 | span: span_range(pat, 4..5), |
2917 | items: vec![ast::FlagsItem { |
2918 | span: span_range(pat, 4..5), |
2919 | kind: ast::FlagsItemKind::Flag( |
2920 | ast::Flag::IgnoreWhitespace |
2921 | ), |
2922 | },], |
2923 | }), |
2924 | ast: Box::new(lit_with('a' , span_range(pat, 6..7))), |
2925 | }), |
2926 | lit_with('a' , span_range(pat, 9..10)), |
2927 | lit_with(' ' , span_range(pat, 10..11)), |
2928 | ] |
2929 | )) |
2930 | ); |
2931 | |
2932 | // Test that whitespace after an opening paren is insignificant. |
2933 | let pat = "(?x)( ?P<foo> a )" ; |
2934 | assert_eq!( |
2935 | parser(pat).parse(), |
2936 | Ok(concat_with( |
2937 | span_range(pat, 0..pat.len()), |
2938 | vec![ |
2939 | flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), |
2940 | Ast::group(ast::Group { |
2941 | span: span_range(pat, 4..pat.len()), |
2942 | kind: ast::GroupKind::CaptureName { |
2943 | starts_with_p: true, |
2944 | name: ast::CaptureName { |
2945 | span: span_range(pat, 9..12), |
2946 | name: s("foo" ), |
2947 | index: 1, |
2948 | } |
2949 | }, |
2950 | ast: Box::new(lit_with('a' , span_range(pat, 14..15))), |
2951 | }), |
2952 | ] |
2953 | )) |
2954 | ); |
2955 | let pat = "(?x)( a )" ; |
2956 | assert_eq!( |
2957 | parser(pat).parse(), |
2958 | Ok(concat_with( |
2959 | span_range(pat, 0..pat.len()), |
2960 | vec![ |
2961 | flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), |
2962 | Ast::group(ast::Group { |
2963 | span: span_range(pat, 4..pat.len()), |
2964 | kind: ast::GroupKind::CaptureIndex(1), |
2965 | ast: Box::new(lit_with('a' , span_range(pat, 7..8))), |
2966 | }), |
2967 | ] |
2968 | )) |
2969 | ); |
2970 | let pat = "(?x)( ?: a )" ; |
2971 | assert_eq!( |
2972 | parser(pat).parse(), |
2973 | Ok(concat_with( |
2974 | span_range(pat, 0..pat.len()), |
2975 | vec![ |
2976 | flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), |
2977 | Ast::group(ast::Group { |
2978 | span: span_range(pat, 4..pat.len()), |
2979 | kind: ast::GroupKind::NonCapturing(ast::Flags { |
2980 | span: span_range(pat, 8..8), |
2981 | items: vec![], |
2982 | }), |
2983 | ast: Box::new(lit_with('a' , span_range(pat, 11..12))), |
2984 | }), |
2985 | ] |
2986 | )) |
2987 | ); |
2988 | let pat = r"(?x)\x { 53 }" ; |
2989 | assert_eq!( |
2990 | parser(pat).parse(), |
2991 | Ok(concat_with( |
2992 | span_range(pat, 0..pat.len()), |
2993 | vec![ |
2994 | flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), |
2995 | Ast::literal(ast::Literal { |
2996 | span: span(4..13), |
2997 | kind: ast::LiteralKind::HexBrace( |
2998 | ast::HexLiteralKind::X |
2999 | ), |
3000 | c: 'S' , |
3001 | }), |
3002 | ] |
3003 | )) |
3004 | ); |
3005 | |
3006 | // Test that whitespace after an escape is OK. |
3007 | let pat = r"(?x)\ " ; |
3008 | assert_eq!( |
3009 | parser(pat).parse(), |
3010 | Ok(concat_with( |
3011 | span_range(pat, 0..pat.len()), |
3012 | vec![ |
3013 | flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), |
3014 | Ast::literal(ast::Literal { |
3015 | span: span_range(pat, 4..6), |
3016 | kind: ast::LiteralKind::Superfluous, |
3017 | c: ' ' , |
3018 | }), |
3019 | ] |
3020 | )) |
3021 | ); |
3022 | } |
3023 | |
3024 | #[test ] |
3025 | fn parse_newlines() { |
3026 | let pat = ". \n." ; |
3027 | assert_eq!( |
3028 | parser(pat).parse(), |
3029 | Ok(concat_with( |
3030 | span_range(pat, 0..3), |
3031 | vec![ |
3032 | Ast::dot(span_range(pat, 0..1)), |
3033 | lit_with(' \n' , span_range(pat, 1..2)), |
3034 | Ast::dot(span_range(pat, 2..3)), |
3035 | ] |
3036 | )) |
3037 | ); |
3038 | |
3039 | let pat = "foobar \nbaz \nquux \n" ; |
3040 | assert_eq!( |
3041 | parser(pat).parse(), |
3042 | Ok(concat_with( |
3043 | span_range(pat, 0..pat.len()), |
3044 | vec![ |
3045 | lit_with('f' , nspan(npos(0, 1, 1), npos(1, 1, 2))), |
3046 | lit_with('o' , nspan(npos(1, 1, 2), npos(2, 1, 3))), |
3047 | lit_with('o' , nspan(npos(2, 1, 3), npos(3, 1, 4))), |
3048 | lit_with('b' , nspan(npos(3, 1, 4), npos(4, 1, 5))), |
3049 | lit_with('a' , nspan(npos(4, 1, 5), npos(5, 1, 6))), |
3050 | lit_with('r' , nspan(npos(5, 1, 6), npos(6, 1, 7))), |
3051 | lit_with(' \n' , nspan(npos(6, 1, 7), npos(7, 2, 1))), |
3052 | lit_with('b' , nspan(npos(7, 2, 1), npos(8, 2, 2))), |
3053 | lit_with('a' , nspan(npos(8, 2, 2), npos(9, 2, 3))), |
3054 | lit_with('z' , nspan(npos(9, 2, 3), npos(10, 2, 4))), |
3055 | lit_with(' \n' , nspan(npos(10, 2, 4), npos(11, 3, 1))), |
3056 | lit_with('q' , nspan(npos(11, 3, 1), npos(12, 3, 2))), |
3057 | lit_with('u' , nspan(npos(12, 3, 2), npos(13, 3, 3))), |
3058 | lit_with('u' , nspan(npos(13, 3, 3), npos(14, 3, 4))), |
3059 | lit_with('x' , nspan(npos(14, 3, 4), npos(15, 3, 5))), |
3060 | lit_with(' \n' , nspan(npos(15, 3, 5), npos(16, 4, 1))), |
3061 | ] |
3062 | )) |
3063 | ); |
3064 | } |
3065 | |
3066 | #[test ] |
3067 | fn parse_uncounted_repetition() { |
3068 | assert_eq!( |
3069 | parser(r"a*" ).parse(), |
3070 | Ok(Ast::repetition(ast::Repetition { |
3071 | span: span(0..2), |
3072 | op: ast::RepetitionOp { |
3073 | span: span(1..2), |
3074 | kind: ast::RepetitionKind::ZeroOrMore, |
3075 | }, |
3076 | greedy: true, |
3077 | ast: Box::new(lit('a' , 0)), |
3078 | })) |
3079 | ); |
3080 | assert_eq!( |
3081 | parser(r"a+" ).parse(), |
3082 | Ok(Ast::repetition(ast::Repetition { |
3083 | span: span(0..2), |
3084 | op: ast::RepetitionOp { |
3085 | span: span(1..2), |
3086 | kind: ast::RepetitionKind::OneOrMore, |
3087 | }, |
3088 | greedy: true, |
3089 | ast: Box::new(lit('a' , 0)), |
3090 | })) |
3091 | ); |
3092 | |
3093 | assert_eq!( |
3094 | parser(r"a?" ).parse(), |
3095 | Ok(Ast::repetition(ast::Repetition { |
3096 | span: span(0..2), |
3097 | op: ast::RepetitionOp { |
3098 | span: span(1..2), |
3099 | kind: ast::RepetitionKind::ZeroOrOne, |
3100 | }, |
3101 | greedy: true, |
3102 | ast: Box::new(lit('a' , 0)), |
3103 | })) |
3104 | ); |
3105 | assert_eq!( |
3106 | parser(r"a??" ).parse(), |
3107 | Ok(Ast::repetition(ast::Repetition { |
3108 | span: span(0..3), |
3109 | op: ast::RepetitionOp { |
3110 | span: span(1..3), |
3111 | kind: ast::RepetitionKind::ZeroOrOne, |
3112 | }, |
3113 | greedy: false, |
3114 | ast: Box::new(lit('a' , 0)), |
3115 | })) |
3116 | ); |
3117 | assert_eq!( |
3118 | parser(r"a?" ).parse(), |
3119 | Ok(Ast::repetition(ast::Repetition { |
3120 | span: span(0..2), |
3121 | op: ast::RepetitionOp { |
3122 | span: span(1..2), |
3123 | kind: ast::RepetitionKind::ZeroOrOne, |
3124 | }, |
3125 | greedy: true, |
3126 | ast: Box::new(lit('a' , 0)), |
3127 | })) |
3128 | ); |
3129 | assert_eq!( |
3130 | parser(r"a?b" ).parse(), |
3131 | Ok(concat( |
3132 | 0..3, |
3133 | vec![ |
3134 | Ast::repetition(ast::Repetition { |
3135 | span: span(0..2), |
3136 | op: ast::RepetitionOp { |
3137 | span: span(1..2), |
3138 | kind: ast::RepetitionKind::ZeroOrOne, |
3139 | }, |
3140 | greedy: true, |
3141 | ast: Box::new(lit('a' , 0)), |
3142 | }), |
3143 | lit('b' , 2), |
3144 | ] |
3145 | )) |
3146 | ); |
3147 | assert_eq!( |
3148 | parser(r"a??b" ).parse(), |
3149 | Ok(concat( |
3150 | 0..4, |
3151 | vec![ |
3152 | Ast::repetition(ast::Repetition { |
3153 | span: span(0..3), |
3154 | op: ast::RepetitionOp { |
3155 | span: span(1..3), |
3156 | kind: ast::RepetitionKind::ZeroOrOne, |
3157 | }, |
3158 | greedy: false, |
3159 | ast: Box::new(lit('a' , 0)), |
3160 | }), |
3161 | lit('b' , 3), |
3162 | ] |
3163 | )) |
3164 | ); |
3165 | assert_eq!( |
3166 | parser(r"ab?" ).parse(), |
3167 | Ok(concat( |
3168 | 0..3, |
3169 | vec![ |
3170 | lit('a' , 0), |
3171 | Ast::repetition(ast::Repetition { |
3172 | span: span(1..3), |
3173 | op: ast::RepetitionOp { |
3174 | span: span(2..3), |
3175 | kind: ast::RepetitionKind::ZeroOrOne, |
3176 | }, |
3177 | greedy: true, |
3178 | ast: Box::new(lit('b' , 1)), |
3179 | }), |
3180 | ] |
3181 | )) |
3182 | ); |
3183 | assert_eq!( |
3184 | parser(r"(ab)?" ).parse(), |
3185 | Ok(Ast::repetition(ast::Repetition { |
3186 | span: span(0..5), |
3187 | op: ast::RepetitionOp { |
3188 | span: span(4..5), |
3189 | kind: ast::RepetitionKind::ZeroOrOne, |
3190 | }, |
3191 | greedy: true, |
3192 | ast: Box::new(group( |
3193 | 0..4, |
3194 | 1, |
3195 | concat(1..3, vec![lit('a' , 1), lit('b' , 2),]) |
3196 | )), |
3197 | })) |
3198 | ); |
3199 | assert_eq!( |
3200 | parser(r"|a?" ).parse(), |
3201 | Ok(alt( |
3202 | 0..3, |
3203 | vec![ |
3204 | Ast::empty(span(0..0)), |
3205 | Ast::repetition(ast::Repetition { |
3206 | span: span(1..3), |
3207 | op: ast::RepetitionOp { |
3208 | span: span(2..3), |
3209 | kind: ast::RepetitionKind::ZeroOrOne, |
3210 | }, |
3211 | greedy: true, |
3212 | ast: Box::new(lit('a' , 1)), |
3213 | }), |
3214 | ] |
3215 | )) |
3216 | ); |
3217 | |
3218 | assert_eq!( |
3219 | parser(r"*" ).parse().unwrap_err(), |
3220 | TestError { |
3221 | span: span(0..0), |
3222 | kind: ast::ErrorKind::RepetitionMissing, |
3223 | } |
3224 | ); |
3225 | assert_eq!( |
3226 | parser(r"(?i)*" ).parse().unwrap_err(), |
3227 | TestError { |
3228 | span: span(4..4), |
3229 | kind: ast::ErrorKind::RepetitionMissing, |
3230 | } |
3231 | ); |
3232 | assert_eq!( |
3233 | parser(r"(*)" ).parse().unwrap_err(), |
3234 | TestError { |
3235 | span: span(1..1), |
3236 | kind: ast::ErrorKind::RepetitionMissing, |
3237 | } |
3238 | ); |
3239 | assert_eq!( |
3240 | parser(r"(?:?)" ).parse().unwrap_err(), |
3241 | TestError { |
3242 | span: span(3..3), |
3243 | kind: ast::ErrorKind::RepetitionMissing, |
3244 | } |
3245 | ); |
3246 | assert_eq!( |
3247 | parser(r"+" ).parse().unwrap_err(), |
3248 | TestError { |
3249 | span: span(0..0), |
3250 | kind: ast::ErrorKind::RepetitionMissing, |
3251 | } |
3252 | ); |
3253 | assert_eq!( |
3254 | parser(r"?" ).parse().unwrap_err(), |
3255 | TestError { |
3256 | span: span(0..0), |
3257 | kind: ast::ErrorKind::RepetitionMissing, |
3258 | } |
3259 | ); |
3260 | assert_eq!( |
3261 | parser(r"(?)" ).parse().unwrap_err(), |
3262 | TestError { |
3263 | span: span(1..1), |
3264 | kind: ast::ErrorKind::RepetitionMissing, |
3265 | } |
3266 | ); |
3267 | assert_eq!( |
3268 | parser(r"|*" ).parse().unwrap_err(), |
3269 | TestError { |
3270 | span: span(1..1), |
3271 | kind: ast::ErrorKind::RepetitionMissing, |
3272 | } |
3273 | ); |
3274 | assert_eq!( |
3275 | parser(r"|+" ).parse().unwrap_err(), |
3276 | TestError { |
3277 | span: span(1..1), |
3278 | kind: ast::ErrorKind::RepetitionMissing, |
3279 | } |
3280 | ); |
3281 | assert_eq!( |
3282 | parser(r"|?" ).parse().unwrap_err(), |
3283 | TestError { |
3284 | span: span(1..1), |
3285 | kind: ast::ErrorKind::RepetitionMissing, |
3286 | } |
3287 | ); |
3288 | } |
3289 | |
3290 | #[test ] |
3291 | fn parse_counted_repetition() { |
3292 | assert_eq!( |
3293 | parser(r"a{5}" ).parse(), |
3294 | Ok(Ast::repetition(ast::Repetition { |
3295 | span: span(0..4), |
3296 | op: ast::RepetitionOp { |
3297 | span: span(1..4), |
3298 | kind: ast::RepetitionKind::Range( |
3299 | ast::RepetitionRange::Exactly(5) |
3300 | ), |
3301 | }, |
3302 | greedy: true, |
3303 | ast: Box::new(lit('a' , 0)), |
3304 | })) |
3305 | ); |
3306 | assert_eq!( |
3307 | parser(r"a{5,}" ).parse(), |
3308 | Ok(Ast::repetition(ast::Repetition { |
3309 | span: span(0..5), |
3310 | op: ast::RepetitionOp { |
3311 | span: span(1..5), |
3312 | kind: ast::RepetitionKind::Range( |
3313 | ast::RepetitionRange::AtLeast(5) |
3314 | ), |
3315 | }, |
3316 | greedy: true, |
3317 | ast: Box::new(lit('a' , 0)), |
3318 | })) |
3319 | ); |
3320 | assert_eq!( |
3321 | parser(r"a{5,9}" ).parse(), |
3322 | Ok(Ast::repetition(ast::Repetition { |
3323 | span: span(0..6), |
3324 | op: ast::RepetitionOp { |
3325 | span: span(1..6), |
3326 | kind: ast::RepetitionKind::Range( |
3327 | ast::RepetitionRange::Bounded(5, 9) |
3328 | ), |
3329 | }, |
3330 | greedy: true, |
3331 | ast: Box::new(lit('a' , 0)), |
3332 | })) |
3333 | ); |
3334 | assert_eq!( |
3335 | parser(r"a{5}?" ).parse(), |
3336 | Ok(Ast::repetition(ast::Repetition { |
3337 | span: span(0..5), |
3338 | op: ast::RepetitionOp { |
3339 | span: span(1..5), |
3340 | kind: ast::RepetitionKind::Range( |
3341 | ast::RepetitionRange::Exactly(5) |
3342 | ), |
3343 | }, |
3344 | greedy: false, |
3345 | ast: Box::new(lit('a' , 0)), |
3346 | })) |
3347 | ); |
3348 | assert_eq!( |
3349 | parser(r"ab{5}" ).parse(), |
3350 | Ok(concat( |
3351 | 0..5, |
3352 | vec![ |
3353 | lit('a' , 0), |
3354 | Ast::repetition(ast::Repetition { |
3355 | span: span(1..5), |
3356 | op: ast::RepetitionOp { |
3357 | span: span(2..5), |
3358 | kind: ast::RepetitionKind::Range( |
3359 | ast::RepetitionRange::Exactly(5) |
3360 | ), |
3361 | }, |
3362 | greedy: true, |
3363 | ast: Box::new(lit('b' , 1)), |
3364 | }), |
3365 | ] |
3366 | )) |
3367 | ); |
3368 | assert_eq!( |
3369 | parser(r"ab{5}c" ).parse(), |
3370 | Ok(concat( |
3371 | 0..6, |
3372 | vec![ |
3373 | lit('a' , 0), |
3374 | Ast::repetition(ast::Repetition { |
3375 | span: span(1..5), |
3376 | op: ast::RepetitionOp { |
3377 | span: span(2..5), |
3378 | kind: ast::RepetitionKind::Range( |
3379 | ast::RepetitionRange::Exactly(5) |
3380 | ), |
3381 | }, |
3382 | greedy: true, |
3383 | ast: Box::new(lit('b' , 1)), |
3384 | }), |
3385 | lit('c' , 5), |
3386 | ] |
3387 | )) |
3388 | ); |
3389 | |
3390 | assert_eq!( |
3391 | parser(r"a{ 5 }" ).parse(), |
3392 | Ok(Ast::repetition(ast::Repetition { |
3393 | span: span(0..6), |
3394 | op: ast::RepetitionOp { |
3395 | span: span(1..6), |
3396 | kind: ast::RepetitionKind::Range( |
3397 | ast::RepetitionRange::Exactly(5) |
3398 | ), |
3399 | }, |
3400 | greedy: true, |
3401 | ast: Box::new(lit('a' , 0)), |
3402 | })) |
3403 | ); |
3404 | assert_eq!( |
3405 | parser(r"a{ 5 , 9 }" ).parse(), |
3406 | Ok(Ast::repetition(ast::Repetition { |
3407 | span: span(0..10), |
3408 | op: ast::RepetitionOp { |
3409 | span: span(1..10), |
3410 | kind: ast::RepetitionKind::Range( |
3411 | ast::RepetitionRange::Bounded(5, 9) |
3412 | ), |
3413 | }, |
3414 | greedy: true, |
3415 | ast: Box::new(lit('a' , 0)), |
3416 | })) |
3417 | ); |
3418 | assert_eq!( |
3419 | parser_empty_min_range(r"a{,9}" ).parse(), |
3420 | Ok(Ast::repetition(ast::Repetition { |
3421 | span: span(0..5), |
3422 | op: ast::RepetitionOp { |
3423 | span: span(1..5), |
3424 | kind: ast::RepetitionKind::Range( |
3425 | ast::RepetitionRange::Bounded(0, 9) |
3426 | ), |
3427 | }, |
3428 | greedy: true, |
3429 | ast: Box::new(lit('a' , 0)), |
3430 | })) |
3431 | ); |
3432 | assert_eq!( |
3433 | parser_ignore_whitespace(r"a{5,9} ?" ).parse(), |
3434 | Ok(Ast::repetition(ast::Repetition { |
3435 | span: span(0..8), |
3436 | op: ast::RepetitionOp { |
3437 | span: span(1..8), |
3438 | kind: ast::RepetitionKind::Range( |
3439 | ast::RepetitionRange::Bounded(5, 9) |
3440 | ), |
3441 | }, |
3442 | greedy: false, |
3443 | ast: Box::new(lit('a' , 0)), |
3444 | })) |
3445 | ); |
3446 | assert_eq!( |
3447 | parser(r"\b{5,9}" ).parse(), |
3448 | Ok(Ast::repetition(ast::Repetition { |
3449 | span: span(0..7), |
3450 | op: ast::RepetitionOp { |
3451 | span: span(2..7), |
3452 | kind: ast::RepetitionKind::Range( |
3453 | ast::RepetitionRange::Bounded(5, 9) |
3454 | ), |
3455 | }, |
3456 | greedy: true, |
3457 | ast: Box::new(Ast::assertion(ast::Assertion { |
3458 | span: span(0..2), |
3459 | kind: ast::AssertionKind::WordBoundary, |
3460 | })), |
3461 | })) |
3462 | ); |
3463 | |
3464 | assert_eq!( |
3465 | parser(r"(?i){0}" ).parse().unwrap_err(), |
3466 | TestError { |
3467 | span: span(4..4), |
3468 | kind: ast::ErrorKind::RepetitionMissing, |
3469 | } |
3470 | ); |
3471 | assert_eq!( |
3472 | parser(r"(?m){1,1}" ).parse().unwrap_err(), |
3473 | TestError { |
3474 | span: span(4..4), |
3475 | kind: ast::ErrorKind::RepetitionMissing, |
3476 | } |
3477 | ); |
3478 | assert_eq!( |
3479 | parser(r"a{]}" ).parse().unwrap_err(), |
3480 | TestError { |
3481 | span: span(2..2), |
3482 | kind: ast::ErrorKind::RepetitionCountDecimalEmpty, |
3483 | } |
3484 | ); |
3485 | assert_eq!( |
3486 | parser(r"a{1,]}" ).parse().unwrap_err(), |
3487 | TestError { |
3488 | span: span(4..4), |
3489 | kind: ast::ErrorKind::RepetitionCountDecimalEmpty, |
3490 | } |
3491 | ); |
3492 | assert_eq!( |
3493 | parser(r"a{" ).parse().unwrap_err(), |
3494 | TestError { |
3495 | span: span(1..2), |
3496 | kind: ast::ErrorKind::RepetitionCountUnclosed, |
3497 | } |
3498 | ); |
3499 | assert_eq!( |
3500 | parser(r"a{}" ).parse().unwrap_err(), |
3501 | TestError { |
3502 | span: span(2..2), |
3503 | kind: ast::ErrorKind::RepetitionCountDecimalEmpty, |
3504 | } |
3505 | ); |
3506 | assert_eq!( |
3507 | parser(r"a{a" ).parse().unwrap_err(), |
3508 | TestError { |
3509 | span: span(2..2), |
3510 | kind: ast::ErrorKind::RepetitionCountDecimalEmpty, |
3511 | } |
3512 | ); |
3513 | assert_eq!( |
3514 | parser(r"a{9999999999}" ).parse().unwrap_err(), |
3515 | TestError { |
3516 | span: span(2..12), |
3517 | kind: ast::ErrorKind::DecimalInvalid, |
3518 | } |
3519 | ); |
3520 | assert_eq!( |
3521 | parser(r"a{9" ).parse().unwrap_err(), |
3522 | TestError { |
3523 | span: span(1..3), |
3524 | kind: ast::ErrorKind::RepetitionCountUnclosed, |
3525 | } |
3526 | ); |
3527 | assert_eq!( |
3528 | parser(r"a{9,a" ).parse().unwrap_err(), |
3529 | TestError { |
3530 | span: span(4..4), |
3531 | kind: ast::ErrorKind::RepetitionCountDecimalEmpty, |
3532 | } |
3533 | ); |
3534 | assert_eq!( |
3535 | parser(r"a{9,9999999999}" ).parse().unwrap_err(), |
3536 | TestError { |
3537 | span: span(4..14), |
3538 | kind: ast::ErrorKind::DecimalInvalid, |
3539 | } |
3540 | ); |
3541 | assert_eq!( |
3542 | parser(r"a{9," ).parse().unwrap_err(), |
3543 | TestError { |
3544 | span: span(1..4), |
3545 | kind: ast::ErrorKind::RepetitionCountUnclosed, |
3546 | } |
3547 | ); |
3548 | assert_eq!( |
3549 | parser(r"a{9,11" ).parse().unwrap_err(), |
3550 | TestError { |
3551 | span: span(1..6), |
3552 | kind: ast::ErrorKind::RepetitionCountUnclosed, |
3553 | } |
3554 | ); |
3555 | assert_eq!( |
3556 | parser(r"a{2,1}" ).parse().unwrap_err(), |
3557 | TestError { |
3558 | span: span(1..6), |
3559 | kind: ast::ErrorKind::RepetitionCountInvalid, |
3560 | } |
3561 | ); |
3562 | assert_eq!( |
3563 | parser(r"{5}" ).parse().unwrap_err(), |
3564 | TestError { |
3565 | span: span(0..0), |
3566 | kind: ast::ErrorKind::RepetitionMissing, |
3567 | } |
3568 | ); |
3569 | assert_eq!( |
3570 | parser(r"|{5}" ).parse().unwrap_err(), |
3571 | TestError { |
3572 | span: span(1..1), |
3573 | kind: ast::ErrorKind::RepetitionMissing, |
3574 | } |
3575 | ); |
3576 | } |
3577 | |
3578 | #[test ] |
3579 | fn parse_alternate() { |
3580 | assert_eq!( |
3581 | parser(r"a|b" ).parse(), |
3582 | Ok(Ast::alternation(ast::Alternation { |
3583 | span: span(0..3), |
3584 | asts: vec![lit('a' , 0), lit('b' , 2)], |
3585 | })) |
3586 | ); |
3587 | assert_eq!( |
3588 | parser(r"(a|b)" ).parse(), |
3589 | Ok(group( |
3590 | 0..5, |
3591 | 1, |
3592 | Ast::alternation(ast::Alternation { |
3593 | span: span(1..4), |
3594 | asts: vec![lit('a' , 1), lit('b' , 3)], |
3595 | }) |
3596 | )) |
3597 | ); |
3598 | |
3599 | assert_eq!( |
3600 | parser(r"a|b|c" ).parse(), |
3601 | Ok(Ast::alternation(ast::Alternation { |
3602 | span: span(0..5), |
3603 | asts: vec![lit('a' , 0), lit('b' , 2), lit('c' , 4)], |
3604 | })) |
3605 | ); |
3606 | assert_eq!( |
3607 | parser(r"ax|by|cz" ).parse(), |
3608 | Ok(Ast::alternation(ast::Alternation { |
3609 | span: span(0..8), |
3610 | asts: vec![ |
3611 | concat(0..2, vec![lit('a' , 0), lit('x' , 1)]), |
3612 | concat(3..5, vec![lit('b' , 3), lit('y' , 4)]), |
3613 | concat(6..8, vec![lit('c' , 6), lit('z' , 7)]), |
3614 | ], |
3615 | })) |
3616 | ); |
3617 | assert_eq!( |
3618 | parser(r"(ax|by|cz)" ).parse(), |
3619 | Ok(group( |
3620 | 0..10, |
3621 | 1, |
3622 | Ast::alternation(ast::Alternation { |
3623 | span: span(1..9), |
3624 | asts: vec![ |
3625 | concat(1..3, vec![lit('a' , 1), lit('x' , 2)]), |
3626 | concat(4..6, vec![lit('b' , 4), lit('y' , 5)]), |
3627 | concat(7..9, vec![lit('c' , 7), lit('z' , 8)]), |
3628 | ], |
3629 | }) |
3630 | )) |
3631 | ); |
3632 | assert_eq!( |
3633 | parser(r"(ax|(by|(cz)))" ).parse(), |
3634 | Ok(group( |
3635 | 0..14, |
3636 | 1, |
3637 | alt( |
3638 | 1..13, |
3639 | vec![ |
3640 | concat(1..3, vec![lit('a' , 1), lit('x' , 2)]), |
3641 | group( |
3642 | 4..13, |
3643 | 2, |
3644 | alt( |
3645 | 5..12, |
3646 | vec![ |
3647 | concat( |
3648 | 5..7, |
3649 | vec![lit('b' , 5), lit('y' , 6)] |
3650 | ), |
3651 | group( |
3652 | 8..12, |
3653 | 3, |
3654 | concat( |
3655 | 9..11, |
3656 | vec![lit('c' , 9), lit('z' , 10),] |
3657 | ) |
3658 | ), |
3659 | ] |
3660 | ) |
3661 | ), |
3662 | ] |
3663 | ) |
3664 | )) |
3665 | ); |
3666 | |
3667 | assert_eq!( |
3668 | parser(r"|" ).parse(), |
3669 | Ok(alt( |
3670 | 0..1, |
3671 | vec![Ast::empty(span(0..0)), Ast::empty(span(1..1)),] |
3672 | )) |
3673 | ); |
3674 | assert_eq!( |
3675 | parser(r"||" ).parse(), |
3676 | Ok(alt( |
3677 | 0..2, |
3678 | vec![ |
3679 | Ast::empty(span(0..0)), |
3680 | Ast::empty(span(1..1)), |
3681 | Ast::empty(span(2..2)), |
3682 | ] |
3683 | )) |
3684 | ); |
3685 | assert_eq!( |
3686 | parser(r"a|" ).parse(), |
3687 | Ok(alt(0..2, vec![lit('a' , 0), Ast::empty(span(2..2)),])) |
3688 | ); |
3689 | assert_eq!( |
3690 | parser(r"|a" ).parse(), |
3691 | Ok(alt(0..2, vec![Ast::empty(span(0..0)), lit('a' , 1),])) |
3692 | ); |
3693 | |
3694 | assert_eq!( |
3695 | parser(r"(|)" ).parse(), |
3696 | Ok(group( |
3697 | 0..3, |
3698 | 1, |
3699 | alt( |
3700 | 1..2, |
3701 | vec![Ast::empty(span(1..1)), Ast::empty(span(2..2)),] |
3702 | ) |
3703 | )) |
3704 | ); |
3705 | assert_eq!( |
3706 | parser(r"(a|)" ).parse(), |
3707 | Ok(group( |
3708 | 0..4, |
3709 | 1, |
3710 | alt(1..3, vec![lit('a' , 1), Ast::empty(span(3..3)),]) |
3711 | )) |
3712 | ); |
3713 | assert_eq!( |
3714 | parser(r"(|a)" ).parse(), |
3715 | Ok(group( |
3716 | 0..4, |
3717 | 1, |
3718 | alt(1..3, vec![Ast::empty(span(1..1)), lit('a' , 2),]) |
3719 | )) |
3720 | ); |
3721 | |
3722 | assert_eq!( |
3723 | parser(r"a|b)" ).parse().unwrap_err(), |
3724 | TestError { |
3725 | span: span(3..4), |
3726 | kind: ast::ErrorKind::GroupUnopened, |
3727 | } |
3728 | ); |
3729 | assert_eq!( |
3730 | parser(r"(a|b" ).parse().unwrap_err(), |
3731 | TestError { |
3732 | span: span(0..1), |
3733 | kind: ast::ErrorKind::GroupUnclosed, |
3734 | } |
3735 | ); |
3736 | } |
3737 | |
3738 | #[test ] |
3739 | fn parse_unsupported_lookaround() { |
3740 | assert_eq!( |
3741 | parser(r"(?=a)" ).parse().unwrap_err(), |
3742 | TestError { |
3743 | span: span(0..3), |
3744 | kind: ast::ErrorKind::UnsupportedLookAround, |
3745 | } |
3746 | ); |
3747 | assert_eq!( |
3748 | parser(r"(?!a)" ).parse().unwrap_err(), |
3749 | TestError { |
3750 | span: span(0..3), |
3751 | kind: ast::ErrorKind::UnsupportedLookAround, |
3752 | } |
3753 | ); |
3754 | assert_eq!( |
3755 | parser(r"(?<=a)" ).parse().unwrap_err(), |
3756 | TestError { |
3757 | span: span(0..4), |
3758 | kind: ast::ErrorKind::UnsupportedLookAround, |
3759 | } |
3760 | ); |
3761 | assert_eq!( |
3762 | parser(r"(?<!a)" ).parse().unwrap_err(), |
3763 | TestError { |
3764 | span: span(0..4), |
3765 | kind: ast::ErrorKind::UnsupportedLookAround, |
3766 | } |
3767 | ); |
3768 | } |
3769 | |
3770 | #[test ] |
3771 | fn parse_group() { |
3772 | assert_eq!( |
3773 | parser("(?i)" ).parse(), |
3774 | Ok(Ast::flags(ast::SetFlags { |
3775 | span: span(0..4), |
3776 | flags: ast::Flags { |
3777 | span: span(2..3), |
3778 | items: vec![ast::FlagsItem { |
3779 | span: span(2..3), |
3780 | kind: ast::FlagsItemKind::Flag( |
3781 | ast::Flag::CaseInsensitive |
3782 | ), |
3783 | }], |
3784 | }, |
3785 | })) |
3786 | ); |
3787 | assert_eq!( |
3788 | parser("(?iU)" ).parse(), |
3789 | Ok(Ast::flags(ast::SetFlags { |
3790 | span: span(0..5), |
3791 | flags: ast::Flags { |
3792 | span: span(2..4), |
3793 | items: vec![ |
3794 | ast::FlagsItem { |
3795 | span: span(2..3), |
3796 | kind: ast::FlagsItemKind::Flag( |
3797 | ast::Flag::CaseInsensitive |
3798 | ), |
3799 | }, |
3800 | ast::FlagsItem { |
3801 | span: span(3..4), |
3802 | kind: ast::FlagsItemKind::Flag( |
3803 | ast::Flag::SwapGreed |
3804 | ), |
3805 | }, |
3806 | ], |
3807 | }, |
3808 | })) |
3809 | ); |
3810 | assert_eq!( |
3811 | parser("(?i-U)" ).parse(), |
3812 | Ok(Ast::flags(ast::SetFlags { |
3813 | span: span(0..6), |
3814 | flags: ast::Flags { |
3815 | span: span(2..5), |
3816 | items: vec![ |
3817 | ast::FlagsItem { |
3818 | span: span(2..3), |
3819 | kind: ast::FlagsItemKind::Flag( |
3820 | ast::Flag::CaseInsensitive |
3821 | ), |
3822 | }, |
3823 | ast::FlagsItem { |
3824 | span: span(3..4), |
3825 | kind: ast::FlagsItemKind::Negation, |
3826 | }, |
3827 | ast::FlagsItem { |
3828 | span: span(4..5), |
3829 | kind: ast::FlagsItemKind::Flag( |
3830 | ast::Flag::SwapGreed |
3831 | ), |
3832 | }, |
3833 | ], |
3834 | }, |
3835 | })) |
3836 | ); |
3837 | |
3838 | assert_eq!( |
3839 | parser("()" ).parse(), |
3840 | Ok(Ast::group(ast::Group { |
3841 | span: span(0..2), |
3842 | kind: ast::GroupKind::CaptureIndex(1), |
3843 | ast: Box::new(Ast::empty(span(1..1))), |
3844 | })) |
3845 | ); |
3846 | assert_eq!( |
3847 | parser("(a)" ).parse(), |
3848 | Ok(Ast::group(ast::Group { |
3849 | span: span(0..3), |
3850 | kind: ast::GroupKind::CaptureIndex(1), |
3851 | ast: Box::new(lit('a' , 1)), |
3852 | })) |
3853 | ); |
3854 | assert_eq!( |
3855 | parser("(())" ).parse(), |
3856 | Ok(Ast::group(ast::Group { |
3857 | span: span(0..4), |
3858 | kind: ast::GroupKind::CaptureIndex(1), |
3859 | ast: Box::new(Ast::group(ast::Group { |
3860 | span: span(1..3), |
3861 | kind: ast::GroupKind::CaptureIndex(2), |
3862 | ast: Box::new(Ast::empty(span(2..2))), |
3863 | })), |
3864 | })) |
3865 | ); |
3866 | |
3867 | assert_eq!( |
3868 | parser("(?:a)" ).parse(), |
3869 | Ok(Ast::group(ast::Group { |
3870 | span: span(0..5), |
3871 | kind: ast::GroupKind::NonCapturing(ast::Flags { |
3872 | span: span(2..2), |
3873 | items: vec![], |
3874 | }), |
3875 | ast: Box::new(lit('a' , 3)), |
3876 | })) |
3877 | ); |
3878 | |
3879 | assert_eq!( |
3880 | parser("(?i:a)" ).parse(), |
3881 | Ok(Ast::group(ast::Group { |
3882 | span: span(0..6), |
3883 | kind: ast::GroupKind::NonCapturing(ast::Flags { |
3884 | span: span(2..3), |
3885 | items: vec![ast::FlagsItem { |
3886 | span: span(2..3), |
3887 | kind: ast::FlagsItemKind::Flag( |
3888 | ast::Flag::CaseInsensitive |
3889 | ), |
3890 | },], |
3891 | }), |
3892 | ast: Box::new(lit('a' , 4)), |
3893 | })) |
3894 | ); |
3895 | assert_eq!( |
3896 | parser("(?i-U:a)" ).parse(), |
3897 | Ok(Ast::group(ast::Group { |
3898 | span: span(0..8), |
3899 | kind: ast::GroupKind::NonCapturing(ast::Flags { |
3900 | span: span(2..5), |
3901 | items: vec![ |
3902 | ast::FlagsItem { |
3903 | span: span(2..3), |
3904 | kind: ast::FlagsItemKind::Flag( |
3905 | ast::Flag::CaseInsensitive |
3906 | ), |
3907 | }, |
3908 | ast::FlagsItem { |
3909 | span: span(3..4), |
3910 | kind: ast::FlagsItemKind::Negation, |
3911 | }, |
3912 | ast::FlagsItem { |
3913 | span: span(4..5), |
3914 | kind: ast::FlagsItemKind::Flag( |
3915 | ast::Flag::SwapGreed |
3916 | ), |
3917 | }, |
3918 | ], |
3919 | }), |
3920 | ast: Box::new(lit('a' , 6)), |
3921 | })) |
3922 | ); |
3923 | |
3924 | assert_eq!( |
3925 | parser("(" ).parse().unwrap_err(), |
3926 | TestError { |
3927 | span: span(0..1), |
3928 | kind: ast::ErrorKind::GroupUnclosed, |
3929 | } |
3930 | ); |
3931 | assert_eq!( |
3932 | parser("(?" ).parse().unwrap_err(), |
3933 | TestError { |
3934 | span: span(0..1), |
3935 | kind: ast::ErrorKind::GroupUnclosed, |
3936 | } |
3937 | ); |
3938 | assert_eq!( |
3939 | parser("(?P" ).parse().unwrap_err(), |
3940 | TestError { |
3941 | span: span(2..3), |
3942 | kind: ast::ErrorKind::FlagUnrecognized, |
3943 | } |
3944 | ); |
3945 | assert_eq!( |
3946 | parser("(?P<" ).parse().unwrap_err(), |
3947 | TestError { |
3948 | span: span(4..4), |
3949 | kind: ast::ErrorKind::GroupNameUnexpectedEof, |
3950 | } |
3951 | ); |
3952 | assert_eq!( |
3953 | parser("(a" ).parse().unwrap_err(), |
3954 | TestError { |
3955 | span: span(0..1), |
3956 | kind: ast::ErrorKind::GroupUnclosed, |
3957 | } |
3958 | ); |
3959 | assert_eq!( |
3960 | parser("(()" ).parse().unwrap_err(), |
3961 | TestError { |
3962 | span: span(0..1), |
3963 | kind: ast::ErrorKind::GroupUnclosed, |
3964 | } |
3965 | ); |
3966 | assert_eq!( |
3967 | parser(")" ).parse().unwrap_err(), |
3968 | TestError { |
3969 | span: span(0..1), |
3970 | kind: ast::ErrorKind::GroupUnopened, |
3971 | } |
3972 | ); |
3973 | assert_eq!( |
3974 | parser("a)" ).parse().unwrap_err(), |
3975 | TestError { |
3976 | span: span(1..2), |
3977 | kind: ast::ErrorKind::GroupUnopened, |
3978 | } |
3979 | ); |
3980 | } |
3981 | |
3982 | #[test ] |
3983 | fn parse_capture_name() { |
3984 | assert_eq!( |
3985 | parser("(?<a>z)" ).parse(), |
3986 | Ok(Ast::group(ast::Group { |
3987 | span: span(0..7), |
3988 | kind: ast::GroupKind::CaptureName { |
3989 | starts_with_p: false, |
3990 | name: ast::CaptureName { |
3991 | span: span(3..4), |
3992 | name: s("a" ), |
3993 | index: 1, |
3994 | } |
3995 | }, |
3996 | ast: Box::new(lit('z' , 5)), |
3997 | })) |
3998 | ); |
3999 | assert_eq!( |
4000 | parser("(?P<a>z)" ).parse(), |
4001 | Ok(Ast::group(ast::Group { |
4002 | span: span(0..8), |
4003 | kind: ast::GroupKind::CaptureName { |
4004 | starts_with_p: true, |
4005 | name: ast::CaptureName { |
4006 | span: span(4..5), |
4007 | name: s("a" ), |
4008 | index: 1, |
4009 | } |
4010 | }, |
4011 | ast: Box::new(lit('z' , 6)), |
4012 | })) |
4013 | ); |
4014 | assert_eq!( |
4015 | parser("(?P<abc>z)" ).parse(), |
4016 | Ok(Ast::group(ast::Group { |
4017 | span: span(0..10), |
4018 | kind: ast::GroupKind::CaptureName { |
4019 | starts_with_p: true, |
4020 | name: ast::CaptureName { |
4021 | span: span(4..7), |
4022 | name: s("abc" ), |
4023 | index: 1, |
4024 | } |
4025 | }, |
4026 | ast: Box::new(lit('z' , 8)), |
4027 | })) |
4028 | ); |
4029 | |
4030 | assert_eq!( |
4031 | parser("(?P<a_1>z)" ).parse(), |
4032 | Ok(Ast::group(ast::Group { |
4033 | span: span(0..10), |
4034 | kind: ast::GroupKind::CaptureName { |
4035 | starts_with_p: true, |
4036 | name: ast::CaptureName { |
4037 | span: span(4..7), |
4038 | name: s("a_1" ), |
4039 | index: 1, |
4040 | } |
4041 | }, |
4042 | ast: Box::new(lit('z' , 8)), |
4043 | })) |
4044 | ); |
4045 | |
4046 | assert_eq!( |
4047 | parser("(?P<a.1>z)" ).parse(), |
4048 | Ok(Ast::group(ast::Group { |
4049 | span: span(0..10), |
4050 | kind: ast::GroupKind::CaptureName { |
4051 | starts_with_p: true, |
4052 | name: ast::CaptureName { |
4053 | span: span(4..7), |
4054 | name: s("a.1" ), |
4055 | index: 1, |
4056 | } |
4057 | }, |
4058 | ast: Box::new(lit('z' , 8)), |
4059 | })) |
4060 | ); |
4061 | |
4062 | assert_eq!( |
4063 | parser("(?P<a[1]>z)" ).parse(), |
4064 | Ok(Ast::group(ast::Group { |
4065 | span: span(0..11), |
4066 | kind: ast::GroupKind::CaptureName { |
4067 | starts_with_p: true, |
4068 | name: ast::CaptureName { |
4069 | span: span(4..8), |
4070 | name: s("a[1]" ), |
4071 | index: 1, |
4072 | } |
4073 | }, |
4074 | ast: Box::new(lit('z' , 9)), |
4075 | })) |
4076 | ); |
4077 | |
4078 | assert_eq!( |
4079 | parser("(?P<a¾>)" ).parse(), |
4080 | Ok(Ast::group(ast::Group { |
4081 | span: Span::new( |
4082 | Position::new(0, 1, 1), |
4083 | Position::new(9, 1, 9), |
4084 | ), |
4085 | kind: ast::GroupKind::CaptureName { |
4086 | starts_with_p: true, |
4087 | name: ast::CaptureName { |
4088 | span: Span::new( |
4089 | Position::new(4, 1, 5), |
4090 | Position::new(7, 1, 7), |
4091 | ), |
4092 | name: s("a¾" ), |
4093 | index: 1, |
4094 | } |
4095 | }, |
4096 | ast: Box::new(Ast::empty(Span::new( |
4097 | Position::new(8, 1, 8), |
4098 | Position::new(8, 1, 8), |
4099 | ))), |
4100 | })) |
4101 | ); |
4102 | assert_eq!( |
4103 | parser("(?P<名字>)" ).parse(), |
4104 | Ok(Ast::group(ast::Group { |
4105 | span: Span::new( |
4106 | Position::new(0, 1, 1), |
4107 | Position::new(12, 1, 9), |
4108 | ), |
4109 | kind: ast::GroupKind::CaptureName { |
4110 | starts_with_p: true, |
4111 | name: ast::CaptureName { |
4112 | span: Span::new( |
4113 | Position::new(4, 1, 5), |
4114 | Position::new(10, 1, 7), |
4115 | ), |
4116 | name: s("名字" ), |
4117 | index: 1, |
4118 | } |
4119 | }, |
4120 | ast: Box::new(Ast::empty(Span::new( |
4121 | Position::new(11, 1, 8), |
4122 | Position::new(11, 1, 8), |
4123 | ))), |
4124 | })) |
4125 | ); |
4126 | |
4127 | assert_eq!( |
4128 | parser("(?P<" ).parse().unwrap_err(), |
4129 | TestError { |
4130 | span: span(4..4), |
4131 | kind: ast::ErrorKind::GroupNameUnexpectedEof, |
4132 | } |
4133 | ); |
4134 | assert_eq!( |
4135 | parser("(?P<>z)" ).parse().unwrap_err(), |
4136 | TestError { |
4137 | span: span(4..4), |
4138 | kind: ast::ErrorKind::GroupNameEmpty, |
4139 | } |
4140 | ); |
4141 | assert_eq!( |
4142 | parser("(?P<a" ).parse().unwrap_err(), |
4143 | TestError { |
4144 | span: span(5..5), |
4145 | kind: ast::ErrorKind::GroupNameUnexpectedEof, |
4146 | } |
4147 | ); |
4148 | assert_eq!( |
4149 | parser("(?P<ab" ).parse().unwrap_err(), |
4150 | TestError { |
4151 | span: span(6..6), |
4152 | kind: ast::ErrorKind::GroupNameUnexpectedEof, |
4153 | } |
4154 | ); |
4155 | assert_eq!( |
4156 | parser("(?P<0a" ).parse().unwrap_err(), |
4157 | TestError { |
4158 | span: span(4..5), |
4159 | kind: ast::ErrorKind::GroupNameInvalid, |
4160 | } |
4161 | ); |
4162 | assert_eq!( |
4163 | parser("(?P<~" ).parse().unwrap_err(), |
4164 | TestError { |
4165 | span: span(4..5), |
4166 | kind: ast::ErrorKind::GroupNameInvalid, |
4167 | } |
4168 | ); |
4169 | assert_eq!( |
4170 | parser("(?P<abc~" ).parse().unwrap_err(), |
4171 | TestError { |
4172 | span: span(7..8), |
4173 | kind: ast::ErrorKind::GroupNameInvalid, |
4174 | } |
4175 | ); |
4176 | assert_eq!( |
4177 | parser("(?P<a>y)(?P<a>z)" ).parse().unwrap_err(), |
4178 | TestError { |
4179 | span: span(12..13), |
4180 | kind: ast::ErrorKind::GroupNameDuplicate { |
4181 | original: span(4..5), |
4182 | }, |
4183 | } |
4184 | ); |
4185 | assert_eq!( |
4186 | parser("(?P<5>)" ).parse().unwrap_err(), |
4187 | TestError { |
4188 | span: span(4..5), |
4189 | kind: ast::ErrorKind::GroupNameInvalid, |
4190 | } |
4191 | ); |
4192 | assert_eq!( |
4193 | parser("(?P<5a>)" ).parse().unwrap_err(), |
4194 | TestError { |
4195 | span: span(4..5), |
4196 | kind: ast::ErrorKind::GroupNameInvalid, |
4197 | } |
4198 | ); |
4199 | assert_eq!( |
4200 | parser("(?P<¾>)" ).parse().unwrap_err(), |
4201 | TestError { |
4202 | span: Span::new( |
4203 | Position::new(4, 1, 5), |
4204 | Position::new(6, 1, 6), |
4205 | ), |
4206 | kind: ast::ErrorKind::GroupNameInvalid, |
4207 | } |
4208 | ); |
4209 | assert_eq!( |
4210 | parser("(?P<¾a>)" ).parse().unwrap_err(), |
4211 | TestError { |
4212 | span: Span::new( |
4213 | Position::new(4, 1, 5), |
4214 | Position::new(6, 1, 6), |
4215 | ), |
4216 | kind: ast::ErrorKind::GroupNameInvalid, |
4217 | } |
4218 | ); |
4219 | assert_eq!( |
4220 | parser("(?P<☃>)" ).parse().unwrap_err(), |
4221 | TestError { |
4222 | span: Span::new( |
4223 | Position::new(4, 1, 5), |
4224 | Position::new(7, 1, 6), |
4225 | ), |
4226 | kind: ast::ErrorKind::GroupNameInvalid, |
4227 | } |
4228 | ); |
4229 | assert_eq!( |
4230 | parser("(?P<a☃>)" ).parse().unwrap_err(), |
4231 | TestError { |
4232 | span: Span::new( |
4233 | Position::new(5, 1, 6), |
4234 | Position::new(8, 1, 7), |
4235 | ), |
4236 | kind: ast::ErrorKind::GroupNameInvalid, |
4237 | } |
4238 | ); |
4239 | } |
4240 | |
4241 | #[test ] |
4242 | fn parse_flags() { |
4243 | assert_eq!( |
4244 | parser("i:" ).parse_flags(), |
4245 | Ok(ast::Flags { |
4246 | span: span(0..1), |
4247 | items: vec![ast::FlagsItem { |
4248 | span: span(0..1), |
4249 | kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive), |
4250 | }], |
4251 | }) |
4252 | ); |
4253 | assert_eq!( |
4254 | parser("i)" ).parse_flags(), |
4255 | Ok(ast::Flags { |
4256 | span: span(0..1), |
4257 | items: vec![ast::FlagsItem { |
4258 | span: span(0..1), |
4259 | kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive), |
4260 | }], |
4261 | }) |
4262 | ); |
4263 | |
4264 | assert_eq!( |
4265 | parser("isU:" ).parse_flags(), |
4266 | Ok(ast::Flags { |
4267 | span: span(0..3), |
4268 | items: vec![ |
4269 | ast::FlagsItem { |
4270 | span: span(0..1), |
4271 | kind: ast::FlagsItemKind::Flag( |
4272 | ast::Flag::CaseInsensitive |
4273 | ), |
4274 | }, |
4275 | ast::FlagsItem { |
4276 | span: span(1..2), |
4277 | kind: ast::FlagsItemKind::Flag( |
4278 | ast::Flag::DotMatchesNewLine |
4279 | ), |
4280 | }, |
4281 | ast::FlagsItem { |
4282 | span: span(2..3), |
4283 | kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed), |
4284 | }, |
4285 | ], |
4286 | }) |
4287 | ); |
4288 | |
4289 | assert_eq!( |
4290 | parser("-isU:" ).parse_flags(), |
4291 | Ok(ast::Flags { |
4292 | span: span(0..4), |
4293 | items: vec![ |
4294 | ast::FlagsItem { |
4295 | span: span(0..1), |
4296 | kind: ast::FlagsItemKind::Negation, |
4297 | }, |
4298 | ast::FlagsItem { |
4299 | span: span(1..2), |
4300 | kind: ast::FlagsItemKind::Flag( |
4301 | ast::Flag::CaseInsensitive |
4302 | ), |
4303 | }, |
4304 | ast::FlagsItem { |
4305 | span: span(2..3), |
4306 | kind: ast::FlagsItemKind::Flag( |
4307 | ast::Flag::DotMatchesNewLine |
4308 | ), |
4309 | }, |
4310 | ast::FlagsItem { |
4311 | span: span(3..4), |
4312 | kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed), |
4313 | }, |
4314 | ], |
4315 | }) |
4316 | ); |
4317 | assert_eq!( |
4318 | parser("i-sU:" ).parse_flags(), |
4319 | Ok(ast::Flags { |
4320 | span: span(0..4), |
4321 | items: vec![ |
4322 | ast::FlagsItem { |
4323 | span: span(0..1), |
4324 | kind: ast::FlagsItemKind::Flag( |
4325 | ast::Flag::CaseInsensitive |
4326 | ), |
4327 | }, |
4328 | ast::FlagsItem { |
4329 | span: span(1..2), |
4330 | kind: ast::FlagsItemKind::Negation, |
4331 | }, |
4332 | ast::FlagsItem { |
4333 | span: span(2..3), |
4334 | kind: ast::FlagsItemKind::Flag( |
4335 | ast::Flag::DotMatchesNewLine |
4336 | ), |
4337 | }, |
4338 | ast::FlagsItem { |
4339 | span: span(3..4), |
4340 | kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed), |
4341 | }, |
4342 | ], |
4343 | }) |
4344 | ); |
4345 | assert_eq!( |
4346 | parser("i-sR:" ).parse_flags(), |
4347 | Ok(ast::Flags { |
4348 | span: span(0..4), |
4349 | items: vec![ |
4350 | ast::FlagsItem { |
4351 | span: span(0..1), |
4352 | kind: ast::FlagsItemKind::Flag( |
4353 | ast::Flag::CaseInsensitive |
4354 | ), |
4355 | }, |
4356 | ast::FlagsItem { |
4357 | span: span(1..2), |
4358 | kind: ast::FlagsItemKind::Negation, |
4359 | }, |
4360 | ast::FlagsItem { |
4361 | span: span(2..3), |
4362 | kind: ast::FlagsItemKind::Flag( |
4363 | ast::Flag::DotMatchesNewLine |
4364 | ), |
4365 | }, |
4366 | ast::FlagsItem { |
4367 | span: span(3..4), |
4368 | kind: ast::FlagsItemKind::Flag(ast::Flag::CRLF), |
4369 | }, |
4370 | ], |
4371 | }) |
4372 | ); |
4373 | |
4374 | assert_eq!( |
4375 | parser("isU" ).parse_flags().unwrap_err(), |
4376 | TestError { |
4377 | span: span(3..3), |
4378 | kind: ast::ErrorKind::FlagUnexpectedEof, |
4379 | } |
4380 | ); |
4381 | assert_eq!( |
4382 | parser("isUa:" ).parse_flags().unwrap_err(), |
4383 | TestError { |
4384 | span: span(3..4), |
4385 | kind: ast::ErrorKind::FlagUnrecognized, |
4386 | } |
4387 | ); |
4388 | assert_eq!( |
4389 | parser("isUi:" ).parse_flags().unwrap_err(), |
4390 | TestError { |
4391 | span: span(3..4), |
4392 | kind: ast::ErrorKind::FlagDuplicate { original: span(0..1) }, |
4393 | } |
4394 | ); |
4395 | assert_eq!( |
4396 | parser("i-sU-i:" ).parse_flags().unwrap_err(), |
4397 | TestError { |
4398 | span: span(4..5), |
4399 | kind: ast::ErrorKind::FlagRepeatedNegation { |
4400 | original: span(1..2), |
4401 | }, |
4402 | } |
4403 | ); |
4404 | assert_eq!( |
4405 | parser("-)" ).parse_flags().unwrap_err(), |
4406 | TestError { |
4407 | span: span(0..1), |
4408 | kind: ast::ErrorKind::FlagDanglingNegation, |
4409 | } |
4410 | ); |
4411 | assert_eq!( |
4412 | parser("i-)" ).parse_flags().unwrap_err(), |
4413 | TestError { |
4414 | span: span(1..2), |
4415 | kind: ast::ErrorKind::FlagDanglingNegation, |
4416 | } |
4417 | ); |
4418 | assert_eq!( |
4419 | parser("iU-)" ).parse_flags().unwrap_err(), |
4420 | TestError { |
4421 | span: span(2..3), |
4422 | kind: ast::ErrorKind::FlagDanglingNegation, |
4423 | } |
4424 | ); |
4425 | } |
4426 | |
4427 | #[test ] |
4428 | fn parse_flag() { |
4429 | assert_eq!(parser("i" ).parse_flag(), Ok(ast::Flag::CaseInsensitive)); |
4430 | assert_eq!(parser("m" ).parse_flag(), Ok(ast::Flag::MultiLine)); |
4431 | assert_eq!(parser("s" ).parse_flag(), Ok(ast::Flag::DotMatchesNewLine)); |
4432 | assert_eq!(parser("U" ).parse_flag(), Ok(ast::Flag::SwapGreed)); |
4433 | assert_eq!(parser("u" ).parse_flag(), Ok(ast::Flag::Unicode)); |
4434 | assert_eq!(parser("R" ).parse_flag(), Ok(ast::Flag::CRLF)); |
4435 | assert_eq!(parser("x" ).parse_flag(), Ok(ast::Flag::IgnoreWhitespace)); |
4436 | |
4437 | assert_eq!( |
4438 | parser("a" ).parse_flag().unwrap_err(), |
4439 | TestError { |
4440 | span: span(0..1), |
4441 | kind: ast::ErrorKind::FlagUnrecognized, |
4442 | } |
4443 | ); |
4444 | assert_eq!( |
4445 | parser("☃" ).parse_flag().unwrap_err(), |
4446 | TestError { |
4447 | span: span_range("☃" , 0..3), |
4448 | kind: ast::ErrorKind::FlagUnrecognized, |
4449 | } |
4450 | ); |
4451 | } |
4452 | |
4453 | #[test ] |
4454 | fn parse_primitive_non_escape() { |
4455 | assert_eq!( |
4456 | parser(r"." ).parse_primitive(), |
4457 | Ok(Primitive::Dot(span(0..1))) |
4458 | ); |
4459 | assert_eq!( |
4460 | parser(r"^" ).parse_primitive(), |
4461 | Ok(Primitive::Assertion(ast::Assertion { |
4462 | span: span(0..1), |
4463 | kind: ast::AssertionKind::StartLine, |
4464 | })) |
4465 | ); |
4466 | assert_eq!( |
4467 | parser(r"$" ).parse_primitive(), |
4468 | Ok(Primitive::Assertion(ast::Assertion { |
4469 | span: span(0..1), |
4470 | kind: ast::AssertionKind::EndLine, |
4471 | })) |
4472 | ); |
4473 | |
4474 | assert_eq!( |
4475 | parser(r"a" ).parse_primitive(), |
4476 | Ok(Primitive::Literal(ast::Literal { |
4477 | span: span(0..1), |
4478 | kind: ast::LiteralKind::Verbatim, |
4479 | c: 'a' , |
4480 | })) |
4481 | ); |
4482 | assert_eq!( |
4483 | parser(r"|" ).parse_primitive(), |
4484 | Ok(Primitive::Literal(ast::Literal { |
4485 | span: span(0..1), |
4486 | kind: ast::LiteralKind::Verbatim, |
4487 | c: '|' , |
4488 | })) |
4489 | ); |
4490 | assert_eq!( |
4491 | parser(r"☃" ).parse_primitive(), |
4492 | Ok(Primitive::Literal(ast::Literal { |
4493 | span: span_range("☃" , 0..3), |
4494 | kind: ast::LiteralKind::Verbatim, |
4495 | c: '☃' , |
4496 | })) |
4497 | ); |
4498 | } |
4499 | |
4500 | #[test ] |
4501 | fn parse_escape() { |
4502 | assert_eq!( |
4503 | parser(r"\|" ).parse_primitive(), |
4504 | Ok(Primitive::Literal(ast::Literal { |
4505 | span: span(0..2), |
4506 | kind: ast::LiteralKind::Meta, |
4507 | c: '|' , |
4508 | })) |
4509 | ); |
4510 | let specials = &[ |
4511 | (r"\a" , ' \x07' , ast::SpecialLiteralKind::Bell), |
4512 | (r"\f" , ' \x0C' , ast::SpecialLiteralKind::FormFeed), |
4513 | (r"\t" , ' \t' , ast::SpecialLiteralKind::Tab), |
4514 | (r"\n" , ' \n' , ast::SpecialLiteralKind::LineFeed), |
4515 | (r"\r" , ' \r' , ast::SpecialLiteralKind::CarriageReturn), |
4516 | (r"\v" , ' \x0B' , ast::SpecialLiteralKind::VerticalTab), |
4517 | ]; |
4518 | for &(pat, c, ref kind) in specials { |
4519 | assert_eq!( |
4520 | parser(pat).parse_primitive(), |
4521 | Ok(Primitive::Literal(ast::Literal { |
4522 | span: span(0..2), |
4523 | kind: ast::LiteralKind::Special(kind.clone()), |
4524 | c, |
4525 | })) |
4526 | ); |
4527 | } |
4528 | assert_eq!( |
4529 | parser(r"\A" ).parse_primitive(), |
4530 | Ok(Primitive::Assertion(ast::Assertion { |
4531 | span: span(0..2), |
4532 | kind: ast::AssertionKind::StartText, |
4533 | })) |
4534 | ); |
4535 | assert_eq!( |
4536 | parser(r"\z" ).parse_primitive(), |
4537 | Ok(Primitive::Assertion(ast::Assertion { |
4538 | span: span(0..2), |
4539 | kind: ast::AssertionKind::EndText, |
4540 | })) |
4541 | ); |
4542 | assert_eq!( |
4543 | parser(r"\b" ).parse_primitive(), |
4544 | Ok(Primitive::Assertion(ast::Assertion { |
4545 | span: span(0..2), |
4546 | kind: ast::AssertionKind::WordBoundary, |
4547 | })) |
4548 | ); |
4549 | assert_eq!( |
4550 | parser(r"\b{start}" ).parse_primitive(), |
4551 | Ok(Primitive::Assertion(ast::Assertion { |
4552 | span: span(0..9), |
4553 | kind: ast::AssertionKind::WordBoundaryStart, |
4554 | })) |
4555 | ); |
4556 | assert_eq!( |
4557 | parser(r"\b{end}" ).parse_primitive(), |
4558 | Ok(Primitive::Assertion(ast::Assertion { |
4559 | span: span(0..7), |
4560 | kind: ast::AssertionKind::WordBoundaryEnd, |
4561 | })) |
4562 | ); |
4563 | assert_eq!( |
4564 | parser(r"\b{start-half}" ).parse_primitive(), |
4565 | Ok(Primitive::Assertion(ast::Assertion { |
4566 | span: span(0..14), |
4567 | kind: ast::AssertionKind::WordBoundaryStartHalf, |
4568 | })) |
4569 | ); |
4570 | assert_eq!( |
4571 | parser(r"\b{end-half}" ).parse_primitive(), |
4572 | Ok(Primitive::Assertion(ast::Assertion { |
4573 | span: span(0..12), |
4574 | kind: ast::AssertionKind::WordBoundaryEndHalf, |
4575 | })) |
4576 | ); |
4577 | assert_eq!( |
4578 | parser(r"\<" ).parse_primitive(), |
4579 | Ok(Primitive::Assertion(ast::Assertion { |
4580 | span: span(0..2), |
4581 | kind: ast::AssertionKind::WordBoundaryStartAngle, |
4582 | })) |
4583 | ); |
4584 | assert_eq!( |
4585 | parser(r"\>" ).parse_primitive(), |
4586 | Ok(Primitive::Assertion(ast::Assertion { |
4587 | span: span(0..2), |
4588 | kind: ast::AssertionKind::WordBoundaryEndAngle, |
4589 | })) |
4590 | ); |
4591 | assert_eq!( |
4592 | parser(r"\B" ).parse_primitive(), |
4593 | Ok(Primitive::Assertion(ast::Assertion { |
4594 | span: span(0..2), |
4595 | kind: ast::AssertionKind::NotWordBoundary, |
4596 | })) |
4597 | ); |
4598 | |
4599 | // We also support superfluous escapes in most cases now too. |
4600 | for c in ['!' , '@' , '%' , '"' , ' \'' , '/' , ' ' ] { |
4601 | let pat = format!(r"\{}" , c); |
4602 | assert_eq!( |
4603 | parser(&pat).parse_primitive(), |
4604 | Ok(Primitive::Literal(ast::Literal { |
4605 | span: span(0..2), |
4606 | kind: ast::LiteralKind::Superfluous, |
4607 | c, |
4608 | })) |
4609 | ); |
4610 | } |
4611 | |
4612 | // Some superfluous escapes, namely [0-9A-Za-z], are still banned. This |
4613 | // gives flexibility for future evolution. |
4614 | assert_eq!( |
4615 | parser(r"\e" ).parse_escape().unwrap_err(), |
4616 | TestError { |
4617 | span: span(0..2), |
4618 | kind: ast::ErrorKind::EscapeUnrecognized, |
4619 | } |
4620 | ); |
4621 | assert_eq!( |
4622 | parser(r"\y" ).parse_escape().unwrap_err(), |
4623 | TestError { |
4624 | span: span(0..2), |
4625 | kind: ast::ErrorKind::EscapeUnrecognized, |
4626 | } |
4627 | ); |
4628 | |
4629 | // Starting a special word boundary without any non-whitespace chars |
4630 | // after the brace makes it ambiguous whether the user meant to write |
4631 | // a counted repetition (probably not?) or an actual special word |
4632 | // boundary assertion. |
4633 | assert_eq!( |
4634 | parser(r"\b{" ).parse_escape().unwrap_err(), |
4635 | TestError { |
4636 | span: span(0..3), |
4637 | kind: ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof, |
4638 | } |
4639 | ); |
4640 | assert_eq!( |
4641 | parser_ignore_whitespace(r"\b{ " ).parse_escape().unwrap_err(), |
4642 | TestError { |
4643 | span: span(0..4), |
4644 | kind: ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof, |
4645 | } |
4646 | ); |
4647 | // When 'x' is not enabled, the space is seen as a non-[-A-Za-z] char, |
4648 | // and thus causes the parser to treat it as a counted repetition. |
4649 | assert_eq!( |
4650 | parser(r"\b{ " ).parse().unwrap_err(), |
4651 | TestError { |
4652 | span: span(2..4), |
4653 | kind: ast::ErrorKind::RepetitionCountUnclosed, |
4654 | } |
4655 | ); |
4656 | // In this case, we got some valid chars that makes it look like the |
4657 | // user is writing one of the special word boundary assertions, but |
4658 | // we forget to close the brace. |
4659 | assert_eq!( |
4660 | parser(r"\b{foo" ).parse_escape().unwrap_err(), |
4661 | TestError { |
4662 | span: span(2..6), |
4663 | kind: ast::ErrorKind::SpecialWordBoundaryUnclosed, |
4664 | } |
4665 | ); |
4666 | // We get the same error as above, except it is provoked by seeing a |
4667 | // char that we know is invalid before seeing a closing brace. |
4668 | assert_eq!( |
4669 | parser(r"\b{foo!}" ).parse_escape().unwrap_err(), |
4670 | TestError { |
4671 | span: span(2..6), |
4672 | kind: ast::ErrorKind::SpecialWordBoundaryUnclosed, |
4673 | } |
4674 | ); |
4675 | // And this one occurs when, syntactically, everything looks okay, but |
4676 | // we don't use a valid spelling of a word boundary assertion. |
4677 | assert_eq!( |
4678 | parser(r"\b{foo}" ).parse_escape().unwrap_err(), |
4679 | TestError { |
4680 | span: span(3..6), |
4681 | kind: ast::ErrorKind::SpecialWordBoundaryUnrecognized, |
4682 | } |
4683 | ); |
4684 | |
4685 | // An unfinished escape is illegal. |
4686 | assert_eq!( |
4687 | parser(r"\" ).parse_escape().unwrap_err(), |
4688 | TestError { |
4689 | span: span(0..1), |
4690 | kind: ast::ErrorKind::EscapeUnexpectedEof, |
4691 | } |
4692 | ); |
4693 | } |
4694 | |
4695 | #[test ] |
4696 | fn parse_unsupported_backreference() { |
4697 | assert_eq!( |
4698 | parser(r"\0" ).parse_escape().unwrap_err(), |
4699 | TestError { |
4700 | span: span(0..2), |
4701 | kind: ast::ErrorKind::UnsupportedBackreference, |
4702 | } |
4703 | ); |
4704 | assert_eq!( |
4705 | parser(r"\9" ).parse_escape().unwrap_err(), |
4706 | TestError { |
4707 | span: span(0..2), |
4708 | kind: ast::ErrorKind::UnsupportedBackreference, |
4709 | } |
4710 | ); |
4711 | } |
4712 | |
4713 | #[test ] |
4714 | fn parse_octal() { |
4715 | for i in 0..511 { |
4716 | let pat = format!(r"\{:o}" , i); |
4717 | assert_eq!( |
4718 | parser_octal(&pat).parse_escape(), |
4719 | Ok(Primitive::Literal(ast::Literal { |
4720 | span: span(0..pat.len()), |
4721 | kind: ast::LiteralKind::Octal, |
4722 | c: char::from_u32(i).unwrap(), |
4723 | })) |
4724 | ); |
4725 | } |
4726 | assert_eq!( |
4727 | parser_octal(r"\778" ).parse_escape(), |
4728 | Ok(Primitive::Literal(ast::Literal { |
4729 | span: span(0..3), |
4730 | kind: ast::LiteralKind::Octal, |
4731 | c: '?' , |
4732 | })) |
4733 | ); |
4734 | assert_eq!( |
4735 | parser_octal(r"\7777" ).parse_escape(), |
4736 | Ok(Primitive::Literal(ast::Literal { |
4737 | span: span(0..4), |
4738 | kind: ast::LiteralKind::Octal, |
4739 | c: ' \u{01FF}' , |
4740 | })) |
4741 | ); |
4742 | assert_eq!( |
4743 | parser_octal(r"\778" ).parse(), |
4744 | Ok(Ast::concat(ast::Concat { |
4745 | span: span(0..4), |
4746 | asts: vec![ |
4747 | Ast::literal(ast::Literal { |
4748 | span: span(0..3), |
4749 | kind: ast::LiteralKind::Octal, |
4750 | c: '?' , |
4751 | }), |
4752 | Ast::literal(ast::Literal { |
4753 | span: span(3..4), |
4754 | kind: ast::LiteralKind::Verbatim, |
4755 | c: '8' , |
4756 | }), |
4757 | ], |
4758 | })) |
4759 | ); |
4760 | assert_eq!( |
4761 | parser_octal(r"\7777" ).parse(), |
4762 | Ok(Ast::concat(ast::Concat { |
4763 | span: span(0..5), |
4764 | asts: vec![ |
4765 | Ast::literal(ast::Literal { |
4766 | span: span(0..4), |
4767 | kind: ast::LiteralKind::Octal, |
4768 | c: ' \u{01FF}' , |
4769 | }), |
4770 | Ast::literal(ast::Literal { |
4771 | span: span(4..5), |
4772 | kind: ast::LiteralKind::Verbatim, |
4773 | c: '7' , |
4774 | }), |
4775 | ], |
4776 | })) |
4777 | ); |
4778 | |
4779 | assert_eq!( |
4780 | parser_octal(r"\8" ).parse_escape().unwrap_err(), |
4781 | TestError { |
4782 | span: span(0..2), |
4783 | kind: ast::ErrorKind::EscapeUnrecognized, |
4784 | } |
4785 | ); |
4786 | } |
4787 | |
4788 | #[test ] |
4789 | fn parse_hex_two() { |
4790 | for i in 0..256 { |
4791 | let pat = format!(r"\x{:02x}" , i); |
4792 | assert_eq!( |
4793 | parser(&pat).parse_escape(), |
4794 | Ok(Primitive::Literal(ast::Literal { |
4795 | span: span(0..pat.len()), |
4796 | kind: ast::LiteralKind::HexFixed(ast::HexLiteralKind::X), |
4797 | c: char::from_u32(i).unwrap(), |
4798 | })) |
4799 | ); |
4800 | } |
4801 | |
4802 | assert_eq!( |
4803 | parser(r"\xF" ).parse_escape().unwrap_err(), |
4804 | TestError { |
4805 | span: span(3..3), |
4806 | kind: ast::ErrorKind::EscapeUnexpectedEof, |
4807 | } |
4808 | ); |
4809 | assert_eq!( |
4810 | parser(r"\xG" ).parse_escape().unwrap_err(), |
4811 | TestError { |
4812 | span: span(2..3), |
4813 | kind: ast::ErrorKind::EscapeHexInvalidDigit, |
4814 | } |
4815 | ); |
4816 | assert_eq!( |
4817 | parser(r"\xFG" ).parse_escape().unwrap_err(), |
4818 | TestError { |
4819 | span: span(3..4), |
4820 | kind: ast::ErrorKind::EscapeHexInvalidDigit, |
4821 | } |
4822 | ); |
4823 | } |
4824 | |
4825 | #[test ] |
4826 | fn parse_hex_four() { |
4827 | for i in 0..65536 { |
4828 | let c = match char::from_u32(i) { |
4829 | None => continue, |
4830 | Some(c) => c, |
4831 | }; |
4832 | let pat = format!(r"\u{:04x}" , i); |
4833 | assert_eq!( |
4834 | parser(&pat).parse_escape(), |
4835 | Ok(Primitive::Literal(ast::Literal { |
4836 | span: span(0..pat.len()), |
4837 | kind: ast::LiteralKind::HexFixed( |
4838 | ast::HexLiteralKind::UnicodeShort |
4839 | ), |
4840 | c, |
4841 | })) |
4842 | ); |
4843 | } |
4844 | |
4845 | assert_eq!( |
4846 | parser(r"\uF" ).parse_escape().unwrap_err(), |
4847 | TestError { |
4848 | span: span(3..3), |
4849 | kind: ast::ErrorKind::EscapeUnexpectedEof, |
4850 | } |
4851 | ); |
4852 | assert_eq!( |
4853 | parser(r"\uG" ).parse_escape().unwrap_err(), |
4854 | TestError { |
4855 | span: span(2..3), |
4856 | kind: ast::ErrorKind::EscapeHexInvalidDigit, |
4857 | } |
4858 | ); |
4859 | assert_eq!( |
4860 | parser(r"\uFG" ).parse_escape().unwrap_err(), |
4861 | TestError { |
4862 | span: span(3..4), |
4863 | kind: ast::ErrorKind::EscapeHexInvalidDigit, |
4864 | } |
4865 | ); |
4866 | assert_eq!( |
4867 | parser(r"\uFFG" ).parse_escape().unwrap_err(), |
4868 | TestError { |
4869 | span: span(4..5), |
4870 | kind: ast::ErrorKind::EscapeHexInvalidDigit, |
4871 | } |
4872 | ); |
4873 | assert_eq!( |
4874 | parser(r"\uFFFG" ).parse_escape().unwrap_err(), |
4875 | TestError { |
4876 | span: span(5..6), |
4877 | kind: ast::ErrorKind::EscapeHexInvalidDigit, |
4878 | } |
4879 | ); |
4880 | assert_eq!( |
4881 | parser(r"\uD800" ).parse_escape().unwrap_err(), |
4882 | TestError { |
4883 | span: span(2..6), |
4884 | kind: ast::ErrorKind::EscapeHexInvalid, |
4885 | } |
4886 | ); |
4887 | } |
4888 | |
4889 | #[test ] |
4890 | fn parse_hex_eight() { |
4891 | for i in 0..65536 { |
4892 | let c = match char::from_u32(i) { |
4893 | None => continue, |
4894 | Some(c) => c, |
4895 | }; |
4896 | let pat = format!(r"\U{:08x}" , i); |
4897 | assert_eq!( |
4898 | parser(&pat).parse_escape(), |
4899 | Ok(Primitive::Literal(ast::Literal { |
4900 | span: span(0..pat.len()), |
4901 | kind: ast::LiteralKind::HexFixed( |
4902 | ast::HexLiteralKind::UnicodeLong |
4903 | ), |
4904 | c, |
4905 | })) |
4906 | ); |
4907 | } |
4908 | |
4909 | assert_eq!( |
4910 | parser(r"\UF" ).parse_escape().unwrap_err(), |
4911 | TestError { |
4912 | span: span(3..3), |
4913 | kind: ast::ErrorKind::EscapeUnexpectedEof, |
4914 | } |
4915 | ); |
4916 | assert_eq!( |
4917 | parser(r"\UG" ).parse_escape().unwrap_err(), |
4918 | TestError { |
4919 | span: span(2..3), |
4920 | kind: ast::ErrorKind::EscapeHexInvalidDigit, |
4921 | } |
4922 | ); |
4923 | assert_eq!( |
4924 | parser(r"\UFG" ).parse_escape().unwrap_err(), |
4925 | TestError { |
4926 | span: span(3..4), |
4927 | kind: ast::ErrorKind::EscapeHexInvalidDigit, |
4928 | } |
4929 | ); |
4930 | assert_eq!( |
4931 | parser(r"\UFFG" ).parse_escape().unwrap_err(), |
4932 | TestError { |
4933 | span: span(4..5), |
4934 | kind: ast::ErrorKind::EscapeHexInvalidDigit, |
4935 | } |
4936 | ); |
4937 | assert_eq!( |
4938 | parser(r"\UFFFG" ).parse_escape().unwrap_err(), |
4939 | TestError { |
4940 | span: span(5..6), |
4941 | kind: ast::ErrorKind::EscapeHexInvalidDigit, |
4942 | } |
4943 | ); |
4944 | assert_eq!( |
4945 | parser(r"\UFFFFG" ).parse_escape().unwrap_err(), |
4946 | TestError { |
4947 | span: span(6..7), |
4948 | kind: ast::ErrorKind::EscapeHexInvalidDigit, |
4949 | } |
4950 | ); |
4951 | assert_eq!( |
4952 | parser(r"\UFFFFFG" ).parse_escape().unwrap_err(), |
4953 | TestError { |
4954 | span: span(7..8), |
4955 | kind: ast::ErrorKind::EscapeHexInvalidDigit, |
4956 | } |
4957 | ); |
4958 | assert_eq!( |
4959 | parser(r"\UFFFFFFG" ).parse_escape().unwrap_err(), |
4960 | TestError { |
4961 | span: span(8..9), |
4962 | kind: ast::ErrorKind::EscapeHexInvalidDigit, |
4963 | } |
4964 | ); |
4965 | assert_eq!( |
4966 | parser(r"\UFFFFFFFG" ).parse_escape().unwrap_err(), |
4967 | TestError { |
4968 | span: span(9..10), |
4969 | kind: ast::ErrorKind::EscapeHexInvalidDigit, |
4970 | } |
4971 | ); |
4972 | } |
4973 | |
4974 | #[test ] |
4975 | fn parse_hex_brace() { |
4976 | assert_eq!( |
4977 | parser(r"\u{26c4}" ).parse_escape(), |
4978 | Ok(Primitive::Literal(ast::Literal { |
4979 | span: span(0..8), |
4980 | kind: ast::LiteralKind::HexBrace( |
4981 | ast::HexLiteralKind::UnicodeShort |
4982 | ), |
4983 | c: '⛄' , |
4984 | })) |
4985 | ); |
4986 | assert_eq!( |
4987 | parser(r"\U{26c4}" ).parse_escape(), |
4988 | Ok(Primitive::Literal(ast::Literal { |
4989 | span: span(0..8), |
4990 | kind: ast::LiteralKind::HexBrace( |
4991 | ast::HexLiteralKind::UnicodeLong |
4992 | ), |
4993 | c: '⛄' , |
4994 | })) |
4995 | ); |
4996 | assert_eq!( |
4997 | parser(r"\x{26c4}" ).parse_escape(), |
4998 | Ok(Primitive::Literal(ast::Literal { |
4999 | span: span(0..8), |
5000 | kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X), |
5001 | c: '⛄' , |
5002 | })) |
5003 | ); |
5004 | assert_eq!( |
5005 | parser(r"\x{26C4}" ).parse_escape(), |
5006 | Ok(Primitive::Literal(ast::Literal { |
5007 | span: span(0..8), |
5008 | kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X), |
5009 | c: '⛄' , |
5010 | })) |
5011 | ); |
5012 | assert_eq!( |
5013 | parser(r"\x{10fFfF}" ).parse_escape(), |
5014 | Ok(Primitive::Literal(ast::Literal { |
5015 | span: span(0..10), |
5016 | kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X), |
5017 | c: ' \u{10FFFF}' , |
5018 | })) |
5019 | ); |
5020 | |
5021 | assert_eq!( |
5022 | parser(r"\x" ).parse_escape().unwrap_err(), |
5023 | TestError { |
5024 | span: span(2..2), |
5025 | kind: ast::ErrorKind::EscapeUnexpectedEof, |
5026 | } |
5027 | ); |
5028 | assert_eq!( |
5029 | parser(r"\x{" ).parse_escape().unwrap_err(), |
5030 | TestError { |
5031 | span: span(2..3), |
5032 | kind: ast::ErrorKind::EscapeUnexpectedEof, |
5033 | } |
5034 | ); |
5035 | assert_eq!( |
5036 | parser(r"\x{FF" ).parse_escape().unwrap_err(), |
5037 | TestError { |
5038 | span: span(2..5), |
5039 | kind: ast::ErrorKind::EscapeUnexpectedEof, |
5040 | } |
5041 | ); |
5042 | assert_eq!( |
5043 | parser(r"\x{}" ).parse_escape().unwrap_err(), |
5044 | TestError { |
5045 | span: span(2..4), |
5046 | kind: ast::ErrorKind::EscapeHexEmpty, |
5047 | } |
5048 | ); |
5049 | assert_eq!( |
5050 | parser(r"\x{FGF}" ).parse_escape().unwrap_err(), |
5051 | TestError { |
5052 | span: span(4..5), |
5053 | kind: ast::ErrorKind::EscapeHexInvalidDigit, |
5054 | } |
5055 | ); |
5056 | assert_eq!( |
5057 | parser(r"\x{FFFFFF}" ).parse_escape().unwrap_err(), |
5058 | TestError { |
5059 | span: span(3..9), |
5060 | kind: ast::ErrorKind::EscapeHexInvalid, |
5061 | } |
5062 | ); |
5063 | assert_eq!( |
5064 | parser(r"\x{D800}" ).parse_escape().unwrap_err(), |
5065 | TestError { |
5066 | span: span(3..7), |
5067 | kind: ast::ErrorKind::EscapeHexInvalid, |
5068 | } |
5069 | ); |
5070 | assert_eq!( |
5071 | parser(r"\x{FFFFFFFFF}" ).parse_escape().unwrap_err(), |
5072 | TestError { |
5073 | span: span(3..12), |
5074 | kind: ast::ErrorKind::EscapeHexInvalid, |
5075 | } |
5076 | ); |
5077 | } |
5078 | |
5079 | #[test ] |
5080 | fn parse_decimal() { |
5081 | assert_eq!(parser("123" ).parse_decimal(), Ok(123)); |
5082 | assert_eq!(parser("0" ).parse_decimal(), Ok(0)); |
5083 | assert_eq!(parser("01" ).parse_decimal(), Ok(1)); |
5084 | |
5085 | assert_eq!( |
5086 | parser("-1" ).parse_decimal().unwrap_err(), |
5087 | TestError { span: span(0..0), kind: ast::ErrorKind::DecimalEmpty } |
5088 | ); |
5089 | assert_eq!( |
5090 | parser("" ).parse_decimal().unwrap_err(), |
5091 | TestError { span: span(0..0), kind: ast::ErrorKind::DecimalEmpty } |
5092 | ); |
5093 | assert_eq!( |
5094 | parser("9999999999" ).parse_decimal().unwrap_err(), |
5095 | TestError { |
5096 | span: span(0..10), |
5097 | kind: ast::ErrorKind::DecimalInvalid, |
5098 | } |
5099 | ); |
5100 | } |
5101 | |
5102 | #[test ] |
5103 | fn parse_set_class() { |
5104 | fn union(span: Span, items: Vec<ast::ClassSetItem>) -> ast::ClassSet { |
5105 | ast::ClassSet::union(ast::ClassSetUnion { span, items }) |
5106 | } |
5107 | |
5108 | fn intersection( |
5109 | span: Span, |
5110 | lhs: ast::ClassSet, |
5111 | rhs: ast::ClassSet, |
5112 | ) -> ast::ClassSet { |
5113 | ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp { |
5114 | span, |
5115 | kind: ast::ClassSetBinaryOpKind::Intersection, |
5116 | lhs: Box::new(lhs), |
5117 | rhs: Box::new(rhs), |
5118 | }) |
5119 | } |
5120 | |
5121 | fn difference( |
5122 | span: Span, |
5123 | lhs: ast::ClassSet, |
5124 | rhs: ast::ClassSet, |
5125 | ) -> ast::ClassSet { |
5126 | ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp { |
5127 | span, |
5128 | kind: ast::ClassSetBinaryOpKind::Difference, |
5129 | lhs: Box::new(lhs), |
5130 | rhs: Box::new(rhs), |
5131 | }) |
5132 | } |
5133 | |
5134 | fn symdifference( |
5135 | span: Span, |
5136 | lhs: ast::ClassSet, |
5137 | rhs: ast::ClassSet, |
5138 | ) -> ast::ClassSet { |
5139 | ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp { |
5140 | span, |
5141 | kind: ast::ClassSetBinaryOpKind::SymmetricDifference, |
5142 | lhs: Box::new(lhs), |
5143 | rhs: Box::new(rhs), |
5144 | }) |
5145 | } |
5146 | |
5147 | fn itemset(item: ast::ClassSetItem) -> ast::ClassSet { |
5148 | ast::ClassSet::Item(item) |
5149 | } |
5150 | |
5151 | fn item_ascii(cls: ast::ClassAscii) -> ast::ClassSetItem { |
5152 | ast::ClassSetItem::Ascii(cls) |
5153 | } |
5154 | |
5155 | fn item_unicode(cls: ast::ClassUnicode) -> ast::ClassSetItem { |
5156 | ast::ClassSetItem::Unicode(cls) |
5157 | } |
5158 | |
5159 | fn item_perl(cls: ast::ClassPerl) -> ast::ClassSetItem { |
5160 | ast::ClassSetItem::Perl(cls) |
5161 | } |
5162 | |
5163 | fn item_bracket(cls: ast::ClassBracketed) -> ast::ClassSetItem { |
5164 | ast::ClassSetItem::Bracketed(Box::new(cls)) |
5165 | } |
5166 | |
5167 | fn lit(span: Span, c: char) -> ast::ClassSetItem { |
5168 | ast::ClassSetItem::Literal(ast::Literal { |
5169 | span, |
5170 | kind: ast::LiteralKind::Verbatim, |
5171 | c, |
5172 | }) |
5173 | } |
5174 | |
5175 | fn empty(span: Span) -> ast::ClassSetItem { |
5176 | ast::ClassSetItem::Empty(span) |
5177 | } |
5178 | |
5179 | fn range(span: Span, start: char, end: char) -> ast::ClassSetItem { |
5180 | let pos1 = Position { |
5181 | offset: span.start.offset + start.len_utf8(), |
5182 | column: span.start.column + 1, |
5183 | ..span.start |
5184 | }; |
5185 | let pos2 = Position { |
5186 | offset: span.end.offset - end.len_utf8(), |
5187 | column: span.end.column - 1, |
5188 | ..span.end |
5189 | }; |
5190 | ast::ClassSetItem::Range(ast::ClassSetRange { |
5191 | span, |
5192 | start: ast::Literal { |
5193 | span: Span { end: pos1, ..span }, |
5194 | kind: ast::LiteralKind::Verbatim, |
5195 | c: start, |
5196 | }, |
5197 | end: ast::Literal { |
5198 | span: Span { start: pos2, ..span }, |
5199 | kind: ast::LiteralKind::Verbatim, |
5200 | c: end, |
5201 | }, |
5202 | }) |
5203 | } |
5204 | |
5205 | fn alnum(span: Span, negated: bool) -> ast::ClassAscii { |
5206 | ast::ClassAscii { span, kind: ast::ClassAsciiKind::Alnum, negated } |
5207 | } |
5208 | |
5209 | fn lower(span: Span, negated: bool) -> ast::ClassAscii { |
5210 | ast::ClassAscii { span, kind: ast::ClassAsciiKind::Lower, negated } |
5211 | } |
5212 | |
5213 | assert_eq!( |
5214 | parser("[[:alnum:]]" ).parse(), |
5215 | Ok(Ast::class_bracketed(ast::ClassBracketed { |
5216 | span: span(0..11), |
5217 | negated: false, |
5218 | kind: itemset(item_ascii(alnum(span(1..10), false))), |
5219 | })) |
5220 | ); |
5221 | assert_eq!( |
5222 | parser("[[[:alnum:]]]" ).parse(), |
5223 | Ok(Ast::class_bracketed(ast::ClassBracketed { |
5224 | span: span(0..13), |
5225 | negated: false, |
5226 | kind: itemset(item_bracket(ast::ClassBracketed { |
5227 | span: span(1..12), |
5228 | negated: false, |
5229 | kind: itemset(item_ascii(alnum(span(2..11), false))), |
5230 | })), |
5231 | })) |
5232 | ); |
5233 | assert_eq!( |
5234 | parser("[[:alnum:]&&[:lower:]]" ).parse(), |
5235 | Ok(Ast::class_bracketed(ast::ClassBracketed { |
5236 | span: span(0..22), |
5237 | negated: false, |
5238 | kind: intersection( |
5239 | span(1..21), |
5240 | itemset(item_ascii(alnum(span(1..10), false))), |
5241 | itemset(item_ascii(lower(span(12..21), false))), |
5242 | ), |
5243 | })) |
5244 | ); |
5245 | assert_eq!( |
5246 | parser("[[:alnum:]--[:lower:]]" ).parse(), |
5247 | Ok(Ast::class_bracketed(ast::ClassBracketed { |
5248 | span: span(0..22), |
5249 | negated: false, |
5250 | kind: difference( |
5251 | span(1..21), |
5252 | itemset(item_ascii(alnum(span(1..10), false))), |
5253 | itemset(item_ascii(lower(span(12..21), false))), |
5254 | ), |
5255 | })) |
5256 | ); |
5257 | assert_eq!( |
5258 | parser("[[:alnum:]~~[:lower:]]" ).parse(), |
5259 | Ok(Ast::class_bracketed(ast::ClassBracketed { |
5260 | span: span(0..22), |
5261 | negated: false, |
5262 | kind: symdifference( |
5263 | span(1..21), |
5264 | itemset(item_ascii(alnum(span(1..10), false))), |
5265 | itemset(item_ascii(lower(span(12..21), false))), |
5266 | ), |
5267 | })) |
5268 | ); |
5269 | |
5270 | assert_eq!( |
5271 | parser("[a]" ).parse(), |
5272 | Ok(Ast::class_bracketed(ast::ClassBracketed { |
5273 | span: span(0..3), |
5274 | negated: false, |
5275 | kind: itemset(lit(span(1..2), 'a' )), |
5276 | })) |
5277 | ); |
5278 | assert_eq!( |
5279 | parser(r"[a\]]" ).parse(), |
5280 | Ok(Ast::class_bracketed(ast::ClassBracketed { |
5281 | span: span(0..5), |
5282 | negated: false, |
5283 | kind: union( |
5284 | span(1..4), |
5285 | vec![ |
5286 | lit(span(1..2), 'a' ), |
5287 | ast::ClassSetItem::Literal(ast::Literal { |
5288 | span: span(2..4), |
5289 | kind: ast::LiteralKind::Meta, |
5290 | c: ']' , |
5291 | }), |
5292 | ] |
5293 | ), |
5294 | })) |
5295 | ); |
5296 | assert_eq!( |
5297 | parser(r"[a\-z]" ).parse(), |
5298 | Ok(Ast::class_bracketed(ast::ClassBracketed { |
5299 | span: span(0..6), |
5300 | negated: false, |
5301 | kind: union( |
5302 | span(1..5), |
5303 | vec![ |
5304 | lit(span(1..2), 'a' ), |
5305 | ast::ClassSetItem::Literal(ast::Literal { |
5306 | span: span(2..4), |
5307 | kind: ast::LiteralKind::Meta, |
5308 | c: '-' , |
5309 | }), |
5310 | lit(span(4..5), 'z' ), |
5311 | ] |
5312 | ), |
5313 | })) |
5314 | ); |
5315 | assert_eq!( |
5316 | parser("[ab]" ).parse(), |
5317 | Ok(Ast::class_bracketed(ast::ClassBracketed { |
5318 | span: span(0..4), |
5319 | negated: false, |
5320 | kind: union( |
5321 | span(1..3), |
5322 | vec![lit(span(1..2), 'a' ), lit(span(2..3), 'b' ),] |
5323 | ), |
5324 | })) |
5325 | ); |
5326 | assert_eq!( |
5327 | parser("[a-]" ).parse(), |
5328 | Ok(Ast::class_bracketed(ast::ClassBracketed { |
5329 | span: span(0..4), |
5330 | negated: false, |
5331 | kind: union( |
5332 | span(1..3), |
5333 | vec![lit(span(1..2), 'a' ), lit(span(2..3), '-' ),] |
5334 | ), |
5335 | })) |
5336 | ); |
5337 | assert_eq!( |
5338 | parser("[-a]" ).parse(), |
5339 | Ok(Ast::class_bracketed(ast::ClassBracketed { |
5340 | span: span(0..4), |
5341 | negated: false, |
5342 | kind: union( |
5343 | span(1..3), |
5344 | vec![lit(span(1..2), '-' ), lit(span(2..3), 'a' ),] |
5345 | ), |
5346 | })) |
5347 | ); |
5348 | assert_eq!( |
5349 | parser(r"[\pL]" ).parse(), |
5350 | Ok(Ast::class_bracketed(ast::ClassBracketed { |
5351 | span: span(0..5), |
5352 | negated: false, |
5353 | kind: itemset(item_unicode(ast::ClassUnicode { |
5354 | span: span(1..4), |
5355 | negated: false, |
5356 | kind: ast::ClassUnicodeKind::OneLetter('L' ), |
5357 | })), |
5358 | })) |
5359 | ); |
5360 | assert_eq!( |
5361 | parser(r"[\w]" ).parse(), |
5362 | Ok(Ast::class_bracketed(ast::ClassBracketed { |
5363 | span: span(0..4), |
5364 | negated: false, |
5365 | kind: itemset(item_perl(ast::ClassPerl { |
5366 | span: span(1..3), |
5367 | kind: ast::ClassPerlKind::Word, |
5368 | negated: false, |
5369 | })), |
5370 | })) |
5371 | ); |
5372 | assert_eq!( |
5373 | parser(r"[a\wz]" ).parse(), |
5374 | Ok(Ast::class_bracketed(ast::ClassBracketed { |
5375 | span: span(0..6), |
5376 | negated: false, |
5377 | kind: union( |
5378 | span(1..5), |
5379 | vec![ |
5380 | lit(span(1..2), 'a' ), |
5381 | item_perl(ast::ClassPerl { |
5382 | span: span(2..4), |
5383 | kind: ast::ClassPerlKind::Word, |
5384 | negated: false, |
5385 | }), |
5386 | lit(span(4..5), 'z' ), |
5387 | ] |
5388 | ), |
5389 | })) |
5390 | ); |
5391 | |
5392 | assert_eq!( |
5393 | parser("[a-z]" ).parse(), |
5394 | Ok(Ast::class_bracketed(ast::ClassBracketed { |
5395 | span: span(0..5), |
5396 | negated: false, |
5397 | kind: itemset(range(span(1..4), 'a' , 'z' )), |
5398 | })) |
5399 | ); |
5400 | assert_eq!( |
5401 | parser("[a-cx-z]" ).parse(), |
5402 | Ok(Ast::class_bracketed(ast::ClassBracketed { |
5403 | span: span(0..8), |
5404 | negated: false, |
5405 | kind: union( |
5406 | span(1..7), |
5407 | vec![ |
5408 | range(span(1..4), 'a' , 'c' ), |
5409 | range(span(4..7), 'x' , 'z' ), |
5410 | ] |
5411 | ), |
5412 | })) |
5413 | ); |
5414 | assert_eq!( |
5415 | parser(r"[\w&&a-cx-z]" ).parse(), |
5416 | Ok(Ast::class_bracketed(ast::ClassBracketed { |
5417 | span: span(0..12), |
5418 | negated: false, |
5419 | kind: intersection( |
5420 | span(1..11), |
5421 | itemset(item_perl(ast::ClassPerl { |
5422 | span: span(1..3), |
5423 | kind: ast::ClassPerlKind::Word, |
5424 | negated: false, |
5425 | })), |
5426 | union( |
5427 | span(5..11), |
5428 | vec![ |
5429 | range(span(5..8), 'a' , 'c' ), |
5430 | range(span(8..11), 'x' , 'z' ), |
5431 | ] |
5432 | ), |
5433 | ), |
5434 | })) |
5435 | ); |
5436 | assert_eq!( |
5437 | parser(r"[a-cx-z&&\w]" ).parse(), |
5438 | Ok(Ast::class_bracketed(ast::ClassBracketed { |
5439 | span: span(0..12), |
5440 | negated: false, |
5441 | kind: intersection( |
5442 | span(1..11), |
5443 | union( |
5444 | span(1..7), |
5445 | vec![ |
5446 | range(span(1..4), 'a' , 'c' ), |
5447 | range(span(4..7), 'x' , 'z' ), |
5448 | ] |
5449 | ), |
5450 | itemset(item_perl(ast::ClassPerl { |
5451 | span: span(9..11), |
5452 | kind: ast::ClassPerlKind::Word, |
5453 | negated: false, |
5454 | })), |
5455 | ), |
5456 | })) |
5457 | ); |
5458 | assert_eq!( |
5459 | parser(r"[a--b--c]" ).parse(), |
5460 | Ok(Ast::class_bracketed(ast::ClassBracketed { |
5461 | span: span(0..9), |
5462 | negated: false, |
5463 | kind: difference( |
5464 | span(1..8), |
5465 | difference( |
5466 | span(1..5), |
5467 | itemset(lit(span(1..2), 'a' )), |
5468 | itemset(lit(span(4..5), 'b' )), |
5469 | ), |
5470 | itemset(lit(span(7..8), 'c' )), |
5471 | ), |
5472 | })) |
5473 | ); |
5474 | assert_eq!( |
5475 | parser(r"[a~~b~~c]" ).parse(), |
5476 | Ok(Ast::class_bracketed(ast::ClassBracketed { |
5477 | span: span(0..9), |
5478 | negated: false, |
5479 | kind: symdifference( |
5480 | span(1..8), |
5481 | symdifference( |
5482 | span(1..5), |
5483 | itemset(lit(span(1..2), 'a' )), |
5484 | itemset(lit(span(4..5), 'b' )), |
5485 | ), |
5486 | itemset(lit(span(7..8), 'c' )), |
5487 | ), |
5488 | })) |
5489 | ); |
5490 | assert_eq!( |
5491 | parser(r"[\^&&^]" ).parse(), |
5492 | Ok(Ast::class_bracketed(ast::ClassBracketed { |
5493 | span: span(0..7), |
5494 | negated: false, |
5495 | kind: intersection( |
5496 | span(1..6), |
5497 | itemset(ast::ClassSetItem::Literal(ast::Literal { |
5498 | span: span(1..3), |
5499 | kind: ast::LiteralKind::Meta, |
5500 | c: '^' , |
5501 | })), |
5502 | itemset(lit(span(5..6), '^' )), |
5503 | ), |
5504 | })) |
5505 | ); |
5506 | assert_eq!( |
5507 | parser(r"[\&&&&]" ).parse(), |
5508 | Ok(Ast::class_bracketed(ast::ClassBracketed { |
5509 | span: span(0..7), |
5510 | negated: false, |
5511 | kind: intersection( |
5512 | span(1..6), |
5513 | itemset(ast::ClassSetItem::Literal(ast::Literal { |
5514 | span: span(1..3), |
5515 | kind: ast::LiteralKind::Meta, |
5516 | c: '&' , |
5517 | })), |
5518 | itemset(lit(span(5..6), '&' )), |
5519 | ), |
5520 | })) |
5521 | ); |
5522 | assert_eq!( |
5523 | parser(r"[&&&&]" ).parse(), |
5524 | Ok(Ast::class_bracketed(ast::ClassBracketed { |
5525 | span: span(0..6), |
5526 | negated: false, |
5527 | kind: intersection( |
5528 | span(1..5), |
5529 | intersection( |
5530 | span(1..3), |
5531 | itemset(empty(span(1..1))), |
5532 | itemset(empty(span(3..3))), |
5533 | ), |
5534 | itemset(empty(span(5..5))), |
5535 | ), |
5536 | })) |
5537 | ); |
5538 | |
5539 | let pat = "[☃-⛄]" ; |
5540 | assert_eq!( |
5541 | parser(pat).parse(), |
5542 | Ok(Ast::class_bracketed(ast::ClassBracketed { |
5543 | span: span_range(pat, 0..9), |
5544 | negated: false, |
5545 | kind: itemset(ast::ClassSetItem::Range(ast::ClassSetRange { |
5546 | span: span_range(pat, 1..8), |
5547 | start: ast::Literal { |
5548 | span: span_range(pat, 1..4), |
5549 | kind: ast::LiteralKind::Verbatim, |
5550 | c: '☃' , |
5551 | }, |
5552 | end: ast::Literal { |
5553 | span: span_range(pat, 5..8), |
5554 | kind: ast::LiteralKind::Verbatim, |
5555 | c: '⛄' , |
5556 | }, |
5557 | })), |
5558 | })) |
5559 | ); |
5560 | |
5561 | assert_eq!( |
5562 | parser(r"[]]" ).parse(), |
5563 | Ok(Ast::class_bracketed(ast::ClassBracketed { |
5564 | span: span(0..3), |
5565 | negated: false, |
5566 | kind: itemset(lit(span(1..2), ']' )), |
5567 | })) |
5568 | ); |
5569 | assert_eq!( |
5570 | parser(r"[]\[]" ).parse(), |
5571 | Ok(Ast::class_bracketed(ast::ClassBracketed { |
5572 | span: span(0..5), |
5573 | negated: false, |
5574 | kind: union( |
5575 | span(1..4), |
5576 | vec![ |
5577 | lit(span(1..2), ']' ), |
5578 | ast::ClassSetItem::Literal(ast::Literal { |
5579 | span: span(2..4), |
5580 | kind: ast::LiteralKind::Meta, |
5581 | c: '[' , |
5582 | }), |
5583 | ] |
5584 | ), |
5585 | })) |
5586 | ); |
5587 | assert_eq!( |
5588 | parser(r"[\[]]" ).parse(), |
5589 | Ok(concat( |
5590 | 0..5, |
5591 | vec![ |
5592 | Ast::class_bracketed(ast::ClassBracketed { |
5593 | span: span(0..4), |
5594 | negated: false, |
5595 | kind: itemset(ast::ClassSetItem::Literal( |
5596 | ast::Literal { |
5597 | span: span(1..3), |
5598 | kind: ast::LiteralKind::Meta, |
5599 | c: '[' , |
5600 | } |
5601 | )), |
5602 | }), |
5603 | Ast::literal(ast::Literal { |
5604 | span: span(4..5), |
5605 | kind: ast::LiteralKind::Verbatim, |
5606 | c: ']' , |
5607 | }), |
5608 | ] |
5609 | )) |
5610 | ); |
5611 | |
5612 | assert_eq!( |
5613 | parser("[" ).parse().unwrap_err(), |
5614 | TestError { |
5615 | span: span(0..1), |
5616 | kind: ast::ErrorKind::ClassUnclosed, |
5617 | } |
5618 | ); |
5619 | assert_eq!( |
5620 | parser("[[" ).parse().unwrap_err(), |
5621 | TestError { |
5622 | span: span(1..2), |
5623 | kind: ast::ErrorKind::ClassUnclosed, |
5624 | } |
5625 | ); |
5626 | assert_eq!( |
5627 | parser("[[-]" ).parse().unwrap_err(), |
5628 | TestError { |
5629 | span: span(0..1), |
5630 | kind: ast::ErrorKind::ClassUnclosed, |
5631 | } |
5632 | ); |
5633 | assert_eq!( |
5634 | parser("[[[:alnum:]" ).parse().unwrap_err(), |
5635 | TestError { |
5636 | span: span(1..2), |
5637 | kind: ast::ErrorKind::ClassUnclosed, |
5638 | } |
5639 | ); |
5640 | assert_eq!( |
5641 | parser(r"[\b]" ).parse().unwrap_err(), |
5642 | TestError { |
5643 | span: span(1..3), |
5644 | kind: ast::ErrorKind::ClassEscapeInvalid, |
5645 | } |
5646 | ); |
5647 | assert_eq!( |
5648 | parser(r"[\w-a]" ).parse().unwrap_err(), |
5649 | TestError { |
5650 | span: span(1..3), |
5651 | kind: ast::ErrorKind::ClassRangeLiteral, |
5652 | } |
5653 | ); |
5654 | assert_eq!( |
5655 | parser(r"[a-\w]" ).parse().unwrap_err(), |
5656 | TestError { |
5657 | span: span(3..5), |
5658 | kind: ast::ErrorKind::ClassRangeLiteral, |
5659 | } |
5660 | ); |
5661 | assert_eq!( |
5662 | parser(r"[z-a]" ).parse().unwrap_err(), |
5663 | TestError { |
5664 | span: span(1..4), |
5665 | kind: ast::ErrorKind::ClassRangeInvalid, |
5666 | } |
5667 | ); |
5668 | |
5669 | assert_eq!( |
5670 | parser_ignore_whitespace("[a " ).parse().unwrap_err(), |
5671 | TestError { |
5672 | span: span(0..1), |
5673 | kind: ast::ErrorKind::ClassUnclosed, |
5674 | } |
5675 | ); |
5676 | assert_eq!( |
5677 | parser_ignore_whitespace("[a- " ).parse().unwrap_err(), |
5678 | TestError { |
5679 | span: span(0..1), |
5680 | kind: ast::ErrorKind::ClassUnclosed, |
5681 | } |
5682 | ); |
5683 | } |
5684 | |
5685 | #[test ] |
5686 | fn parse_set_class_open() { |
5687 | assert_eq!(parser("[a]" ).parse_set_class_open(), { |
5688 | let set = ast::ClassBracketed { |
5689 | span: span(0..1), |
5690 | negated: false, |
5691 | kind: ast::ClassSet::union(ast::ClassSetUnion { |
5692 | span: span(1..1), |
5693 | items: vec![], |
5694 | }), |
5695 | }; |
5696 | let union = ast::ClassSetUnion { span: span(1..1), items: vec![] }; |
5697 | Ok((set, union)) |
5698 | }); |
5699 | assert_eq!( |
5700 | parser_ignore_whitespace("[ a]" ).parse_set_class_open(), |
5701 | { |
5702 | let set = ast::ClassBracketed { |
5703 | span: span(0..4), |
5704 | negated: false, |
5705 | kind: ast::ClassSet::union(ast::ClassSetUnion { |
5706 | span: span(4..4), |
5707 | items: vec![], |
5708 | }), |
5709 | }; |
5710 | let union = |
5711 | ast::ClassSetUnion { span: span(4..4), items: vec![] }; |
5712 | Ok((set, union)) |
5713 | } |
5714 | ); |
5715 | assert_eq!(parser("[^a]" ).parse_set_class_open(), { |
5716 | let set = ast::ClassBracketed { |
5717 | span: span(0..2), |
5718 | negated: true, |
5719 | kind: ast::ClassSet::union(ast::ClassSetUnion { |
5720 | span: span(2..2), |
5721 | items: vec![], |
5722 | }), |
5723 | }; |
5724 | let union = ast::ClassSetUnion { span: span(2..2), items: vec![] }; |
5725 | Ok((set, union)) |
5726 | }); |
5727 | assert_eq!( |
5728 | parser_ignore_whitespace("[ ^ a]" ).parse_set_class_open(), |
5729 | { |
5730 | let set = ast::ClassBracketed { |
5731 | span: span(0..4), |
5732 | negated: true, |
5733 | kind: ast::ClassSet::union(ast::ClassSetUnion { |
5734 | span: span(4..4), |
5735 | items: vec![], |
5736 | }), |
5737 | }; |
5738 | let union = |
5739 | ast::ClassSetUnion { span: span(4..4), items: vec![] }; |
5740 | Ok((set, union)) |
5741 | } |
5742 | ); |
5743 | assert_eq!(parser("[-a]" ).parse_set_class_open(), { |
5744 | let set = ast::ClassBracketed { |
5745 | span: span(0..2), |
5746 | negated: false, |
5747 | kind: ast::ClassSet::union(ast::ClassSetUnion { |
5748 | span: span(1..1), |
5749 | items: vec![], |
5750 | }), |
5751 | }; |
5752 | let union = ast::ClassSetUnion { |
5753 | span: span(1..2), |
5754 | items: vec![ast::ClassSetItem::Literal(ast::Literal { |
5755 | span: span(1..2), |
5756 | kind: ast::LiteralKind::Verbatim, |
5757 | c: '-' , |
5758 | })], |
5759 | }; |
5760 | Ok((set, union)) |
5761 | }); |
5762 | assert_eq!( |
5763 | parser_ignore_whitespace("[ - a]" ).parse_set_class_open(), |
5764 | { |
5765 | let set = ast::ClassBracketed { |
5766 | span: span(0..4), |
5767 | negated: false, |
5768 | kind: ast::ClassSet::union(ast::ClassSetUnion { |
5769 | span: span(2..2), |
5770 | items: vec![], |
5771 | }), |
5772 | }; |
5773 | let union = ast::ClassSetUnion { |
5774 | span: span(2..3), |
5775 | items: vec![ast::ClassSetItem::Literal(ast::Literal { |
5776 | span: span(2..3), |
5777 | kind: ast::LiteralKind::Verbatim, |
5778 | c: '-' , |
5779 | })], |
5780 | }; |
5781 | Ok((set, union)) |
5782 | } |
5783 | ); |
5784 | assert_eq!(parser("[^-a]" ).parse_set_class_open(), { |
5785 | let set = ast::ClassBracketed { |
5786 | span: span(0..3), |
5787 | negated: true, |
5788 | kind: ast::ClassSet::union(ast::ClassSetUnion { |
5789 | span: span(2..2), |
5790 | items: vec![], |
5791 | }), |
5792 | }; |
5793 | let union = ast::ClassSetUnion { |
5794 | span: span(2..3), |
5795 | items: vec![ast::ClassSetItem::Literal(ast::Literal { |
5796 | span: span(2..3), |
5797 | kind: ast::LiteralKind::Verbatim, |
5798 | c: '-' , |
5799 | })], |
5800 | }; |
5801 | Ok((set, union)) |
5802 | }); |
5803 | assert_eq!(parser("[--a]" ).parse_set_class_open(), { |
5804 | let set = ast::ClassBracketed { |
5805 | span: span(0..3), |
5806 | negated: false, |
5807 | kind: ast::ClassSet::union(ast::ClassSetUnion { |
5808 | span: span(1..1), |
5809 | items: vec![], |
5810 | }), |
5811 | }; |
5812 | let union = ast::ClassSetUnion { |
5813 | span: span(1..3), |
5814 | items: vec![ |
5815 | ast::ClassSetItem::Literal(ast::Literal { |
5816 | span: span(1..2), |
5817 | kind: ast::LiteralKind::Verbatim, |
5818 | c: '-' , |
5819 | }), |
5820 | ast::ClassSetItem::Literal(ast::Literal { |
5821 | span: span(2..3), |
5822 | kind: ast::LiteralKind::Verbatim, |
5823 | c: '-' , |
5824 | }), |
5825 | ], |
5826 | }; |
5827 | Ok((set, union)) |
5828 | }); |
5829 | assert_eq!(parser("[]a]" ).parse_set_class_open(), { |
5830 | let set = ast::ClassBracketed { |
5831 | span: span(0..2), |
5832 | negated: false, |
5833 | kind: ast::ClassSet::union(ast::ClassSetUnion { |
5834 | span: span(1..1), |
5835 | items: vec![], |
5836 | }), |
5837 | }; |
5838 | let union = ast::ClassSetUnion { |
5839 | span: span(1..2), |
5840 | items: vec![ast::ClassSetItem::Literal(ast::Literal { |
5841 | span: span(1..2), |
5842 | kind: ast::LiteralKind::Verbatim, |
5843 | c: ']' , |
5844 | })], |
5845 | }; |
5846 | Ok((set, union)) |
5847 | }); |
5848 | assert_eq!( |
5849 | parser_ignore_whitespace("[ ] a]" ).parse_set_class_open(), |
5850 | { |
5851 | let set = ast::ClassBracketed { |
5852 | span: span(0..4), |
5853 | negated: false, |
5854 | kind: ast::ClassSet::union(ast::ClassSetUnion { |
5855 | span: span(2..2), |
5856 | items: vec![], |
5857 | }), |
5858 | }; |
5859 | let union = ast::ClassSetUnion { |
5860 | span: span(2..3), |
5861 | items: vec![ast::ClassSetItem::Literal(ast::Literal { |
5862 | span: span(2..3), |
5863 | kind: ast::LiteralKind::Verbatim, |
5864 | c: ']' , |
5865 | })], |
5866 | }; |
5867 | Ok((set, union)) |
5868 | } |
5869 | ); |
5870 | assert_eq!(parser("[^]a]" ).parse_set_class_open(), { |
5871 | let set = ast::ClassBracketed { |
5872 | span: span(0..3), |
5873 | negated: true, |
5874 | kind: ast::ClassSet::union(ast::ClassSetUnion { |
5875 | span: span(2..2), |
5876 | items: vec![], |
5877 | }), |
5878 | }; |
5879 | let union = ast::ClassSetUnion { |
5880 | span: span(2..3), |
5881 | items: vec![ast::ClassSetItem::Literal(ast::Literal { |
5882 | span: span(2..3), |
5883 | kind: ast::LiteralKind::Verbatim, |
5884 | c: ']' , |
5885 | })], |
5886 | }; |
5887 | Ok((set, union)) |
5888 | }); |
5889 | assert_eq!(parser("[-]a]" ).parse_set_class_open(), { |
5890 | let set = ast::ClassBracketed { |
5891 | span: span(0..2), |
5892 | negated: false, |
5893 | kind: ast::ClassSet::union(ast::ClassSetUnion { |
5894 | span: span(1..1), |
5895 | items: vec![], |
5896 | }), |
5897 | }; |
5898 | let union = ast::ClassSetUnion { |
5899 | span: span(1..2), |
5900 | items: vec![ast::ClassSetItem::Literal(ast::Literal { |
5901 | span: span(1..2), |
5902 | kind: ast::LiteralKind::Verbatim, |
5903 | c: '-' , |
5904 | })], |
5905 | }; |
5906 | Ok((set, union)) |
5907 | }); |
5908 | |
5909 | assert_eq!( |
5910 | parser("[" ).parse_set_class_open().unwrap_err(), |
5911 | TestError { |
5912 | span: span(0..1), |
5913 | kind: ast::ErrorKind::ClassUnclosed, |
5914 | } |
5915 | ); |
5916 | assert_eq!( |
5917 | parser_ignore_whitespace("[ " ) |
5918 | .parse_set_class_open() |
5919 | .unwrap_err(), |
5920 | TestError { |
5921 | span: span(0..5), |
5922 | kind: ast::ErrorKind::ClassUnclosed, |
5923 | } |
5924 | ); |
5925 | assert_eq!( |
5926 | parser("[^" ).parse_set_class_open().unwrap_err(), |
5927 | TestError { |
5928 | span: span(0..2), |
5929 | kind: ast::ErrorKind::ClassUnclosed, |
5930 | } |
5931 | ); |
5932 | assert_eq!( |
5933 | parser("[]" ).parse_set_class_open().unwrap_err(), |
5934 | TestError { |
5935 | span: span(0..2), |
5936 | kind: ast::ErrorKind::ClassUnclosed, |
5937 | } |
5938 | ); |
5939 | assert_eq!( |
5940 | parser("[-" ).parse_set_class_open().unwrap_err(), |
5941 | TestError { |
5942 | span: span(0..0), |
5943 | kind: ast::ErrorKind::ClassUnclosed, |
5944 | } |
5945 | ); |
5946 | assert_eq!( |
5947 | parser("[--" ).parse_set_class_open().unwrap_err(), |
5948 | TestError { |
5949 | span: span(0..0), |
5950 | kind: ast::ErrorKind::ClassUnclosed, |
5951 | } |
5952 | ); |
5953 | |
5954 | // See: https://github.com/rust-lang/regex/issues/792 |
5955 | assert_eq!( |
5956 | parser("(?x)[-#]" ).parse_with_comments().unwrap_err(), |
5957 | TestError { |
5958 | span: span(4..4), |
5959 | kind: ast::ErrorKind::ClassUnclosed, |
5960 | } |
5961 | ); |
5962 | } |
5963 | |
5964 | #[test ] |
5965 | fn maybe_parse_ascii_class() { |
5966 | assert_eq!( |
5967 | parser(r"[:alnum:]" ).maybe_parse_ascii_class(), |
5968 | Some(ast::ClassAscii { |
5969 | span: span(0..9), |
5970 | kind: ast::ClassAsciiKind::Alnum, |
5971 | negated: false, |
5972 | }) |
5973 | ); |
5974 | assert_eq!( |
5975 | parser(r"[:alnum:]A" ).maybe_parse_ascii_class(), |
5976 | Some(ast::ClassAscii { |
5977 | span: span(0..9), |
5978 | kind: ast::ClassAsciiKind::Alnum, |
5979 | negated: false, |
5980 | }) |
5981 | ); |
5982 | assert_eq!( |
5983 | parser(r"[:^alnum:]" ).maybe_parse_ascii_class(), |
5984 | Some(ast::ClassAscii { |
5985 | span: span(0..10), |
5986 | kind: ast::ClassAsciiKind::Alnum, |
5987 | negated: true, |
5988 | }) |
5989 | ); |
5990 | |
5991 | let p = parser(r"[:" ); |
5992 | assert_eq!(p.maybe_parse_ascii_class(), None); |
5993 | assert_eq!(p.offset(), 0); |
5994 | |
5995 | let p = parser(r"[:^" ); |
5996 | assert_eq!(p.maybe_parse_ascii_class(), None); |
5997 | assert_eq!(p.offset(), 0); |
5998 | |
5999 | let p = parser(r"[^:alnum:]" ); |
6000 | assert_eq!(p.maybe_parse_ascii_class(), None); |
6001 | assert_eq!(p.offset(), 0); |
6002 | |
6003 | let p = parser(r"[:alnnum:]" ); |
6004 | assert_eq!(p.maybe_parse_ascii_class(), None); |
6005 | assert_eq!(p.offset(), 0); |
6006 | |
6007 | let p = parser(r"[:alnum]" ); |
6008 | assert_eq!(p.maybe_parse_ascii_class(), None); |
6009 | assert_eq!(p.offset(), 0); |
6010 | |
6011 | let p = parser(r"[:alnum:" ); |
6012 | assert_eq!(p.maybe_parse_ascii_class(), None); |
6013 | assert_eq!(p.offset(), 0); |
6014 | } |
6015 | |
6016 | #[test ] |
6017 | fn parse_unicode_class() { |
6018 | assert_eq!( |
6019 | parser(r"\pN" ).parse_escape(), |
6020 | Ok(Primitive::Unicode(ast::ClassUnicode { |
6021 | span: span(0..3), |
6022 | negated: false, |
6023 | kind: ast::ClassUnicodeKind::OneLetter('N' ), |
6024 | })) |
6025 | ); |
6026 | assert_eq!( |
6027 | parser(r"\PN" ).parse_escape(), |
6028 | Ok(Primitive::Unicode(ast::ClassUnicode { |
6029 | span: span(0..3), |
6030 | negated: true, |
6031 | kind: ast::ClassUnicodeKind::OneLetter('N' ), |
6032 | })) |
6033 | ); |
6034 | assert_eq!( |
6035 | parser(r"\p{N}" ).parse_escape(), |
6036 | Ok(Primitive::Unicode(ast::ClassUnicode { |
6037 | span: span(0..5), |
6038 | negated: false, |
6039 | kind: ast::ClassUnicodeKind::Named(s("N" )), |
6040 | })) |
6041 | ); |
6042 | assert_eq!( |
6043 | parser(r"\P{N}" ).parse_escape(), |
6044 | Ok(Primitive::Unicode(ast::ClassUnicode { |
6045 | span: span(0..5), |
6046 | negated: true, |
6047 | kind: ast::ClassUnicodeKind::Named(s("N" )), |
6048 | })) |
6049 | ); |
6050 | assert_eq!( |
6051 | parser(r"\p{Greek}" ).parse_escape(), |
6052 | Ok(Primitive::Unicode(ast::ClassUnicode { |
6053 | span: span(0..9), |
6054 | negated: false, |
6055 | kind: ast::ClassUnicodeKind::Named(s("Greek" )), |
6056 | })) |
6057 | ); |
6058 | |
6059 | assert_eq!( |
6060 | parser(r"\p{scx:Katakana}" ).parse_escape(), |
6061 | Ok(Primitive::Unicode(ast::ClassUnicode { |
6062 | span: span(0..16), |
6063 | negated: false, |
6064 | kind: ast::ClassUnicodeKind::NamedValue { |
6065 | op: ast::ClassUnicodeOpKind::Colon, |
6066 | name: s("scx" ), |
6067 | value: s("Katakana" ), |
6068 | }, |
6069 | })) |
6070 | ); |
6071 | assert_eq!( |
6072 | parser(r"\p{scx=Katakana}" ).parse_escape(), |
6073 | Ok(Primitive::Unicode(ast::ClassUnicode { |
6074 | span: span(0..16), |
6075 | negated: false, |
6076 | kind: ast::ClassUnicodeKind::NamedValue { |
6077 | op: ast::ClassUnicodeOpKind::Equal, |
6078 | name: s("scx" ), |
6079 | value: s("Katakana" ), |
6080 | }, |
6081 | })) |
6082 | ); |
6083 | assert_eq!( |
6084 | parser(r"\p{scx!=Katakana}" ).parse_escape(), |
6085 | Ok(Primitive::Unicode(ast::ClassUnicode { |
6086 | span: span(0..17), |
6087 | negated: false, |
6088 | kind: ast::ClassUnicodeKind::NamedValue { |
6089 | op: ast::ClassUnicodeOpKind::NotEqual, |
6090 | name: s("scx" ), |
6091 | value: s("Katakana" ), |
6092 | }, |
6093 | })) |
6094 | ); |
6095 | |
6096 | assert_eq!( |
6097 | parser(r"\p{:}" ).parse_escape(), |
6098 | Ok(Primitive::Unicode(ast::ClassUnicode { |
6099 | span: span(0..5), |
6100 | negated: false, |
6101 | kind: ast::ClassUnicodeKind::NamedValue { |
6102 | op: ast::ClassUnicodeOpKind::Colon, |
6103 | name: s("" ), |
6104 | value: s("" ), |
6105 | }, |
6106 | })) |
6107 | ); |
6108 | assert_eq!( |
6109 | parser(r"\p{=}" ).parse_escape(), |
6110 | Ok(Primitive::Unicode(ast::ClassUnicode { |
6111 | span: span(0..5), |
6112 | negated: false, |
6113 | kind: ast::ClassUnicodeKind::NamedValue { |
6114 | op: ast::ClassUnicodeOpKind::Equal, |
6115 | name: s("" ), |
6116 | value: s("" ), |
6117 | }, |
6118 | })) |
6119 | ); |
6120 | assert_eq!( |
6121 | parser(r"\p{!=}" ).parse_escape(), |
6122 | Ok(Primitive::Unicode(ast::ClassUnicode { |
6123 | span: span(0..6), |
6124 | negated: false, |
6125 | kind: ast::ClassUnicodeKind::NamedValue { |
6126 | op: ast::ClassUnicodeOpKind::NotEqual, |
6127 | name: s("" ), |
6128 | value: s("" ), |
6129 | }, |
6130 | })) |
6131 | ); |
6132 | |
6133 | assert_eq!( |
6134 | parser(r"\p" ).parse_escape().unwrap_err(), |
6135 | TestError { |
6136 | span: span(2..2), |
6137 | kind: ast::ErrorKind::EscapeUnexpectedEof, |
6138 | } |
6139 | ); |
6140 | assert_eq!( |
6141 | parser(r"\p{" ).parse_escape().unwrap_err(), |
6142 | TestError { |
6143 | span: span(3..3), |
6144 | kind: ast::ErrorKind::EscapeUnexpectedEof, |
6145 | } |
6146 | ); |
6147 | assert_eq!( |
6148 | parser(r"\p{N" ).parse_escape().unwrap_err(), |
6149 | TestError { |
6150 | span: span(4..4), |
6151 | kind: ast::ErrorKind::EscapeUnexpectedEof, |
6152 | } |
6153 | ); |
6154 | assert_eq!( |
6155 | parser(r"\p{Greek" ).parse_escape().unwrap_err(), |
6156 | TestError { |
6157 | span: span(8..8), |
6158 | kind: ast::ErrorKind::EscapeUnexpectedEof, |
6159 | } |
6160 | ); |
6161 | |
6162 | assert_eq!( |
6163 | parser(r"\pNz" ).parse(), |
6164 | Ok(Ast::concat(ast::Concat { |
6165 | span: span(0..4), |
6166 | asts: vec![ |
6167 | Ast::class_unicode(ast::ClassUnicode { |
6168 | span: span(0..3), |
6169 | negated: false, |
6170 | kind: ast::ClassUnicodeKind::OneLetter('N' ), |
6171 | }), |
6172 | Ast::literal(ast::Literal { |
6173 | span: span(3..4), |
6174 | kind: ast::LiteralKind::Verbatim, |
6175 | c: 'z' , |
6176 | }), |
6177 | ], |
6178 | })) |
6179 | ); |
6180 | assert_eq!( |
6181 | parser(r"\p{Greek}z" ).parse(), |
6182 | Ok(Ast::concat(ast::Concat { |
6183 | span: span(0..10), |
6184 | asts: vec![ |
6185 | Ast::class_unicode(ast::ClassUnicode { |
6186 | span: span(0..9), |
6187 | negated: false, |
6188 | kind: ast::ClassUnicodeKind::Named(s("Greek" )), |
6189 | }), |
6190 | Ast::literal(ast::Literal { |
6191 | span: span(9..10), |
6192 | kind: ast::LiteralKind::Verbatim, |
6193 | c: 'z' , |
6194 | }), |
6195 | ], |
6196 | })) |
6197 | ); |
6198 | assert_eq!( |
6199 | parser(r"\p\{" ).parse().unwrap_err(), |
6200 | TestError { |
6201 | span: span(2..3), |
6202 | kind: ast::ErrorKind::UnicodeClassInvalid, |
6203 | } |
6204 | ); |
6205 | assert_eq!( |
6206 | parser(r"\P\{" ).parse().unwrap_err(), |
6207 | TestError { |
6208 | span: span(2..3), |
6209 | kind: ast::ErrorKind::UnicodeClassInvalid, |
6210 | } |
6211 | ); |
6212 | } |
6213 | |
6214 | #[test ] |
6215 | fn parse_perl_class() { |
6216 | assert_eq!( |
6217 | parser(r"\d" ).parse_escape(), |
6218 | Ok(Primitive::Perl(ast::ClassPerl { |
6219 | span: span(0..2), |
6220 | kind: ast::ClassPerlKind::Digit, |
6221 | negated: false, |
6222 | })) |
6223 | ); |
6224 | assert_eq!( |
6225 | parser(r"\D" ).parse_escape(), |
6226 | Ok(Primitive::Perl(ast::ClassPerl { |
6227 | span: span(0..2), |
6228 | kind: ast::ClassPerlKind::Digit, |
6229 | negated: true, |
6230 | })) |
6231 | ); |
6232 | assert_eq!( |
6233 | parser(r"\s" ).parse_escape(), |
6234 | Ok(Primitive::Perl(ast::ClassPerl { |
6235 | span: span(0..2), |
6236 | kind: ast::ClassPerlKind::Space, |
6237 | negated: false, |
6238 | })) |
6239 | ); |
6240 | assert_eq!( |
6241 | parser(r"\S" ).parse_escape(), |
6242 | Ok(Primitive::Perl(ast::ClassPerl { |
6243 | span: span(0..2), |
6244 | kind: ast::ClassPerlKind::Space, |
6245 | negated: true, |
6246 | })) |
6247 | ); |
6248 | assert_eq!( |
6249 | parser(r"\w" ).parse_escape(), |
6250 | Ok(Primitive::Perl(ast::ClassPerl { |
6251 | span: span(0..2), |
6252 | kind: ast::ClassPerlKind::Word, |
6253 | negated: false, |
6254 | })) |
6255 | ); |
6256 | assert_eq!( |
6257 | parser(r"\W" ).parse_escape(), |
6258 | Ok(Primitive::Perl(ast::ClassPerl { |
6259 | span: span(0..2), |
6260 | kind: ast::ClassPerlKind::Word, |
6261 | negated: true, |
6262 | })) |
6263 | ); |
6264 | |
6265 | assert_eq!( |
6266 | parser(r"\d" ).parse(), |
6267 | Ok(Ast::class_perl(ast::ClassPerl { |
6268 | span: span(0..2), |
6269 | kind: ast::ClassPerlKind::Digit, |
6270 | negated: false, |
6271 | })) |
6272 | ); |
6273 | assert_eq!( |
6274 | parser(r"\dz" ).parse(), |
6275 | Ok(Ast::concat(ast::Concat { |
6276 | span: span(0..3), |
6277 | asts: vec![ |
6278 | Ast::class_perl(ast::ClassPerl { |
6279 | span: span(0..2), |
6280 | kind: ast::ClassPerlKind::Digit, |
6281 | negated: false, |
6282 | }), |
6283 | Ast::literal(ast::Literal { |
6284 | span: span(2..3), |
6285 | kind: ast::LiteralKind::Verbatim, |
6286 | c: 'z' , |
6287 | }), |
6288 | ], |
6289 | })) |
6290 | ); |
6291 | } |
6292 | |
6293 | // This tests a bug fix where the nest limit checker wasn't decrementing |
6294 | // its depth during post-traversal, which causes long regexes to trip |
6295 | // the default limit too aggressively. |
6296 | #[test ] |
6297 | fn regression_454_nest_too_big() { |
6298 | let pattern = r#" |
6299 | 2(?: |
6300 | [45]\d{3}| |
6301 | 7(?: |
6302 | 1[0-267]| |
6303 | 2[0-289]| |
6304 | 3[0-29]| |
6305 | 4[01]| |
6306 | 5[1-3]| |
6307 | 6[013]| |
6308 | 7[0178]| |
6309 | 91 |
6310 | )| |
6311 | 8(?: |
6312 | 0[125]| |
6313 | [139][1-6]| |
6314 | 2[0157-9]| |
6315 | 41| |
6316 | 6[1-35]| |
6317 | 7[1-5]| |
6318 | 8[1-8]| |
6319 | 90 |
6320 | )| |
6321 | 9(?: |
6322 | 0[0-2]| |
6323 | 1[0-4]| |
6324 | 2[568]| |
6325 | 3[3-6]| |
6326 | 5[5-7]| |
6327 | 6[0167]| |
6328 | 7[15]| |
6329 | 8[0146-9] |
6330 | ) |
6331 | )\d{4} |
6332 | "# ; |
6333 | assert!(parser_nest_limit(pattern, 50).parse().is_ok()); |
6334 | } |
6335 | |
6336 | // This tests that we treat a trailing `-` in a character class as a |
6337 | // literal `-` even when whitespace mode is enabled and there is whitespace |
6338 | // after the trailing `-`. |
6339 | #[test ] |
6340 | fn regression_455_trailing_dash_ignore_whitespace() { |
6341 | assert!(parser("(?x)[ / - ]" ).parse().is_ok()); |
6342 | assert!(parser("(?x)[ a - ]" ).parse().is_ok()); |
6343 | assert!(parser( |
6344 | "(?x)[ |
6345 | a |
6346 | - ] |
6347 | " |
6348 | ) |
6349 | .parse() |
6350 | .is_ok()); |
6351 | assert!(parser( |
6352 | "(?x)[ |
6353 | a # wat |
6354 | - ] |
6355 | " |
6356 | ) |
6357 | .parse() |
6358 | .is_ok()); |
6359 | |
6360 | assert!(parser("(?x)[ / -" ).parse().is_err()); |
6361 | assert!(parser("(?x)[ / - " ).parse().is_err()); |
6362 | assert!(parser( |
6363 | "(?x)[ |
6364 | / - |
6365 | " |
6366 | ) |
6367 | .parse() |
6368 | .is_err()); |
6369 | assert!(parser( |
6370 | "(?x)[ |
6371 | / - # wat |
6372 | " |
6373 | ) |
6374 | .parse() |
6375 | .is_err()); |
6376 | } |
6377 | } |
6378 | |