1/*!
2Defines an abstract syntax for regular expressions.
3*/
4
5use std::cmp::Ordering;
6use std::error;
7use std::fmt;
8
9pub use crate::ast::visitor::{visit, Visitor};
10
11pub mod parse;
12pub mod print;
13mod visitor;
14
15/// An error that occurred while parsing a regular expression into an abstract
16/// syntax tree.
17///
18/// Note that not all ASTs represents a valid regular expression. For example,
19/// an AST is constructed without error for `\p{Quux}`, but `Quux` is not a
20/// valid Unicode property name. That particular error is reported when
21/// translating an AST to the high-level intermediate representation (`HIR`).
22#[derive(Clone, Debug, Eq, PartialEq)]
23pub struct Error {
24 /// The kind of error.
25 kind: ErrorKind,
26 /// The original pattern that the parser generated the error from. Every
27 /// span in an error is a valid range into this string.
28 pattern: String,
29 /// The span of this error.
30 span: Span,
31}
32
33impl Error {
34 /// Return the type of this error.
35 pub fn kind(&self) -> &ErrorKind {
36 &self.kind
37 }
38
39 /// The original pattern string in which this error occurred.
40 ///
41 /// Every span reported by this error is reported in terms of this string.
42 pub fn pattern(&self) -> &str {
43 &self.pattern
44 }
45
46 /// Return the span at which this error occurred.
47 pub fn span(&self) -> &Span {
48 &self.span
49 }
50
51 /// Return an auxiliary span. This span exists only for some errors that
52 /// benefit from being able to point to two locations in the original
53 /// regular expression. For example, "duplicate" errors will have the
54 /// main error position set to the duplicate occurrence while its
55 /// auxiliary span will be set to the initial occurrence.
56 pub fn auxiliary_span(&self) -> Option<&Span> {
57 use self::ErrorKind::*;
58 match self.kind {
59 FlagDuplicate { ref original } => Some(original),
60 FlagRepeatedNegation { ref original, .. } => Some(original),
61 GroupNameDuplicate { ref original, .. } => Some(original),
62 _ => None,
63 }
64 }
65}
66
67/// The type of an error that occurred while building an AST.
68#[derive(Clone, Debug, Eq, PartialEq)]
69pub enum ErrorKind {
70 /// The capturing group limit was exceeded.
71 ///
72 /// Note that this represents a limit on the total number of capturing
73 /// groups in a regex and not necessarily the number of nested capturing
74 /// groups. That is, the nest limit can be low and it is still possible for
75 /// this error to occur.
76 CaptureLimitExceeded,
77 /// An invalid escape sequence was found in a character class set.
78 ClassEscapeInvalid,
79 /// An invalid character class range was found. An invalid range is any
80 /// range where the start is greater than the end.
81 ClassRangeInvalid,
82 /// An invalid range boundary was found in a character class. Range
83 /// boundaries must be a single literal codepoint, but this error indicates
84 /// that something else was found, such as a nested class.
85 ClassRangeLiteral,
86 /// An opening `[` was found with no corresponding closing `]`.
87 ClassUnclosed,
88 /// Note that this error variant is no longer used. Namely, a decimal
89 /// number can only appear as a repetition quantifier. When the number
90 /// in a repetition quantifier is empty, then it gets its own specialized
91 /// error, `RepetitionCountDecimalEmpty`.
92 DecimalEmpty,
93 /// An invalid decimal number was given where one was expected.
94 DecimalInvalid,
95 /// A bracketed hex literal was empty.
96 EscapeHexEmpty,
97 /// A bracketed hex literal did not correspond to a Unicode scalar value.
98 EscapeHexInvalid,
99 /// An invalid hexadecimal digit was found.
100 EscapeHexInvalidDigit,
101 /// EOF was found before an escape sequence was completed.
102 EscapeUnexpectedEof,
103 /// An unrecognized escape sequence.
104 EscapeUnrecognized,
105 /// A dangling negation was used when setting flags, e.g., `i-`.
106 FlagDanglingNegation,
107 /// A flag was used twice, e.g., `i-i`.
108 FlagDuplicate {
109 /// The position of the original flag. The error position
110 /// points to the duplicate flag.
111 original: Span,
112 },
113 /// The negation operator was used twice, e.g., `-i-s`.
114 FlagRepeatedNegation {
115 /// The position of the original negation operator. The error position
116 /// points to the duplicate negation operator.
117 original: Span,
118 },
119 /// Expected a flag but got EOF, e.g., `(?`.
120 FlagUnexpectedEof,
121 /// Unrecognized flag, e.g., `a`.
122 FlagUnrecognized,
123 /// A duplicate capture name was found.
124 GroupNameDuplicate {
125 /// The position of the initial occurrence of the capture name. The
126 /// error position itself points to the duplicate occurrence.
127 original: Span,
128 },
129 /// A capture group name is empty, e.g., `(?P<>abc)`.
130 GroupNameEmpty,
131 /// An invalid character was seen for a capture group name. This includes
132 /// errors where the first character is a digit (even though subsequent
133 /// characters are allowed to be digits).
134 GroupNameInvalid,
135 /// A closing `>` could not be found for a capture group name.
136 GroupNameUnexpectedEof,
137 /// An unclosed group, e.g., `(ab`.
138 ///
139 /// The span of this error corresponds to the unclosed parenthesis.
140 GroupUnclosed,
141 /// An unopened group, e.g., `ab)`.
142 GroupUnopened,
143 /// The nest limit was exceeded. The limit stored here is the limit
144 /// configured in the parser.
145 NestLimitExceeded(u32),
146 /// The range provided in a counted repetition operator is invalid. The
147 /// range is invalid if the start is greater than the end.
148 RepetitionCountInvalid,
149 /// An opening `{` was not followed by a valid decimal value.
150 /// For example, `x{}` or `x{]}` would fail.
151 RepetitionCountDecimalEmpty,
152 /// An opening `{` was found with no corresponding closing `}`.
153 RepetitionCountUnclosed,
154 /// A repetition operator was applied to a missing sub-expression. This
155 /// occurs, for example, in the regex consisting of just a `*` or even
156 /// `(?i)*`. It is, however, possible to create a repetition operating on
157 /// an empty sub-expression. For example, `()*` is still considered valid.
158 RepetitionMissing,
159 /// The Unicode class is not valid. This typically occurs when a `\p` is
160 /// followed by something other than a `{`.
161 UnicodeClassInvalid,
162 /// When octal support is disabled, this error is produced when an octal
163 /// escape is used. The octal escape is assumed to be an invocation of
164 /// a backreference, which is the common case.
165 UnsupportedBackreference,
166 /// When syntax similar to PCRE's look-around is used, this error is
167 /// returned. Some example syntaxes that are rejected include, but are
168 /// not necessarily limited to, `(?=re)`, `(?!re)`, `(?<=re)` and
169 /// `(?<!re)`. Note that all of these syntaxes are otherwise invalid; this
170 /// error is used to improve the user experience.
171 UnsupportedLookAround,
172 /// Hints that destructuring should not be exhaustive.
173 ///
174 /// This enum may grow additional variants, so this makes sure clients
175 /// don't count on exhaustive matching. (Otherwise, adding a new variant
176 /// could break existing code.)
177 #[doc(hidden)]
178 __Nonexhaustive,
179}
180
181impl error::Error for Error {
182 // TODO: Remove this method entirely on the next breaking semver release.
183 #[allow(deprecated)]
184 fn description(&self) -> &str {
185 use self::ErrorKind::*;
186 match self.kind {
187 CaptureLimitExceeded => "capture group limit exceeded",
188 ClassEscapeInvalid => "invalid escape sequence in character class",
189 ClassRangeInvalid => "invalid character class range",
190 ClassRangeLiteral => "invalid range boundary, must be a literal",
191 ClassUnclosed => "unclosed character class",
192 DecimalEmpty => "empty decimal literal",
193 DecimalInvalid => "invalid decimal literal",
194 EscapeHexEmpty => "empty hexadecimal literal",
195 EscapeHexInvalid => "invalid hexadecimal literal",
196 EscapeHexInvalidDigit => "invalid hexadecimal digit",
197 EscapeUnexpectedEof => "unexpected eof (escape sequence)",
198 EscapeUnrecognized => "unrecognized escape sequence",
199 FlagDanglingNegation => "dangling flag negation operator",
200 FlagDuplicate { .. } => "duplicate flag",
201 FlagRepeatedNegation { .. } => "repeated negation",
202 FlagUnexpectedEof => "unexpected eof (flag)",
203 FlagUnrecognized => "unrecognized flag",
204 GroupNameDuplicate { .. } => "duplicate capture group name",
205 GroupNameEmpty => "empty capture group name",
206 GroupNameInvalid => "invalid capture group name",
207 GroupNameUnexpectedEof => "unclosed capture group name",
208 GroupUnclosed => "unclosed group",
209 GroupUnopened => "unopened group",
210 NestLimitExceeded(_) => "nest limit exceeded",
211 RepetitionCountInvalid => "invalid repetition count range",
212 RepetitionCountUnclosed => "unclosed counted repetition",
213 RepetitionMissing => "repetition operator missing expression",
214 UnicodeClassInvalid => "invalid Unicode character class",
215 UnsupportedBackreference => "backreferences are not supported",
216 UnsupportedLookAround => "look-around is not supported",
217 _ => unreachable!(),
218 }
219 }
220}
221
222impl fmt::Display for Error {
223 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
224 crate::error::Formatter::from(self).fmt(f)
225 }
226}
227
228impl fmt::Display for ErrorKind {
229 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
230 use self::ErrorKind::*;
231 match *self {
232 CaptureLimitExceeded => write!(
233 f,
234 "exceeded the maximum number of \
235 capturing groups ({})",
236 ::std::u32::MAX
237 ),
238 ClassEscapeInvalid => {
239 write!(f, "invalid escape sequence found in character class")
240 }
241 ClassRangeInvalid => write!(
242 f,
243 "invalid character class range, \
244 the start must be <= the end"
245 ),
246 ClassRangeLiteral => {
247 write!(f, "invalid range boundary, must be a literal")
248 }
249 ClassUnclosed => write!(f, "unclosed character class"),
250 DecimalEmpty => write!(f, "decimal literal empty"),
251 DecimalInvalid => write!(f, "decimal literal invalid"),
252 EscapeHexEmpty => write!(f, "hexadecimal literal empty"),
253 EscapeHexInvalid => {
254 write!(f, "hexadecimal literal is not a Unicode scalar value")
255 }
256 EscapeHexInvalidDigit => write!(f, "invalid hexadecimal digit"),
257 EscapeUnexpectedEof => write!(
258 f,
259 "incomplete escape sequence, \
260 reached end of pattern prematurely"
261 ),
262 EscapeUnrecognized => write!(f, "unrecognized escape sequence"),
263 FlagDanglingNegation => {
264 write!(f, "dangling flag negation operator")
265 }
266 FlagDuplicate { .. } => write!(f, "duplicate flag"),
267 FlagRepeatedNegation { .. } => {
268 write!(f, "flag negation operator repeated")
269 }
270 FlagUnexpectedEof => {
271 write!(f, "expected flag but got end of regex")
272 }
273 FlagUnrecognized => write!(f, "unrecognized flag"),
274 GroupNameDuplicate { .. } => {
275 write!(f, "duplicate capture group name")
276 }
277 GroupNameEmpty => write!(f, "empty capture group name"),
278 GroupNameInvalid => write!(f, "invalid capture group character"),
279 GroupNameUnexpectedEof => write!(f, "unclosed capture group name"),
280 GroupUnclosed => write!(f, "unclosed group"),
281 GroupUnopened => write!(f, "unopened group"),
282 NestLimitExceeded(limit) => write!(
283 f,
284 "exceed the maximum number of \
285 nested parentheses/brackets ({})",
286 limit
287 ),
288 RepetitionCountInvalid => write!(
289 f,
290 "invalid repetition count range, \
291 the start must be <= the end"
292 ),
293 RepetitionCountDecimalEmpty => {
294 write!(f, "repetition quantifier expects a valid decimal")
295 }
296 RepetitionCountUnclosed => {
297 write!(f, "unclosed counted repetition")
298 }
299 RepetitionMissing => {
300 write!(f, "repetition operator missing expression")
301 }
302 UnicodeClassInvalid => {
303 write!(f, "invalid Unicode character class")
304 }
305 UnsupportedBackreference => {
306 write!(f, "backreferences are not supported")
307 }
308 UnsupportedLookAround => write!(
309 f,
310 "look-around, including look-ahead and look-behind, \
311 is not supported"
312 ),
313 _ => unreachable!(),
314 }
315 }
316}
317
318/// Span represents the position information of a single AST item.
319///
320/// All span positions are absolute byte offsets that can be used on the
321/// original regular expression that was parsed.
322#[derive(Clone, Copy, Eq, PartialEq)]
323pub struct Span {
324 /// The start byte offset.
325 pub start: Position,
326 /// The end byte offset.
327 pub end: Position,
328}
329
330impl fmt::Debug for Span {
331 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
332 write!(f, "Span({:?}, {:?})", self.start, self.end)
333 }
334}
335
336impl Ord for Span {
337 fn cmp(&self, other: &Span) -> Ordering {
338 (&self.start, &self.end).cmp(&(&other.start, &other.end))
339 }
340}
341
342impl PartialOrd for Span {
343 fn partial_cmp(&self, other: &Span) -> Option<Ordering> {
344 Some(self.cmp(other))
345 }
346}
347
348/// A single position in a regular expression.
349///
350/// A position encodes one half of a span, and include the byte offset, line
351/// number and column number.
352#[derive(Clone, Copy, Eq, PartialEq)]
353pub struct Position {
354 /// The absolute offset of this position, starting at `0` from the
355 /// beginning of the regular expression pattern string.
356 pub offset: usize,
357 /// The line number, starting at `1`.
358 pub line: usize,
359 /// The approximate column number, starting at `1`.
360 pub column: usize,
361}
362
363impl fmt::Debug for Position {
364 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
365 write!(
366 f,
367 "Position(o: {:?}, l: {:?}, c: {:?})",
368 self.offset, self.line, self.column
369 )
370 }
371}
372
373impl Ord for Position {
374 fn cmp(&self, other: &Position) -> Ordering {
375 self.offset.cmp(&other.offset)
376 }
377}
378
379impl PartialOrd for Position {
380 fn partial_cmp(&self, other: &Position) -> Option<Ordering> {
381 Some(self.cmp(other))
382 }
383}
384
385impl Span {
386 /// Create a new span with the given positions.
387 pub fn new(start: Position, end: Position) -> Span {
388 Span { start, end }
389 }
390
391 /// Create a new span using the given position as the start and end.
392 pub fn splat(pos: Position) -> Span {
393 Span::new(pos, pos)
394 }
395
396 /// Create a new span by replacing the starting the position with the one
397 /// given.
398 pub fn with_start(self, pos: Position) -> Span {
399 Span { start: pos, ..self }
400 }
401
402 /// Create a new span by replacing the ending the position with the one
403 /// given.
404 pub fn with_end(self, pos: Position) -> Span {
405 Span { end: pos, ..self }
406 }
407
408 /// Returns true if and only if this span occurs on a single line.
409 pub fn is_one_line(&self) -> bool {
410 self.start.line == self.end.line
411 }
412
413 /// Returns true if and only if this span is empty. That is, it points to
414 /// a single position in the concrete syntax of a regular expression.
415 pub fn is_empty(&self) -> bool {
416 self.start.offset == self.end.offset
417 }
418}
419
420impl Position {
421 /// Create a new position with the given information.
422 ///
423 /// `offset` is the absolute offset of the position, starting at `0` from
424 /// the beginning of the regular expression pattern string.
425 ///
426 /// `line` is the line number, starting at `1`.
427 ///
428 /// `column` is the approximate column number, starting at `1`.
429 pub fn new(offset: usize, line: usize, column: usize) -> Position {
430 Position { offset, line, column }
431 }
432}
433
434/// An abstract syntax tree for a singular expression along with comments
435/// found.
436///
437/// Comments are not stored in the tree itself to avoid complexity. Each
438/// comment contains a span of precisely where it occurred in the original
439/// regular expression.
440#[derive(Clone, Debug, Eq, PartialEq)]
441pub struct WithComments {
442 /// The actual ast.
443 pub ast: Ast,
444 /// All comments found in the original regular expression.
445 pub comments: Vec<Comment>,
446}
447
448/// A comment from a regular expression with an associated span.
449///
450/// A regular expression can only contain comments when the `x` flag is
451/// enabled.
452#[derive(Clone, Debug, Eq, PartialEq)]
453pub struct Comment {
454 /// The span of this comment, including the beginning `#` and ending `\n`.
455 pub span: Span,
456 /// The comment text, starting with the first character following the `#`
457 /// and ending with the last character preceding the `\n`.
458 pub comment: String,
459}
460
461/// An abstract syntax tree for a single regular expression.
462///
463/// An `Ast`'s `fmt::Display` implementation uses constant stack space and heap
464/// space proportional to the size of the `Ast`.
465///
466/// This type defines its own destructor that uses constant stack space and
467/// heap space proportional to the size of the `Ast`.
468#[derive(Clone, Debug, Eq, PartialEq)]
469pub enum Ast {
470 /// An empty regex that matches everything.
471 Empty(Span),
472 /// A set of flags, e.g., `(?is)`.
473 Flags(SetFlags),
474 /// A single character literal, which includes escape sequences.
475 Literal(Literal),
476 /// The "any character" class.
477 Dot(Span),
478 /// A single zero-width assertion.
479 Assertion(Assertion),
480 /// A single character class. This includes all forms of character classes
481 /// except for `.`. e.g., `\d`, `\pN`, `[a-z]` and `[[:alpha:]]`.
482 Class(Class),
483 /// A repetition operator applied to an arbitrary regular expression.
484 Repetition(Repetition),
485 /// A grouped regular expression.
486 Group(Group),
487 /// An alternation of regular expressions.
488 Alternation(Alternation),
489 /// A concatenation of regular expressions.
490 Concat(Concat),
491}
492
493impl Ast {
494 /// Return the span of this abstract syntax tree.
495 pub fn span(&self) -> &Span {
496 match *self {
497 Ast::Empty(ref span) => span,
498 Ast::Flags(ref x) => &x.span,
499 Ast::Literal(ref x) => &x.span,
500 Ast::Dot(ref span) => span,
501 Ast::Assertion(ref x) => &x.span,
502 Ast::Class(ref x) => x.span(),
503 Ast::Repetition(ref x) => &x.span,
504 Ast::Group(ref x) => &x.span,
505 Ast::Alternation(ref x) => &x.span,
506 Ast::Concat(ref x) => &x.span,
507 }
508 }
509
510 /// Return true if and only if this Ast is empty.
511 pub fn is_empty(&self) -> bool {
512 match *self {
513 Ast::Empty(_) => true,
514 _ => false,
515 }
516 }
517
518 /// Returns true if and only if this AST has any (including possibly empty)
519 /// subexpressions.
520 fn has_subexprs(&self) -> bool {
521 match *self {
522 Ast::Empty(_)
523 | Ast::Flags(_)
524 | Ast::Literal(_)
525 | Ast::Dot(_)
526 | Ast::Assertion(_) => false,
527 Ast::Class(_)
528 | Ast::Repetition(_)
529 | Ast::Group(_)
530 | Ast::Alternation(_)
531 | Ast::Concat(_) => true,
532 }
533 }
534}
535
536/// Print a display representation of this Ast.
537///
538/// This does not preserve any of the original whitespace formatting that may
539/// have originally been present in the concrete syntax from which this Ast
540/// was generated.
541///
542/// This implementation uses constant stack space and heap space proportional
543/// to the size of the `Ast`.
544impl fmt::Display for Ast {
545 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
546 use crate::ast::print::Printer;
547 Printer::new().print(self, wtr:f)
548 }
549}
550
551/// An alternation of regular expressions.
552#[derive(Clone, Debug, Eq, PartialEq)]
553pub struct Alternation {
554 /// The span of this alternation.
555 pub span: Span,
556 /// The alternate regular expressions.
557 pub asts: Vec<Ast>,
558}
559
560impl Alternation {
561 /// Return this alternation as an AST.
562 ///
563 /// If this alternation contains zero ASTs, then Ast::Empty is
564 /// returned. If this alternation contains exactly 1 AST, then the
565 /// corresponding AST is returned. Otherwise, Ast::Alternation is returned.
566 pub fn into_ast(mut self) -> Ast {
567 match self.asts.len() {
568 0 => Ast::Empty(self.span),
569 1 => self.asts.pop().unwrap(),
570 _ => Ast::Alternation(self),
571 }
572 }
573}
574
575/// A concatenation of regular expressions.
576#[derive(Clone, Debug, Eq, PartialEq)]
577pub struct Concat {
578 /// The span of this concatenation.
579 pub span: Span,
580 /// The concatenation regular expressions.
581 pub asts: Vec<Ast>,
582}
583
584impl Concat {
585 /// Return this concatenation as an AST.
586 ///
587 /// If this concatenation contains zero ASTs, then Ast::Empty is
588 /// returned. If this concatenation contains exactly 1 AST, then the
589 /// corresponding AST is returned. Otherwise, Ast::Concat is returned.
590 pub fn into_ast(mut self) -> Ast {
591 match self.asts.len() {
592 0 => Ast::Empty(self.span),
593 1 => self.asts.pop().unwrap(),
594 _ => Ast::Concat(self),
595 }
596 }
597}
598
599/// A single literal expression.
600///
601/// A literal corresponds to a single Unicode scalar value. Literals may be
602/// represented in their literal form, e.g., `a` or in their escaped form,
603/// e.g., `\x61`.
604#[derive(Clone, Debug, Eq, PartialEq)]
605pub struct Literal {
606 /// The span of this literal.
607 pub span: Span,
608 /// The kind of this literal.
609 pub kind: LiteralKind,
610 /// The Unicode scalar value corresponding to this literal.
611 pub c: char,
612}
613
614impl Literal {
615 /// If this literal was written as a `\x` hex escape, then this returns
616 /// the corresponding byte value. Otherwise, this returns `None`.
617 pub fn byte(&self) -> Option<u8> {
618 let short_hex: LiteralKind = LiteralKind::HexFixed(HexLiteralKind::X);
619 if self.c as u32 <= 255 && self.kind == short_hex {
620 Some(self.c as u8)
621 } else {
622 None
623 }
624 }
625}
626
627/// The kind of a single literal expression.
628#[derive(Clone, Debug, Eq, PartialEq)]
629pub enum LiteralKind {
630 /// The literal is written verbatim, e.g., `a` or `☃`.
631 Verbatim,
632 /// The literal is written as an escape because it is punctuation, e.g.,
633 /// `\*` or `\[`.
634 Punctuation,
635 /// The literal is written as an octal escape, e.g., `\141`.
636 Octal,
637 /// The literal is written as a hex code with a fixed number of digits
638 /// depending on the type of the escape, e.g., `\x61` or or `\u0061` or
639 /// `\U00000061`.
640 HexFixed(HexLiteralKind),
641 /// The literal is written as a hex code with a bracketed number of
642 /// digits. The only restriction is that the bracketed hex code must refer
643 /// to a valid Unicode scalar value.
644 HexBrace(HexLiteralKind),
645 /// The literal is written as a specially recognized escape, e.g., `\f`
646 /// or `\n`.
647 Special(SpecialLiteralKind),
648}
649
650/// The type of a special literal.
651///
652/// A special literal is a special escape sequence recognized by the regex
653/// parser, e.g., `\f` or `\n`.
654#[derive(Clone, Debug, Eq, PartialEq)]
655pub enum SpecialLiteralKind {
656 /// Bell, spelled `\a` (`\x07`).
657 Bell,
658 /// Form feed, spelled `\f` (`\x0C`).
659 FormFeed,
660 /// Tab, spelled `\t` (`\x09`).
661 Tab,
662 /// Line feed, spelled `\n` (`\x0A`).
663 LineFeed,
664 /// Carriage return, spelled `\r` (`\x0D`).
665 CarriageReturn,
666 /// Vertical tab, spelled `\v` (`\x0B`).
667 VerticalTab,
668 /// Space, spelled `\ ` (`\x20`). Note that this can only appear when
669 /// parsing in verbose mode.
670 Space,
671}
672
673/// The type of a Unicode hex literal.
674///
675/// Note that all variants behave the same when used with brackets. They only
676/// differ when used without brackets in the number of hex digits that must
677/// follow.
678#[derive(Clone, Debug, Eq, PartialEq)]
679pub enum HexLiteralKind {
680 /// A `\x` prefix. When used without brackets, this form is limited to
681 /// two digits.
682 X,
683 /// A `\u` prefix. When used without brackets, this form is limited to
684 /// four digits.
685 UnicodeShort,
686 /// A `\U` prefix. When used without brackets, this form is limited to
687 /// eight digits.
688 UnicodeLong,
689}
690
691impl HexLiteralKind {
692 /// The number of digits that must be used with this literal form when
693 /// used without brackets. When used with brackets, there is no
694 /// restriction on the number of digits.
695 pub fn digits(&self) -> u32 {
696 match *self {
697 HexLiteralKind::X => 2,
698 HexLiteralKind::UnicodeShort => 4,
699 HexLiteralKind::UnicodeLong => 8,
700 }
701 }
702}
703
704/// A single character class expression.
705#[derive(Clone, Debug, Eq, PartialEq)]
706pub enum Class {
707 /// A Unicode character class, e.g., `\pL` or `\p{Greek}`.
708 Unicode(ClassUnicode),
709 /// A perl character class, e.g., `\d` or `\W`.
710 Perl(ClassPerl),
711 /// A bracketed character class set, which may contain zero or more
712 /// character ranges and/or zero or more nested classes. e.g.,
713 /// `[a-zA-Z\pL]`.
714 Bracketed(ClassBracketed),
715}
716
717impl Class {
718 /// Return the span of this character class.
719 pub fn span(&self) -> &Span {
720 match *self {
721 Class::Perl(ref x: &ClassPerl) => &x.span,
722 Class::Unicode(ref x: &ClassUnicode) => &x.span,
723 Class::Bracketed(ref x: &ClassBracketed) => &x.span,
724 }
725 }
726}
727
728/// A Perl character class.
729#[derive(Clone, Debug, Eq, PartialEq)]
730pub struct ClassPerl {
731 /// The span of this class.
732 pub span: Span,
733 /// The kind of Perl class.
734 pub kind: ClassPerlKind,
735 /// Whether the class is negated or not. e.g., `\d` is not negated but
736 /// `\D` is.
737 pub negated: bool,
738}
739
740/// The available Perl character classes.
741#[derive(Clone, Debug, Eq, PartialEq)]
742pub enum ClassPerlKind {
743 /// Decimal numbers.
744 Digit,
745 /// Whitespace.
746 Space,
747 /// Word characters.
748 Word,
749}
750
751/// An ASCII character class.
752#[derive(Clone, Debug, Eq, PartialEq)]
753pub struct ClassAscii {
754 /// The span of this class.
755 pub span: Span,
756 /// The kind of ASCII class.
757 pub kind: ClassAsciiKind,
758 /// Whether the class is negated or not. e.g., `[[:alpha:]]` is not negated
759 /// but `[[:^alpha:]]` is.
760 pub negated: bool,
761}
762
763/// The available ASCII character classes.
764#[derive(Clone, Debug, Eq, PartialEq)]
765pub enum ClassAsciiKind {
766 /// `[0-9A-Za-z]`
767 Alnum,
768 /// `[A-Za-z]`
769 Alpha,
770 /// `[\x00-\x7F]`
771 Ascii,
772 /// `[ \t]`
773 Blank,
774 /// `[\x00-\x1F\x7F]`
775 Cntrl,
776 /// `[0-9]`
777 Digit,
778 /// `[!-~]`
779 Graph,
780 /// `[a-z]`
781 Lower,
782 /// `[ -~]`
783 Print,
784 /// `[!-/:-@\[-`{-~]`
785 Punct,
786 /// `[\t\n\v\f\r ]`
787 Space,
788 /// `[A-Z]`
789 Upper,
790 /// `[0-9A-Za-z_]`
791 Word,
792 /// `[0-9A-Fa-f]`
793 Xdigit,
794}
795
796impl ClassAsciiKind {
797 /// Return the corresponding ClassAsciiKind variant for the given name.
798 ///
799 /// The name given should correspond to the lowercase version of the
800 /// variant name. e.g., `cntrl` is the name for `ClassAsciiKind::Cntrl`.
801 ///
802 /// If no variant with the corresponding name exists, then `None` is
803 /// returned.
804 pub fn from_name(name: &str) -> Option<ClassAsciiKind> {
805 use self::ClassAsciiKind::*;
806 match name {
807 "alnum" => Some(Alnum),
808 "alpha" => Some(Alpha),
809 "ascii" => Some(Ascii),
810 "blank" => Some(Blank),
811 "cntrl" => Some(Cntrl),
812 "digit" => Some(Digit),
813 "graph" => Some(Graph),
814 "lower" => Some(Lower),
815 "print" => Some(Print),
816 "punct" => Some(Punct),
817 "space" => Some(Space),
818 "upper" => Some(Upper),
819 "word" => Some(Word),
820 "xdigit" => Some(Xdigit),
821 _ => None,
822 }
823 }
824}
825
826/// A Unicode character class.
827#[derive(Clone, Debug, Eq, PartialEq)]
828pub struct ClassUnicode {
829 /// The span of this class.
830 pub span: Span,
831 /// Whether this class is negated or not.
832 ///
833 /// Note: be careful when using this attribute. This specifically refers
834 /// to whether the class is written as `\p` or `\P`, where the latter
835 /// is `negated = true`. However, it also possible to write something like
836 /// `\P{scx!=Katakana}` which is actually equivalent to
837 /// `\p{scx=Katakana}` and is therefore not actually negated even though
838 /// `negated = true` here. To test whether this class is truly negated
839 /// or not, use the `is_negated` method.
840 pub negated: bool,
841 /// The kind of Unicode class.
842 pub kind: ClassUnicodeKind,
843}
844
845impl ClassUnicode {
846 /// Returns true if this class has been negated.
847 ///
848 /// Note that this takes the Unicode op into account, if it's present.
849 /// e.g., `is_negated` for `\P{scx!=Katakana}` will return `false`.
850 pub fn is_negated(&self) -> bool {
851 match self.kind {
852 ClassUnicodeKind::NamedValue {
853 op: ClassUnicodeOpKind::NotEqual,
854 ..
855 } => !self.negated,
856 _ => self.negated,
857 }
858 }
859}
860
861/// The available forms of Unicode character classes.
862#[derive(Clone, Debug, Eq, PartialEq)]
863pub enum ClassUnicodeKind {
864 /// A one letter abbreviated class, e.g., `\pN`.
865 OneLetter(char),
866 /// A binary property, general category or script. The string may be
867 /// empty.
868 Named(String),
869 /// A property name and an associated value.
870 NamedValue {
871 /// The type of Unicode op used to associate `name` with `value`.
872 op: ClassUnicodeOpKind,
873 /// The property name (which may be empty).
874 name: String,
875 /// The property value (which may be empty).
876 value: String,
877 },
878}
879
880/// The type of op used in a Unicode character class.
881#[derive(Clone, Debug, Eq, PartialEq)]
882pub enum ClassUnicodeOpKind {
883 /// A property set to a specific value, e.g., `\p{scx=Katakana}`.
884 Equal,
885 /// A property set to a specific value using a colon, e.g.,
886 /// `\p{scx:Katakana}`.
887 Colon,
888 /// A property that isn't a particular value, e.g., `\p{scx!=Katakana}`.
889 NotEqual,
890}
891
892impl ClassUnicodeOpKind {
893 /// Whether the op is an equality op or not.
894 pub fn is_equal(&self) -> bool {
895 match *self {
896 ClassUnicodeOpKind::Equal | ClassUnicodeOpKind::Colon => true,
897 _ => false,
898 }
899 }
900}
901
902/// A bracketed character class, e.g., `[a-z0-9]`.
903#[derive(Clone, Debug, Eq, PartialEq)]
904pub struct ClassBracketed {
905 /// The span of this class.
906 pub span: Span,
907 /// Whether this class is negated or not. e.g., `[a]` is not negated but
908 /// `[^a]` is.
909 pub negated: bool,
910 /// The type of this set. A set is either a normal union of things, e.g.,
911 /// `[abc]` or a result of applying set operations, e.g., `[\pL--c]`.
912 pub kind: ClassSet,
913}
914
915/// A character class set.
916///
917/// This type corresponds to the internal structure of a bracketed character
918/// class. That is, every bracketed character is one of two types: a union of
919/// items (literals, ranges, other bracketed classes) or a tree of binary set
920/// operations.
921#[derive(Clone, Debug, Eq, PartialEq)]
922pub enum ClassSet {
923 /// An item, which can be a single literal, range, nested character class
924 /// or a union of items.
925 Item(ClassSetItem),
926 /// A single binary operation (i.e., &&, -- or ~~).
927 BinaryOp(ClassSetBinaryOp),
928}
929
930impl ClassSet {
931 /// Build a set from a union.
932 pub fn union(ast: ClassSetUnion) -> ClassSet {
933 ClassSet::Item(ClassSetItem::Union(ast))
934 }
935
936 /// Return the span of this character class set.
937 pub fn span(&self) -> &Span {
938 match *self {
939 ClassSet::Item(ref x: &ClassSetItem) => x.span(),
940 ClassSet::BinaryOp(ref x: &ClassSetBinaryOp) => &x.span,
941 }
942 }
943
944 /// Return true if and only if this class set is empty.
945 fn is_empty(&self) -> bool {
946 match *self {
947 ClassSet::Item(ClassSetItem::Empty(_)) => true,
948 _ => false,
949 }
950 }
951}
952
953/// A single component of a character class set.
954#[derive(Clone, Debug, Eq, PartialEq)]
955pub enum ClassSetItem {
956 /// An empty item.
957 ///
958 /// Note that a bracketed character class cannot contain a single empty
959 /// item. Empty items can appear when using one of the binary operators.
960 /// For example, `[&&]` is the intersection of two empty classes.
961 Empty(Span),
962 /// A single literal.
963 Literal(Literal),
964 /// A range between two literals.
965 Range(ClassSetRange),
966 /// An ASCII character class, e.g., `[:alnum:]` or `[:punct:]`.
967 Ascii(ClassAscii),
968 /// A Unicode character class, e.g., `\pL` or `\p{Greek}`.
969 Unicode(ClassUnicode),
970 /// A perl character class, e.g., `\d` or `\W`.
971 Perl(ClassPerl),
972 /// A bracketed character class set, which may contain zero or more
973 /// character ranges and/or zero or more nested classes. e.g.,
974 /// `[a-zA-Z\pL]`.
975 Bracketed(Box<ClassBracketed>),
976 /// A union of items.
977 Union(ClassSetUnion),
978}
979
980impl ClassSetItem {
981 /// Return the span of this character class set item.
982 pub fn span(&self) -> &Span {
983 match *self {
984 ClassSetItem::Empty(ref span: &Span) => span,
985 ClassSetItem::Literal(ref x: &Literal) => &x.span,
986 ClassSetItem::Range(ref x: &ClassSetRange) => &x.span,
987 ClassSetItem::Ascii(ref x: &ClassAscii) => &x.span,
988 ClassSetItem::Perl(ref x: &ClassPerl) => &x.span,
989 ClassSetItem::Unicode(ref x: &ClassUnicode) => &x.span,
990 ClassSetItem::Bracketed(ref x: &Box) => &x.span,
991 ClassSetItem::Union(ref x: &ClassSetUnion) => &x.span,
992 }
993 }
994}
995
996/// A single character class range in a set.
997#[derive(Clone, Debug, Eq, PartialEq)]
998pub struct ClassSetRange {
999 /// The span of this range.
1000 pub span: Span,
1001 /// The start of this range.
1002 pub start: Literal,
1003 /// The end of this range.
1004 pub end: Literal,
1005}
1006
1007impl ClassSetRange {
1008 /// Returns true if and only if this character class range is valid.
1009 ///
1010 /// The only case where a range is invalid is if its start is greater than
1011 /// its end.
1012 pub fn is_valid(&self) -> bool {
1013 self.start.c <= self.end.c
1014 }
1015}
1016
1017/// A union of items inside a character class set.
1018#[derive(Clone, Debug, Eq, PartialEq)]
1019pub struct ClassSetUnion {
1020 /// The span of the items in this operation. e.g., the `a-z0-9` in
1021 /// `[^a-z0-9]`
1022 pub span: Span,
1023 /// The sequence of items that make up this union.
1024 pub items: Vec<ClassSetItem>,
1025}
1026
1027impl ClassSetUnion {
1028 /// Push a new item in this union.
1029 ///
1030 /// The ending position of this union's span is updated to the ending
1031 /// position of the span of the item given. If the union is empty, then
1032 /// the starting position of this union is set to the starting position
1033 /// of this item.
1034 ///
1035 /// In other words, if you only use this method to add items to a union
1036 /// and you set the spans on each item correctly, then you should never
1037 /// need to adjust the span of the union directly.
1038 pub fn push(&mut self, item: ClassSetItem) {
1039 if self.items.is_empty() {
1040 self.span.start = item.span().start;
1041 }
1042 self.span.end = item.span().end;
1043 self.items.push(item);
1044 }
1045
1046 /// Return this union as a character class set item.
1047 ///
1048 /// If this union contains zero items, then an empty union is
1049 /// returned. If this concatenation contains exactly 1 item, then the
1050 /// corresponding item is returned. Otherwise, ClassSetItem::Union is
1051 /// returned.
1052 pub fn into_item(mut self) -> ClassSetItem {
1053 match self.items.len() {
1054 0 => ClassSetItem::Empty(self.span),
1055 1 => self.items.pop().unwrap(),
1056 _ => ClassSetItem::Union(self),
1057 }
1058 }
1059}
1060
1061/// A Unicode character class set operation.
1062#[derive(Clone, Debug, Eq, PartialEq)]
1063pub struct ClassSetBinaryOp {
1064 /// The span of this operation. e.g., the `a-z--[h-p]` in `[a-z--h-p]`.
1065 pub span: Span,
1066 /// The type of this set operation.
1067 pub kind: ClassSetBinaryOpKind,
1068 /// The left hand side of the operation.
1069 pub lhs: Box<ClassSet>,
1070 /// The right hand side of the operation.
1071 pub rhs: Box<ClassSet>,
1072}
1073
1074/// The type of a Unicode character class set operation.
1075///
1076/// Note that this doesn't explicitly represent union since there is no
1077/// explicit union operator. Concatenation inside a character class corresponds
1078/// to the union operation.
1079#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1080pub enum ClassSetBinaryOpKind {
1081 /// The intersection of two sets, e.g., `\pN&&[a-z]`.
1082 Intersection,
1083 /// The difference of two sets, e.g., `\pN--[0-9]`.
1084 Difference,
1085 /// The symmetric difference of two sets. The symmetric difference is the
1086 /// set of elements belonging to one but not both sets.
1087 /// e.g., `[\pL~~[:ascii:]]`.
1088 SymmetricDifference,
1089}
1090
1091/// A single zero-width assertion.
1092#[derive(Clone, Debug, Eq, PartialEq)]
1093pub struct Assertion {
1094 /// The span of this assertion.
1095 pub span: Span,
1096 /// The assertion kind, e.g., `\b` or `^`.
1097 pub kind: AssertionKind,
1098}
1099
1100/// An assertion kind.
1101#[derive(Clone, Debug, Eq, PartialEq)]
1102pub enum AssertionKind {
1103 /// `^`
1104 StartLine,
1105 /// `$`
1106 EndLine,
1107 /// `\A`
1108 StartText,
1109 /// `\z`
1110 EndText,
1111 /// `\b`
1112 WordBoundary,
1113 /// `\B`
1114 NotWordBoundary,
1115}
1116
1117/// A repetition operation applied to a regular expression.
1118#[derive(Clone, Debug, Eq, PartialEq)]
1119pub struct Repetition {
1120 /// The span of this operation.
1121 pub span: Span,
1122 /// The actual operation.
1123 pub op: RepetitionOp,
1124 /// Whether this operation was applied greedily or not.
1125 pub greedy: bool,
1126 /// The regular expression under repetition.
1127 pub ast: Box<Ast>,
1128}
1129
1130/// The repetition operator itself.
1131#[derive(Clone, Debug, Eq, PartialEq)]
1132pub struct RepetitionOp {
1133 /// The span of this operator. This includes things like `+`, `*?` and
1134 /// `{m,n}`.
1135 pub span: Span,
1136 /// The type of operation.
1137 pub kind: RepetitionKind,
1138}
1139
1140/// The kind of a repetition operator.
1141#[derive(Clone, Debug, Eq, PartialEq)]
1142pub enum RepetitionKind {
1143 /// `?`
1144 ZeroOrOne,
1145 /// `*`
1146 ZeroOrMore,
1147 /// `+`
1148 OneOrMore,
1149 /// `{m,n}`
1150 Range(RepetitionRange),
1151}
1152
1153/// A range repetition operator.
1154#[derive(Clone, Debug, Eq, PartialEq)]
1155pub enum RepetitionRange {
1156 /// `{m}`
1157 Exactly(u32),
1158 /// `{m,}`
1159 AtLeast(u32),
1160 /// `{m,n}`
1161 Bounded(u32, u32),
1162}
1163
1164impl RepetitionRange {
1165 /// Returns true if and only if this repetition range is valid.
1166 ///
1167 /// The only case where a repetition range is invalid is if it is bounded
1168 /// and its start is greater than its end.
1169 pub fn is_valid(&self) -> bool {
1170 match *self {
1171 RepetitionRange::Bounded(s: u32, e: u32) if s > e => false,
1172 _ => true,
1173 }
1174 }
1175}
1176
1177/// A grouped regular expression.
1178///
1179/// This includes both capturing and non-capturing groups. This does **not**
1180/// include flag-only groups like `(?is)`, but does contain any group that
1181/// contains a sub-expression, e.g., `(a)`, `(?P<name>a)`, `(?:a)` and
1182/// `(?is:a)`.
1183#[derive(Clone, Debug, Eq, PartialEq)]
1184pub struct Group {
1185 /// The span of this group.
1186 pub span: Span,
1187 /// The kind of this group.
1188 pub kind: GroupKind,
1189 /// The regular expression in this group.
1190 pub ast: Box<Ast>,
1191}
1192
1193impl Group {
1194 /// If this group is non-capturing, then this returns the (possibly empty)
1195 /// set of flags. Otherwise, `None` is returned.
1196 pub fn flags(&self) -> Option<&Flags> {
1197 match self.kind {
1198 GroupKind::NonCapturing(ref flags) => Some(flags),
1199 _ => None,
1200 }
1201 }
1202
1203 /// Returns true if and only if this group is capturing.
1204 pub fn is_capturing(&self) -> bool {
1205 match self.kind {
1206 GroupKind::CaptureIndex(_) | GroupKind::CaptureName(_) => true,
1207 GroupKind::NonCapturing(_) => false,
1208 }
1209 }
1210
1211 /// Returns the capture index of this group, if this is a capturing group.
1212 ///
1213 /// This returns a capture index precisely when `is_capturing` is `true`.
1214 pub fn capture_index(&self) -> Option<u32> {
1215 match self.kind {
1216 GroupKind::CaptureIndex(i) => Some(i),
1217 GroupKind::CaptureName(ref x) => Some(x.index),
1218 GroupKind::NonCapturing(_) => None,
1219 }
1220 }
1221}
1222
1223/// The kind of a group.
1224#[derive(Clone, Debug, Eq, PartialEq)]
1225pub enum GroupKind {
1226 /// `(a)`
1227 CaptureIndex(u32),
1228 /// `(?P<name>a)`
1229 CaptureName(CaptureName),
1230 /// `(?:a)` and `(?i:a)`
1231 NonCapturing(Flags),
1232}
1233
1234/// A capture name.
1235///
1236/// This corresponds to the name itself between the angle brackets in, e.g.,
1237/// `(?P<foo>expr)`.
1238#[derive(Clone, Debug, Eq, PartialEq)]
1239pub struct CaptureName {
1240 /// The span of this capture name.
1241 pub span: Span,
1242 /// The capture name.
1243 pub name: String,
1244 /// The capture index.
1245 pub index: u32,
1246}
1247
1248/// A group of flags that is not applied to a particular regular expression.
1249#[derive(Clone, Debug, Eq, PartialEq)]
1250pub struct SetFlags {
1251 /// The span of these flags, including the grouping parentheses.
1252 pub span: Span,
1253 /// The actual sequence of flags.
1254 pub flags: Flags,
1255}
1256
1257/// A group of flags.
1258///
1259/// This corresponds only to the sequence of flags themselves, e.g., `is-u`.
1260#[derive(Clone, Debug, Eq, PartialEq)]
1261pub struct Flags {
1262 /// The span of this group of flags.
1263 pub span: Span,
1264 /// A sequence of flag items. Each item is either a flag or a negation
1265 /// operator.
1266 pub items: Vec<FlagsItem>,
1267}
1268
1269impl Flags {
1270 /// Add the given item to this sequence of flags.
1271 ///
1272 /// If the item was added successfully, then `None` is returned. If the
1273 /// given item is a duplicate, then `Some(i)` is returned, where
1274 /// `items[i].kind == item.kind`.
1275 pub fn add_item(&mut self, item: FlagsItem) -> Option<usize> {
1276 for (i, x) in self.items.iter().enumerate() {
1277 if x.kind == item.kind {
1278 return Some(i);
1279 }
1280 }
1281 self.items.push(item);
1282 None
1283 }
1284
1285 /// Returns the state of the given flag in this set.
1286 ///
1287 /// If the given flag is in the set but is negated, then `Some(false)` is
1288 /// returned.
1289 ///
1290 /// If the given flag is in the set and is not negated, then `Some(true)`
1291 /// is returned.
1292 ///
1293 /// Otherwise, `None` is returned.
1294 pub fn flag_state(&self, flag: Flag) -> Option<bool> {
1295 let mut negated = false;
1296 for x in &self.items {
1297 match x.kind {
1298 FlagsItemKind::Negation => {
1299 negated = true;
1300 }
1301 FlagsItemKind::Flag(ref xflag) if xflag == &flag => {
1302 return Some(!negated);
1303 }
1304 _ => {}
1305 }
1306 }
1307 None
1308 }
1309}
1310
1311/// A single item in a group of flags.
1312#[derive(Clone, Debug, Eq, PartialEq)]
1313pub struct FlagsItem {
1314 /// The span of this item.
1315 pub span: Span,
1316 /// The kind of this item.
1317 pub kind: FlagsItemKind,
1318}
1319
1320/// The kind of an item in a group of flags.
1321#[derive(Clone, Debug, Eq, PartialEq)]
1322pub enum FlagsItemKind {
1323 /// A negation operator applied to all subsequent flags in the enclosing
1324 /// group.
1325 Negation,
1326 /// A single flag in a group.
1327 Flag(Flag),
1328}
1329
1330impl FlagsItemKind {
1331 /// Returns true if and only if this item is a negation operator.
1332 pub fn is_negation(&self) -> bool {
1333 match *self {
1334 FlagsItemKind::Negation => true,
1335 _ => false,
1336 }
1337 }
1338}
1339
1340/// A single flag.
1341#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1342pub enum Flag {
1343 /// `i`
1344 CaseInsensitive,
1345 /// `m`
1346 MultiLine,
1347 /// `s`
1348 DotMatchesNewLine,
1349 /// `U`
1350 SwapGreed,
1351 /// `u`
1352 Unicode,
1353 /// `x`
1354 IgnoreWhitespace,
1355}
1356
1357/// A custom `Drop` impl is used for `Ast` such that it uses constant stack
1358/// space but heap space proportional to the depth of the `Ast`.
1359impl Drop for Ast {
1360 fn drop(&mut self) {
1361 use std::mem;
1362
1363 match *self {
1364 Ast::Empty(_)
1365 | Ast::Flags(_)
1366 | Ast::Literal(_)
1367 | Ast::Dot(_)
1368 | Ast::Assertion(_)
1369 // Classes are recursive, so they get their own Drop impl.
1370 | Ast::Class(_) => return,
1371 Ast::Repetition(ref x) if !x.ast.has_subexprs() => return,
1372 Ast::Group(ref x) if !x.ast.has_subexprs() => return,
1373 Ast::Alternation(ref x) if x.asts.is_empty() => return,
1374 Ast::Concat(ref x) if x.asts.is_empty() => return,
1375 _ => {}
1376 }
1377
1378 let empty_span = || Span::splat(Position::new(0, 0, 0));
1379 let empty_ast = || Ast::Empty(empty_span());
1380 let mut stack = vec![mem::replace(self, empty_ast())];
1381 while let Some(mut ast) = stack.pop() {
1382 match ast {
1383 Ast::Empty(_)
1384 | Ast::Flags(_)
1385 | Ast::Literal(_)
1386 | Ast::Dot(_)
1387 | Ast::Assertion(_)
1388 // Classes are recursive, so they get their own Drop impl.
1389 | Ast::Class(_) => {}
1390 Ast::Repetition(ref mut x) => {
1391 stack.push(mem::replace(&mut x.ast, empty_ast()));
1392 }
1393 Ast::Group(ref mut x) => {
1394 stack.push(mem::replace(&mut x.ast, empty_ast()));
1395 }
1396 Ast::Alternation(ref mut x) => {
1397 stack.extend(x.asts.drain(..));
1398 }
1399 Ast::Concat(ref mut x) => {
1400 stack.extend(x.asts.drain(..));
1401 }
1402 }
1403 }
1404 }
1405}
1406
1407/// A custom `Drop` impl is used for `ClassSet` such that it uses constant
1408/// stack space but heap space proportional to the depth of the `ClassSet`.
1409impl Drop for ClassSet {
1410 fn drop(&mut self) {
1411 use std::mem;
1412
1413 match *self {
1414 ClassSet::Item(ref item) => match *item {
1415 ClassSetItem::Empty(_)
1416 | ClassSetItem::Literal(_)
1417 | ClassSetItem::Range(_)
1418 | ClassSetItem::Ascii(_)
1419 | ClassSetItem::Unicode(_)
1420 | ClassSetItem::Perl(_) => return,
1421 ClassSetItem::Bracketed(ref x) => {
1422 if x.kind.is_empty() {
1423 return;
1424 }
1425 }
1426 ClassSetItem::Union(ref x) => {
1427 if x.items.is_empty() {
1428 return;
1429 }
1430 }
1431 },
1432 ClassSet::BinaryOp(ref op) => {
1433 if op.lhs.is_empty() && op.rhs.is_empty() {
1434 return;
1435 }
1436 }
1437 }
1438
1439 let empty_span = || Span::splat(Position::new(0, 0, 0));
1440 let empty_set = || ClassSet::Item(ClassSetItem::Empty(empty_span()));
1441 let mut stack = vec![mem::replace(self, empty_set())];
1442 while let Some(mut set) = stack.pop() {
1443 match set {
1444 ClassSet::Item(ref mut item) => match *item {
1445 ClassSetItem::Empty(_)
1446 | ClassSetItem::Literal(_)
1447 | ClassSetItem::Range(_)
1448 | ClassSetItem::Ascii(_)
1449 | ClassSetItem::Unicode(_)
1450 | ClassSetItem::Perl(_) => {}
1451 ClassSetItem::Bracketed(ref mut x) => {
1452 stack.push(mem::replace(&mut x.kind, empty_set()));
1453 }
1454 ClassSetItem::Union(ref mut x) => {
1455 stack.extend(x.items.drain(..).map(ClassSet::Item));
1456 }
1457 },
1458 ClassSet::BinaryOp(ref mut op) => {
1459 stack.push(mem::replace(&mut op.lhs, empty_set()));
1460 stack.push(mem::replace(&mut op.rhs, empty_set()));
1461 }
1462 }
1463 }
1464 }
1465}
1466
1467#[cfg(test)]
1468mod tests {
1469 use super::*;
1470
1471 // We use a thread with an explicit stack size to test that our destructor
1472 // for Ast can handle arbitrarily sized expressions in constant stack
1473 // space. In case we run on a platform without threads (WASM?), we limit
1474 // this test to Windows/Unix.
1475 #[test]
1476 #[cfg(any(unix, windows))]
1477 fn no_stack_overflow_on_drop() {
1478 use std::thread;
1479
1480 let run = || {
1481 let span = || Span::splat(Position::new(0, 0, 0));
1482 let mut ast = Ast::Empty(span());
1483 for i in 0..200 {
1484 ast = Ast::Group(Group {
1485 span: span(),
1486 kind: GroupKind::CaptureIndex(i),
1487 ast: Box::new(ast),
1488 });
1489 }
1490 assert!(!ast.is_empty());
1491 };
1492
1493 // We run our test on a thread with a small stack size so we can
1494 // force the issue more easily.
1495 //
1496 // NOTE(2023-03-21): It turns out that some platforms (like FreeBSD)
1497 // will just barf with very small stack sizes. So we bump this up a bit
1498 // to give more room to breath. When I did this, I confirmed that if
1499 // I remove the custom `Drop` impl for `Ast`, then this test does
1500 // indeed still fail with a stack overflow. (At the time of writing, I
1501 // had to bump it all the way up to 32K before the test would pass even
1502 // without the custom `Drop` impl. So 16K seems like a safe number
1503 // here.)
1504 //
1505 // See: https://github.com/rust-lang/regex/issues/967
1506 thread::Builder::new()
1507 .stack_size(16 << 10)
1508 .spawn(run)
1509 .unwrap()
1510 .join()
1511 .unwrap();
1512 }
1513}
1514