1/*!
2Defines an abstract syntax for regular expressions.
3*/
4
5use core::cmp::Ordering;
6
7use alloc::{boxed::Box, string::String, vec, vec::Vec};
8
9pub use crate::ast::visitor::{visit, Visitor};
10
11pub mod parse;
12pub mod print;
13mod visitor;
14
15/// An error that occurred while parsing a regular expression into an abstract
16/// syntax tree.
17///
18/// Note that not all ASTs represents a valid regular expression. For example,
19/// an AST is constructed without error for `\p{Quux}`, but `Quux` is not a
20/// valid Unicode property name. That particular error is reported when
21/// translating an AST to the high-level intermediate representation (`HIR`).
22#[derive(Clone, Debug, Eq, PartialEq)]
23pub struct Error {
24 /// The kind of error.
25 kind: ErrorKind,
26 /// The original pattern that the parser generated the error from. Every
27 /// span in an error is a valid range into this string.
28 pattern: String,
29 /// The span of this error.
30 span: Span,
31}
32
33impl Error {
34 /// Return the type of this error.
35 pub fn kind(&self) -> &ErrorKind {
36 &self.kind
37 }
38
39 /// The original pattern string in which this error occurred.
40 ///
41 /// Every span reported by this error is reported in terms of this string.
42 pub fn pattern(&self) -> &str {
43 &self.pattern
44 }
45
46 /// Return the span at which this error occurred.
47 pub fn span(&self) -> &Span {
48 &self.span
49 }
50
51 /// Return an auxiliary span. This span exists only for some errors that
52 /// benefit from being able to point to two locations in the original
53 /// regular expression. For example, "duplicate" errors will have the
54 /// main error position set to the duplicate occurrence while its
55 /// auxiliary span will be set to the initial occurrence.
56 pub fn auxiliary_span(&self) -> Option<&Span> {
57 use self::ErrorKind::*;
58 match self.kind {
59 FlagDuplicate { ref original } => Some(original),
60 FlagRepeatedNegation { ref original, .. } => Some(original),
61 GroupNameDuplicate { ref original, .. } => Some(original),
62 _ => None,
63 }
64 }
65}
66
67/// The type of an error that occurred while building an AST.
68///
69/// This error type is marked as `non_exhaustive`. This means that adding a
70/// new variant is not considered a breaking change.
71#[non_exhaustive]
72#[derive(Clone, Debug, Eq, PartialEq)]
73pub enum ErrorKind {
74 /// The capturing group limit was exceeded.
75 ///
76 /// Note that this represents a limit on the total number of capturing
77 /// groups in a regex and not necessarily the number of nested capturing
78 /// groups. That is, the nest limit can be low and it is still possible for
79 /// this error to occur.
80 CaptureLimitExceeded,
81 /// An invalid escape sequence was found in a character class set.
82 ClassEscapeInvalid,
83 /// An invalid character class range was found. An invalid range is any
84 /// range where the start is greater than the end.
85 ClassRangeInvalid,
86 /// An invalid range boundary was found in a character class. Range
87 /// boundaries must be a single literal codepoint, but this error indicates
88 /// that something else was found, such as a nested class.
89 ClassRangeLiteral,
90 /// An opening `[` was found with no corresponding closing `]`.
91 ClassUnclosed,
92 /// Note that this error variant is no longer used. Namely, a decimal
93 /// number can only appear as a repetition quantifier. When the number
94 /// in a repetition quantifier is empty, then it gets its own specialized
95 /// error, `RepetitionCountDecimalEmpty`.
96 DecimalEmpty,
97 /// An invalid decimal number was given where one was expected.
98 DecimalInvalid,
99 /// A bracketed hex literal was empty.
100 EscapeHexEmpty,
101 /// A bracketed hex literal did not correspond to a Unicode scalar value.
102 EscapeHexInvalid,
103 /// An invalid hexadecimal digit was found.
104 EscapeHexInvalidDigit,
105 /// EOF was found before an escape sequence was completed.
106 EscapeUnexpectedEof,
107 /// An unrecognized escape sequence.
108 EscapeUnrecognized,
109 /// A dangling negation was used when setting flags, e.g., `i-`.
110 FlagDanglingNegation,
111 /// A flag was used twice, e.g., `i-i`.
112 FlagDuplicate {
113 /// The position of the original flag. The error position
114 /// points to the duplicate flag.
115 original: Span,
116 },
117 /// The negation operator was used twice, e.g., `-i-s`.
118 FlagRepeatedNegation {
119 /// The position of the original negation operator. The error position
120 /// points to the duplicate negation operator.
121 original: Span,
122 },
123 /// Expected a flag but got EOF, e.g., `(?`.
124 FlagUnexpectedEof,
125 /// Unrecognized flag, e.g., `a`.
126 FlagUnrecognized,
127 /// A duplicate capture name was found.
128 GroupNameDuplicate {
129 /// The position of the initial occurrence of the capture name. The
130 /// error position itself points to the duplicate occurrence.
131 original: Span,
132 },
133 /// A capture group name is empty, e.g., `(?P<>abc)`.
134 GroupNameEmpty,
135 /// An invalid character was seen for a capture group name. This includes
136 /// errors where the first character is a digit (even though subsequent
137 /// characters are allowed to be digits).
138 GroupNameInvalid,
139 /// A closing `>` could not be found for a capture group name.
140 GroupNameUnexpectedEof,
141 /// An unclosed group, e.g., `(ab`.
142 ///
143 /// The span of this error corresponds to the unclosed parenthesis.
144 GroupUnclosed,
145 /// An unopened group, e.g., `ab)`.
146 GroupUnopened,
147 /// The nest limit was exceeded. The limit stored here is the limit
148 /// configured in the parser.
149 NestLimitExceeded(u32),
150 /// The range provided in a counted repetition operator is invalid. The
151 /// range is invalid if the start is greater than the end.
152 RepetitionCountInvalid,
153 /// An opening `{` was not followed by a valid decimal value.
154 /// For example, `x{}` or `x{]}` would fail.
155 RepetitionCountDecimalEmpty,
156 /// An opening `{` was found with no corresponding closing `}`.
157 RepetitionCountUnclosed,
158 /// A repetition operator was applied to a missing sub-expression. This
159 /// occurs, for example, in the regex consisting of just a `*` or even
160 /// `(?i)*`. It is, however, possible to create a repetition operating on
161 /// an empty sub-expression. For example, `()*` is still considered valid.
162 RepetitionMissing,
163 /// The Unicode class is not valid. This typically occurs when a `\p` is
164 /// followed by something other than a `{`.
165 UnicodeClassInvalid,
166 /// When octal support is disabled, this error is produced when an octal
167 /// escape is used. The octal escape is assumed to be an invocation of
168 /// a backreference, which is the common case.
169 UnsupportedBackreference,
170 /// When syntax similar to PCRE's look-around is used, this error is
171 /// returned. Some example syntaxes that are rejected include, but are
172 /// not necessarily limited to, `(?=re)`, `(?!re)`, `(?<=re)` and
173 /// `(?<!re)`. Note that all of these syntaxes are otherwise invalid; this
174 /// error is used to improve the user experience.
175 UnsupportedLookAround,
176}
177
178#[cfg(feature = "std")]
179impl std::error::Error for Error {}
180
181impl core::fmt::Display for Error {
182 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
183 crate::error::Formatter::from(self).fmt(f)
184 }
185}
186
187impl core::fmt::Display for ErrorKind {
188 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
189 use self::ErrorKind::*;
190 match *self {
191 CaptureLimitExceeded => write!(
192 f,
193 "exceeded the maximum number of \
194 capturing groups ({})",
195 u32::MAX
196 ),
197 ClassEscapeInvalid => {
198 write!(f, "invalid escape sequence found in character class")
199 }
200 ClassRangeInvalid => write!(
201 f,
202 "invalid character class range, \
203 the start must be <= the end"
204 ),
205 ClassRangeLiteral => {
206 write!(f, "invalid range boundary, must be a literal")
207 }
208 ClassUnclosed => write!(f, "unclosed character class"),
209 DecimalEmpty => write!(f, "decimal literal empty"),
210 DecimalInvalid => write!(f, "decimal literal invalid"),
211 EscapeHexEmpty => write!(f, "hexadecimal literal empty"),
212 EscapeHexInvalid => {
213 write!(f, "hexadecimal literal is not a Unicode scalar value")
214 }
215 EscapeHexInvalidDigit => write!(f, "invalid hexadecimal digit"),
216 EscapeUnexpectedEof => write!(
217 f,
218 "incomplete escape sequence, \
219 reached end of pattern prematurely"
220 ),
221 EscapeUnrecognized => write!(f, "unrecognized escape sequence"),
222 FlagDanglingNegation => {
223 write!(f, "dangling flag negation operator")
224 }
225 FlagDuplicate { .. } => write!(f, "duplicate flag"),
226 FlagRepeatedNegation { .. } => {
227 write!(f, "flag negation operator repeated")
228 }
229 FlagUnexpectedEof => {
230 write!(f, "expected flag but got end of regex")
231 }
232 FlagUnrecognized => write!(f, "unrecognized flag"),
233 GroupNameDuplicate { .. } => {
234 write!(f, "duplicate capture group name")
235 }
236 GroupNameEmpty => write!(f, "empty capture group name"),
237 GroupNameInvalid => write!(f, "invalid capture group character"),
238 GroupNameUnexpectedEof => write!(f, "unclosed capture group name"),
239 GroupUnclosed => write!(f, "unclosed group"),
240 GroupUnopened => write!(f, "unopened group"),
241 NestLimitExceeded(limit) => write!(
242 f,
243 "exceed the maximum number of \
244 nested parentheses/brackets ({})",
245 limit
246 ),
247 RepetitionCountInvalid => write!(
248 f,
249 "invalid repetition count range, \
250 the start must be <= the end"
251 ),
252 RepetitionCountDecimalEmpty => {
253 write!(f, "repetition quantifier expects a valid decimal")
254 }
255 RepetitionCountUnclosed => {
256 write!(f, "unclosed counted repetition")
257 }
258 RepetitionMissing => {
259 write!(f, "repetition operator missing expression")
260 }
261 UnicodeClassInvalid => {
262 write!(f, "invalid Unicode character class")
263 }
264 UnsupportedBackreference => {
265 write!(f, "backreferences are not supported")
266 }
267 UnsupportedLookAround => write!(
268 f,
269 "look-around, including look-ahead and look-behind, \
270 is not supported"
271 ),
272 }
273 }
274}
275
276/// Span represents the position information of a single AST item.
277///
278/// All span positions are absolute byte offsets that can be used on the
279/// original regular expression that was parsed.
280#[derive(Clone, Copy, Eq, PartialEq)]
281pub struct Span {
282 /// The start byte offset.
283 pub start: Position,
284 /// The end byte offset.
285 pub end: Position,
286}
287
288impl core::fmt::Debug for Span {
289 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
290 write!(f, "Span({:?}, {:?})", self.start, self.end)
291 }
292}
293
294impl Ord for Span {
295 fn cmp(&self, other: &Span) -> Ordering {
296 (&self.start, &self.end).cmp(&(&other.start, &other.end))
297 }
298}
299
300impl PartialOrd for Span {
301 fn partial_cmp(&self, other: &Span) -> Option<Ordering> {
302 Some(self.cmp(other))
303 }
304}
305
306/// A single position in a regular expression.
307///
308/// A position encodes one half of a span, and include the byte offset, line
309/// number and column number.
310#[derive(Clone, Copy, Eq, PartialEq)]
311pub struct Position {
312 /// The absolute offset of this position, starting at `0` from the
313 /// beginning of the regular expression pattern string.
314 pub offset: usize,
315 /// The line number, starting at `1`.
316 pub line: usize,
317 /// The approximate column number, starting at `1`.
318 pub column: usize,
319}
320
321impl core::fmt::Debug for Position {
322 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
323 write!(
324 f,
325 "Position(o: {:?}, l: {:?}, c: {:?})",
326 self.offset, self.line, self.column
327 )
328 }
329}
330
331impl Ord for Position {
332 fn cmp(&self, other: &Position) -> Ordering {
333 self.offset.cmp(&other.offset)
334 }
335}
336
337impl PartialOrd for Position {
338 fn partial_cmp(&self, other: &Position) -> Option<Ordering> {
339 Some(self.cmp(other))
340 }
341}
342
343impl Span {
344 /// Create a new span with the given positions.
345 pub fn new(start: Position, end: Position) -> Span {
346 Span { start, end }
347 }
348
349 /// Create a new span using the given position as the start and end.
350 pub fn splat(pos: Position) -> Span {
351 Span::new(pos, pos)
352 }
353
354 /// Create a new span by replacing the starting the position with the one
355 /// given.
356 pub fn with_start(self, pos: Position) -> Span {
357 Span { start: pos, ..self }
358 }
359
360 /// Create a new span by replacing the ending the position with the one
361 /// given.
362 pub fn with_end(self, pos: Position) -> Span {
363 Span { end: pos, ..self }
364 }
365
366 /// Returns true if and only if this span occurs on a single line.
367 pub fn is_one_line(&self) -> bool {
368 self.start.line == self.end.line
369 }
370
371 /// Returns true if and only if this span is empty. That is, it points to
372 /// a single position in the concrete syntax of a regular expression.
373 pub fn is_empty(&self) -> bool {
374 self.start.offset == self.end.offset
375 }
376}
377
378impl Position {
379 /// Create a new position with the given information.
380 ///
381 /// `offset` is the absolute offset of the position, starting at `0` from
382 /// the beginning of the regular expression pattern string.
383 ///
384 /// `line` is the line number, starting at `1`.
385 ///
386 /// `column` is the approximate column number, starting at `1`.
387 pub fn new(offset: usize, line: usize, column: usize) -> Position {
388 Position { offset, line, column }
389 }
390}
391
392/// An abstract syntax tree for a singular expression along with comments
393/// found.
394///
395/// Comments are not stored in the tree itself to avoid complexity. Each
396/// comment contains a span of precisely where it occurred in the original
397/// regular expression.
398#[derive(Clone, Debug, Eq, PartialEq)]
399pub struct WithComments {
400 /// The actual ast.
401 pub ast: Ast,
402 /// All comments found in the original regular expression.
403 pub comments: Vec<Comment>,
404}
405
406/// A comment from a regular expression with an associated span.
407///
408/// A regular expression can only contain comments when the `x` flag is
409/// enabled.
410#[derive(Clone, Debug, Eq, PartialEq)]
411pub struct Comment {
412 /// The span of this comment, including the beginning `#` and ending `\n`.
413 pub span: Span,
414 /// The comment text, starting with the first character following the `#`
415 /// and ending with the last character preceding the `\n`.
416 pub comment: String,
417}
418
419/// An abstract syntax tree for a single regular expression.
420///
421/// An `Ast`'s `fmt::Display` implementation uses constant stack space and heap
422/// space proportional to the size of the `Ast`.
423///
424/// This type defines its own destructor that uses constant stack space and
425/// heap space proportional to the size of the `Ast`.
426#[derive(Clone, Debug, Eq, PartialEq)]
427pub enum Ast {
428 /// An empty regex that matches everything.
429 Empty(Span),
430 /// A set of flags, e.g., `(?is)`.
431 Flags(SetFlags),
432 /// A single character literal, which includes escape sequences.
433 Literal(Literal),
434 /// The "any character" class.
435 Dot(Span),
436 /// A single zero-width assertion.
437 Assertion(Assertion),
438 /// A single character class. This includes all forms of character classes
439 /// except for `.`. e.g., `\d`, `\pN`, `[a-z]` and `[[:alpha:]]`.
440 Class(Class),
441 /// A repetition operator applied to an arbitrary regular expression.
442 Repetition(Repetition),
443 /// A grouped regular expression.
444 Group(Group),
445 /// An alternation of regular expressions.
446 Alternation(Alternation),
447 /// A concatenation of regular expressions.
448 Concat(Concat),
449}
450
451impl Ast {
452 /// Return the span of this abstract syntax tree.
453 pub fn span(&self) -> &Span {
454 match *self {
455 Ast::Empty(ref span) => span,
456 Ast::Flags(ref x) => &x.span,
457 Ast::Literal(ref x) => &x.span,
458 Ast::Dot(ref span) => span,
459 Ast::Assertion(ref x) => &x.span,
460 Ast::Class(ref x) => x.span(),
461 Ast::Repetition(ref x) => &x.span,
462 Ast::Group(ref x) => &x.span,
463 Ast::Alternation(ref x) => &x.span,
464 Ast::Concat(ref x) => &x.span,
465 }
466 }
467
468 /// Return true if and only if this Ast is empty.
469 pub fn is_empty(&self) -> bool {
470 match *self {
471 Ast::Empty(_) => true,
472 _ => false,
473 }
474 }
475
476 /// Returns true if and only if this AST has any (including possibly empty)
477 /// subexpressions.
478 fn has_subexprs(&self) -> bool {
479 match *self {
480 Ast::Empty(_)
481 | Ast::Flags(_)
482 | Ast::Literal(_)
483 | Ast::Dot(_)
484 | Ast::Assertion(_) => false,
485 Ast::Class(_)
486 | Ast::Repetition(_)
487 | Ast::Group(_)
488 | Ast::Alternation(_)
489 | Ast::Concat(_) => true,
490 }
491 }
492}
493
494/// Print a display representation of this Ast.
495///
496/// This does not preserve any of the original whitespace formatting that may
497/// have originally been present in the concrete syntax from which this Ast
498/// was generated.
499///
500/// This implementation uses constant stack space and heap space proportional
501/// to the size of the `Ast`.
502impl core::fmt::Display for Ast {
503 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
504 use crate::ast::print::Printer;
505 Printer::new().print(self, wtr:f)
506 }
507}
508
509/// An alternation of regular expressions.
510#[derive(Clone, Debug, Eq, PartialEq)]
511pub struct Alternation {
512 /// The span of this alternation.
513 pub span: Span,
514 /// The alternate regular expressions.
515 pub asts: Vec<Ast>,
516}
517
518impl Alternation {
519 /// Return this alternation as an AST.
520 ///
521 /// If this alternation contains zero ASTs, then Ast::Empty is
522 /// returned. If this alternation contains exactly 1 AST, then the
523 /// corresponding AST is returned. Otherwise, Ast::Alternation is returned.
524 pub fn into_ast(mut self) -> Ast {
525 match self.asts.len() {
526 0 => Ast::Empty(self.span),
527 1 => self.asts.pop().unwrap(),
528 _ => Ast::Alternation(self),
529 }
530 }
531}
532
533/// A concatenation of regular expressions.
534#[derive(Clone, Debug, Eq, PartialEq)]
535pub struct Concat {
536 /// The span of this concatenation.
537 pub span: Span,
538 /// The concatenation regular expressions.
539 pub asts: Vec<Ast>,
540}
541
542impl Concat {
543 /// Return this concatenation as an AST.
544 ///
545 /// If this concatenation contains zero ASTs, then Ast::Empty is
546 /// returned. If this concatenation contains exactly 1 AST, then the
547 /// corresponding AST is returned. Otherwise, Ast::Concat is returned.
548 pub fn into_ast(mut self) -> Ast {
549 match self.asts.len() {
550 0 => Ast::Empty(self.span),
551 1 => self.asts.pop().unwrap(),
552 _ => Ast::Concat(self),
553 }
554 }
555}
556
557/// A single literal expression.
558///
559/// A literal corresponds to a single Unicode scalar value. Literals may be
560/// represented in their literal form, e.g., `a` or in their escaped form,
561/// e.g., `\x61`.
562#[derive(Clone, Debug, Eq, PartialEq)]
563pub struct Literal {
564 /// The span of this literal.
565 pub span: Span,
566 /// The kind of this literal.
567 pub kind: LiteralKind,
568 /// The Unicode scalar value corresponding to this literal.
569 pub c: char,
570}
571
572impl Literal {
573 /// If this literal was written as a `\x` hex escape, then this returns
574 /// the corresponding byte value. Otherwise, this returns `None`.
575 pub fn byte(&self) -> Option<u8> {
576 match self.kind {
577 LiteralKind::HexFixed(HexLiteralKind::X) => {
578 u8::try_from(self.c).ok()
579 }
580 _ => None,
581 }
582 }
583}
584
585/// The kind of a single literal expression.
586#[derive(Clone, Debug, Eq, PartialEq)]
587pub enum LiteralKind {
588 /// The literal is written verbatim, e.g., `a` or `☃`.
589 Verbatim,
590 /// The literal is written as an escape because it is otherwise a special
591 /// regex meta character, e.g., `\*` or `\[`.
592 Meta,
593 /// The literal is written as an escape despite the fact that the escape is
594 /// unnecessary, e.g., `\%` or `\/`.
595 Superfluous,
596 /// The literal is written as an octal escape, e.g., `\141`.
597 Octal,
598 /// The literal is written as a hex code with a fixed number of digits
599 /// depending on the type of the escape, e.g., `\x61` or or `\u0061` or
600 /// `\U00000061`.
601 HexFixed(HexLiteralKind),
602 /// The literal is written as a hex code with a bracketed number of
603 /// digits. The only restriction is that the bracketed hex code must refer
604 /// to a valid Unicode scalar value.
605 HexBrace(HexLiteralKind),
606 /// The literal is written as a specially recognized escape, e.g., `\f`
607 /// or `\n`.
608 Special(SpecialLiteralKind),
609}
610
611/// The type of a special literal.
612///
613/// A special literal is a special escape sequence recognized by the regex
614/// parser, e.g., `\f` or `\n`.
615#[derive(Clone, Debug, Eq, PartialEq)]
616pub enum SpecialLiteralKind {
617 /// Bell, spelled `\a` (`\x07`).
618 Bell,
619 /// Form feed, spelled `\f` (`\x0C`).
620 FormFeed,
621 /// Tab, spelled `\t` (`\x09`).
622 Tab,
623 /// Line feed, spelled `\n` (`\x0A`).
624 LineFeed,
625 /// Carriage return, spelled `\r` (`\x0D`).
626 CarriageReturn,
627 /// Vertical tab, spelled `\v` (`\x0B`).
628 VerticalTab,
629 /// Space, spelled `\ ` (`\x20`). Note that this can only appear when
630 /// parsing in verbose mode.
631 Space,
632}
633
634/// The type of a Unicode hex literal.
635///
636/// Note that all variants behave the same when used with brackets. They only
637/// differ when used without brackets in the number of hex digits that must
638/// follow.
639#[derive(Clone, Debug, Eq, PartialEq)]
640pub enum HexLiteralKind {
641 /// A `\x` prefix. When used without brackets, this form is limited to
642 /// two digits.
643 X,
644 /// A `\u` prefix. When used without brackets, this form is limited to
645 /// four digits.
646 UnicodeShort,
647 /// A `\U` prefix. When used without brackets, this form is limited to
648 /// eight digits.
649 UnicodeLong,
650}
651
652impl HexLiteralKind {
653 /// The number of digits that must be used with this literal form when
654 /// used without brackets. When used with brackets, there is no
655 /// restriction on the number of digits.
656 pub fn digits(&self) -> u32 {
657 match *self {
658 HexLiteralKind::X => 2,
659 HexLiteralKind::UnicodeShort => 4,
660 HexLiteralKind::UnicodeLong => 8,
661 }
662 }
663}
664
665/// A single character class expression.
666#[derive(Clone, Debug, Eq, PartialEq)]
667pub enum Class {
668 /// A Unicode character class, e.g., `\pL` or `\p{Greek}`.
669 Unicode(ClassUnicode),
670 /// A perl character class, e.g., `\d` or `\W`.
671 Perl(ClassPerl),
672 /// A bracketed character class set, which may contain zero or more
673 /// character ranges and/or zero or more nested classes. e.g.,
674 /// `[a-zA-Z\pL]`.
675 Bracketed(ClassBracketed),
676}
677
678impl Class {
679 /// Return the span of this character class.
680 pub fn span(&self) -> &Span {
681 match *self {
682 Class::Perl(ref x: &ClassPerl) => &x.span,
683 Class::Unicode(ref x: &ClassUnicode) => &x.span,
684 Class::Bracketed(ref x: &ClassBracketed) => &x.span,
685 }
686 }
687}
688
689/// A Perl character class.
690#[derive(Clone, Debug, Eq, PartialEq)]
691pub struct ClassPerl {
692 /// The span of this class.
693 pub span: Span,
694 /// The kind of Perl class.
695 pub kind: ClassPerlKind,
696 /// Whether the class is negated or not. e.g., `\d` is not negated but
697 /// `\D` is.
698 pub negated: bool,
699}
700
701/// The available Perl character classes.
702#[derive(Clone, Debug, Eq, PartialEq)]
703pub enum ClassPerlKind {
704 /// Decimal numbers.
705 Digit,
706 /// Whitespace.
707 Space,
708 /// Word characters.
709 Word,
710}
711
712/// An ASCII character class.
713#[derive(Clone, Debug, Eq, PartialEq)]
714pub struct ClassAscii {
715 /// The span of this class.
716 pub span: Span,
717 /// The kind of ASCII class.
718 pub kind: ClassAsciiKind,
719 /// Whether the class is negated or not. e.g., `[[:alpha:]]` is not negated
720 /// but `[[:^alpha:]]` is.
721 pub negated: bool,
722}
723
724/// The available ASCII character classes.
725#[derive(Clone, Debug, Eq, PartialEq)]
726pub enum ClassAsciiKind {
727 /// `[0-9A-Za-z]`
728 Alnum,
729 /// `[A-Za-z]`
730 Alpha,
731 /// `[\x00-\x7F]`
732 Ascii,
733 /// `[ \t]`
734 Blank,
735 /// `[\x00-\x1F\x7F]`
736 Cntrl,
737 /// `[0-9]`
738 Digit,
739 /// `[!-~]`
740 Graph,
741 /// `[a-z]`
742 Lower,
743 /// `[ -~]`
744 Print,
745 /// `[!-/:-@\[-`{-~]`
746 Punct,
747 /// `[\t\n\v\f\r ]`
748 Space,
749 /// `[A-Z]`
750 Upper,
751 /// `[0-9A-Za-z_]`
752 Word,
753 /// `[0-9A-Fa-f]`
754 Xdigit,
755}
756
757impl ClassAsciiKind {
758 /// Return the corresponding ClassAsciiKind variant for the given name.
759 ///
760 /// The name given should correspond to the lowercase version of the
761 /// variant name. e.g., `cntrl` is the name for `ClassAsciiKind::Cntrl`.
762 ///
763 /// If no variant with the corresponding name exists, then `None` is
764 /// returned.
765 pub fn from_name(name: &str) -> Option<ClassAsciiKind> {
766 use self::ClassAsciiKind::*;
767 match name {
768 "alnum" => Some(Alnum),
769 "alpha" => Some(Alpha),
770 "ascii" => Some(Ascii),
771 "blank" => Some(Blank),
772 "cntrl" => Some(Cntrl),
773 "digit" => Some(Digit),
774 "graph" => Some(Graph),
775 "lower" => Some(Lower),
776 "print" => Some(Print),
777 "punct" => Some(Punct),
778 "space" => Some(Space),
779 "upper" => Some(Upper),
780 "word" => Some(Word),
781 "xdigit" => Some(Xdigit),
782 _ => None,
783 }
784 }
785}
786
787/// A Unicode character class.
788#[derive(Clone, Debug, Eq, PartialEq)]
789pub struct ClassUnicode {
790 /// The span of this class.
791 pub span: Span,
792 /// Whether this class is negated or not.
793 ///
794 /// Note: be careful when using this attribute. This specifically refers
795 /// to whether the class is written as `\p` or `\P`, where the latter
796 /// is `negated = true`. However, it also possible to write something like
797 /// `\P{scx!=Katakana}` which is actually equivalent to
798 /// `\p{scx=Katakana}` and is therefore not actually negated even though
799 /// `negated = true` here. To test whether this class is truly negated
800 /// or not, use the `is_negated` method.
801 pub negated: bool,
802 /// The kind of Unicode class.
803 pub kind: ClassUnicodeKind,
804}
805
806impl ClassUnicode {
807 /// Returns true if this class has been negated.
808 ///
809 /// Note that this takes the Unicode op into account, if it's present.
810 /// e.g., `is_negated` for `\P{scx!=Katakana}` will return `false`.
811 pub fn is_negated(&self) -> bool {
812 match self.kind {
813 ClassUnicodeKind::NamedValue {
814 op: ClassUnicodeOpKind::NotEqual,
815 ..
816 } => !self.negated,
817 _ => self.negated,
818 }
819 }
820}
821
822/// The available forms of Unicode character classes.
823#[derive(Clone, Debug, Eq, PartialEq)]
824pub enum ClassUnicodeKind {
825 /// A one letter abbreviated class, e.g., `\pN`.
826 OneLetter(char),
827 /// A binary property, general category or script. The string may be
828 /// empty.
829 Named(String),
830 /// A property name and an associated value.
831 NamedValue {
832 /// The type of Unicode op used to associate `name` with `value`.
833 op: ClassUnicodeOpKind,
834 /// The property name (which may be empty).
835 name: String,
836 /// The property value (which may be empty).
837 value: String,
838 },
839}
840
841/// The type of op used in a Unicode character class.
842#[derive(Clone, Debug, Eq, PartialEq)]
843pub enum ClassUnicodeOpKind {
844 /// A property set to a specific value, e.g., `\p{scx=Katakana}`.
845 Equal,
846 /// A property set to a specific value using a colon, e.g.,
847 /// `\p{scx:Katakana}`.
848 Colon,
849 /// A property that isn't a particular value, e.g., `\p{scx!=Katakana}`.
850 NotEqual,
851}
852
853impl ClassUnicodeOpKind {
854 /// Whether the op is an equality op or not.
855 pub fn is_equal(&self) -> bool {
856 match *self {
857 ClassUnicodeOpKind::Equal | ClassUnicodeOpKind::Colon => true,
858 _ => false,
859 }
860 }
861}
862
863/// A bracketed character class, e.g., `[a-z0-9]`.
864#[derive(Clone, Debug, Eq, PartialEq)]
865pub struct ClassBracketed {
866 /// The span of this class.
867 pub span: Span,
868 /// Whether this class is negated or not. e.g., `[a]` is not negated but
869 /// `[^a]` is.
870 pub negated: bool,
871 /// The type of this set. A set is either a normal union of things, e.g.,
872 /// `[abc]` or a result of applying set operations, e.g., `[\pL--c]`.
873 pub kind: ClassSet,
874}
875
876/// A character class set.
877///
878/// This type corresponds to the internal structure of a bracketed character
879/// class. That is, every bracketed character is one of two types: a union of
880/// items (literals, ranges, other bracketed classes) or a tree of binary set
881/// operations.
882#[derive(Clone, Debug, Eq, PartialEq)]
883pub enum ClassSet {
884 /// An item, which can be a single literal, range, nested character class
885 /// or a union of items.
886 Item(ClassSetItem),
887 /// A single binary operation (i.e., &&, -- or ~~).
888 BinaryOp(ClassSetBinaryOp),
889}
890
891impl ClassSet {
892 /// Build a set from a union.
893 pub fn union(ast: ClassSetUnion) -> ClassSet {
894 ClassSet::Item(ClassSetItem::Union(ast))
895 }
896
897 /// Return the span of this character class set.
898 pub fn span(&self) -> &Span {
899 match *self {
900 ClassSet::Item(ref x: &ClassSetItem) => x.span(),
901 ClassSet::BinaryOp(ref x: &ClassSetBinaryOp) => &x.span,
902 }
903 }
904
905 /// Return true if and only if this class set is empty.
906 fn is_empty(&self) -> bool {
907 match *self {
908 ClassSet::Item(ClassSetItem::Empty(_)) => true,
909 _ => false,
910 }
911 }
912}
913
914/// A single component of a character class set.
915#[derive(Clone, Debug, Eq, PartialEq)]
916pub enum ClassSetItem {
917 /// An empty item.
918 ///
919 /// Note that a bracketed character class cannot contain a single empty
920 /// item. Empty items can appear when using one of the binary operators.
921 /// For example, `[&&]` is the intersection of two empty classes.
922 Empty(Span),
923 /// A single literal.
924 Literal(Literal),
925 /// A range between two literals.
926 Range(ClassSetRange),
927 /// An ASCII character class, e.g., `[:alnum:]` or `[:punct:]`.
928 Ascii(ClassAscii),
929 /// A Unicode character class, e.g., `\pL` or `\p{Greek}`.
930 Unicode(ClassUnicode),
931 /// A perl character class, e.g., `\d` or `\W`.
932 Perl(ClassPerl),
933 /// A bracketed character class set, which may contain zero or more
934 /// character ranges and/or zero or more nested classes. e.g.,
935 /// `[a-zA-Z\pL]`.
936 Bracketed(Box<ClassBracketed>),
937 /// A union of items.
938 Union(ClassSetUnion),
939}
940
941impl ClassSetItem {
942 /// Return the span of this character class set item.
943 pub fn span(&self) -> &Span {
944 match *self {
945 ClassSetItem::Empty(ref span: &Span) => span,
946 ClassSetItem::Literal(ref x: &Literal) => &x.span,
947 ClassSetItem::Range(ref x: &ClassSetRange) => &x.span,
948 ClassSetItem::Ascii(ref x: &ClassAscii) => &x.span,
949 ClassSetItem::Perl(ref x: &ClassPerl) => &x.span,
950 ClassSetItem::Unicode(ref x: &ClassUnicode) => &x.span,
951 ClassSetItem::Bracketed(ref x: &Box) => &x.span,
952 ClassSetItem::Union(ref x: &ClassSetUnion) => &x.span,
953 }
954 }
955}
956
957/// A single character class range in a set.
958#[derive(Clone, Debug, Eq, PartialEq)]
959pub struct ClassSetRange {
960 /// The span of this range.
961 pub span: Span,
962 /// The start of this range.
963 pub start: Literal,
964 /// The end of this range.
965 pub end: Literal,
966}
967
968impl ClassSetRange {
969 /// Returns true if and only if this character class range is valid.
970 ///
971 /// The only case where a range is invalid is if its start is greater than
972 /// its end.
973 pub fn is_valid(&self) -> bool {
974 self.start.c <= self.end.c
975 }
976}
977
978/// A union of items inside a character class set.
979#[derive(Clone, Debug, Eq, PartialEq)]
980pub struct ClassSetUnion {
981 /// The span of the items in this operation. e.g., the `a-z0-9` in
982 /// `[^a-z0-9]`
983 pub span: Span,
984 /// The sequence of items that make up this union.
985 pub items: Vec<ClassSetItem>,
986}
987
988impl ClassSetUnion {
989 /// Push a new item in this union.
990 ///
991 /// The ending position of this union's span is updated to the ending
992 /// position of the span of the item given. If the union is empty, then
993 /// the starting position of this union is set to the starting position
994 /// of this item.
995 ///
996 /// In other words, if you only use this method to add items to a union
997 /// and you set the spans on each item correctly, then you should never
998 /// need to adjust the span of the union directly.
999 pub fn push(&mut self, item: ClassSetItem) {
1000 if self.items.is_empty() {
1001 self.span.start = item.span().start;
1002 }
1003 self.span.end = item.span().end;
1004 self.items.push(item);
1005 }
1006
1007 /// Return this union as a character class set item.
1008 ///
1009 /// If this union contains zero items, then an empty union is
1010 /// returned. If this concatenation contains exactly 1 item, then the
1011 /// corresponding item is returned. Otherwise, ClassSetItem::Union is
1012 /// returned.
1013 pub fn into_item(mut self) -> ClassSetItem {
1014 match self.items.len() {
1015 0 => ClassSetItem::Empty(self.span),
1016 1 => self.items.pop().unwrap(),
1017 _ => ClassSetItem::Union(self),
1018 }
1019 }
1020}
1021
1022/// A Unicode character class set operation.
1023#[derive(Clone, Debug, Eq, PartialEq)]
1024pub struct ClassSetBinaryOp {
1025 /// The span of this operation. e.g., the `a-z--[h-p]` in `[a-z--h-p]`.
1026 pub span: Span,
1027 /// The type of this set operation.
1028 pub kind: ClassSetBinaryOpKind,
1029 /// The left hand side of the operation.
1030 pub lhs: Box<ClassSet>,
1031 /// The right hand side of the operation.
1032 pub rhs: Box<ClassSet>,
1033}
1034
1035/// The type of a Unicode character class set operation.
1036///
1037/// Note that this doesn't explicitly represent union since there is no
1038/// explicit union operator. Concatenation inside a character class corresponds
1039/// to the union operation.
1040#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1041pub enum ClassSetBinaryOpKind {
1042 /// The intersection of two sets, e.g., `\pN&&[a-z]`.
1043 Intersection,
1044 /// The difference of two sets, e.g., `\pN--[0-9]`.
1045 Difference,
1046 /// The symmetric difference of two sets. The symmetric difference is the
1047 /// set of elements belonging to one but not both sets.
1048 /// e.g., `[\pL~~[:ascii:]]`.
1049 SymmetricDifference,
1050}
1051
1052/// A single zero-width assertion.
1053#[derive(Clone, Debug, Eq, PartialEq)]
1054pub struct Assertion {
1055 /// The span of this assertion.
1056 pub span: Span,
1057 /// The assertion kind, e.g., `\b` or `^`.
1058 pub kind: AssertionKind,
1059}
1060
1061/// An assertion kind.
1062#[derive(Clone, Debug, Eq, PartialEq)]
1063pub enum AssertionKind {
1064 /// `^`
1065 StartLine,
1066 /// `$`
1067 EndLine,
1068 /// `\A`
1069 StartText,
1070 /// `\z`
1071 EndText,
1072 /// `\b`
1073 WordBoundary,
1074 /// `\B`
1075 NotWordBoundary,
1076}
1077
1078/// A repetition operation applied to a regular expression.
1079#[derive(Clone, Debug, Eq, PartialEq)]
1080pub struct Repetition {
1081 /// The span of this operation.
1082 pub span: Span,
1083 /// The actual operation.
1084 pub op: RepetitionOp,
1085 /// Whether this operation was applied greedily or not.
1086 pub greedy: bool,
1087 /// The regular expression under repetition.
1088 pub ast: Box<Ast>,
1089}
1090
1091/// The repetition operator itself.
1092#[derive(Clone, Debug, Eq, PartialEq)]
1093pub struct RepetitionOp {
1094 /// The span of this operator. This includes things like `+`, `*?` and
1095 /// `{m,n}`.
1096 pub span: Span,
1097 /// The type of operation.
1098 pub kind: RepetitionKind,
1099}
1100
1101/// The kind of a repetition operator.
1102#[derive(Clone, Debug, Eq, PartialEq)]
1103pub enum RepetitionKind {
1104 /// `?`
1105 ZeroOrOne,
1106 /// `*`
1107 ZeroOrMore,
1108 /// `+`
1109 OneOrMore,
1110 /// `{m,n}`
1111 Range(RepetitionRange),
1112}
1113
1114/// A range repetition operator.
1115#[derive(Clone, Debug, Eq, PartialEq)]
1116pub enum RepetitionRange {
1117 /// `{m}`
1118 Exactly(u32),
1119 /// `{m,}`
1120 AtLeast(u32),
1121 /// `{m,n}`
1122 Bounded(u32, u32),
1123}
1124
1125impl RepetitionRange {
1126 /// Returns true if and only if this repetition range is valid.
1127 ///
1128 /// The only case where a repetition range is invalid is if it is bounded
1129 /// and its start is greater than its end.
1130 pub fn is_valid(&self) -> bool {
1131 match *self {
1132 RepetitionRange::Bounded(s: u32, e: u32) if s > e => false,
1133 _ => true,
1134 }
1135 }
1136}
1137
1138/// A grouped regular expression.
1139///
1140/// This includes both capturing and non-capturing groups. This does **not**
1141/// include flag-only groups like `(?is)`, but does contain any group that
1142/// contains a sub-expression, e.g., `(a)`, `(?P<name>a)`, `(?:a)` and
1143/// `(?is:a)`.
1144#[derive(Clone, Debug, Eq, PartialEq)]
1145pub struct Group {
1146 /// The span of this group.
1147 pub span: Span,
1148 /// The kind of this group.
1149 pub kind: GroupKind,
1150 /// The regular expression in this group.
1151 pub ast: Box<Ast>,
1152}
1153
1154impl Group {
1155 /// If this group is non-capturing, then this returns the (possibly empty)
1156 /// set of flags. Otherwise, `None` is returned.
1157 pub fn flags(&self) -> Option<&Flags> {
1158 match self.kind {
1159 GroupKind::NonCapturing(ref flags) => Some(flags),
1160 _ => None,
1161 }
1162 }
1163
1164 /// Returns true if and only if this group is capturing.
1165 pub fn is_capturing(&self) -> bool {
1166 match self.kind {
1167 GroupKind::CaptureIndex(_) | GroupKind::CaptureName { .. } => true,
1168 GroupKind::NonCapturing(_) => false,
1169 }
1170 }
1171
1172 /// Returns the capture index of this group, if this is a capturing group.
1173 ///
1174 /// This returns a capture index precisely when `is_capturing` is `true`.
1175 pub fn capture_index(&self) -> Option<u32> {
1176 match self.kind {
1177 GroupKind::CaptureIndex(i) => Some(i),
1178 GroupKind::CaptureName { ref name, .. } => Some(name.index),
1179 GroupKind::NonCapturing(_) => None,
1180 }
1181 }
1182}
1183
1184/// The kind of a group.
1185#[derive(Clone, Debug, Eq, PartialEq)]
1186pub enum GroupKind {
1187 /// `(a)`
1188 CaptureIndex(u32),
1189 /// `(?<name>a)` or `(?P<name>a)`
1190 CaptureName {
1191 /// True if the `?P<` syntax is used and false if the `?<` syntax is used.
1192 starts_with_p: bool,
1193 /// The capture name.
1194 name: CaptureName,
1195 },
1196 /// `(?:a)` and `(?i:a)`
1197 NonCapturing(Flags),
1198}
1199
1200/// A capture name.
1201///
1202/// This corresponds to the name itself between the angle brackets in, e.g.,
1203/// `(?P<foo>expr)`.
1204#[derive(Clone, Debug, Eq, PartialEq)]
1205pub struct CaptureName {
1206 /// The span of this capture name.
1207 pub span: Span,
1208 /// The capture name.
1209 pub name: String,
1210 /// The capture index.
1211 pub index: u32,
1212}
1213
1214/// A group of flags that is not applied to a particular regular expression.
1215#[derive(Clone, Debug, Eq, PartialEq)]
1216pub struct SetFlags {
1217 /// The span of these flags, including the grouping parentheses.
1218 pub span: Span,
1219 /// The actual sequence of flags.
1220 pub flags: Flags,
1221}
1222
1223/// A group of flags.
1224///
1225/// This corresponds only to the sequence of flags themselves, e.g., `is-u`.
1226#[derive(Clone, Debug, Eq, PartialEq)]
1227pub struct Flags {
1228 /// The span of this group of flags.
1229 pub span: Span,
1230 /// A sequence of flag items. Each item is either a flag or a negation
1231 /// operator.
1232 pub items: Vec<FlagsItem>,
1233}
1234
1235impl Flags {
1236 /// Add the given item to this sequence of flags.
1237 ///
1238 /// If the item was added successfully, then `None` is returned. If the
1239 /// given item is a duplicate, then `Some(i)` is returned, where
1240 /// `items[i].kind == item.kind`.
1241 pub fn add_item(&mut self, item: FlagsItem) -> Option<usize> {
1242 for (i, x) in self.items.iter().enumerate() {
1243 if x.kind == item.kind {
1244 return Some(i);
1245 }
1246 }
1247 self.items.push(item);
1248 None
1249 }
1250
1251 /// Returns the state of the given flag in this set.
1252 ///
1253 /// If the given flag is in the set but is negated, then `Some(false)` is
1254 /// returned.
1255 ///
1256 /// If the given flag is in the set and is not negated, then `Some(true)`
1257 /// is returned.
1258 ///
1259 /// Otherwise, `None` is returned.
1260 pub fn flag_state(&self, flag: Flag) -> Option<bool> {
1261 let mut negated = false;
1262 for x in &self.items {
1263 match x.kind {
1264 FlagsItemKind::Negation => {
1265 negated = true;
1266 }
1267 FlagsItemKind::Flag(ref xflag) if xflag == &flag => {
1268 return Some(!negated);
1269 }
1270 _ => {}
1271 }
1272 }
1273 None
1274 }
1275}
1276
1277/// A single item in a group of flags.
1278#[derive(Clone, Debug, Eq, PartialEq)]
1279pub struct FlagsItem {
1280 /// The span of this item.
1281 pub span: Span,
1282 /// The kind of this item.
1283 pub kind: FlagsItemKind,
1284}
1285
1286/// The kind of an item in a group of flags.
1287#[derive(Clone, Debug, Eq, PartialEq)]
1288pub enum FlagsItemKind {
1289 /// A negation operator applied to all subsequent flags in the enclosing
1290 /// group.
1291 Negation,
1292 /// A single flag in a group.
1293 Flag(Flag),
1294}
1295
1296impl FlagsItemKind {
1297 /// Returns true if and only if this item is a negation operator.
1298 pub fn is_negation(&self) -> bool {
1299 match *self {
1300 FlagsItemKind::Negation => true,
1301 _ => false,
1302 }
1303 }
1304}
1305
1306/// A single flag.
1307#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1308pub enum Flag {
1309 /// `i`
1310 CaseInsensitive,
1311 /// `m`
1312 MultiLine,
1313 /// `s`
1314 DotMatchesNewLine,
1315 /// `U`
1316 SwapGreed,
1317 /// `u`
1318 Unicode,
1319 /// `R`
1320 CRLF,
1321 /// `x`
1322 IgnoreWhitespace,
1323}
1324
1325/// A custom `Drop` impl is used for `Ast` such that it uses constant stack
1326/// space but heap space proportional to the depth of the `Ast`.
1327impl Drop for Ast {
1328 fn drop(&mut self) {
1329 use core::mem;
1330
1331 match *self {
1332 Ast::Empty(_)
1333 | Ast::Flags(_)
1334 | Ast::Literal(_)
1335 | Ast::Dot(_)
1336 | Ast::Assertion(_)
1337 // Classes are recursive, so they get their own Drop impl.
1338 | Ast::Class(_) => return,
1339 Ast::Repetition(ref x) if !x.ast.has_subexprs() => return,
1340 Ast::Group(ref x) if !x.ast.has_subexprs() => return,
1341 Ast::Alternation(ref x) if x.asts.is_empty() => return,
1342 Ast::Concat(ref x) if x.asts.is_empty() => return,
1343 _ => {}
1344 }
1345
1346 let empty_span = || Span::splat(Position::new(0, 0, 0));
1347 let empty_ast = || Ast::Empty(empty_span());
1348 let mut stack = vec![mem::replace(self, empty_ast())];
1349 while let Some(mut ast) = stack.pop() {
1350 match ast {
1351 Ast::Empty(_)
1352 | Ast::Flags(_)
1353 | Ast::Literal(_)
1354 | Ast::Dot(_)
1355 | Ast::Assertion(_)
1356 // Classes are recursive, so they get their own Drop impl.
1357 | Ast::Class(_) => {}
1358 Ast::Repetition(ref mut x) => {
1359 stack.push(mem::replace(&mut x.ast, empty_ast()));
1360 }
1361 Ast::Group(ref mut x) => {
1362 stack.push(mem::replace(&mut x.ast, empty_ast()));
1363 }
1364 Ast::Alternation(ref mut x) => {
1365 stack.extend(x.asts.drain(..));
1366 }
1367 Ast::Concat(ref mut x) => {
1368 stack.extend(x.asts.drain(..));
1369 }
1370 }
1371 }
1372 }
1373}
1374
1375/// A custom `Drop` impl is used for `ClassSet` such that it uses constant
1376/// stack space but heap space proportional to the depth of the `ClassSet`.
1377impl Drop for ClassSet {
1378 fn drop(&mut self) {
1379 use core::mem;
1380
1381 match *self {
1382 ClassSet::Item(ref item) => match *item {
1383 ClassSetItem::Empty(_)
1384 | ClassSetItem::Literal(_)
1385 | ClassSetItem::Range(_)
1386 | ClassSetItem::Ascii(_)
1387 | ClassSetItem::Unicode(_)
1388 | ClassSetItem::Perl(_) => return,
1389 ClassSetItem::Bracketed(ref x) => {
1390 if x.kind.is_empty() {
1391 return;
1392 }
1393 }
1394 ClassSetItem::Union(ref x) => {
1395 if x.items.is_empty() {
1396 return;
1397 }
1398 }
1399 },
1400 ClassSet::BinaryOp(ref op) => {
1401 if op.lhs.is_empty() && op.rhs.is_empty() {
1402 return;
1403 }
1404 }
1405 }
1406
1407 let empty_span = || Span::splat(Position::new(0, 0, 0));
1408 let empty_set = || ClassSet::Item(ClassSetItem::Empty(empty_span()));
1409 let mut stack = vec![mem::replace(self, empty_set())];
1410 while let Some(mut set) = stack.pop() {
1411 match set {
1412 ClassSet::Item(ref mut item) => match *item {
1413 ClassSetItem::Empty(_)
1414 | ClassSetItem::Literal(_)
1415 | ClassSetItem::Range(_)
1416 | ClassSetItem::Ascii(_)
1417 | ClassSetItem::Unicode(_)
1418 | ClassSetItem::Perl(_) => {}
1419 ClassSetItem::Bracketed(ref mut x) => {
1420 stack.push(mem::replace(&mut x.kind, empty_set()));
1421 }
1422 ClassSetItem::Union(ref mut x) => {
1423 stack.extend(x.items.drain(..).map(ClassSet::Item));
1424 }
1425 },
1426 ClassSet::BinaryOp(ref mut op) => {
1427 stack.push(mem::replace(&mut op.lhs, empty_set()));
1428 stack.push(mem::replace(&mut op.rhs, empty_set()));
1429 }
1430 }
1431 }
1432 }
1433}
1434
1435#[cfg(test)]
1436mod tests {
1437 use super::*;
1438
1439 // We use a thread with an explicit stack size to test that our destructor
1440 // for Ast can handle arbitrarily sized expressions in constant stack
1441 // space. In case we run on a platform without threads (WASM?), we limit
1442 // this test to Windows/Unix.
1443 #[test]
1444 #[cfg(any(unix, windows))]
1445 fn no_stack_overflow_on_drop() {
1446 use std::thread;
1447
1448 let run = || {
1449 let span = || Span::splat(Position::new(0, 0, 0));
1450 let mut ast = Ast::Empty(span());
1451 for i in 0..200 {
1452 ast = Ast::Group(Group {
1453 span: span(),
1454 kind: GroupKind::CaptureIndex(i),
1455 ast: Box::new(ast),
1456 });
1457 }
1458 assert!(!ast.is_empty());
1459 };
1460
1461 // We run our test on a thread with a small stack size so we can
1462 // force the issue more easily.
1463 //
1464 // NOTE(2023-03-21): It turns out that some platforms (like FreeBSD)
1465 // will just barf with very small stack sizes. So we bump this up a bit
1466 // to give more room to breath. When I did this, I confirmed that if
1467 // I remove the custom `Drop` impl for `Ast`, then this test does
1468 // indeed still fail with a stack overflow. (At the time of writing, I
1469 // had to bump it all the way up to 32K before the test would pass even
1470 // without the custom `Drop` impl. So 16K seems like a safe number
1471 // here.)
1472 //
1473 // See: https://github.com/rust-lang/regex/issues/967
1474 thread::Builder::new()
1475 .stack_size(16 << 10)
1476 .spawn(run)
1477 .unwrap()
1478 .join()
1479 .unwrap();
1480 }
1481}
1482