1/*!
2Defines an abstract syntax for regular expressions.
3*/
4
5use core::cmp::Ordering;
6
7use alloc::{boxed::Box, string::String, vec, vec::Vec};
8
9pub use crate::ast::visitor::{visit, Visitor};
10
11pub mod parse;
12pub mod print;
13mod visitor;
14
15/// An error that occurred while parsing a regular expression into an abstract
16/// syntax tree.
17///
18/// Note that not all ASTs represents a valid regular expression. For example,
19/// an AST is constructed without error for `\p{Quux}`, but `Quux` is not a
20/// valid Unicode property name. That particular error is reported when
21/// translating an AST to the high-level intermediate representation (`HIR`).
22#[derive(Clone, Debug, Eq, PartialEq)]
23#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
24pub struct Error {
25 /// The kind of error.
26 kind: ErrorKind,
27 /// The original pattern that the parser generated the error from. Every
28 /// span in an error is a valid range into this string.
29 pattern: String,
30 /// The span of this error.
31 span: Span,
32}
33
34impl Error {
35 /// Return the type of this error.
36 pub fn kind(&self) -> &ErrorKind {
37 &self.kind
38 }
39
40 /// The original pattern string in which this error occurred.
41 ///
42 /// Every span reported by this error is reported in terms of this string.
43 pub fn pattern(&self) -> &str {
44 &self.pattern
45 }
46
47 /// Return the span at which this error occurred.
48 pub fn span(&self) -> &Span {
49 &self.span
50 }
51
52 /// Return an auxiliary span. This span exists only for some errors that
53 /// benefit from being able to point to two locations in the original
54 /// regular expression. For example, "duplicate" errors will have the
55 /// main error position set to the duplicate occurrence while its
56 /// auxiliary span will be set to the initial occurrence.
57 pub fn auxiliary_span(&self) -> Option<&Span> {
58 use self::ErrorKind::*;
59 match self.kind {
60 FlagDuplicate { ref original } => Some(original),
61 FlagRepeatedNegation { ref original, .. } => Some(original),
62 GroupNameDuplicate { ref original, .. } => Some(original),
63 _ => None,
64 }
65 }
66}
67
68/// The type of an error that occurred while building an AST.
69///
70/// This error type is marked as `non_exhaustive`. This means that adding a
71/// new variant is not considered a breaking change.
72#[non_exhaustive]
73#[derive(Clone, Debug, Eq, PartialEq)]
74#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
75pub enum ErrorKind {
76 /// The capturing group limit was exceeded.
77 ///
78 /// Note that this represents a limit on the total number of capturing
79 /// groups in a regex and not necessarily the number of nested capturing
80 /// groups. That is, the nest limit can be low and it is still possible for
81 /// this error to occur.
82 CaptureLimitExceeded,
83 /// An invalid escape sequence was found in a character class set.
84 ClassEscapeInvalid,
85 /// An invalid character class range was found. An invalid range is any
86 /// range where the start is greater than the end.
87 ClassRangeInvalid,
88 /// An invalid range boundary was found in a character class. Range
89 /// boundaries must be a single literal codepoint, but this error indicates
90 /// that something else was found, such as a nested class.
91 ClassRangeLiteral,
92 /// An opening `[` was found with no corresponding closing `]`.
93 ClassUnclosed,
94 /// Note that this error variant is no longer used. Namely, a decimal
95 /// number can only appear as a repetition quantifier. When the number
96 /// in a repetition quantifier is empty, then it gets its own specialized
97 /// error, `RepetitionCountDecimalEmpty`.
98 DecimalEmpty,
99 /// An invalid decimal number was given where one was expected.
100 DecimalInvalid,
101 /// A bracketed hex literal was empty.
102 EscapeHexEmpty,
103 /// A bracketed hex literal did not correspond to a Unicode scalar value.
104 EscapeHexInvalid,
105 /// An invalid hexadecimal digit was found.
106 EscapeHexInvalidDigit,
107 /// EOF was found before an escape sequence was completed.
108 EscapeUnexpectedEof,
109 /// An unrecognized escape sequence.
110 EscapeUnrecognized,
111 /// A dangling negation was used when setting flags, e.g., `i-`.
112 FlagDanglingNegation,
113 /// A flag was used twice, e.g., `i-i`.
114 FlagDuplicate {
115 /// The position of the original flag. The error position
116 /// points to the duplicate flag.
117 original: Span,
118 },
119 /// The negation operator was used twice, e.g., `-i-s`.
120 FlagRepeatedNegation {
121 /// The position of the original negation operator. The error position
122 /// points to the duplicate negation operator.
123 original: Span,
124 },
125 /// Expected a flag but got EOF, e.g., `(?`.
126 FlagUnexpectedEof,
127 /// Unrecognized flag, e.g., `a`.
128 FlagUnrecognized,
129 /// A duplicate capture name was found.
130 GroupNameDuplicate {
131 /// The position of the initial occurrence of the capture name. The
132 /// error position itself points to the duplicate occurrence.
133 original: Span,
134 },
135 /// A capture group name is empty, e.g., `(?P<>abc)`.
136 GroupNameEmpty,
137 /// An invalid character was seen for a capture group name. This includes
138 /// errors where the first character is a digit (even though subsequent
139 /// characters are allowed to be digits).
140 GroupNameInvalid,
141 /// A closing `>` could not be found for a capture group name.
142 GroupNameUnexpectedEof,
143 /// An unclosed group, e.g., `(ab`.
144 ///
145 /// The span of this error corresponds to the unclosed parenthesis.
146 GroupUnclosed,
147 /// An unopened group, e.g., `ab)`.
148 GroupUnopened,
149 /// The nest limit was exceeded. The limit stored here is the limit
150 /// configured in the parser.
151 NestLimitExceeded(u32),
152 /// The range provided in a counted repetition operator is invalid. The
153 /// range is invalid if the start is greater than the end.
154 RepetitionCountInvalid,
155 /// An opening `{` was not followed by a valid decimal value.
156 /// For example, `x{}` or `x{]}` would fail.
157 RepetitionCountDecimalEmpty,
158 /// An opening `{` was found with no corresponding closing `}`.
159 RepetitionCountUnclosed,
160 /// A repetition operator was applied to a missing sub-expression. This
161 /// occurs, for example, in the regex consisting of just a `*` or even
162 /// `(?i)*`. It is, however, possible to create a repetition operating on
163 /// an empty sub-expression. For example, `()*` is still considered valid.
164 RepetitionMissing,
165 /// The special word boundary syntax, `\b{something}`, was used, but
166 /// either EOF without `}` was seen, or an invalid character in the
167 /// braces was seen.
168 SpecialWordBoundaryUnclosed,
169 /// The special word boundary syntax, `\b{something}`, was used, but
170 /// `something` was not recognized as a valid word boundary kind.
171 SpecialWordBoundaryUnrecognized,
172 /// The syntax `\b{` was observed, but afterwards the end of the pattern
173 /// was observed without being able to tell whether it was meant to be a
174 /// bounded repetition on the `\b` or the beginning of a special word
175 /// boundary assertion.
176 SpecialWordOrRepetitionUnexpectedEof,
177 /// The Unicode class is not valid. This typically occurs when a `\p` is
178 /// followed by something other than a `{`.
179 UnicodeClassInvalid,
180 /// When octal support is disabled, this error is produced when an octal
181 /// escape is used. The octal escape is assumed to be an invocation of
182 /// a backreference, which is the common case.
183 UnsupportedBackreference,
184 /// When syntax similar to PCRE's look-around is used, this error is
185 /// returned. Some example syntaxes that are rejected include, but are
186 /// not necessarily limited to, `(?=re)`, `(?!re)`, `(?<=re)` and
187 /// `(?<!re)`. Note that all of these syntaxes are otherwise invalid; this
188 /// error is used to improve the user experience.
189 UnsupportedLookAround,
190}
191
192#[cfg(feature = "std")]
193impl std::error::Error for Error {}
194
195impl core::fmt::Display for Error {
196 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
197 crate::error::Formatter::from(self).fmt(f)
198 }
199}
200
201impl core::fmt::Display for ErrorKind {
202 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
203 use self::ErrorKind::*;
204 match *self {
205 CaptureLimitExceeded => write!(
206 f,
207 "exceeded the maximum number of \
208 capturing groups ({})",
209 u32::MAX
210 ),
211 ClassEscapeInvalid => {
212 write!(f, "invalid escape sequence found in character class")
213 }
214 ClassRangeInvalid => write!(
215 f,
216 "invalid character class range, \
217 the start must be <= the end"
218 ),
219 ClassRangeLiteral => {
220 write!(f, "invalid range boundary, must be a literal")
221 }
222 ClassUnclosed => write!(f, "unclosed character class"),
223 DecimalEmpty => write!(f, "decimal literal empty"),
224 DecimalInvalid => write!(f, "decimal literal invalid"),
225 EscapeHexEmpty => write!(f, "hexadecimal literal empty"),
226 EscapeHexInvalid => {
227 write!(f, "hexadecimal literal is not a Unicode scalar value")
228 }
229 EscapeHexInvalidDigit => write!(f, "invalid hexadecimal digit"),
230 EscapeUnexpectedEof => write!(
231 f,
232 "incomplete escape sequence, \
233 reached end of pattern prematurely"
234 ),
235 EscapeUnrecognized => write!(f, "unrecognized escape sequence"),
236 FlagDanglingNegation => {
237 write!(f, "dangling flag negation operator")
238 }
239 FlagDuplicate { .. } => write!(f, "duplicate flag"),
240 FlagRepeatedNegation { .. } => {
241 write!(f, "flag negation operator repeated")
242 }
243 FlagUnexpectedEof => {
244 write!(f, "expected flag but got end of regex")
245 }
246 FlagUnrecognized => write!(f, "unrecognized flag"),
247 GroupNameDuplicate { .. } => {
248 write!(f, "duplicate capture group name")
249 }
250 GroupNameEmpty => write!(f, "empty capture group name"),
251 GroupNameInvalid => write!(f, "invalid capture group character"),
252 GroupNameUnexpectedEof => write!(f, "unclosed capture group name"),
253 GroupUnclosed => write!(f, "unclosed group"),
254 GroupUnopened => write!(f, "unopened group"),
255 NestLimitExceeded(limit) => write!(
256 f,
257 "exceed the maximum number of \
258 nested parentheses/brackets ({})",
259 limit
260 ),
261 RepetitionCountInvalid => write!(
262 f,
263 "invalid repetition count range, \
264 the start must be <= the end"
265 ),
266 RepetitionCountDecimalEmpty => {
267 write!(f, "repetition quantifier expects a valid decimal")
268 }
269 RepetitionCountUnclosed => {
270 write!(f, "unclosed counted repetition")
271 }
272 RepetitionMissing => {
273 write!(f, "repetition operator missing expression")
274 }
275 SpecialWordBoundaryUnclosed => {
276 write!(
277 f,
278 "special word boundary assertion is either \
279 unclosed or contains an invalid character",
280 )
281 }
282 SpecialWordBoundaryUnrecognized => {
283 write!(
284 f,
285 "unrecognized special word boundary assertion, \
286 valid choices are: start, end, start-half \
287 or end-half",
288 )
289 }
290 SpecialWordOrRepetitionUnexpectedEof => {
291 write!(
292 f,
293 "found either the beginning of a special word \
294 boundary or a bounded repetition on a \\b with \
295 an opening brace, but no closing brace",
296 )
297 }
298 UnicodeClassInvalid => {
299 write!(f, "invalid Unicode character class")
300 }
301 UnsupportedBackreference => {
302 write!(f, "backreferences are not supported")
303 }
304 UnsupportedLookAround => write!(
305 f,
306 "look-around, including look-ahead and look-behind, \
307 is not supported"
308 ),
309 }
310 }
311}
312
313/// Span represents the position information of a single AST item.
314///
315/// All span positions are absolute byte offsets that can be used on the
316/// original regular expression that was parsed.
317#[derive(Clone, Copy, Eq, PartialEq)]
318#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
319pub struct Span {
320 /// The start byte offset.
321 pub start: Position,
322 /// The end byte offset.
323 pub end: Position,
324}
325
326impl core::fmt::Debug for Span {
327 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
328 write!(f, "Span({:?}, {:?})", self.start, self.end)
329 }
330}
331
332impl Ord for Span {
333 fn cmp(&self, other: &Span) -> Ordering {
334 (&self.start, &self.end).cmp(&(&other.start, &other.end))
335 }
336}
337
338impl PartialOrd for Span {
339 fn partial_cmp(&self, other: &Span) -> Option<Ordering> {
340 Some(self.cmp(other))
341 }
342}
343
344/// A single position in a regular expression.
345///
346/// A position encodes one half of a span, and include the byte offset, line
347/// number and column number.
348#[derive(Clone, Copy, Eq, PartialEq)]
349#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
350pub struct Position {
351 /// The absolute offset of this position, starting at `0` from the
352 /// beginning of the regular expression pattern string.
353 pub offset: usize,
354 /// The line number, starting at `1`.
355 pub line: usize,
356 /// The approximate column number, starting at `1`.
357 pub column: usize,
358}
359
360impl core::fmt::Debug for Position {
361 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
362 write!(
363 f,
364 "Position(o: {:?}, l: {:?}, c: {:?})",
365 self.offset, self.line, self.column
366 )
367 }
368}
369
370impl Ord for Position {
371 fn cmp(&self, other: &Position) -> Ordering {
372 self.offset.cmp(&other.offset)
373 }
374}
375
376impl PartialOrd for Position {
377 fn partial_cmp(&self, other: &Position) -> Option<Ordering> {
378 Some(self.cmp(other))
379 }
380}
381
382impl Span {
383 /// Create a new span with the given positions.
384 pub fn new(start: Position, end: Position) -> Span {
385 Span { start, end }
386 }
387
388 /// Create a new span using the given position as the start and end.
389 pub fn splat(pos: Position) -> Span {
390 Span::new(pos, pos)
391 }
392
393 /// Create a new span by replacing the starting the position with the one
394 /// given.
395 pub fn with_start(self, pos: Position) -> Span {
396 Span { start: pos, ..self }
397 }
398
399 /// Create a new span by replacing the ending the position with the one
400 /// given.
401 pub fn with_end(self, pos: Position) -> Span {
402 Span { end: pos, ..self }
403 }
404
405 /// Returns true if and only if this span occurs on a single line.
406 pub fn is_one_line(&self) -> bool {
407 self.start.line == self.end.line
408 }
409
410 /// Returns true if and only if this span is empty. That is, it points to
411 /// a single position in the concrete syntax of a regular expression.
412 pub fn is_empty(&self) -> bool {
413 self.start.offset == self.end.offset
414 }
415}
416
417impl Position {
418 /// Create a new position with the given information.
419 ///
420 /// `offset` is the absolute offset of the position, starting at `0` from
421 /// the beginning of the regular expression pattern string.
422 ///
423 /// `line` is the line number, starting at `1`.
424 ///
425 /// `column` is the approximate column number, starting at `1`.
426 pub fn new(offset: usize, line: usize, column: usize) -> Position {
427 Position { offset, line, column }
428 }
429}
430
431/// An abstract syntax tree for a singular expression along with comments
432/// found.
433///
434/// Comments are not stored in the tree itself to avoid complexity. Each
435/// comment contains a span of precisely where it occurred in the original
436/// regular expression.
437#[derive(Clone, Debug, Eq, PartialEq)]
438#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
439pub struct WithComments {
440 /// The actual ast.
441 pub ast: Ast,
442 /// All comments found in the original regular expression.
443 pub comments: Vec<Comment>,
444}
445
446/// A comment from a regular expression with an associated span.
447///
448/// A regular expression can only contain comments when the `x` flag is
449/// enabled.
450#[derive(Clone, Debug, Eq, PartialEq)]
451#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
452pub struct Comment {
453 /// The span of this comment, including the beginning `#` and ending `\n`.
454 pub span: Span,
455 /// The comment text, starting with the first character following the `#`
456 /// and ending with the last character preceding the `\n`.
457 pub comment: String,
458}
459
460/// An abstract syntax tree for a single regular expression.
461///
462/// An `Ast`'s `fmt::Display` implementation uses constant stack space and heap
463/// space proportional to the size of the `Ast`.
464///
465/// This type defines its own destructor that uses constant stack space and
466/// heap space proportional to the size of the `Ast`.
467#[derive(Clone, Debug, Eq, PartialEq)]
468#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
469pub enum Ast {
470 /// An empty regex that matches everything.
471 Empty(Box<Span>),
472 /// A set of flags, e.g., `(?is)`.
473 Flags(Box<SetFlags>),
474 /// A single character literal, which includes escape sequences.
475 Literal(Box<Literal>),
476 /// The "any character" class.
477 Dot(Box<Span>),
478 /// A single zero-width assertion.
479 Assertion(Box<Assertion>),
480 /// A single Unicode character class, e.g., `\pL` or `\p{Greek}`.
481 ClassUnicode(Box<ClassUnicode>),
482 /// A single perl character class, e.g., `\d` or `\W`.
483 ClassPerl(Box<ClassPerl>),
484 /// A single bracketed character class set, which may contain zero or more
485 /// character ranges and/or zero or more nested classes. e.g.,
486 /// `[a-zA-Z\pL]`.
487 ClassBracketed(Box<ClassBracketed>),
488 /// A repetition operator applied to an arbitrary regular expression.
489 Repetition(Box<Repetition>),
490 /// A grouped regular expression.
491 Group(Box<Group>),
492 /// An alternation of regular expressions.
493 Alternation(Box<Alternation>),
494 /// A concatenation of regular expressions.
495 Concat(Box<Concat>),
496}
497
498impl Ast {
499 /// Create an "empty" AST item.
500 pub fn empty(span: Span) -> Ast {
501 Ast::Empty(Box::new(span))
502 }
503
504 /// Create a "flags" AST item.
505 pub fn flags(e: SetFlags) -> Ast {
506 Ast::Flags(Box::new(e))
507 }
508
509 /// Create a "literal" AST item.
510 pub fn literal(e: Literal) -> Ast {
511 Ast::Literal(Box::new(e))
512 }
513
514 /// Create a "dot" AST item.
515 pub fn dot(span: Span) -> Ast {
516 Ast::Dot(Box::new(span))
517 }
518
519 /// Create a "assertion" AST item.
520 pub fn assertion(e: Assertion) -> Ast {
521 Ast::Assertion(Box::new(e))
522 }
523
524 /// Create a "Unicode class" AST item.
525 pub fn class_unicode(e: ClassUnicode) -> Ast {
526 Ast::ClassUnicode(Box::new(e))
527 }
528
529 /// Create a "Perl class" AST item.
530 pub fn class_perl(e: ClassPerl) -> Ast {
531 Ast::ClassPerl(Box::new(e))
532 }
533
534 /// Create a "bracketed class" AST item.
535 pub fn class_bracketed(e: ClassBracketed) -> Ast {
536 Ast::ClassBracketed(Box::new(e))
537 }
538
539 /// Create a "repetition" AST item.
540 pub fn repetition(e: Repetition) -> Ast {
541 Ast::Repetition(Box::new(e))
542 }
543
544 /// Create a "group" AST item.
545 pub fn group(e: Group) -> Ast {
546 Ast::Group(Box::new(e))
547 }
548
549 /// Create a "alternation" AST item.
550 pub fn alternation(e: Alternation) -> Ast {
551 Ast::Alternation(Box::new(e))
552 }
553
554 /// Create a "concat" AST item.
555 pub fn concat(e: Concat) -> Ast {
556 Ast::Concat(Box::new(e))
557 }
558
559 /// Return the span of this abstract syntax tree.
560 pub fn span(&self) -> &Span {
561 match *self {
562 Ast::Empty(ref span) => span,
563 Ast::Flags(ref x) => &x.span,
564 Ast::Literal(ref x) => &x.span,
565 Ast::Dot(ref span) => span,
566 Ast::Assertion(ref x) => &x.span,
567 Ast::ClassUnicode(ref x) => &x.span,
568 Ast::ClassPerl(ref x) => &x.span,
569 Ast::ClassBracketed(ref x) => &x.span,
570 Ast::Repetition(ref x) => &x.span,
571 Ast::Group(ref x) => &x.span,
572 Ast::Alternation(ref x) => &x.span,
573 Ast::Concat(ref x) => &x.span,
574 }
575 }
576
577 /// Return true if and only if this Ast is empty.
578 pub fn is_empty(&self) -> bool {
579 match *self {
580 Ast::Empty(_) => true,
581 _ => false,
582 }
583 }
584
585 /// Returns true if and only if this AST has any (including possibly empty)
586 /// subexpressions.
587 fn has_subexprs(&self) -> bool {
588 match *self {
589 Ast::Empty(_)
590 | Ast::Flags(_)
591 | Ast::Literal(_)
592 | Ast::Dot(_)
593 | Ast::Assertion(_)
594 | Ast::ClassUnicode(_)
595 | Ast::ClassPerl(_) => false,
596 Ast::ClassBracketed(_)
597 | Ast::Repetition(_)
598 | Ast::Group(_)
599 | Ast::Alternation(_)
600 | Ast::Concat(_) => true,
601 }
602 }
603}
604
605/// Print a display representation of this Ast.
606///
607/// This does not preserve any of the original whitespace formatting that may
608/// have originally been present in the concrete syntax from which this Ast
609/// was generated.
610///
611/// This implementation uses constant stack space and heap space proportional
612/// to the size of the `Ast`.
613impl core::fmt::Display for Ast {
614 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
615 use crate::ast::print::Printer;
616 Printer::new().print(self, f)
617 }
618}
619
620/// An alternation of regular expressions.
621#[derive(Clone, Debug, Eq, PartialEq)]
622#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
623pub struct Alternation {
624 /// The span of this alternation.
625 pub span: Span,
626 /// The alternate regular expressions.
627 pub asts: Vec<Ast>,
628}
629
630impl Alternation {
631 /// Return this alternation as an AST.
632 ///
633 /// If this alternation contains zero ASTs, then `Ast::empty` is returned.
634 /// If this alternation contains exactly 1 AST, then the corresponding AST
635 /// is returned. Otherwise, `Ast::alternation` is returned.
636 pub fn into_ast(mut self) -> Ast {
637 match self.asts.len() {
638 0 => Ast::empty(self.span),
639 1 => self.asts.pop().unwrap(),
640 _ => Ast::alternation(self),
641 }
642 }
643}
644
645/// A concatenation of regular expressions.
646#[derive(Clone, Debug, Eq, PartialEq)]
647#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
648pub struct Concat {
649 /// The span of this concatenation.
650 pub span: Span,
651 /// The concatenation regular expressions.
652 pub asts: Vec<Ast>,
653}
654
655impl Concat {
656 /// Return this concatenation as an AST.
657 ///
658 /// If this alternation contains zero ASTs, then `Ast::empty` is returned.
659 /// If this alternation contains exactly 1 AST, then the corresponding AST
660 /// is returned. Otherwise, `Ast::concat` is returned.
661 pub fn into_ast(mut self) -> Ast {
662 match self.asts.len() {
663 0 => Ast::empty(self.span),
664 1 => self.asts.pop().unwrap(),
665 _ => Ast::concat(self),
666 }
667 }
668}
669
670/// A single literal expression.
671///
672/// A literal corresponds to a single Unicode scalar value. Literals may be
673/// represented in their literal form, e.g., `a` or in their escaped form,
674/// e.g., `\x61`.
675#[derive(Clone, Debug, Eq, PartialEq)]
676#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
677pub struct Literal {
678 /// The span of this literal.
679 pub span: Span,
680 /// The kind of this literal.
681 pub kind: LiteralKind,
682 /// The Unicode scalar value corresponding to this literal.
683 pub c: char,
684}
685
686impl Literal {
687 /// If this literal was written as a `\x` hex escape, then this returns
688 /// the corresponding byte value. Otherwise, this returns `None`.
689 pub fn byte(&self) -> Option<u8> {
690 match self.kind {
691 LiteralKind::HexFixed(HexLiteralKind::X) => {
692 u8::try_from(self.c).ok()
693 }
694 _ => None,
695 }
696 }
697}
698
699/// The kind of a single literal expression.
700#[derive(Clone, Debug, Eq, PartialEq)]
701#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
702pub enum LiteralKind {
703 /// The literal is written verbatim, e.g., `a` or `☃`.
704 Verbatim,
705 /// The literal is written as an escape because it is otherwise a special
706 /// regex meta character, e.g., `\*` or `\[`.
707 Meta,
708 /// The literal is written as an escape despite the fact that the escape is
709 /// unnecessary, e.g., `\%` or `\/`.
710 Superfluous,
711 /// The literal is written as an octal escape, e.g., `\141`.
712 Octal,
713 /// The literal is written as a hex code with a fixed number of digits
714 /// depending on the type of the escape, e.g., `\x61` or or `\u0061` or
715 /// `\U00000061`.
716 HexFixed(HexLiteralKind),
717 /// The literal is written as a hex code with a bracketed number of
718 /// digits. The only restriction is that the bracketed hex code must refer
719 /// to a valid Unicode scalar value.
720 HexBrace(HexLiteralKind),
721 /// The literal is written as a specially recognized escape, e.g., `\f`
722 /// or `\n`.
723 Special(SpecialLiteralKind),
724}
725
726/// The type of a special literal.
727///
728/// A special literal is a special escape sequence recognized by the regex
729/// parser, e.g., `\f` or `\n`.
730#[derive(Clone, Debug, Eq, PartialEq)]
731#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
732pub enum SpecialLiteralKind {
733 /// Bell, spelled `\a` (`\x07`).
734 Bell,
735 /// Form feed, spelled `\f` (`\x0C`).
736 FormFeed,
737 /// Tab, spelled `\t` (`\x09`).
738 Tab,
739 /// Line feed, spelled `\n` (`\x0A`).
740 LineFeed,
741 /// Carriage return, spelled `\r` (`\x0D`).
742 CarriageReturn,
743 /// Vertical tab, spelled `\v` (`\x0B`).
744 VerticalTab,
745 /// Space, spelled `\ ` (`\x20`). Note that this can only appear when
746 /// parsing in verbose mode.
747 Space,
748}
749
750/// The type of a Unicode hex literal.
751///
752/// Note that all variants behave the same when used with brackets. They only
753/// differ when used without brackets in the number of hex digits that must
754/// follow.
755#[derive(Clone, Debug, Eq, PartialEq)]
756#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
757pub enum HexLiteralKind {
758 /// A `\x` prefix. When used without brackets, this form is limited to
759 /// two digits.
760 X,
761 /// A `\u` prefix. When used without brackets, this form is limited to
762 /// four digits.
763 UnicodeShort,
764 /// A `\U` prefix. When used without brackets, this form is limited to
765 /// eight digits.
766 UnicodeLong,
767}
768
769impl HexLiteralKind {
770 /// The number of digits that must be used with this literal form when
771 /// used without brackets. When used with brackets, there is no
772 /// restriction on the number of digits.
773 pub fn digits(&self) -> u32 {
774 match *self {
775 HexLiteralKind::X => 2,
776 HexLiteralKind::UnicodeShort => 4,
777 HexLiteralKind::UnicodeLong => 8,
778 }
779 }
780}
781
782/// A Perl character class.
783#[derive(Clone, Debug, Eq, PartialEq)]
784#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
785pub struct ClassPerl {
786 /// The span of this class.
787 pub span: Span,
788 /// The kind of Perl class.
789 pub kind: ClassPerlKind,
790 /// Whether the class is negated or not. e.g., `\d` is not negated but
791 /// `\D` is.
792 pub negated: bool,
793}
794
795/// The available Perl character classes.
796#[derive(Clone, Debug, Eq, PartialEq)]
797#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
798pub enum ClassPerlKind {
799 /// Decimal numbers.
800 Digit,
801 /// Whitespace.
802 Space,
803 /// Word characters.
804 Word,
805}
806
807/// An ASCII character class.
808#[derive(Clone, Debug, Eq, PartialEq)]
809#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
810pub struct ClassAscii {
811 /// The span of this class.
812 pub span: Span,
813 /// The kind of ASCII class.
814 pub kind: ClassAsciiKind,
815 /// Whether the class is negated or not. e.g., `[[:alpha:]]` is not negated
816 /// but `[[:^alpha:]]` is.
817 pub negated: bool,
818}
819
820/// The available ASCII character classes.
821#[derive(Clone, Debug, Eq, PartialEq)]
822#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
823pub enum ClassAsciiKind {
824 /// `[0-9A-Za-z]`
825 Alnum,
826 /// `[A-Za-z]`
827 Alpha,
828 /// `[\x00-\x7F]`
829 Ascii,
830 /// `[ \t]`
831 Blank,
832 /// `[\x00-\x1F\x7F]`
833 Cntrl,
834 /// `[0-9]`
835 Digit,
836 /// `[!-~]`
837 Graph,
838 /// `[a-z]`
839 Lower,
840 /// `[ -~]`
841 Print,
842 /// `[!-/:-@\[-`{-~]`
843 Punct,
844 /// `[\t\n\v\f\r ]`
845 Space,
846 /// `[A-Z]`
847 Upper,
848 /// `[0-9A-Za-z_]`
849 Word,
850 /// `[0-9A-Fa-f]`
851 Xdigit,
852}
853
854impl ClassAsciiKind {
855 /// Return the corresponding ClassAsciiKind variant for the given name.
856 ///
857 /// The name given should correspond to the lowercase version of the
858 /// variant name. e.g., `cntrl` is the name for `ClassAsciiKind::Cntrl`.
859 ///
860 /// If no variant with the corresponding name exists, then `None` is
861 /// returned.
862 pub fn from_name(name: &str) -> Option<ClassAsciiKind> {
863 use self::ClassAsciiKind::*;
864 match name {
865 "alnum" => Some(Alnum),
866 "alpha" => Some(Alpha),
867 "ascii" => Some(Ascii),
868 "blank" => Some(Blank),
869 "cntrl" => Some(Cntrl),
870 "digit" => Some(Digit),
871 "graph" => Some(Graph),
872 "lower" => Some(Lower),
873 "print" => Some(Print),
874 "punct" => Some(Punct),
875 "space" => Some(Space),
876 "upper" => Some(Upper),
877 "word" => Some(Word),
878 "xdigit" => Some(Xdigit),
879 _ => None,
880 }
881 }
882}
883
884/// A Unicode character class.
885#[derive(Clone, Debug, Eq, PartialEq)]
886#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
887pub struct ClassUnicode {
888 /// The span of this class.
889 pub span: Span,
890 /// Whether this class is negated or not.
891 ///
892 /// Note: be careful when using this attribute. This specifically refers
893 /// to whether the class is written as `\p` or `\P`, where the latter
894 /// is `negated = true`. However, it also possible to write something like
895 /// `\P{scx!=Katakana}` which is actually equivalent to
896 /// `\p{scx=Katakana}` and is therefore not actually negated even though
897 /// `negated = true` here. To test whether this class is truly negated
898 /// or not, use the `is_negated` method.
899 pub negated: bool,
900 /// The kind of Unicode class.
901 pub kind: ClassUnicodeKind,
902}
903
904impl ClassUnicode {
905 /// Returns true if this class has been negated.
906 ///
907 /// Note that this takes the Unicode op into account, if it's present.
908 /// e.g., `is_negated` for `\P{scx!=Katakana}` will return `false`.
909 pub fn is_negated(&self) -> bool {
910 match self.kind {
911 ClassUnicodeKind::NamedValue {
912 op: ClassUnicodeOpKind::NotEqual,
913 ..
914 } => !self.negated,
915 _ => self.negated,
916 }
917 }
918}
919
920/// The available forms of Unicode character classes.
921#[derive(Clone, Debug, Eq, PartialEq)]
922pub enum ClassUnicodeKind {
923 /// A one letter abbreviated class, e.g., `\pN`.
924 OneLetter(char),
925 /// A binary property, general category or script. The string may be
926 /// empty.
927 Named(String),
928 /// A property name and an associated value.
929 NamedValue {
930 /// The type of Unicode op used to associate `name` with `value`.
931 op: ClassUnicodeOpKind,
932 /// The property name (which may be empty).
933 name: String,
934 /// The property value (which may be empty).
935 value: String,
936 },
937}
938
939#[cfg(feature = "arbitrary")]
940impl arbitrary::Arbitrary<'_> for ClassUnicodeKind {
941 fn arbitrary(
942 u: &mut arbitrary::Unstructured,
943 ) -> arbitrary::Result<ClassUnicodeKind> {
944 #[cfg(any(
945 feature = "unicode-age",
946 feature = "unicode-bool",
947 feature = "unicode-gencat",
948 feature = "unicode-perl",
949 feature = "unicode-script",
950 feature = "unicode-segment",
951 ))]
952 {
953 use alloc::string::ToString;
954
955 use super::unicode_tables::{
956 property_names::PROPERTY_NAMES,
957 property_values::PROPERTY_VALUES,
958 };
959
960 match u.choose_index(3)? {
961 0 => {
962 let all = PROPERTY_VALUES
963 .iter()
964 .flat_map(|e| e.1.iter())
965 .filter(|(name, _)| name.len() == 1)
966 .count();
967 let idx = u.choose_index(all)?;
968 let value = PROPERTY_VALUES
969 .iter()
970 .flat_map(|e| e.1.iter())
971 .take(idx + 1)
972 .last()
973 .unwrap()
974 .0
975 .chars()
976 .next()
977 .unwrap();
978 Ok(ClassUnicodeKind::OneLetter(value))
979 }
980 1 => {
981 let all = PROPERTY_VALUES
982 .iter()
983 .map(|e| e.1.len())
984 .sum::<usize>()
985 + PROPERTY_NAMES.len();
986 let idx = u.choose_index(all)?;
987 let name = PROPERTY_VALUES
988 .iter()
989 .flat_map(|e| e.1.iter())
990 .chain(PROPERTY_NAMES)
991 .map(|(_, e)| e)
992 .take(idx + 1)
993 .last()
994 .unwrap();
995 Ok(ClassUnicodeKind::Named(name.to_string()))
996 }
997 2 => {
998 let all = PROPERTY_VALUES
999 .iter()
1000 .map(|e| e.1.len())
1001 .sum::<usize>();
1002 let idx = u.choose_index(all)?;
1003 let (prop, value) = PROPERTY_VALUES
1004 .iter()
1005 .flat_map(|e| {
1006 e.1.iter().map(|(_, value)| (e.0, value))
1007 })
1008 .take(idx + 1)
1009 .last()
1010 .unwrap();
1011 Ok(ClassUnicodeKind::NamedValue {
1012 op: u.arbitrary()?,
1013 name: prop.to_string(),
1014 value: value.to_string(),
1015 })
1016 }
1017 _ => unreachable!("index chosen is impossible"),
1018 }
1019 }
1020 #[cfg(not(any(
1021 feature = "unicode-age",
1022 feature = "unicode-bool",
1023 feature = "unicode-gencat",
1024 feature = "unicode-perl",
1025 feature = "unicode-script",
1026 feature = "unicode-segment",
1027 )))]
1028 {
1029 match u.choose_index(3)? {
1030 0 => Ok(ClassUnicodeKind::OneLetter(u.arbitrary()?)),
1031 1 => Ok(ClassUnicodeKind::Named(u.arbitrary()?)),
1032 2 => Ok(ClassUnicodeKind::NamedValue {
1033 op: u.arbitrary()?,
1034 name: u.arbitrary()?,
1035 value: u.arbitrary()?,
1036 }),
1037 _ => unreachable!("index chosen is impossible"),
1038 }
1039 }
1040 }
1041
1042 fn size_hint(depth: usize) -> (usize, Option<usize>) {
1043 #[cfg(any(
1044 feature = "unicode-age",
1045 feature = "unicode-bool",
1046 feature = "unicode-gencat",
1047 feature = "unicode-perl",
1048 feature = "unicode-script",
1049 feature = "unicode-segment",
1050 ))]
1051 {
1052 arbitrary::size_hint::and_all(&[
1053 usize::size_hint(depth),
1054 usize::size_hint(depth),
1055 arbitrary::size_hint::or(
1056 (0, Some(0)),
1057 ClassUnicodeOpKind::size_hint(depth),
1058 ),
1059 ])
1060 }
1061 #[cfg(not(any(
1062 feature = "unicode-age",
1063 feature = "unicode-bool",
1064 feature = "unicode-gencat",
1065 feature = "unicode-perl",
1066 feature = "unicode-script",
1067 feature = "unicode-segment",
1068 )))]
1069 {
1070 arbitrary::size_hint::and(
1071 usize::size_hint(depth),
1072 arbitrary::size_hint::or_all(&[
1073 char::size_hint(depth),
1074 String::size_hint(depth),
1075 arbitrary::size_hint::and_all(&[
1076 String::size_hint(depth),
1077 String::size_hint(depth),
1078 ClassUnicodeOpKind::size_hint(depth),
1079 ]),
1080 ]),
1081 )
1082 }
1083 }
1084}
1085
1086/// The type of op used in a Unicode character class.
1087#[derive(Clone, Debug, Eq, PartialEq)]
1088#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1089pub enum ClassUnicodeOpKind {
1090 /// A property set to a specific value, e.g., `\p{scx=Katakana}`.
1091 Equal,
1092 /// A property set to a specific value using a colon, e.g.,
1093 /// `\p{scx:Katakana}`.
1094 Colon,
1095 /// A property that isn't a particular value, e.g., `\p{scx!=Katakana}`.
1096 NotEqual,
1097}
1098
1099impl ClassUnicodeOpKind {
1100 /// Whether the op is an equality op or not.
1101 pub fn is_equal(&self) -> bool {
1102 match *self {
1103 ClassUnicodeOpKind::Equal | ClassUnicodeOpKind::Colon => true,
1104 _ => false,
1105 }
1106 }
1107}
1108
1109/// A bracketed character class, e.g., `[a-z0-9]`.
1110#[derive(Clone, Debug, Eq, PartialEq)]
1111#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1112pub struct ClassBracketed {
1113 /// The span of this class.
1114 pub span: Span,
1115 /// Whether this class is negated or not. e.g., `[a]` is not negated but
1116 /// `[^a]` is.
1117 pub negated: bool,
1118 /// The type of this set. A set is either a normal union of things, e.g.,
1119 /// `[abc]` or a result of applying set operations, e.g., `[\pL--c]`.
1120 pub kind: ClassSet,
1121}
1122
1123/// A character class set.
1124///
1125/// This type corresponds to the internal structure of a bracketed character
1126/// class. That is, every bracketed character is one of two types: a union of
1127/// items (literals, ranges, other bracketed classes) or a tree of binary set
1128/// operations.
1129#[derive(Clone, Debug, Eq, PartialEq)]
1130#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1131pub enum ClassSet {
1132 /// An item, which can be a single literal, range, nested character class
1133 /// or a union of items.
1134 Item(ClassSetItem),
1135 /// A single binary operation (i.e., &&, -- or ~~).
1136 BinaryOp(ClassSetBinaryOp),
1137}
1138
1139impl ClassSet {
1140 /// Build a set from a union.
1141 pub fn union(ast: ClassSetUnion) -> ClassSet {
1142 ClassSet::Item(ClassSetItem::Union(ast))
1143 }
1144
1145 /// Return the span of this character class set.
1146 pub fn span(&self) -> &Span {
1147 match *self {
1148 ClassSet::Item(ref x) => x.span(),
1149 ClassSet::BinaryOp(ref x) => &x.span,
1150 }
1151 }
1152
1153 /// Return true if and only if this class set is empty.
1154 fn is_empty(&self) -> bool {
1155 match *self {
1156 ClassSet::Item(ClassSetItem::Empty(_)) => true,
1157 _ => false,
1158 }
1159 }
1160}
1161
1162/// A single component of a character class set.
1163#[derive(Clone, Debug, Eq, PartialEq)]
1164#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1165pub enum ClassSetItem {
1166 /// An empty item.
1167 ///
1168 /// Note that a bracketed character class cannot contain a single empty
1169 /// item. Empty items can appear when using one of the binary operators.
1170 /// For example, `[&&]` is the intersection of two empty classes.
1171 Empty(Span),
1172 /// A single literal.
1173 Literal(Literal),
1174 /// A range between two literals.
1175 Range(ClassSetRange),
1176 /// An ASCII character class, e.g., `[:alnum:]` or `[:punct:]`.
1177 Ascii(ClassAscii),
1178 /// A Unicode character class, e.g., `\pL` or `\p{Greek}`.
1179 Unicode(ClassUnicode),
1180 /// A perl character class, e.g., `\d` or `\W`.
1181 Perl(ClassPerl),
1182 /// A bracketed character class set, which may contain zero or more
1183 /// character ranges and/or zero or more nested classes. e.g.,
1184 /// `[a-zA-Z\pL]`.
1185 Bracketed(Box<ClassBracketed>),
1186 /// A union of items.
1187 Union(ClassSetUnion),
1188}
1189
1190impl ClassSetItem {
1191 /// Return the span of this character class set item.
1192 pub fn span(&self) -> &Span {
1193 match *self {
1194 ClassSetItem::Empty(ref span) => span,
1195 ClassSetItem::Literal(ref x) => &x.span,
1196 ClassSetItem::Range(ref x) => &x.span,
1197 ClassSetItem::Ascii(ref x) => &x.span,
1198 ClassSetItem::Perl(ref x) => &x.span,
1199 ClassSetItem::Unicode(ref x) => &x.span,
1200 ClassSetItem::Bracketed(ref x) => &x.span,
1201 ClassSetItem::Union(ref x) => &x.span,
1202 }
1203 }
1204}
1205
1206/// A single character class range in a set.
1207#[derive(Clone, Debug, Eq, PartialEq)]
1208#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1209pub struct ClassSetRange {
1210 /// The span of this range.
1211 pub span: Span,
1212 /// The start of this range.
1213 pub start: Literal,
1214 /// The end of this range.
1215 pub end: Literal,
1216}
1217
1218impl ClassSetRange {
1219 /// Returns true if and only if this character class range is valid.
1220 ///
1221 /// The only case where a range is invalid is if its start is greater than
1222 /// its end.
1223 pub fn is_valid(&self) -> bool {
1224 self.start.c <= self.end.c
1225 }
1226}
1227
1228/// A union of items inside a character class set.
1229#[derive(Clone, Debug, Eq, PartialEq)]
1230#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1231pub struct ClassSetUnion {
1232 /// The span of the items in this operation. e.g., the `a-z0-9` in
1233 /// `[^a-z0-9]`
1234 pub span: Span,
1235 /// The sequence of items that make up this union.
1236 pub items: Vec<ClassSetItem>,
1237}
1238
1239impl ClassSetUnion {
1240 /// Push a new item in this union.
1241 ///
1242 /// The ending position of this union's span is updated to the ending
1243 /// position of the span of the item given. If the union is empty, then
1244 /// the starting position of this union is set to the starting position
1245 /// of this item.
1246 ///
1247 /// In other words, if you only use this method to add items to a union
1248 /// and you set the spans on each item correctly, then you should never
1249 /// need to adjust the span of the union directly.
1250 pub fn push(&mut self, item: ClassSetItem) {
1251 if self.items.is_empty() {
1252 self.span.start = item.span().start;
1253 }
1254 self.span.end = item.span().end;
1255 self.items.push(item);
1256 }
1257
1258 /// Return this union as a character class set item.
1259 ///
1260 /// If this union contains zero items, then an empty union is
1261 /// returned. If this concatenation contains exactly 1 item, then the
1262 /// corresponding item is returned. Otherwise, ClassSetItem::Union is
1263 /// returned.
1264 pub fn into_item(mut self) -> ClassSetItem {
1265 match self.items.len() {
1266 0 => ClassSetItem::Empty(self.span),
1267 1 => self.items.pop().unwrap(),
1268 _ => ClassSetItem::Union(self),
1269 }
1270 }
1271}
1272
1273/// A Unicode character class set operation.
1274#[derive(Clone, Debug, Eq, PartialEq)]
1275#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1276pub struct ClassSetBinaryOp {
1277 /// The span of this operation. e.g., the `a-z--[h-p]` in `[a-z--h-p]`.
1278 pub span: Span,
1279 /// The type of this set operation.
1280 pub kind: ClassSetBinaryOpKind,
1281 /// The left hand side of the operation.
1282 pub lhs: Box<ClassSet>,
1283 /// The right hand side of the operation.
1284 pub rhs: Box<ClassSet>,
1285}
1286
1287/// The type of a Unicode character class set operation.
1288///
1289/// Note that this doesn't explicitly represent union since there is no
1290/// explicit union operator. Concatenation inside a character class corresponds
1291/// to the union operation.
1292#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1293#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1294pub enum ClassSetBinaryOpKind {
1295 /// The intersection of two sets, e.g., `\pN&&[a-z]`.
1296 Intersection,
1297 /// The difference of two sets, e.g., `\pN--[0-9]`.
1298 Difference,
1299 /// The symmetric difference of two sets. The symmetric difference is the
1300 /// set of elements belonging to one but not both sets.
1301 /// e.g., `[\pL~~[:ascii:]]`.
1302 SymmetricDifference,
1303}
1304
1305/// A single zero-width assertion.
1306#[derive(Clone, Debug, Eq, PartialEq)]
1307#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1308pub struct Assertion {
1309 /// The span of this assertion.
1310 pub span: Span,
1311 /// The assertion kind, e.g., `\b` or `^`.
1312 pub kind: AssertionKind,
1313}
1314
1315/// An assertion kind.
1316#[derive(Clone, Debug, Eq, PartialEq)]
1317#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1318pub enum AssertionKind {
1319 /// `^`
1320 StartLine,
1321 /// `$`
1322 EndLine,
1323 /// `\A`
1324 StartText,
1325 /// `\z`
1326 EndText,
1327 /// `\b`
1328 WordBoundary,
1329 /// `\B`
1330 NotWordBoundary,
1331 /// `\b{start}`
1332 WordBoundaryStart,
1333 /// `\b{end}`
1334 WordBoundaryEnd,
1335 /// `\<` (alias for `\b{start}`)
1336 WordBoundaryStartAngle,
1337 /// `\>` (alias for `\b{end}`)
1338 WordBoundaryEndAngle,
1339 /// `\b{start-half}`
1340 WordBoundaryStartHalf,
1341 /// `\b{end-half}`
1342 WordBoundaryEndHalf,
1343}
1344
1345/// A repetition operation applied to a regular expression.
1346#[derive(Clone, Debug, Eq, PartialEq)]
1347#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1348pub struct Repetition {
1349 /// The span of this operation.
1350 pub span: Span,
1351 /// The actual operation.
1352 pub op: RepetitionOp,
1353 /// Whether this operation was applied greedily or not.
1354 pub greedy: bool,
1355 /// The regular expression under repetition.
1356 pub ast: Box<Ast>,
1357}
1358
1359/// The repetition operator itself.
1360#[derive(Clone, Debug, Eq, PartialEq)]
1361#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1362pub struct RepetitionOp {
1363 /// The span of this operator. This includes things like `+`, `*?` and
1364 /// `{m,n}`.
1365 pub span: Span,
1366 /// The type of operation.
1367 pub kind: RepetitionKind,
1368}
1369
1370/// The kind of a repetition operator.
1371#[derive(Clone, Debug, Eq, PartialEq)]
1372#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1373pub enum RepetitionKind {
1374 /// `?`
1375 ZeroOrOne,
1376 /// `*`
1377 ZeroOrMore,
1378 /// `+`
1379 OneOrMore,
1380 /// `{m,n}`
1381 Range(RepetitionRange),
1382}
1383
1384/// A range repetition operator.
1385#[derive(Clone, Debug, Eq, PartialEq)]
1386#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1387pub enum RepetitionRange {
1388 /// `{m}`
1389 Exactly(u32),
1390 /// `{m,}`
1391 AtLeast(u32),
1392 /// `{m,n}`
1393 Bounded(u32, u32),
1394}
1395
1396impl RepetitionRange {
1397 /// Returns true if and only if this repetition range is valid.
1398 ///
1399 /// The only case where a repetition range is invalid is if it is bounded
1400 /// and its start is greater than its end.
1401 pub fn is_valid(&self) -> bool {
1402 match *self {
1403 RepetitionRange::Bounded(s, e) if s > e => false,
1404 _ => true,
1405 }
1406 }
1407}
1408
1409/// A grouped regular expression.
1410///
1411/// This includes both capturing and non-capturing groups. This does **not**
1412/// include flag-only groups like `(?is)`, but does contain any group that
1413/// contains a sub-expression, e.g., `(a)`, `(?P<name>a)`, `(?:a)` and
1414/// `(?is:a)`.
1415#[derive(Clone, Debug, Eq, PartialEq)]
1416#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1417pub struct Group {
1418 /// The span of this group.
1419 pub span: Span,
1420 /// The kind of this group.
1421 pub kind: GroupKind,
1422 /// The regular expression in this group.
1423 pub ast: Box<Ast>,
1424}
1425
1426impl Group {
1427 /// If this group is non-capturing, then this returns the (possibly empty)
1428 /// set of flags. Otherwise, `None` is returned.
1429 pub fn flags(&self) -> Option<&Flags> {
1430 match self.kind {
1431 GroupKind::NonCapturing(ref flags) => Some(flags),
1432 _ => None,
1433 }
1434 }
1435
1436 /// Returns true if and only if this group is capturing.
1437 pub fn is_capturing(&self) -> bool {
1438 match self.kind {
1439 GroupKind::CaptureIndex(_) | GroupKind::CaptureName { .. } => true,
1440 GroupKind::NonCapturing(_) => false,
1441 }
1442 }
1443
1444 /// Returns the capture index of this group, if this is a capturing group.
1445 ///
1446 /// This returns a capture index precisely when `is_capturing` is `true`.
1447 pub fn capture_index(&self) -> Option<u32> {
1448 match self.kind {
1449 GroupKind::CaptureIndex(i) => Some(i),
1450 GroupKind::CaptureName { ref name, .. } => Some(name.index),
1451 GroupKind::NonCapturing(_) => None,
1452 }
1453 }
1454}
1455
1456/// The kind of a group.
1457#[derive(Clone, Debug, Eq, PartialEq)]
1458#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1459pub enum GroupKind {
1460 /// `(a)`
1461 CaptureIndex(u32),
1462 /// `(?<name>a)` or `(?P<name>a)`
1463 CaptureName {
1464 /// True if the `?P<` syntax is used and false if the `?<` syntax is used.
1465 starts_with_p: bool,
1466 /// The capture name.
1467 name: CaptureName,
1468 },
1469 /// `(?:a)` and `(?i:a)`
1470 NonCapturing(Flags),
1471}
1472
1473/// A capture name.
1474///
1475/// This corresponds to the name itself between the angle brackets in, e.g.,
1476/// `(?P<foo>expr)`.
1477#[derive(Clone, Debug, Eq, PartialEq)]
1478pub struct CaptureName {
1479 /// The span of this capture name.
1480 pub span: Span,
1481 /// The capture name.
1482 pub name: String,
1483 /// The capture index.
1484 pub index: u32,
1485}
1486
1487#[cfg(feature = "arbitrary")]
1488impl arbitrary::Arbitrary<'_> for CaptureName {
1489 fn arbitrary(
1490 u: &mut arbitrary::Unstructured,
1491 ) -> arbitrary::Result<CaptureName> {
1492 let len = u.arbitrary_len::<char>()?;
1493 if len == 0 {
1494 return Err(arbitrary::Error::NotEnoughData);
1495 }
1496 let mut name: String = String::new();
1497 for _ in 0..len {
1498 let ch: char = u.arbitrary()?;
1499 let cp = u32::from(ch);
1500 let ascii_letter_offset = u8::try_from(cp % 26).unwrap();
1501 let ascii_letter = b'a' + ascii_letter_offset;
1502 name.push(char::from(ascii_letter));
1503 }
1504 Ok(CaptureName { span: u.arbitrary()?, name, index: u.arbitrary()? })
1505 }
1506
1507 fn size_hint(depth: usize) -> (usize, Option<usize>) {
1508 arbitrary::size_hint::and_all(&[
1509 Span::size_hint(depth),
1510 usize::size_hint(depth),
1511 u32::size_hint(depth),
1512 ])
1513 }
1514}
1515
1516/// A group of flags that is not applied to a particular regular expression.
1517#[derive(Clone, Debug, Eq, PartialEq)]
1518#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1519pub struct SetFlags {
1520 /// The span of these flags, including the grouping parentheses.
1521 pub span: Span,
1522 /// The actual sequence of flags.
1523 pub flags: Flags,
1524}
1525
1526/// A group of flags.
1527///
1528/// This corresponds only to the sequence of flags themselves, e.g., `is-u`.
1529#[derive(Clone, Debug, Eq, PartialEq)]
1530#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1531pub struct Flags {
1532 /// The span of this group of flags.
1533 pub span: Span,
1534 /// A sequence of flag items. Each item is either a flag or a negation
1535 /// operator.
1536 pub items: Vec<FlagsItem>,
1537}
1538
1539impl Flags {
1540 /// Add the given item to this sequence of flags.
1541 ///
1542 /// If the item was added successfully, then `None` is returned. If the
1543 /// given item is a duplicate, then `Some(i)` is returned, where
1544 /// `items[i].kind == item.kind`.
1545 pub fn add_item(&mut self, item: FlagsItem) -> Option<usize> {
1546 for (i, x) in self.items.iter().enumerate() {
1547 if x.kind == item.kind {
1548 return Some(i);
1549 }
1550 }
1551 self.items.push(item);
1552 None
1553 }
1554
1555 /// Returns the state of the given flag in this set.
1556 ///
1557 /// If the given flag is in the set but is negated, then `Some(false)` is
1558 /// returned.
1559 ///
1560 /// If the given flag is in the set and is not negated, then `Some(true)`
1561 /// is returned.
1562 ///
1563 /// Otherwise, `None` is returned.
1564 pub fn flag_state(&self, flag: Flag) -> Option<bool> {
1565 let mut negated = false;
1566 for x in &self.items {
1567 match x.kind {
1568 FlagsItemKind::Negation => {
1569 negated = true;
1570 }
1571 FlagsItemKind::Flag(ref xflag) if xflag == &flag => {
1572 return Some(!negated);
1573 }
1574 _ => {}
1575 }
1576 }
1577 None
1578 }
1579}
1580
1581/// A single item in a group of flags.
1582#[derive(Clone, Debug, Eq, PartialEq)]
1583#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1584pub struct FlagsItem {
1585 /// The span of this item.
1586 pub span: Span,
1587 /// The kind of this item.
1588 pub kind: FlagsItemKind,
1589}
1590
1591/// The kind of an item in a group of flags.
1592#[derive(Clone, Debug, Eq, PartialEq)]
1593#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1594pub enum FlagsItemKind {
1595 /// A negation operator applied to all subsequent flags in the enclosing
1596 /// group.
1597 Negation,
1598 /// A single flag in a group.
1599 Flag(Flag),
1600}
1601
1602impl FlagsItemKind {
1603 /// Returns true if and only if this item is a negation operator.
1604 pub fn is_negation(&self) -> bool {
1605 match *self {
1606 FlagsItemKind::Negation => true,
1607 _ => false,
1608 }
1609 }
1610}
1611
1612/// A single flag.
1613#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1614#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1615pub enum Flag {
1616 /// `i`
1617 CaseInsensitive,
1618 /// `m`
1619 MultiLine,
1620 /// `s`
1621 DotMatchesNewLine,
1622 /// `U`
1623 SwapGreed,
1624 /// `u`
1625 Unicode,
1626 /// `R`
1627 CRLF,
1628 /// `x`
1629 IgnoreWhitespace,
1630}
1631
1632/// A custom `Drop` impl is used for `Ast` such that it uses constant stack
1633/// space but heap space proportional to the depth of the `Ast`.
1634impl Drop for Ast {
1635 fn drop(&mut self) {
1636 use core::mem;
1637
1638 match *self {
1639 Ast::Empty(_)
1640 | Ast::Flags(_)
1641 | Ast::Literal(_)
1642 | Ast::Dot(_)
1643 | Ast::Assertion(_)
1644 | Ast::ClassUnicode(_)
1645 | Ast::ClassPerl(_)
1646 // Bracketed classes are recursive, they get their own Drop impl.
1647 | Ast::ClassBracketed(_) => return,
1648 Ast::Repetition(ref x) if !x.ast.has_subexprs() => return,
1649 Ast::Group(ref x) if !x.ast.has_subexprs() => return,
1650 Ast::Alternation(ref x) if x.asts.is_empty() => return,
1651 Ast::Concat(ref x) if x.asts.is_empty() => return,
1652 _ => {}
1653 }
1654
1655 let empty_span = || Span::splat(Position::new(0, 0, 0));
1656 let empty_ast = || Ast::empty(empty_span());
1657 let mut stack = vec![mem::replace(self, empty_ast())];
1658 while let Some(mut ast) = stack.pop() {
1659 match ast {
1660 Ast::Empty(_)
1661 | Ast::Flags(_)
1662 | Ast::Literal(_)
1663 | Ast::Dot(_)
1664 | Ast::Assertion(_)
1665 | Ast::ClassUnicode(_)
1666 | Ast::ClassPerl(_)
1667 // Bracketed classes are recursive, so they get their own Drop
1668 // impl.
1669 | Ast::ClassBracketed(_) => {}
1670 Ast::Repetition(ref mut x) => {
1671 stack.push(mem::replace(&mut x.ast, empty_ast()));
1672 }
1673 Ast::Group(ref mut x) => {
1674 stack.push(mem::replace(&mut x.ast, empty_ast()));
1675 }
1676 Ast::Alternation(ref mut x) => {
1677 stack.extend(x.asts.drain(..));
1678 }
1679 Ast::Concat(ref mut x) => {
1680 stack.extend(x.asts.drain(..));
1681 }
1682 }
1683 }
1684 }
1685}
1686
1687/// A custom `Drop` impl is used for `ClassSet` such that it uses constant
1688/// stack space but heap space proportional to the depth of the `ClassSet`.
1689impl Drop for ClassSet {
1690 fn drop(&mut self) {
1691 use core::mem;
1692
1693 match *self {
1694 ClassSet::Item(ref item) => match *item {
1695 ClassSetItem::Empty(_)
1696 | ClassSetItem::Literal(_)
1697 | ClassSetItem::Range(_)
1698 | ClassSetItem::Ascii(_)
1699 | ClassSetItem::Unicode(_)
1700 | ClassSetItem::Perl(_) => return,
1701 ClassSetItem::Bracketed(ref x) => {
1702 if x.kind.is_empty() {
1703 return;
1704 }
1705 }
1706 ClassSetItem::Union(ref x) => {
1707 if x.items.is_empty() {
1708 return;
1709 }
1710 }
1711 },
1712 ClassSet::BinaryOp(ref op) => {
1713 if op.lhs.is_empty() && op.rhs.is_empty() {
1714 return;
1715 }
1716 }
1717 }
1718
1719 let empty_span = || Span::splat(Position::new(0, 0, 0));
1720 let empty_set = || ClassSet::Item(ClassSetItem::Empty(empty_span()));
1721 let mut stack = vec![mem::replace(self, empty_set())];
1722 while let Some(mut set) = stack.pop() {
1723 match set {
1724 ClassSet::Item(ref mut item) => match *item {
1725 ClassSetItem::Empty(_)
1726 | ClassSetItem::Literal(_)
1727 | ClassSetItem::Range(_)
1728 | ClassSetItem::Ascii(_)
1729 | ClassSetItem::Unicode(_)
1730 | ClassSetItem::Perl(_) => {}
1731 ClassSetItem::Bracketed(ref mut x) => {
1732 stack.push(mem::replace(&mut x.kind, empty_set()));
1733 }
1734 ClassSetItem::Union(ref mut x) => {
1735 stack.extend(x.items.drain(..).map(ClassSet::Item));
1736 }
1737 },
1738 ClassSet::BinaryOp(ref mut op) => {
1739 stack.push(mem::replace(&mut op.lhs, empty_set()));
1740 stack.push(mem::replace(&mut op.rhs, empty_set()));
1741 }
1742 }
1743 }
1744 }
1745}
1746
1747#[cfg(test)]
1748mod tests {
1749 use super::*;
1750
1751 // We use a thread with an explicit stack size to test that our destructor
1752 // for Ast can handle arbitrarily sized expressions in constant stack
1753 // space. In case we run on a platform without threads (WASM?), we limit
1754 // this test to Windows/Unix.
1755 #[test]
1756 #[cfg(any(unix, windows))]
1757 fn no_stack_overflow_on_drop() {
1758 use std::thread;
1759
1760 let run = || {
1761 let span = || Span::splat(Position::new(0, 0, 0));
1762 let mut ast = Ast::empty(span());
1763 for i in 0..200 {
1764 ast = Ast::group(Group {
1765 span: span(),
1766 kind: GroupKind::CaptureIndex(i),
1767 ast: Box::new(ast),
1768 });
1769 }
1770 assert!(!ast.is_empty());
1771 };
1772
1773 // We run our test on a thread with a small stack size so we can
1774 // force the issue more easily.
1775 //
1776 // NOTE(2023-03-21): It turns out that some platforms (like FreeBSD)
1777 // will just barf with very small stack sizes. So we bump this up a bit
1778 // to give more room to breath. When I did this, I confirmed that if
1779 // I remove the custom `Drop` impl for `Ast`, then this test does
1780 // indeed still fail with a stack overflow. (At the time of writing, I
1781 // had to bump it all the way up to 32K before the test would pass even
1782 // without the custom `Drop` impl. So 16K seems like a safe number
1783 // here.)
1784 //
1785 // See: https://github.com/rust-lang/regex/issues/967
1786 thread::Builder::new()
1787 .stack_size(16 << 10)
1788 .spawn(run)
1789 .unwrap()
1790 .join()
1791 .unwrap();
1792 }
1793
1794 // This tests that our `Ast` has a reasonable size. This isn't a hard rule
1795 // and it can be increased if given a good enough reason. But this test
1796 // exists because the size of `Ast` was at one point over 200 bytes on a
1797 // 64-bit target. Wow.
1798 #[test]
1799 fn ast_size() {
1800 let max = 2 * core::mem::size_of::<usize>();
1801 let size = core::mem::size_of::<Ast>();
1802 assert!(
1803 size <= max,
1804 "Ast size of {} bytes is bigger than suggested max {}",
1805 size,
1806 max
1807 );
1808 }
1809}
1810

Provided by KDAB

Privacy Policy