mod.rs - Codebrowser

1	/!*
2	Defines an abstract syntax for regular expressions.
3	*/
4
5	use core::cmp::Ordering;
6
7	use alloc::{boxed::Box, string::String, vec, vec::Vec};
8
9	pub use crate::ast::visitor::{visit, Visitor};
10
11	pub mod parse;
12	pub mod print;
13	mod visitor;
14
15	/// An error that occurred while parsing a regular expression into an abstract
16	/// syntax tree.
17	///
18	/// Note that not all ASTs represents a valid regular expression. For example,
19	/// an AST is constructed without error for `\p{Quux}`, but `Quux` is not a
20	/// valid Unicode property name. That particular error is reported when
21	/// translating an AST to the high-level intermediate representation (`HIR`).
22	#[derive(Clone, Debug, Eq, PartialEq)]
23	#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
24	pub struct Error {
25	/// The kind of error.
26	kind: ErrorKind,
27	/// The original pattern that the parser generated the error from. Every
28	/// span in an error is a valid range into this string.
29	pattern: String,
30	/// The span of this error.
31	span: Span,
32	}
33
34	impl Error {
35	/// Return the type of this error.
36	pub fn kind(&self) -> &ErrorKind {
37	&self.kind
38	}
39
40	/// The original pattern string in which this error occurred.
41	///
42	/// Every span reported by this error is reported in terms of this string.
43	pub fn pattern(&self) -> &str {
44	&self.pattern
45	}
46
47	/// Return the span at which this error occurred.
48	pub fn span(&self) -> &Span {
49	&self.span
50	}
51
52	/// Return an auxiliary span. This span exists only for some errors that
53	/// benefit from being able to point to two locations in the original
54	/// regular expression. For example, "duplicate" errors will have the
55	/// main error position set to the duplicate occurrence while its
56	/// auxiliary span will be set to the initial occurrence.
57	pub fn auxiliary_span(&self) -> Option<&Span> {
58	use self::ErrorKind::*;
59	match self.kind {
60	FlagDuplicate { ref original } => Some(original),
61	FlagRepeatedNegation { ref original, .. } => Some(original),
62	GroupNameDuplicate { ref original, .. } => Some(original),
63	_ => None,
64	}
65	}
66	}
67
68	/// The type of an error that occurred while building an AST.
69	///
70	/// This error type is marked as `non_exhaustive`. This means that adding a
71	/// new variant is not considered a breaking change.
72	#[non_exhaustive]
73	#[derive(Clone, Debug, Eq, PartialEq)]
74	#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
75	pub enum ErrorKind {
76	/// The capturing group limit was exceeded.
77	///
78	/// Note that this represents a limit on the total number of capturing
79	/// groups in a regex and not necessarily the number of nested capturing
80	/// groups. That is, the nest limit can be low and it is still possible for
81	/// this error to occur.
82	CaptureLimitExceeded,
83	/// An invalid escape sequence was found in a character class set.
84	ClassEscapeInvalid,
85	/// An invalid character class range was found. An invalid range is any
86	/// range where the start is greater than the end.
87	ClassRangeInvalid,
88	/// An invalid range boundary was found in a character class. Range
89	/// boundaries must be a single literal codepoint, but this error indicates
90	/// that something else was found, such as a nested class.
91	ClassRangeLiteral,
92	/// An opening `[` was found with no corresponding closing `]`.
93	ClassUnclosed,
94	/// Note that this error variant is no longer used. Namely, a decimal
95	/// number can only appear as a repetition quantifier. When the number
96	/// in a repetition quantifier is empty, then it gets its own specialized
97	/// error, `RepetitionCountDecimalEmpty`.
98	DecimalEmpty,
99	/// An invalid decimal number was given where one was expected.
100	DecimalInvalid,
101	/// A bracketed hex literal was empty.
102	EscapeHexEmpty,
103	/// A bracketed hex literal did not correspond to a Unicode scalar value.
104	EscapeHexInvalid,
105	/// An invalid hexadecimal digit was found.
106	EscapeHexInvalidDigit,
107	/// EOF was found before an escape sequence was completed.
108	EscapeUnexpectedEof,
109	/// An unrecognized escape sequence.
110	EscapeUnrecognized,
111	/// A dangling negation was used when setting flags, e.g., `i-`.
112	FlagDanglingNegation,
113	/// A flag was used twice, e.g., `i-i`.
114	FlagDuplicate {
115	/// The position of the original flag. The error position
116	/// points to the duplicate flag.
117	original: Span,
118	},
119	/// The negation operator was used twice, e.g., `-i-s`.
120	FlagRepeatedNegation {
121	/// The position of the original negation operator. The error position
122	/// points to the duplicate negation operator.
123	original: Span,
124	},
125	/// Expected a flag but got EOF, e.g., `(?`.
126	FlagUnexpectedEof,
127	/// Unrecognized flag, e.g., `a`.
128	FlagUnrecognized,
129	/// A duplicate capture name was found.
130	GroupNameDuplicate {
131	/// The position of the initial occurrence of the capture name. The
132	/// error position itself points to the duplicate occurrence.
133	original: Span,
134	},
135	/// A capture group name is empty, e.g., `(?P<>abc)`.
136	GroupNameEmpty,
137	/// An invalid character was seen for a capture group name. This includes
138	/// errors where the first character is a digit (even though subsequent
139	/// characters are allowed to be digits).
140	GroupNameInvalid,
141	/// A closing `>` could not be found for a capture group name.
142	GroupNameUnexpectedEof,
143	/// An unclosed group, e.g., `(ab`.
144	///
145	/// The span of this error corresponds to the unclosed parenthesis.
146	GroupUnclosed,
147	/// An unopened group, e.g., `ab)`.
148	GroupUnopened,
149	/// The nest limit was exceeded. The limit stored here is the limit
150	/// configured in the parser.
151	NestLimitExceeded(u32),
152	/// The range provided in a counted repetition operator is invalid. The
153	/// range is invalid if the start is greater than the end.
154	RepetitionCountInvalid,
155	/// An opening `{` was not followed by a valid decimal value.
156	/// For example, `x{}` or `x{]}` would fail.
157	RepetitionCountDecimalEmpty,
158	/// An opening `{` was found with no corresponding closing `}`.
159	RepetitionCountUnclosed,
160	/// A repetition operator was applied to a missing sub-expression. This
161	/// occurs, for example, in the regex consisting of just a `` or even*
162	/// `(?i)`. It is, however, possible to create a repetition operating on*
163	/// an empty sub-expression. For example, `()` is still considered valid.*
164	RepetitionMissing,
165	/// The special word boundary syntax, `\b{something}`, was used, but
166	/// either EOF without `}` was seen, or an invalid character in the
167	/// braces was seen.
168	SpecialWordBoundaryUnclosed,
169	/// The special word boundary syntax, `\b{something}`, was used, but
170	/// `something` was not recognized as a valid word boundary kind.
171	SpecialWordBoundaryUnrecognized,
172	/// The syntax `\b{` was observed, but afterwards the end of the pattern
173	/// was observed without being able to tell whether it was meant to be a
174	/// bounded repetition on the `\b` or the beginning of a special word
175	/// boundary assertion.
176	SpecialWordOrRepetitionUnexpectedEof,
177	/// The Unicode class is not valid. This typically occurs when a `\p` is
178	/// followed by something other than a `{`.
179	UnicodeClassInvalid,
180	/// When octal support is disabled, this error is produced when an octal
181	/// escape is used. The octal escape is assumed to be an invocation of
182	/// a backreference, which is the common case.
183	UnsupportedBackreference,
184	/// When syntax similar to PCRE's look-around is used, this error is
185	/// returned. Some example syntaxes that are rejected include, but are
186	/// not necessarily limited to, `(?=re)`, `(?!re)`, `(?<=re)` and
187	/// `(?<!re)`. Note that all of these syntaxes are otherwise invalid; this
188	/// error is used to improve the user experience.
189	UnsupportedLookAround,
190	}
191
192	#[cfg(feature = "std")]
193	impl std::error::Error for Error {}
194
195	impl core::fmt::Display for Error {
196	fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
197	crate::error::Formatter::from(self).fmt(f)
198	}
199	}
200
201	impl core::fmt::Display for ErrorKind {
202	fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
203	use self::ErrorKind::*;
204	match *self {
205	CaptureLimitExceeded => write!(
206	f,
207	"exceeded the maximum number of \
208	capturing groups ({})",
209	u32::MAX
210	),
211	ClassEscapeInvalid => {
212	write!(f, "invalid escape sequence found in character class")
213	}
214	ClassRangeInvalid => write!(
215	f,
216	"invalid character class range, \
217	the start must be <= the end"
218	),
219	ClassRangeLiteral => {
220	write!(f, "invalid range boundary, must be a literal")
221	}
222	ClassUnclosed => write!(f, "unclosed character class"),
223	DecimalEmpty => write!(f, "decimal literal empty"),
224	DecimalInvalid => write!(f, "decimal literal invalid"),
225	EscapeHexEmpty => write!(f, "hexadecimal literal empty"),
226	EscapeHexInvalid => {
227	write!(f, "hexadecimal literal is not a Unicode scalar value")
228	}
229	EscapeHexInvalidDigit => write!(f, "invalid hexadecimal digit"),
230	EscapeUnexpectedEof => write!(
231	f,
232	"incomplete escape sequence, \
233	reached end of pattern prematurely"
234	),
235	EscapeUnrecognized => write!(f, "unrecognized escape sequence"),
236	FlagDanglingNegation => {
237	write!(f, "dangling flag negation operator")
238	}
239	FlagDuplicate { .. } => write!(f, "duplicate flag"),
240	FlagRepeatedNegation { .. } => {
241	write!(f, "flag negation operator repeated")
242	}
243	FlagUnexpectedEof => {
244	write!(f, "expected flag but got end of regex")
245	}
246	FlagUnrecognized => write!(f, "unrecognized flag"),
247	GroupNameDuplicate { .. } => {
248	write!(f, "duplicate capture group name")
249	}
250	GroupNameEmpty => write!(f, "empty capture group name"),
251	GroupNameInvalid => write!(f, "invalid capture group character"),
252	GroupNameUnexpectedEof => write!(f, "unclosed capture group name"),
253	GroupUnclosed => write!(f, "unclosed group"),
254	GroupUnopened => write!(f, "unopened group"),
255	NestLimitExceeded(limit) => write!(
256	f,
257	"exceed the maximum number of \
258	nested parentheses/brackets ({})",
259	limit
260	),
261	RepetitionCountInvalid => write!(
262	f,
263	"invalid repetition count range, \
264	the start must be <= the end"
265	),
266	RepetitionCountDecimalEmpty => {
267	write!(f, "repetition quantifier expects a valid decimal")
268	}
269	RepetitionCountUnclosed => {
270	write!(f, "unclosed counted repetition")
271	}
272	RepetitionMissing => {
273	write!(f, "repetition operator missing expression")
274	}
275	SpecialWordBoundaryUnclosed => {
276	write!(
277	f,
278	"special word boundary assertion is either \
279	unclosed or contains an invalid character",
280	)
281	}
282	SpecialWordBoundaryUnrecognized => {
283	write!(
284	f,
285	"unrecognized special word boundary assertion, \
286	valid choices are: start, end, start-half \
287	or end-half",
288	)
289	}
290	SpecialWordOrRepetitionUnexpectedEof => {
291	write!(
292	f,
293	"found either the beginning of a special word \
294	boundary or a bounded repetition on a `\\`b with \
295	an opening brace, but no closing brace",
296	)
297	}
298	UnicodeClassInvalid => {
299	write!(f, "invalid Unicode character class")
300	}
301	UnsupportedBackreference => {
302	write!(f, "backreferences are not supported")
303	}
304	UnsupportedLookAround => write!(
305	f,
306	"look-around, including look-ahead and look-behind, \
307	is not supported"
308	),
309	}
310	}
311	}
312
313	/// Span represents the position information of a single AST item.
314	///
315	/// All span positions are absolute byte offsets that can be used on the
316	/// original regular expression that was parsed.
317	#[derive(Clone, Copy, Eq, PartialEq)]
318	#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
319	pub struct Span {
320	/// The start byte offset.
321	pub start: Position,
322	/// The end byte offset.
323	pub end: Position,
324	}
325
326	impl core::fmt::Debug for Span {
327	fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
328	write!(f, "Span({:?}, {:?})", self.start, self.end)
329	}
330	}
331
332	impl Ord for Span {
333	fn cmp(&self, other: &Span) -> Ordering {
334	(&self.start, &self.end).cmp(&(&other.start, &other.end))
335	}
336	}
337
338	impl PartialOrd for Span {
339	fn partial_cmp(&self, other: &Span) -> Option<Ordering> {
340	Some(self.cmp(other))
341	}
342	}
343
344	/// A single position in a regular expression.
345	///
346	/// A position encodes one half of a span, and include the byte offset, line
347	/// number and column number.
348	#[derive(Clone, Copy, Eq, PartialEq)]
349	#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
350	pub struct Position {
351	/// The absolute offset of this position, starting at `0` from the
352	/// beginning of the regular expression pattern string.
353	pub offset: usize,
354	/// The line number, starting at `1`.
355	pub line: usize,
356	/// The approximate column number, starting at `1`.
357	pub column: usize,
358	}
359
360	impl core::fmt::Debug for Position {
361	fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
362	write!(
363	f,
364	"Position(o: {:?}, l: {:?}, c: {:?})",
365	self.offset, self.line, self.column
366	)
367	}
368	}
369
370	impl Ord for Position {
371	fn cmp(&self, other: &Position) -> Ordering {
372	self.offset.cmp(&other.offset)
373	}
374	}
375
376	impl PartialOrd for Position {
377	fn partial_cmp(&self, other: &Position) -> Option<Ordering> {
378	Some(self.cmp(other))
379	}
380	}
381
382	impl Span {
383	/// Create a new span with the given positions.
384	pub fn new(start: Position, end: Position) -> Span {
385	Span { start, end }
386	}
387
388	/// Create a new span using the given position as the start and end.
389	pub fn splat(pos: Position) -> Span {
390	Span::new(pos, pos)
391	}
392
393	/// Create a new span by replacing the starting the position with the one
394	/// given.
395	pub fn with_start(self, pos: Position) -> Span {
396	Span { start: pos, ..self }
397	}
398
399	/// Create a new span by replacing the ending the position with the one
400	/// given.
401	pub fn with_end(self, pos: Position) -> Span {
402	Span { end: pos, ..self }
403	}
404
405	/// Returns true if and only if this span occurs on a single line.
406	pub fn is_one_line(&self) -> bool {
407	self.start.line == self.end.line
408	}
409
410	/// Returns true if and only if this span is empty. That is, it points to
411	/// a single position in the concrete syntax of a regular expression.
412	pub fn is_empty(&self) -> bool {
413	self.start.offset == self.end.offset
414	}
415	}
416
417	impl Position {
418	/// Create a new position with the given information.
419	///
420	/// `offset` is the absolute offset of the position, starting at `0` from
421	/// the beginning of the regular expression pattern string.
422	///
423	/// `line` is the line number, starting at `1`.
424	///
425	/// `column` is the approximate column number, starting at `1`.
426	pub fn new(offset: usize, line: usize, column: usize) -> Position {
427	Position { offset, line, column }
428	}
429	}
430
431	/// An abstract syntax tree for a singular expression along with comments
432	/// found.
433	///
434	/// Comments are not stored in the tree itself to avoid complexity. Each
435	/// comment contains a span of precisely where it occurred in the original
436	/// regular expression.
437	#[derive(Clone, Debug, Eq, PartialEq)]
438	#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
439	pub struct WithComments {
440	/// The actual ast.
441	pub ast: Ast,
442	/// All comments found in the original regular expression.
443	pub comments: Vec<Comment>,
444	}
445
446	/// A comment from a regular expression with an associated span.
447	///
448	/// A regular expression can only contain comments when the `x` flag is
449	/// enabled.
450	#[derive(Clone, Debug, Eq, PartialEq)]
451	#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
452	pub struct Comment {
453	/// The span of this comment, including the beginning `#` and ending `\n`.
454	pub span: Span,
455	/// The comment text, starting with the first character following the `#`
456	/// and ending with the last character preceding the `\n`.
457	pub comment: String,
458	}
459
460	/// An abstract syntax tree for a single regular expression.
461	///
462	/// An `Ast`'s `fmt::Display` implementation uses constant stack space and heap
463	/// space proportional to the size of the `Ast`.
464	///
465	/// This type defines its own destructor that uses constant stack space and
466	/// heap space proportional to the size of the `Ast`.
467	#[derive(Clone, Debug, Eq, PartialEq)]
468	#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
469	pub enum Ast {
470	/// An empty regex that matches everything.
471	Empty(Box<Span>),
472	/// A set of flags, e.g., `(?is)`.
473	Flags(Box<SetFlags>),
474	/// A single character literal, which includes escape sequences.
475	Literal(Box<Literal>),
476	/// The "any character" class.
477	Dot(Box<Span>),
478	/// A single zero-width assertion.
479	Assertion(Box<Assertion>),
480	/// A single Unicode character class, e.g., `\pL` or `\p{Greek}`.
481	ClassUnicode(Box<ClassUnicode>),
482	/// A single perl character class, e.g., `\d` or `\W`.
483	ClassPerl(Box<ClassPerl>),
484	/// A single bracketed character class set, which may contain zero or more
485	/// character ranges and/or zero or more nested classes. e.g.,
486	/// `[a-zA-Z\pL]`.
487	ClassBracketed(Box<ClassBracketed>),
488	/// A repetition operator applied to an arbitrary regular expression.
489	Repetition(Box<Repetition>),
490	/// A grouped regular expression.
491	Group(Box<Group>),
492	/// An alternation of regular expressions.
493	Alternation(Box<Alternation>),
494	/// A concatenation of regular expressions.
495	Concat(Box<Concat>),
496	}
497
498	impl Ast {
499	/// Create an "empty" AST item.
500	pub fn empty(span: Span) -> Ast {
501	Ast::Empty(Box::new(span))
502	}
503
504	/// Create a "flags" AST item.
505	pub fn flags(e: SetFlags) -> Ast {
506	Ast::Flags(Box::new(e))
507	}
508
509	/// Create a "literal" AST item.
510	pub fn literal(e: Literal) -> Ast {
511	Ast::Literal(Box::new(e))
512	}
513
514	/// Create a "dot" AST item.
515	pub fn dot(span: Span) -> Ast {
516	Ast::Dot(Box::new(span))
517	}
518
519	/// Create a "assertion" AST item.
520	pub fn assertion(e: Assertion) -> Ast {
521	Ast::Assertion(Box::new(e))
522	}
523
524	/// Create a "Unicode class" AST item.
525	pub fn class_unicode(e: ClassUnicode) -> Ast {
526	Ast::ClassUnicode(Box::new(e))
527	}
528
529	/// Create a "Perl class" AST item.
530	pub fn class_perl(e: ClassPerl) -> Ast {
531	Ast::ClassPerl(Box::new(e))
532	}
533
534	/// Create a "bracketed class" AST item.
535	pub fn class_bracketed(e: ClassBracketed) -> Ast {
536	Ast::ClassBracketed(Box::new(e))
537	}
538
539	/// Create a "repetition" AST item.
540	pub fn repetition(e: Repetition) -> Ast {
541	Ast::Repetition(Box::new(e))
542	}
543
544	/// Create a "group" AST item.
545	pub fn group(e: Group) -> Ast {
546	Ast::Group(Box::new(e))
547	}
548
549	/// Create a "alternation" AST item.
550	pub fn alternation(e: Alternation) -> Ast {
551	Ast::Alternation(Box::new(e))
552	}
553
554	/// Create a "concat" AST item.
555	pub fn concat(e: Concat) -> Ast {
556	Ast::Concat(Box::new(e))
557	}
558
559	/// Return the span of this abstract syntax tree.
560	pub fn span(&self) -> &Span {
561	match *self {
562	Ast::Empty(ref span) => span,
563	Ast::Flags(ref x) => &x.span,
564	Ast::Literal(ref x) => &x.span,
565	Ast::Dot(ref span) => span,
566	Ast::Assertion(ref x) => &x.span,
567	Ast::ClassUnicode(ref x) => &x.span,
568	Ast::ClassPerl(ref x) => &x.span,
569	Ast::ClassBracketed(ref x) => &x.span,
570	Ast::Repetition(ref x) => &x.span,
571	Ast::Group(ref x) => &x.span,
572	Ast::Alternation(ref x) => &x.span,
573	Ast::Concat(ref x) => &x.span,
574	}
575	}
576
577	/// Return true if and only if this Ast is empty.
578	pub fn is_empty(&self) -> bool {
579	match *self {
580	Ast::Empty(_) => `true`,
581	_ => `false`,
582	}
583	}
584
585	/// Returns true if and only if this AST has any (including possibly empty)
586	/// subexpressions.
587	fn has_subexprs(&self) -> bool {
588	match *self {
589	Ast::Empty(_)
590	\| Ast::Flags(_)
591	\| Ast::Literal(_)
592	\| Ast::Dot(_)
593	\| Ast::Assertion(_)
594	\| Ast::ClassUnicode(_)
595	\| Ast::ClassPerl(_) => `false`,
596	Ast::ClassBracketed(_)
597	\| Ast::Repetition(_)
598	\| Ast::Group(_)
599	\| Ast::Alternation(_)
600	\| Ast::Concat(_) => `true`,
601	}
602	}
603	}
604
605	/// Print a display representation of this Ast.
606	///
607	/// This does not preserve any of the original whitespace formatting that may
608	/// have originally been present in the concrete syntax from which this Ast
609	/// was generated.
610	///
611	/// This implementation uses constant stack space and heap space proportional
612	/// to the size of the `Ast`.
613	impl core::fmt::Display for Ast {
614	fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
615	use crate::ast::print::Printer;
616	Printer::new().print(self, f)
617	}
618	}
619
620	/// An alternation of regular expressions.
621	#[derive(Clone, Debug, Eq, PartialEq)]
622	#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
623	pub struct Alternation {
624	/// The span of this alternation.
625	pub span: Span,
626	/// The alternate regular expressions.
627	pub asts: Vec<Ast>,
628	}
629
630	impl Alternation {
631	/// Return this alternation as an AST.
632	///
633	/// If this alternation contains zero ASTs, then `Ast::empty` is returned.
634	/// If this alternation contains exactly 1 AST, then the corresponding AST
635	/// is returned. Otherwise, `Ast::alternation` is returned.
636	pub fn into_ast(mut self) -> Ast {
637	match self.asts.len() {
638	`0` => Ast::empty(self.span),
639	`1` => self.asts.pop().unwrap(),
640	_ => Ast::alternation(self),
641	}
642	}
643	}
644
645	/// A concatenation of regular expressions.
646	#[derive(Clone, Debug, Eq, PartialEq)]
647	#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
648	pub struct Concat {
649	/// The span of this concatenation.
650	pub span: Span,
651	/// The concatenation regular expressions.
652	pub asts: Vec<Ast>,
653	}
654
655	impl Concat {
656	/// Return this concatenation as an AST.
657	///
658	/// If this alternation contains zero ASTs, then `Ast::empty` is returned.
659	/// If this alternation contains exactly 1 AST, then the corresponding AST
660	/// is returned. Otherwise, `Ast::concat` is returned.
661	pub fn into_ast(mut self) -> Ast {
662	match self.asts.len() {
663	`0` => Ast::empty(self.span),
664	`1` => self.asts.pop().unwrap(),
665	_ => Ast::concat(self),
666	}
667	}
668	}
669
670	/// A single literal expression.
671	///
672	/// A literal corresponds to a single Unicode scalar value. Literals may be
673	/// represented in their literal form, e.g., `a` or in their escaped form,
674	/// e.g., `\x61`.
675	#[derive(Clone, Debug, Eq, PartialEq)]
676	#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
677	pub struct Literal {
678	/// The span of this literal.
679	pub span: Span,
680	/// The kind of this literal.
681	pub kind: LiteralKind,
682	/// The Unicode scalar value corresponding to this literal.
683	pub c: char,
684	}
685
686	impl Literal {
687	/// If this literal was written as a `\x` hex escape, then this returns
688	/// the corresponding byte value. Otherwise, this returns `None`.
689	pub fn byte(&self) -> Option<u8> {
690	match self.kind {
691	LiteralKind::HexFixed(HexLiteralKind::X) => {
692	u8::try_from(self.c).ok()
693	}
694	_ => None,
695	}
696	}
697	}
698
699	/// The kind of a single literal expression.
700	#[derive(Clone, Debug, Eq, PartialEq)]
701	#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
702	pub enum LiteralKind {
703	/// The literal is written verbatim, e.g., `a` or `☃`.
704	Verbatim,
705	/// The literal is written as an escape because it is otherwise a special
706	/// regex meta character, e.g., `\` or `\[`.*
707	Meta,
708	/// The literal is written as an escape despite the fact that the escape is
709	/// unnecessary, e.g., `\%` or `\/`.
710	Superfluous,
711	/// The literal is written as an octal escape, e.g., `\141`.
712	Octal,
713	/// The literal is written as a hex code with a fixed number of digits
714	/// depending on the type of the escape, e.g., `\x61` or or `\u0061` or
715	/// `\U00000061`.
716	HexFixed(HexLiteralKind),
717	/// The literal is written as a hex code with a bracketed number of
718	/// digits. The only restriction is that the bracketed hex code must refer
719	/// to a valid Unicode scalar value.
720	HexBrace(HexLiteralKind),
721	/// The literal is written as a specially recognized escape, e.g., `\f`
722	/// or `\n`.
723	Special(SpecialLiteralKind),
724	}
725
726	/// The type of a special literal.
727	///
728	/// A special literal is a special escape sequence recognized by the regex
729	/// parser, e.g., `\f` or `\n`.
730	#[derive(Clone, Debug, Eq, PartialEq)]
731	#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
732	pub enum SpecialLiteralKind {
733	/// Bell, spelled `\a` (`\x07`).
734	Bell,
735	/// Form feed, spelled `\f` (`\x0C`).
736	FormFeed,
737	/// Tab, spelled `\t` (`\x09`).
738	Tab,
739	/// Line feed, spelled `\n` (`\x0A`).
740	LineFeed,
741	/// Carriage return, spelled `\r` (`\x0D`).
742	CarriageReturn,
743	/// Vertical tab, spelled `\v` (`\x0B`).
744	VerticalTab,
745	/// Space, spelled `\ ` (`\x20`). Note that this can only appear when
746	/// parsing in verbose mode.
747	Space,
748	}
749
750	/// The type of a Unicode hex literal.
751	///
752	/// Note that all variants behave the same when used with brackets. They only
753	/// differ when used without brackets in the number of hex digits that must
754	/// follow.
755	#[derive(Clone, Debug, Eq, PartialEq)]
756	#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
757	pub enum HexLiteralKind {
758	/// A `\x` prefix. When used without brackets, this form is limited to
759	/// two digits.
760	X,
761	/// A `\u` prefix. When used without brackets, this form is limited to
762	/// four digits.
763	UnicodeShort,
764	/// A `\U` prefix. When used without brackets, this form is limited to
765	/// eight digits.
766	UnicodeLong,
767	}
768
769	impl HexLiteralKind {
770	/// The number of digits that must be used with this literal form when
771	/// used without brackets. When used with brackets, there is no
772	/// restriction on the number of digits.
773	pub fn digits(&self) -> u32 {
774	match *self {
775	HexLiteralKind::X => `2`,
776	HexLiteralKind::UnicodeShort => `4`,
777	HexLiteralKind::UnicodeLong => `8`,
778	}
779	}
780	}
781
782	/// A Perl character class.
783	#[derive(Clone, Debug, Eq, PartialEq)]
784	#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
785	pub struct ClassPerl {
786	/// The span of this class.
787	pub span: Span,
788	/// The kind of Perl class.
789	pub kind: ClassPerlKind,
790	/// Whether the class is negated or not. e.g., `\d` is not negated but
791	/// `\D` is.
792	pub negated: bool,
793	}
794
795	/// The available Perl character classes.
796	#[derive(Clone, Debug, Eq, PartialEq)]
797	#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
798	pub enum ClassPerlKind {
799	/// Decimal numbers.
800	Digit,
801	/// Whitespace.
802	Space,
803	/// Word characters.
804	Word,
805	}
806
807	/// An ASCII character class.
808	#[derive(Clone, Debug, Eq, PartialEq)]
809	#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
810	pub struct ClassAscii {
811	/// The span of this class.
812	pub span: Span,
813	/// The kind of ASCII class.
814	pub kind: ClassAsciiKind,
815	/// Whether the class is negated or not. e.g., `[[:alpha:]]` is not negated
816	/// but `[[:^alpha:]]` is.
817	pub negated: bool,
818	}
819
820	/// The available ASCII character classes.
821	#[derive(Clone, Debug, Eq, PartialEq)]
822	#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
823	pub enum ClassAsciiKind {
824	/// `[0-9A-Za-z]`
825	Alnum,
826	/// `[A-Za-z]`
827	Alpha,
828	/// `[\x00-\x7F]`
829	Ascii,
830	/// `[ \t]`
831	Blank,
832	/// `[\x00-\x1F\x7F]`
833	Cntrl,
834	/// `[0-9]`
835	Digit,
836	/// `[!-~]`
837	Graph,
838	/// `[a-z]`
839	Lower,
840	/// `[ -~]`
841	Print,
842	/// `[!-/:-@\[-`{-~]`
843	Punct,
844	/// `[\t\n\v\f\r ]`
845	Space,
846	/// `[A-Z]`
847	Upper,
848	/// `[0-9A-Za-z_]`
849	Word,
850	/// `[0-9A-Fa-f]`
851	Xdigit,
852	}
853
854	impl ClassAsciiKind {
855	/// Return the corresponding ClassAsciiKind variant for the given name.
856	///
857	/// The name given should correspond to the lowercase version of the
858	/// variant name. e.g., `cntrl` is the name for `ClassAsciiKind::Cntrl`.
859	///
860	/// If no variant with the corresponding name exists, then `None` is
861	/// returned.
862	pub fn from_name(name: &str) -> Option<ClassAsciiKind> {
863	use self::ClassAsciiKind::*;
864	match name {
865	"alnum" => Some(Alnum),
866	"alpha" => Some(Alpha),
867	"ascii" => Some(Ascii),
868	"blank" => Some(Blank),
869	"cntrl" => Some(Cntrl),
870	"digit" => Some(Digit),
871	"graph" => Some(Graph),
872	"lower" => Some(Lower),
873	"print" => Some(Print),
874	"punct" => Some(Punct),
875	"space" => Some(Space),
876	"upper" => Some(Upper),
877	"word" => Some(Word),
878	"xdigit" => Some(Xdigit),
879	_ => None,
880	}
881	}
882	}
883
884	/// A Unicode character class.
885	#[derive(Clone, Debug, Eq, PartialEq)]
886	#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
887	pub struct ClassUnicode {
888	/// The span of this class.
889	pub span: Span,
890	/// Whether this class is negated or not.
891	///
892	/// Note: be careful when using this attribute. This specifically refers
893	/// to whether the class is written as `\p` or `\P`, where the latter
894	/// is `negated = true`. However, it also possible to write something like
895	/// `\P{scx!=Katakana}` which is actually equivalent to
896	/// `\p{scx=Katakana}` and is therefore not actually negated even though
897	/// `negated = true` here. To test whether this class is truly negated
898	/// or not, use the `is_negated` method.
899	pub negated: bool,
900	/// The kind of Unicode class.
901	pub kind: ClassUnicodeKind,
902	}
903
904	impl ClassUnicode {
905	/// Returns true if this class has been negated.
906	///
907	/// Note that this takes the Unicode op into account, if it's present.
908	/// e.g., `is_negated` for `\P{scx!=Katakana}` will return `false`.
909	pub fn is_negated(&self) -> bool {
910	match self.kind {
911	ClassUnicodeKind::NamedValue {
912	op: ClassUnicodeOpKind::NotEqual,
913	..
914	} => !self.negated,
915	_ => self.negated,
916	}
917	}
918	}
919
920	/// The available forms of Unicode character classes.
921	#[derive(Clone, Debug, Eq, PartialEq)]
922	pub enum ClassUnicodeKind {
923	/// A one letter abbreviated class, e.g., `\pN`.
924	OneLetter(char),
925	/// A binary property, general category or script. The string may be
926	/// empty.
927	Named(String),
928	/// A property name and an associated value.
929	NamedValue {
930	/// The type of Unicode op used to associate `name` with `value`.
931	op: ClassUnicodeOpKind,
932	/// The property name (which may be empty).
933	name: String,
934	/// The property value (which may be empty).
935	value: String,
936	},
937	}
938
939	#[cfg(feature = "arbitrary")]
940	impl arbitrary::Arbitrary<'_> for ClassUnicodeKind {
941	fn arbitrary(
942	u: &mut arbitrary::Unstructured,
943	) -> arbitrary::Result<ClassUnicodeKind> {
944	#[cfg(any(
945	feature = "unicode-age",
946	feature = "unicode-bool",
947	feature = "unicode-gencat",
948	feature = "unicode-perl",
949	feature = "unicode-script",
950	feature = "unicode-segment",
951	))]
952	{
953	use alloc::string::ToString;
954
955	use super::unicode_tables::{
956	property_names::PROPERTY_NAMES,
957	property_values::PROPERTY_VALUES,
958	};
959
960	match u.choose_index(`3`)? {
961	`0` => {
962	let all = PROPERTY_VALUES
963	.iter()
964	.flat_map(\|e\| e.1.iter())
965	.filter(\|(name, _)\| name.len() == `1`)
966	.count();
967	let idx = u.choose_index(all)?;
968	let value = PROPERTY_VALUES
969	.iter()
970	.flat_map(\|e\| e.1.iter())
971	.take(idx + `1`)
972	.last()
973	.unwrap()
974	.0
975	.chars()
976	.next()
977	.unwrap();
978	Ok(ClassUnicodeKind::OneLetter(value))
979	}
980	`1` => {
981	let all = PROPERTY_VALUES
982	.iter()
983	.map(\|e\| e.1.len())
984	.sum::<usize>()
985	+ PROPERTY_NAMES.len();
986	let idx = u.choose_index(all)?;
987	let name = PROPERTY_VALUES
988	.iter()
989	.flat_map(\|e\| e.1.iter())
990	.chain(PROPERTY_NAMES)
991	.map(\|(_, e)\| e)
992	.take(idx + `1`)
993	.last()
994	.unwrap();
995	Ok(ClassUnicodeKind::Named(name.to_string()))
996	}
997	`2` => {
998	let all = PROPERTY_VALUES
999	.iter()
1000	.map(\|e\| e.1.len())
1001	.sum::<usize>();
1002	let idx = u.choose_index(all)?;
1003	let (prop, value) = PROPERTY_VALUES
1004	.iter()
1005	.flat_map(\|e\| {
1006	e.1.iter().map(\|(_, value)\| (e.0, value))
1007	})
1008	.take(idx + `1`)
1009	.last()
1010	.unwrap();
1011	Ok(ClassUnicodeKind::NamedValue {
1012	op: u.arbitrary()?,
1013	name: prop.to_string(),
1014	value: value.to_string(),
1015	})
1016	}
1017	_ => unreachable!("index chosen is impossible"),
1018	}
1019	}
1020	#[cfg(not(any(
1021	feature = "unicode-age",
1022	feature = "unicode-bool",
1023	feature = "unicode-gencat",
1024	feature = "unicode-perl",
1025	feature = "unicode-script",
1026	feature = "unicode-segment",
1027	)))]
1028	{
1029	match u.choose_index(`3`)? {
1030	`0` => Ok(ClassUnicodeKind::OneLetter(u.arbitrary()?)),
1031	`1` => Ok(ClassUnicodeKind::Named(u.arbitrary()?)),
1032	`2` => Ok(ClassUnicodeKind::NamedValue {
1033	op: u.arbitrary()?,
1034	name: u.arbitrary()?,
1035	value: u.arbitrary()?,
1036	}),
1037	_ => unreachable!("index chosen is impossible"),
1038	}
1039	}
1040	}
1041
1042	fn size_hint(depth: usize) -> (usize, Option<usize>) {
1043	#[cfg(any(
1044	feature = "unicode-age",
1045	feature = "unicode-bool",
1046	feature = "unicode-gencat",
1047	feature = "unicode-perl",
1048	feature = "unicode-script",
1049	feature = "unicode-segment",
1050	))]
1051	{
1052	arbitrary::size_hint::and_all(&[
1053	usize::size_hint(depth),
1054	usize::size_hint(depth),
1055	arbitrary::size_hint::or(
1056	(`0`, Some(`0`)),
1057	ClassUnicodeOpKind::size_hint(depth),
1058	),
1059	])
1060	}
1061	#[cfg(not(any(
1062	feature = "unicode-age",
1063	feature = "unicode-bool",
1064	feature = "unicode-gencat",
1065	feature = "unicode-perl",
1066	feature = "unicode-script",
1067	feature = "unicode-segment",
1068	)))]
1069	{
1070	arbitrary::size_hint::and(
1071	usize::size_hint(depth),
1072	arbitrary::size_hint::or_all(&[
1073	char::size_hint(depth),
1074	String::size_hint(depth),
1075	arbitrary::size_hint::and_all(&[
1076	String::size_hint(depth),
1077	String::size_hint(depth),
1078	ClassUnicodeOpKind::size_hint(depth),
1079	]),
1080	]),
1081	)
1082	}
1083	}
1084	}
1085
1086	/// The type of op used in a Unicode character class.
1087	#[derive(Clone, Debug, Eq, PartialEq)]
1088	#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1089	pub enum ClassUnicodeOpKind {
1090	/// A property set to a specific value, e.g., `\p{scx=Katakana}`.
1091	Equal,
1092	/// A property set to a specific value using a colon, e.g.,
1093	/// `\p{scx:Katakana}`.
1094	Colon,
1095	/// A property that isn't a particular value, e.g., `\p{scx!=Katakana}`.
1096	NotEqual,
1097	}
1098
1099	impl ClassUnicodeOpKind {
1100	/// Whether the op is an equality op or not.
1101	pub fn is_equal(&self) -> bool {
1102	match *self {
1103	ClassUnicodeOpKind::Equal \| ClassUnicodeOpKind::Colon => `true`,
1104	_ => `false`,
1105	}
1106	}
1107	}
1108
1109	/// A bracketed character class, e.g., `[a-z0-9]`.
1110	#[derive(Clone, Debug, Eq, PartialEq)]
1111	#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1112	pub struct ClassBracketed {
1113	/// The span of this class.
1114	pub span: Span,
1115	/// Whether this class is negated or not. e.g., `[a]` is not negated but
1116	/// `[^a]` is.
1117	pub negated: bool,
1118	/// The type of this set. A set is either a normal union of things, e.g.,
1119	/// `[abc]` or a result of applying set operations, e.g., `[\pL--c]`.
1120	pub kind: ClassSet,
1121	}
1122
1123	/// A character class set.
1124	///
1125	/// This type corresponds to the internal structure of a bracketed character
1126	/// class. That is, every bracketed character is one of two types: a union of
1127	/// items (literals, ranges, other bracketed classes) or a tree of binary set
1128	/// operations.
1129	#[derive(Clone, Debug, Eq, PartialEq)]
1130	#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1131	pub enum ClassSet {
1132	/// An item, which can be a single literal, range, nested character class
1133	/// or a union of items.
1134	Item(ClassSetItem),
1135	/// A single binary operation (i.e., &&, -- or ~~).
1136	BinaryOp(ClassSetBinaryOp),
1137	}
1138
1139	impl ClassSet {
1140	/// Build a set from a union.
1141	pub fn union(ast: ClassSetUnion) -> ClassSet {
1142	ClassSet::Item(ClassSetItem::Union(ast))
1143	}
1144
1145	/// Return the span of this character class set.
1146	pub fn span(&self) -> &Span {
1147	match *self {
1148	ClassSet::Item(ref x) => x.span(),
1149	ClassSet::BinaryOp(ref x) => &x.span,
1150	}
1151	}
1152
1153	/// Return true if and only if this class set is empty.
1154	fn is_empty(&self) -> bool {
1155	match *self {
1156	ClassSet::Item(ClassSetItem::Empty(_)) => `true`,
1157	_ => `false`,
1158	}
1159	}
1160	}
1161
1162	/// A single component of a character class set.
1163	#[derive(Clone, Debug, Eq, PartialEq)]
1164	#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1165	pub enum ClassSetItem {
1166	/// An empty item.
1167	///
1168	/// Note that a bracketed character class cannot contain a single empty
1169	/// item. Empty items can appear when using one of the binary operators.
1170	/// For example, `[&&]` is the intersection of two empty classes.
1171	Empty(Span),
1172	/// A single literal.
1173	Literal(Literal),
1174	/// A range between two literals.
1175	Range(ClassSetRange),
1176	/// An ASCII character class, e.g., `[:alnum:]` or `[:punct:]`.
1177	Ascii(ClassAscii),
1178	/// A Unicode character class, e.g., `\pL` or `\p{Greek}`.
1179	Unicode(ClassUnicode),
1180	/// A perl character class, e.g., `\d` or `\W`.
1181	Perl(ClassPerl),
1182	/// A bracketed character class set, which may contain zero or more
1183	/// character ranges and/or zero or more nested classes. e.g.,
1184	/// `[a-zA-Z\pL]`.
1185	Bracketed(Box<ClassBracketed>),
1186	/// A union of items.
1187	Union(ClassSetUnion),
1188	}
1189
1190	impl ClassSetItem {
1191	/// Return the span of this character class set item.
1192	pub fn span(&self) -> &Span {
1193	match *self {
1194	ClassSetItem::Empty(ref span) => span,
1195	ClassSetItem::Literal(ref x) => &x.span,
1196	ClassSetItem::Range(ref x) => &x.span,
1197	ClassSetItem::Ascii(ref x) => &x.span,
1198	ClassSetItem::Perl(ref x) => &x.span,
1199	ClassSetItem::Unicode(ref x) => &x.span,
1200	ClassSetItem::Bracketed(ref x) => &x.span,
1201	ClassSetItem::Union(ref x) => &x.span,
1202	}
1203	}
1204	}
1205
1206	/// A single character class range in a set.
1207	#[derive(Clone, Debug, Eq, PartialEq)]
1208	#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1209	pub struct ClassSetRange {
1210	/// The span of this range.
1211	pub span: Span,
1212	/// The start of this range.
1213	pub start: Literal,
1214	/// The end of this range.
1215	pub end: Literal,
1216	}
1217
1218	impl ClassSetRange {
1219	/// Returns true if and only if this character class range is valid.
1220	///
1221	/// The only case where a range is invalid is if its start is greater than
1222	/// its end.
1223	pub fn is_valid(&self) -> bool {
1224	self.start.c <= self.end.c
1225	}
1226	}
1227
1228	/// A union of items inside a character class set.
1229	#[derive(Clone, Debug, Eq, PartialEq)]
1230	#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1231	pub struct ClassSetUnion {
1232	/// The span of the items in this operation. e.g., the `a-z0-9` in
1233	/// `[^a-z0-9]`
1234	pub span: Span,
1235	/// The sequence of items that make up this union.
1236	pub items: Vec<ClassSetItem>,
1237	}
1238
1239	impl ClassSetUnion {
1240	/// Push a new item in this union.
1241	///
1242	/// The ending position of this union's span is updated to the ending
1243	/// position of the span of the item given. If the union is empty, then
1244	/// the starting position of this union is set to the starting position
1245	/// of this item.
1246	///
1247	/// In other words, if you only use this method to add items to a union
1248	/// and you set the spans on each item correctly, then you should never
1249	/// need to adjust the span of the union directly.
1250	pub fn push(&mut self, item: ClassSetItem) {
1251	if self.items.is_empty() {
1252	self.span.start = item.span().start;
1253	}
1254	self.span.end = item.span().end;
1255	self.items.push(item);
1256	}
1257
1258	/// Return this union as a character class set item.
1259	///
1260	/// If this union contains zero items, then an empty union is
1261	/// returned. If this concatenation contains exactly 1 item, then the
1262	/// corresponding item is returned. Otherwise, ClassSetItem::Union is
1263	/// returned.
1264	pub fn into_item(mut self) -> ClassSetItem {
1265	match self.items.len() {
1266	`0` => ClassSetItem::Empty(self.span),
1267	`1` => self.items.pop().unwrap(),
1268	_ => ClassSetItem::Union(self),
1269	}
1270	}
1271	}
1272
1273	/// A Unicode character class set operation.
1274	#[derive(Clone, Debug, Eq, PartialEq)]
1275	#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1276	pub struct ClassSetBinaryOp {
1277	/// The span of this operation. e.g., the `a-z--[h-p]` in `[a-z--h-p]`.
1278	pub span: Span,
1279	/// The type of this set operation.
1280	pub kind: ClassSetBinaryOpKind,
1281	/// The left hand side of the operation.
1282	pub lhs: Box<ClassSet>,
1283	/// The right hand side of the operation.
1284	pub rhs: Box<ClassSet>,
1285	}
1286
1287	/// The type of a Unicode character class set operation.
1288	///
1289	/// Note that this doesn't explicitly represent union since there is no
1290	/// explicit union operator. Concatenation inside a character class corresponds
1291	/// to the union operation.
1292	#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1293	#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1294	pub enum ClassSetBinaryOpKind {
1295	/// The intersection of two sets, e.g., `\pN&&[a-z]`.
1296	Intersection,
1297	/// The difference of two sets, e.g., `\pN--[0-9]`.
1298	Difference,
1299	/// The symmetric difference of two sets. The symmetric difference is the
1300	/// set of elements belonging to one but not both sets.
1301	/// e.g., `[\pL~~[:ascii:]]`.
1302	SymmetricDifference,
1303	}
1304
1305	/// A single zero-width assertion.
1306	#[derive(Clone, Debug, Eq, PartialEq)]
1307	#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1308	pub struct Assertion {
1309	/// The span of this assertion.
1310	pub span: Span,
1311	/// The assertion kind, e.g., `\b` or `^`.
1312	pub kind: AssertionKind,
1313	}
1314
1315	/// An assertion kind.
1316	#[derive(Clone, Debug, Eq, PartialEq)]
1317	#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1318	pub enum AssertionKind {
1319	/// `^`
1320	StartLine,
1321	/// `$`
1322	EndLine,
1323	/// `\A`
1324	StartText,
1325	/// `\z`
1326	EndText,
1327	/// `\b`
1328	WordBoundary,
1329	/// `\B`
1330	NotWordBoundary,
1331	/// `\b{start}`
1332	WordBoundaryStart,
1333	/// `\b{end}`
1334	WordBoundaryEnd,
1335	/// `\<` (alias for `\b{start}`)
1336	WordBoundaryStartAngle,
1337	/// `\>` (alias for `\b{end}`)
1338	WordBoundaryEndAngle,
1339	/// `\b{start-half}`
1340	WordBoundaryStartHalf,
1341	/// `\b{end-half}`
1342	WordBoundaryEndHalf,
1343	}
1344
1345	/// A repetition operation applied to a regular expression.
1346	#[derive(Clone, Debug, Eq, PartialEq)]
1347	#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1348	pub struct Repetition {
1349	/// The span of this operation.
1350	pub span: Span,
1351	/// The actual operation.
1352	pub op: RepetitionOp,
1353	/// Whether this operation was applied greedily or not.
1354	pub greedy: bool,
1355	/// The regular expression under repetition.
1356	pub ast: Box<Ast>,
1357	}
1358
1359	/// The repetition operator itself.
1360	#[derive(Clone, Debug, Eq, PartialEq)]
1361	#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1362	pub struct RepetitionOp {
1363	/// The span of this operator. This includes things like `+`, `?` and*
1364	/// `{m,n}`.
1365	pub span: Span,
1366	/// The type of operation.
1367	pub kind: RepetitionKind,
1368	}
1369
1370	/// The kind of a repetition operator.
1371	#[derive(Clone, Debug, Eq, PartialEq)]
1372	#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1373	pub enum RepetitionKind {
1374	/// `?`
1375	ZeroOrOne,
1376	/// ``*
1377	ZeroOrMore,
1378	/// `+`
1379	OneOrMore,
1380	/// `{m,n}`
1381	Range(RepetitionRange),
1382	}
1383
1384	/// A range repetition operator.
1385	#[derive(Clone, Debug, Eq, PartialEq)]
1386	#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1387	pub enum RepetitionRange {
1388	/// `{m}`
1389	Exactly(u32),
1390	/// `{m,}`
1391	AtLeast(u32),
1392	/// `{m,n}`
1393	Bounded(u32, u32),
1394	}
1395
1396	impl RepetitionRange {
1397	/// Returns true if and only if this repetition range is valid.
1398	///
1399	/// The only case where a repetition range is invalid is if it is bounded
1400	/// and its start is greater than its end.
1401	pub fn is_valid(&self) -> bool {
1402	match *self {
1403	RepetitionRange::Bounded(s, e) if s > e => `false`,
1404	_ => `true`,
1405	}
1406	}
1407	}
1408
1409	/// A grouped regular expression.
1410	///
1411	/// This includes both capturing and non-capturing groups. This does not
1412	/// include flag-only groups like `(?is)`, but does contain any group that
1413	/// contains a sub-expression, e.g., `(a)`, `(?P<name>a)`, `(?:a)` and
1414	/// `(?is:a)`.
1415	#[derive(Clone, Debug, Eq, PartialEq)]
1416	#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1417	pub struct Group {
1418	/// The span of this group.
1419	pub span: Span,
1420	/// The kind of this group.
1421	pub kind: GroupKind,
1422	/// The regular expression in this group.
1423	pub ast: Box<Ast>,
1424	}
1425
1426	impl Group {
1427	/// If this group is non-capturing, then this returns the (possibly empty)
1428	/// set of flags. Otherwise, `None` is returned.
1429	pub fn flags(&self) -> Option<&Flags> {
1430	match self.kind {
1431	GroupKind::NonCapturing(ref flags) => Some(flags),
1432	_ => None,
1433	}
1434	}
1435
1436	/// Returns true if and only if this group is capturing.
1437	pub fn is_capturing(&self) -> bool {
1438	match self.kind {
1439	GroupKind::CaptureIndex(_) \| GroupKind::CaptureName { .. } => `true`,
1440	GroupKind::NonCapturing(_) => `false`,
1441	}
1442	}
1443
1444	/// Returns the capture index of this group, if this is a capturing group.
1445	///
1446	/// This returns a capture index precisely when `is_capturing` is `true`.
1447	pub fn capture_index(&self) -> Option<u32> {
1448	match self.kind {
1449	GroupKind::CaptureIndex(i) => Some(i),
1450	GroupKind::CaptureName { ref name, .. } => Some(name.index),
1451	GroupKind::NonCapturing(_) => None,
1452	}
1453	}
1454	}
1455
1456	/// The kind of a group.
1457	#[derive(Clone, Debug, Eq, PartialEq)]
1458	#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1459	pub enum GroupKind {
1460	/// `(a)`
1461	CaptureIndex(u32),
1462	/// `(?<name>a)` or `(?P<name>a)`
1463	CaptureName {
1464	/// True if the `?P<` syntax is used and false if the `?<` syntax is used.
1465	starts_with_p: bool,
1466	/// The capture name.
1467	name: CaptureName,
1468	},
1469	/// `(?:a)` and `(?i:a)`
1470	NonCapturing(Flags),
1471	}
1472
1473	/// A capture name.
1474	///
1475	/// This corresponds to the name itself between the angle brackets in, e.g.,
1476	/// `(?P<foo>expr)`.
1477	#[derive(Clone, Debug, Eq, PartialEq)]
1478	pub struct CaptureName {
1479	/// The span of this capture name.
1480	pub span: Span,
1481	/// The capture name.
1482	pub name: String,
1483	/// The capture index.
1484	pub index: u32,
1485	}
1486
1487	#[cfg(feature = "arbitrary")]
1488	impl arbitrary::Arbitrary<'_> for CaptureName {
1489	fn arbitrary(
1490	u: &mut arbitrary::Unstructured,
1491	) -> arbitrary::Result<CaptureName> {
1492	let len = u.arbitrary_len::<char>()?;
1493	if len == `0` {
1494	return Err(arbitrary::Error::NotEnoughData);
1495	}
1496	let mut name: String = String::new();
1497	for _ in `0`..len {
1498	let ch: char = u.arbitrary()?;
1499	let cp = u32::from(ch);
1500	let ascii_letter_offset = u8::try_from(cp % `26`).unwrap();
1501	let ascii_letter = b'a' + ascii_letter_offset;
1502	name.push(char::from(ascii_letter));
1503	}
1504	Ok(CaptureName { span: u.arbitrary()?, name, index: u.arbitrary()? })
1505	}
1506
1507	fn size_hint(depth: usize) -> (usize, Option<usize>) {
1508	arbitrary::size_hint::and_all(&[
1509	Span::size_hint(depth),
1510	usize::size_hint(depth),
1511	u32::size_hint(depth),
1512	])
1513	}
1514	}
1515
1516	/// A group of flags that is not applied to a particular regular expression.
1517	#[derive(Clone, Debug, Eq, PartialEq)]
1518	#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1519	pub struct SetFlags {
1520	/// The span of these flags, including the grouping parentheses.
1521	pub span: Span,
1522	/// The actual sequence of flags.
1523	pub flags: Flags,
1524	}
1525
1526	/// A group of flags.
1527	///
1528	/// This corresponds only to the sequence of flags themselves, e.g., `is-u`.
1529	#[derive(Clone, Debug, Eq, PartialEq)]
1530	#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1531	pub struct Flags {
1532	/// The span of this group of flags.
1533	pub span: Span,
1534	/// A sequence of flag items. Each item is either a flag or a negation
1535	/// operator.
1536	pub items: Vec<FlagsItem>,
1537	}
1538
1539	impl Flags {
1540	/// Add the given item to this sequence of flags.
1541	///
1542	/// If the item was added successfully, then `None` is returned. If the
1543	/// given item is a duplicate, then `Some(i)` is returned, where
1544	/// `items[i].kind == item.kind`.
1545	pub fn add_item(&mut self, item: FlagsItem) -> Option<usize> {
1546	for (i, x) in self.items.iter().enumerate() {
1547	if x.kind == item.kind {
1548	return Some(i);
1549	}
1550	}
1551	self.items.push(item);
1552	None
1553	}
1554
1555	/// Returns the state of the given flag in this set.
1556	///
1557	/// If the given flag is in the set but is negated, then `Some(false)` is
1558	/// returned.
1559	///
1560	/// If the given flag is in the set and is not negated, then `Some(true)`
1561	/// is returned.
1562	///
1563	/// Otherwise, `None` is returned.
1564	pub fn flag_state(&self, flag: Flag) -> Option<bool> {
1565	let mut negated = `false`;
1566	for x in &self.items {
1567	match x.kind {
1568	FlagsItemKind::Negation => {
1569	negated = `true`;
1570	}
1571	FlagsItemKind::Flag(ref xflag) if xflag == &flag => {
1572	return Some(!negated);
1573	}
1574	_ => {}
1575	}
1576	}
1577	None
1578	}
1579	}
1580
1581	/// A single item in a group of flags.
1582	#[derive(Clone, Debug, Eq, PartialEq)]
1583	#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1584	pub struct FlagsItem {
1585	/// The span of this item.
1586	pub span: Span,
1587	/// The kind of this item.
1588	pub kind: FlagsItemKind,
1589	}
1590
1591	/// The kind of an item in a group of flags.
1592	#[derive(Clone, Debug, Eq, PartialEq)]
1593	#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1594	pub enum FlagsItemKind {
1595	/// A negation operator applied to all subsequent flags in the enclosing
1596	/// group.
1597	Negation,
1598	/// A single flag in a group.
1599	Flag(Flag),
1600	}
1601
1602	impl FlagsItemKind {
1603	/// Returns true if and only if this item is a negation operator.
1604	pub fn is_negation(&self) -> bool {
1605	match *self {
1606	FlagsItemKind::Negation => `true`,
1607	_ => `false`,
1608	}
1609	}
1610	}
1611
1612	/// A single flag.
1613	#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1614	#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
1615	pub enum Flag {
1616	/// `i`
1617	CaseInsensitive,
1618	/// `m`
1619	MultiLine,
1620	/// `s`
1621	DotMatchesNewLine,
1622	/// `U`
1623	SwapGreed,
1624	/// `u`
1625	Unicode,
1626	/// `R`
1627	CRLF,
1628	/// `x`
1629	IgnoreWhitespace,
1630	}
1631
1632	/// A custom `Drop` impl is used for `Ast` such that it uses constant stack
1633	/// space but heap space proportional to the depth of the `Ast`.
1634	impl Drop for Ast {
1635	fn drop(&mut self) {
1636	use core::mem;
1637
1638	match *self {
1639	Ast::Empty(_)
1640	\| Ast::Flags(_)
1641	\| Ast::Literal(_)
1642	\| Ast::Dot(_)
1643	\| Ast::Assertion(_)
1644	\| Ast::ClassUnicode(_)
1645	\| Ast::ClassPerl(_)
1646	// Bracketed classes are recursive, they get their own Drop impl.
1647	\| Ast::ClassBracketed(_) => return,
1648	Ast::Repetition(ref x) if !x.ast.has_subexprs() => return,
1649	Ast::Group(ref x) if !x.ast.has_subexprs() => return,
1650	Ast::Alternation(ref x) if x.asts.is_empty() => return,
1651	Ast::Concat(ref x) if x.asts.is_empty() => return,
1652	_ => {}
1653	}
1654
1655	let empty_span = \|\| Span::splat(Position::new(`0`, `0`, `0`));
1656	let empty_ast = \|\| Ast::empty(empty_span());
1657	let mut stack = vec![mem::replace(self, empty_ast())];
1658	while let Some(mut ast) = stack.pop() {
1659	match ast {
1660	Ast::Empty(_)
1661	\| Ast::Flags(_)
1662	\| Ast::Literal(_)
1663	\| Ast::Dot(_)
1664	\| Ast::Assertion(_)
1665	\| Ast::ClassUnicode(_)
1666	\| Ast::ClassPerl(_)
1667	// Bracketed classes are recursive, so they get their own Drop
1668	// impl.
1669	\| Ast::ClassBracketed(_) => {}
1670	Ast::Repetition(ref mut x) => {
1671	stack.push(mem::replace(&mut x.ast, empty_ast()));
1672	}
1673	Ast::Group(ref mut x) => {
1674	stack.push(mem::replace(&mut x.ast, empty_ast()));
1675	}
1676	Ast::Alternation(ref mut x) => {
1677	stack.extend(x.asts.drain(..));
1678	}
1679	Ast::Concat(ref mut x) => {
1680	stack.extend(x.asts.drain(..));
1681	}
1682	}
1683	}
1684	}
1685	}
1686
1687	/// A custom `Drop` impl is used for `ClassSet` such that it uses constant
1688	/// stack space but heap space proportional to the depth of the `ClassSet`.
1689	impl Drop for ClassSet {
1690	fn drop(&mut self) {
1691	use core::mem;
1692
1693	match *self {
1694	ClassSet::Item(ref item) => match *item {
1695	ClassSetItem::Empty(_)
1696	\| ClassSetItem::Literal(_)
1697	\| ClassSetItem::Range(_)
1698	\| ClassSetItem::Ascii(_)
1699	\| ClassSetItem::Unicode(_)
1700	\| ClassSetItem::Perl(_) => return,
1701	ClassSetItem::Bracketed(ref x) => {
1702	if x.kind.is_empty() {
1703	return;
1704	}
1705	}
1706	ClassSetItem::Union(ref x) => {
1707	if x.items.is_empty() {
1708	return;
1709	}
1710	}
1711	},
1712	ClassSet::BinaryOp(ref op) => {
1713	if op.lhs.is_empty() && op.rhs.is_empty() {
1714	return;
1715	}
1716	}
1717	}
1718
1719	let empty_span = \|\| Span::splat(Position::new(`0`, `0`, `0`));
1720	let empty_set = \|\| ClassSet::Item(ClassSetItem::Empty(empty_span()));
1721	let mut stack = vec![mem::replace(self, empty_set())];
1722	while let Some(mut set) = stack.pop() {
1723	match set {
1724	ClassSet::Item(ref mut item) => match *item {
1725	ClassSetItem::Empty(_)
1726	\| ClassSetItem::Literal(_)
1727	\| ClassSetItem::Range(_)
1728	\| ClassSetItem::Ascii(_)
1729	\| ClassSetItem::Unicode(_)
1730	\| ClassSetItem::Perl(_) => {}
1731	ClassSetItem::Bracketed(ref mut x) => {
1732	stack.push(mem::replace(&mut x.kind, empty_set()));
1733	}
1734	ClassSetItem::Union(ref mut x) => {
1735	stack.extend(x.items.drain(..).map(ClassSet::Item));
1736	}
1737	},
1738	ClassSet::BinaryOp(ref mut op) => {
1739	stack.push(mem::replace(&mut op.lhs, empty_set()));
1740	stack.push(mem::replace(&mut op.rhs, empty_set()));
1741	}
1742	}
1743	}
1744	}
1745	}
1746
1747	#[cfg(test)]
1748	mod tests {
1749	use super::*;
1750
1751	// We use a thread with an explicit stack size to test that our destructor
1752	// for Ast can handle arbitrarily sized expressions in constant stack
1753	// space. In case we run on a platform without threads (WASM?), we limit
1754	// this test to Windows/Unix.
1755	#[test]
1756	#[cfg(any(unix, windows))]
1757	fn no_stack_overflow_on_drop() {
1758	use std::thread;
1759
1760	let run = \|\| {
1761	let span = \|\| Span::splat(Position::new(`0`, `0`, `0`));
1762	let mut ast = Ast::empty(span());
1763	for i in `0`..`200` {
1764	ast = Ast::group(Group {
1765	span: span(),
1766	kind: GroupKind::CaptureIndex(i),
1767	ast: Box::new(ast),
1768	});
1769	}
1770	assert!(!ast.is_empty());
1771	};
1772
1773	// We run our test on a thread with a small stack size so we can
1774	// force the issue more easily.
1775	//
1776	// NOTE(2023-03-21): It turns out that some platforms (like FreeBSD)
1777	// will just barf with very small stack sizes. So we bump this up a bit
1778	// to give more room to breath. When I did this, I confirmed that if
1779	// I remove the custom `Drop` impl for `Ast`, then this test does
1780	// indeed still fail with a stack overflow. (At the time of writing, I
1781	// had to bump it all the way up to 32K before the test would pass even
1782	// without the custom `Drop` impl. So 16K seems like a safe number
1783	// here.)
1784	//
1785	// See: https://github.com/rust-lang/regex/issues/967
1786	thread::Builder::new()
1787	.stack_size(`16` << `10`)
1788	.spawn(run)
1789	.unwrap()
1790	.join()
1791	.unwrap();
1792	}
1793
1794	// This tests that our `Ast` has a reasonable size. This isn't a hard rule
1795	// and it can be increased if given a good enough reason. But this test
1796	// exists because the size of `Ast` was at one point over 200 bytes on a
1797	// 64-bit target. Wow.
1798	#[test]
1799	fn ast_size() {
1800	let max = `2` * core::mem::size_of::<usize>();
1801	let size = core::mem::size_of::<Ast>();
1802	assert!(
1803	size <= max,
1804	"Ast size of {} bytes is bigger than suggested max {}",
1805	size,
1806	max
1807	);
1808	}
1809	}
1810