parse.rs source code [crates/regex_lite/src/hir/parse.rs]

1	use core::cell::{Cell, RefCell};
2
3	use alloc::{
4	boxed::Box,
5	string::{String, ToString},
6	vec,
7	vec::Vec,
8	};
9
10	use crate::{
11	error::Error,
12	hir::{self, Config, Flags, Hir, HirKind},
13	};
14
15	// These are all of the errors that can occur while parsing a regex. Unlike
16	// regex-syntax, our errors are not particularly great. They are just enough
17	// to get a general sense of what went wrong. But in exchange, the error
18	// reporting mechanism is much* simpler than what's in regex-syntax.*
19	//
20	// By convention, we use each of these messages in exactly one place. That
21	// way, every branch that leads to an error has a unique message. This in turn
22	// means that given a message, one can precisely identify which part of the
23	// parser reported it.
24	//
25	// Finally, we give names to each message so that we can reference them in
26	// tests.
27	const ERR_TOO_MUCH_NESTING: &str = "pattern has too much nesting";
28	const ERR_TOO_MANY_CAPTURES: &str = "too many capture groups";
29	const ERR_DUPLICATE_CAPTURE_NAME: &str = "duplicate capture group name";
30	const ERR_UNCLOSED_GROUP: &str = "found open group without closing ')'";
31	const ERR_UNCLOSED_GROUP_QUESTION: &str =
32	"expected closing ')', but got end of pattern";
33	const ERR_UNOPENED_GROUP: &str = "found closing ')' without matching '('";
34	const ERR_LOOK_UNSUPPORTED: &str = "look-around is not supported";
35	const ERR_EMPTY_FLAGS: &str = "empty flag directive '(?)' is not allowed";
36	const ERR_MISSING_GROUP_NAME: &str =
37	"expected capture group name, but got end of pattern";
38	const ERR_INVALID_GROUP_NAME: &str = "invalid group name";
39	const ERR_UNCLOSED_GROUP_NAME: &str =
40	"expected end of capture group name, but got end of pattern";
41	const ERR_EMPTY_GROUP_NAME: &str = "empty capture group names are not allowed";
42	const ERR_FLAG_UNRECOGNIZED: &str = "unrecognized inline flag";
43	const ERR_FLAG_REPEATED_NEGATION: &str =
44	"inline flag negation cannot be repeated";
45	const ERR_FLAG_DUPLICATE: &str = "duplicate inline flag is not allowed";
46	const ERR_FLAG_UNEXPECTED_EOF: &str =
47	"expected ':' or ')' to end inline flags, but got end of pattern";
48	const ERR_FLAG_DANGLING_NEGATION: &str =
49	"inline flags cannot end with negation directive";
50	const ERR_DECIMAL_NO_DIGITS: &str =
51	"expected decimal number, but found no digits";
52	const ERR_DECIMAL_INVALID: &str = "got invalid decimal number";
53	const ERR_HEX_BRACE_INVALID_DIGIT: &str =
54	"expected hexadecimal number in braces, but got non-hex digit";
55	const ERR_HEX_BRACE_UNEXPECTED_EOF: &str =
56	"expected hexadecimal number, but saw end of pattern before closing brace";
57	const ERR_HEX_BRACE_EMPTY: &str =
58	"expected hexadecimal number in braces, but got no digits";
59	const ERR_HEX_BRACE_INVALID: &str = "got invalid hexadecimal number in braces";
60	const ERR_HEX_FIXED_UNEXPECTED_EOF: &str =
61	"expected fixed length hexadecimal number, but saw end of pattern first";
62	const ERR_HEX_FIXED_INVALID_DIGIT: &str =
63	"expected fixed length hexadecimal number, but got non-hex digit";
64	const ERR_HEX_FIXED_INVALID: &str =
65	"got invalid fixed length hexadecimal number";
66	const ERR_HEX_UNEXPECTED_EOF: &str =
67	"expected hexadecimal number, but saw end of pattern first";
68	const ERR_ESCAPE_UNEXPECTED_EOF: &str =
69	"saw start of escape sequence, but saw end of pattern before it finished";
70	const ERR_BACKREF_UNSUPPORTED: &str = "backreferences are not supported";
71	const ERR_UNICODE_CLASS_UNSUPPORTED: &str =
72	"Unicode character classes are not supported";
73	const ERR_ESCAPE_UNRECOGNIZED: &str = "unrecognized escape sequence";
74	const ERR_POSIX_CLASS_UNRECOGNIZED: &str =
75	"unrecognized POSIX character class";
76	const ERR_UNCOUNTED_REP_SUB_MISSING: &str =
77	"uncounted repetition operator must be applied to a sub-expression";
78	const ERR_COUNTED_REP_SUB_MISSING: &str =
79	"counted repetition operator must be applied to a sub-expression";
80	const ERR_COUNTED_REP_UNCLOSED: &str =
81	"found unclosed counted repetition operator";
82	const ERR_COUNTED_REP_MIN_UNCLOSED: &str =
83	"found incomplete and unclosed counted repetition operator";
84	const ERR_COUNTED_REP_COMMA_UNCLOSED: &str =
85	"found counted repetition operator with a comma that is unclosed";
86	const ERR_COUNTED_REP_MIN_MAX_UNCLOSED: &str =
87	"found counted repetition with min and max that is unclosed";
88	const ERR_COUNTED_REP_INVALID: &str =
89	"expected closing brace for counted repetition, but got something else";
90	const ERR_COUNTED_REP_INVALID_RANGE: &str =
91	"found counted repetition with a min bigger than its max";
92	const ERR_CLASS_UNCLOSED_AFTER_ITEM: &str =
93	"non-empty character class has no closing bracket";
94	const ERR_CLASS_INVALID_RANGE_ITEM: &str =
95	"character class ranges must start and end with a single character";
96	const ERR_CLASS_INVALID_ITEM: &str =
97	"invalid escape sequence in character class";
98	const ERR_CLASS_UNCLOSED_AFTER_DASH: &str =
99	"non-empty character class has no closing bracket after dash";
100	const ERR_CLASS_UNCLOSED_AFTER_NEGATION: &str =
101	"negated character class has no closing bracket";
102	const ERR_CLASS_UNCLOSED_AFTER_CLOSING: &str =
103	"character class begins with literal ']' but has no closing bracket";
104	const ERR_CLASS_INVALID_RANGE: &str = "invalid range in character class";
105	const ERR_CLASS_UNCLOSED: &str = "found unclosed character class";
106	const ERR_CLASS_NEST_UNSUPPORTED: &str =
107	"nested character classes are not supported";
108	const ERR_CLASS_INTERSECTION_UNSUPPORTED: &str =
109	"character class intersection is not supported";
110	const ERR_CLASS_DIFFERENCE_UNSUPPORTED: &str =
111	"character class difference is not supported";
112	const ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED: &str =
113	"character class symmetric difference is not supported";
114	const ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED: &str =
115	"special word boundary assertion is unclosed or has an invalid character";
116	const ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED: &str =
117	"special word boundary assertion is unrecognized";
118	const ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF: &str =
119	"found start of special word boundary or repetition without an end";
120
121	/// A regular expression parser.
122	///
123	/// This parses a string representation of a regular expression into an
124	/// abstract syntax tree. The size of the tree is proportional to the length
125	/// of the regular expression pattern.
126	///
127	/// A `Parser` can be configured in more detail via a [`ParserBuilder`].
128	#[derive(Clone, Debug)]
129	pub(super) struct Parser<'a> {
130	/// The configuration of the parser as given by the caller.
131	config: Config,
132	/// The pattern we're parsing as given by the caller.
133	pattern: &'a str,
134	/// The call depth of the parser. This is incremented for each
135	/// sub-expression parsed. Its peak value is the maximum nesting of the
136	/// pattern.
137	depth: Cell<u32>,
138	/// The current position of the parser.
139	pos: Cell<usize>,
140	/// The current codepoint of the parser. The codepoint corresponds to the
141	/// codepoint encoded in `pattern` beginning at `pos`.
142	///
143	/// This is `None` if and only if `pos == pattern.len()`.
144	char: Cell<Option<char>>,
145	/// The current capture index.
146	capture_index: Cell<u32>,
147	/// The flags that are currently set.
148	flags: RefCell<Flags>,
149	/// A sorted sequence of capture names. This is used to detect duplicate
150	/// capture names and report an error if one is detected.
151	capture_names: RefCell<Vec<String>>,
152	}
153
154	/// The constructor and a variety of helper routines.
155	impl<'a> Parser<'a> {
156	/// Build a parser from this configuration with the given pattern.
157	pub(super) fn new(config: Config, pattern: &'a str) -> Parser<'a> {
158	Parser {
159	config,
160	pattern,
161	depth: Cell::new(`0`),
162	pos: Cell::new(`0`),
163	char: Cell::new(pattern.chars().next()),
164	capture_index: Cell::new(`0`),
165	flags: RefCell::new(config.flags),
166	capture_names: RefCell::new(vec![]),
167	}
168	}
169
170	/// Returns the full pattern string that we're parsing.
171	fn pattern(&self) -> &str {
172	self.pattern
173	}
174
175	/// Return the current byte offset of the parser.
176	///
177	/// The offset starts at `0` from the beginning of the regular expression
178	/// pattern string.
179	fn pos(&self) -> usize {
180	self.pos.get()
181	}
182
183	/// Increments the call depth of the parser.
184	///
185	/// If the call depth would exceed the configured nest limit, then this
186	/// returns an error.
187	///
188	/// This returns the old depth.
189	fn increment_depth(&self) -> Result<u32, Error> {
190	let old = self.depth.get();
191	if old > self.config.nest_limit {
192	return Err(Error::new(ERR_TOO_MUCH_NESTING));
193	}
194	// OK because our depth starts at 0, and we return an error if it
195	// ever reaches the limit. So the call depth can never exceed u32::MAX.
196	let new = old.checked_add(`1`).unwrap();
197	self.depth.set(new);
198	Ok(old)
199	}
200
201	/// Decrements the call depth of the parser.
202	///
203	/// This panics if the current depth is 0.
204	fn decrement_depth(&self) {
205	let old = self.depth.get();
206	// If this fails then the caller has a bug in how they're incrementing
207	// and decrementing the depth of the parser's call stack.
208	let new = old.checked_sub(`1`).unwrap();
209	self.depth.set(new);
210	}
211
212	/// Return the codepoint at the current position of the parser.
213	///
214	/// This panics if the parser is positioned at the end of the pattern.
215	fn char(&self) -> char {
216	self.char.get().expect("codepoint, but parser is done")
217	}
218
219	/// Returns true if the next call to `bump` would return false.
220	fn is_done(&self) -> bool {
221	self.pos() == self.pattern.len()
222	}
223
224	/// Returns the flags that are current set for this regex.
225	fn flags(&self) -> Flags {
226	*self.flags.borrow()
227	}
228
229	/// Bump the parser to the next Unicode scalar value.
230	///
231	/// If the end of the input has been reached, then `false` is returned.
232	fn bump(&self) -> bool {
233	if self.is_done() {
234	return `false`;
235	}
236	self.pos.set(self.pos() + self.char().len_utf8());
237	self.char.set(self.pattern()[self.pos()..].chars().next());
238	self.char.get().is_some()
239	}
240
241	/// If the substring starting at the current position of the parser has
242	/// the given prefix, then bump the parser to the character immediately
243	/// following the prefix and return true. Otherwise, don't bump the parser
244	/// and return false.
245	fn bump_if(&self, prefix: &str) -> bool {
246	if self.pattern()[self.pos()..].starts_with(prefix) {
247	for _ in `0`..prefix.chars().count() {
248	self.bump();
249	}
250	`true`
251	} else {
252	`false`
253	}
254	}
255
256	/// Bump the parser, and if the `x` flag is enabled, bump through any
257	/// subsequent spaces. Return true if and only if the parser is not done.
258	fn bump_and_bump_space(&self) -> bool {
259	if !self.bump() {
260	return `false`;
261	}
262	self.bump_space();
263	!self.is_done()
264	}
265
266	/// If the `x` flag is enabled (i.e., whitespace insensitivity with
267	/// comments), then this will advance the parser through all whitespace
268	/// and comments to the next non-whitespace non-comment byte.
269	///
270	/// If the `x` flag is disabled, then this is a no-op.
271	///
272	/// This should be used selectively throughout the parser where
273	/// arbitrary whitespace is permitted when the `x` flag is enabled. For
274	/// example, `{ 5 , 6}` is equivalent to `{5,6}`.
275	fn bump_space(&self) {
276	if !self.flags().ignore_whitespace {
277	return;
278	}
279	while !self.is_done() {
280	if self.char().is_whitespace() {
281	self.bump();
282	} else if self.char() == '#' {
283	self.bump();
284	while !self.is_done() {
285	let c = self.char();
286	self.bump();
287	if c == '`\n`' {
288	break;
289	}
290	}
291	} else {
292	break;
293	}
294	}
295	}
296
297	/// Peek at the next character in the input without advancing the parser.
298	///
299	/// If the input has been exhausted, then this returns `None`.
300	fn peek(&self) -> Option<char> {
301	if self.is_done() {
302	return None;
303	}
304	self.pattern()[self.pos() + self.char().len_utf8()..].chars().next()
305	}
306
307	/// Peeks at the next character in the pattern from the current offset, and
308	/// will ignore spaces when the parser is in whitespace insensitive mode.
309	fn peek_space(&self) -> Option<char> {
310	if !self.flags().ignore_whitespace {
311	return self.peek();
312	}
313	if self.is_done() {
314	return None;
315	}
316	let mut start = self.pos() + self.char().len_utf8();
317	let mut in_comment = `false`;
318	for (i, ch) in self.pattern()[start..].char_indices() {
319	if ch.is_whitespace() {
320	continue;
321	} else if !in_comment && ch == '#' {
322	in_comment = `true`;
323	} else if in_comment && ch == '`\n`' {
324	in_comment = `false`;
325	} else {
326	start += i;
327	break;
328	}
329	}
330	self.pattern()[start..].chars().next()
331	}
332
333	/// Return the next capturing index. Each subsequent call increments the
334	/// internal index. Since the way capture indices are computed is a public
335	/// API guarantee, use of this routine depends on the parser being depth
336	/// first and left-to-right.
337	///
338	/// If the capture limit is exceeded, then an error is returned.
339	fn next_capture_index(&self) -> Result<u32, Error> {
340	let current = self.capture_index.get();
341	let next = current
342	.checked_add(`1`)
343	.ok_or_else(\|\| Error::new(ERR_TOO_MANY_CAPTURES))?;
344	self.capture_index.set(next);
345	Ok(next)
346	}
347
348	/// Adds the given capture name to this parser. If this capture name has
349	/// already been used, then an error is returned.
350	fn add_capture_name(&self, name: &str) -> Result<(), Error> {
351	let mut names = self.capture_names.borrow_mut();
352	match names.binary_search_by(\|n\| name.cmp(n)) {
353	Ok(_) => Err(Error::new(ERR_DUPLICATE_CAPTURE_NAME)),
354	Err(i) => {
355	names.insert(i, name.to_string());
356	Ok(())
357	}
358	}
359	}
360
361	/// Returns true if and only if the parser is positioned at a look-around
362	/// prefix. The conditions under which this returns true must always
363	/// correspond to a regular expression that would otherwise be consider
364	/// invalid.
365	///
366	/// This should only be called immediately after parsing the opening of
367	/// a group or a set of flags.
368	fn is_lookaround_prefix(&self) -> bool {
369	self.bump_if("?=")
370	\|\| self.bump_if("?!")
371	\|\| self.bump_if("?<=")
372	\|\| self.bump_if("?<!")
373	}
374	}
375
376	/// The actual parser. We try to break out each kind of regex syntax into its
377	/// own routine.
378	impl<'a> Parser<'a> {
379	pub(super) fn parse(&self) -> Result<Hir, Error> {
380	let hir = self.parse_inner()?;
381	// While we also check nesting during parsing, that only checks the
382	// number of recursive parse calls. It does not necessarily cover
383	// all possible recursive nestings of the Hir itself. For example,
384	// repetition operators don't require recursive parse calls. So one
385	// can stack them arbitrarily without overflowing the stack in the
386	// parser. But then if one recurses over the resulting Hir, a stack
387	// overflow is possible. So here we check the Hir nesting level
388	// thoroughly to ensure it isn't nested too deeply.
389	//
390	// Note that we do still need the nesting limit check in the parser as
391	// well, since that will avoid overflowing the stack during parse time
392	// before the complete Hir value is constructed.
393	check_hir_nesting(&hir, self.config.nest_limit)?;
394	Ok(hir)
395	}
396
397	fn parse_inner(&self) -> Result<Hir, Error> {
398	let depth = self.increment_depth()?;
399	let mut alternates = vec![];
400	let mut concat = vec![];
401	loop {
402	self.bump_space();
403	if self.is_done() {
404	break;
405	}
406	match self.char() {
407	'(' => {
408	// Save the old flags and reset them only when we close
409	// the group.
410	let oldflags = *self.flags.borrow();
411	if let Some(sub) = self.parse_group()? {
412	concat.push(sub);
413	// We only reset them here because if 'parse_group'
414	// returns None, then that means it handled a flag
415	// directive, e.g., '(?ism)'. And the whole point is
416	// that those flags remain active until either disabled
417	// or the end of the pattern or current group.
418	*self.flags.borrow_mut() = oldflags;
419	}
420	if self.char.get() != Some(')') {
421	return Err(Error::new(ERR_UNCLOSED_GROUP));
422	}
423	self.bump();
424	}
425	')' => {
426	if depth == `0` {
427	return Err(Error::new(ERR_UNOPENED_GROUP));
428	}
429	break;
430	}
431	'\|' => {
432	alternates.push(Hir::concat(core::mem::take(&mut concat)));
433	self.bump();
434	}
435	'[' => concat.push(self.parse_class()?),
436	'?' \| '*' \| '+' => {
437	concat = self.parse_uncounted_repetition(concat)?;
438	}
439	'{' => {
440	concat = self.parse_counted_repetition(concat)?;
441	}
442	_ => concat.push(self.parse_primitive()?),
443	}
444	}
445	self.decrement_depth();
446	alternates.push(Hir::concat(concat));
447	// N.B. This strips off the "alternation" if there's only one branch.
448	Ok(Hir::alternation(alternates))
449	}
450
451	/// Parses a "primitive" pattern. A primitive is any expression that does
452	/// not contain any sub-expressions.
453	///
454	/// This assumes the parser is pointing at the beginning of the primitive.
455	fn parse_primitive(&self) -> Result<Hir, Error> {
456	let ch = self.char();
457	self.bump();
458	match ch {
459	'`\\`' => self.parse_escape(),
460	'.' => Ok(self.hir_dot()),
461	'^' => Ok(self.hir_anchor_start()),
462	'$' => Ok(self.hir_anchor_end()),
463	ch => Ok(self.hir_char(ch)),
464	}
465	}
466
467	/// Parse an escape sequence. This always results in a "primitive" HIR,
468	/// that is, an HIR with no sub-expressions.
469	///
470	/// This assumes the parser is positioned at the start of the sequence,
471	/// immediately after* the `\`. It advances the parser to the first*
472	/// position immediately following the escape sequence.
473	fn parse_escape(&self) -> Result<Hir, Error> {
474	if self.is_done() {
475	return Err(Error::new(ERR_ESCAPE_UNEXPECTED_EOF));
476	}
477	let ch = self.char();
478	// Put some of the more complicated routines into helpers.
479	match ch {
480	'0'..='9' => return Err(Error::new(ERR_BACKREF_UNSUPPORTED)),
481	'p' \| 'P' => {
482	return Err(Error::new(ERR_UNICODE_CLASS_UNSUPPORTED))
483	}
484	'x' \| 'u' \| 'U' => return self.parse_hex(),
485	'd' \| 's' \| 'w' \| 'D' \| 'S' \| 'W' => {
486	return Ok(self.parse_perl_class());
487	}
488	_ => {}
489	}
490
491	// Handle all of the one letter sequences inline.
492	self.bump();
493	if hir::is_meta_character(ch) \|\| hir::is_escapeable_character(ch) {
494	return Ok(self.hir_char(ch));
495	}
496	let special = \|ch\| Ok(self.hir_char(ch));
497	match ch {
498	'a' => special('`\x07`'),
499	'f' => special('`\x0C`'),
500	't' => special('`\t`'),
501	'n' => special('`\n`'),
502	'r' => special('`\r`'),
503	'v' => special('`\x0B`'),
504	'A' => Ok(Hir::look(hir::Look::Start)),
505	'z' => Ok(Hir::look(hir::Look::End)),
506	'b' => {
507	let mut hir = Hir::look(hir::Look::Word);
508	if !self.is_done() && self.char() == '{' {
509	if let Some(special) =
510	self.maybe_parse_special_word_boundary()?
511	{
512	hir = special;
513	}
514	}
515	Ok(hir)
516	}
517	'B' => Ok(Hir::look(hir::Look::WordNegate)),
518	'<' => Ok(Hir::look(hir::Look::WordStart)),
519	'>' => Ok(Hir::look(hir::Look::WordEnd)),
520	_ => Err(Error::new(ERR_ESCAPE_UNRECOGNIZED)),
521	}
522	}
523
524	/// Attempt to parse a specialty word boundary. That is, `\b{start}`,
525	/// `\b{end}`, `\b{start-half}` or `\b{end-half}`.
526	///
527	/// This is similar to `maybe_parse_ascii_class` in that, in most cases,
528	/// if it fails it will just return `None` with no error. This is done
529	/// because `\b{5}` is a valid expression and we want to let that be parsed
530	/// by the existing counted repetition parsing code. (I thought about just
531	/// invoking the counted repetition code from here, but it seemed a little
532	/// ham-fisted.)
533	///
534	/// Unlike `maybe_parse_ascii_class` though, this can return an error.
535	/// Namely, if we definitely know it isn't a counted repetition, then we
536	/// return an error specific to the specialty word boundaries.
537	///
538	/// This assumes the parser is positioned at a `{` immediately following
539	/// a `\b`. When `None` is returned, the parser is returned to the position
540	/// at which it started: pointing at a `{`.
541	///
542	/// The position given should correspond to the start of the `\b`.
543	fn maybe_parse_special_word_boundary(&self) -> Result<Option<Hir>, Error> {
544	assert_eq!(self.char(), '{');
545
546	let is_valid_char = \|c\| match c {
547	'A'..='Z' \| 'a'..='z' \| '-' => `true`,
548	_ => `false`,
549	};
550	let start = self.pos();
551	if !self.bump_and_bump_space() {
552	return Err(Error::new(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF));
553	}
554	// This is one of the critical bits: if the first non-whitespace
555	// character isn't in [-A-Za-z] (i.e., this can't be a special word
556	// boundary), then we bail and let the counted repetition parser deal
557	// with this.
558	if !is_valid_char(self.char()) {
559	self.pos.set(start);
560	self.char.set(Some('{'));
561	return Ok(None);
562	}
563
564	// Now collect up our chars until we see a '}'.
565	let mut scratch = String::new();
566	while !self.is_done() && is_valid_char(self.char()) {
567	scratch.push(self.char());
568	self.bump_and_bump_space();
569	}
570	if self.is_done() \|\| self.char() != '}' {
571	return Err(Error::new(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED));
572	}
573	self.bump();
574	let kind = match scratch.as_str() {
575	"start" => hir::Look::WordStart,
576	"end" => hir::Look::WordEnd,
577	"start-half" => hir::Look::WordStartHalf,
578	"end-half" => hir::Look::WordEndHalf,
579	_ => {
580	return Err(Error::new(ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED))
581	}
582	};
583	Ok(Some(Hir::look(kind)))
584	}
585
586	/// Parse a hex representation of a Unicode codepoint. This handles both
587	/// hex notations, i.e., `\xFF` and `\x{FFFF}`. This expects the parser to
588	/// be positioned at the `x`, `u` or `U` prefix. The parser is advanced to
589	/// the first character immediately following the hexadecimal literal.
590	fn parse_hex(&self) -> Result<Hir, Error> {
591	let digit_len = match self.char() {
592	'x' => `2`,
593	'u' => `4`,
594	'U' => `8`,
595	unk => unreachable!(
596	"invalid start of fixed length hexadecimal number {}",
597	unk
598	),
599	};
600	if !self.bump_and_bump_space() {
601	return Err(Error::new(ERR_HEX_UNEXPECTED_EOF));
602	}
603	if self.char() == '{' {
604	self.parse_hex_brace()
605	} else {
606	self.parse_hex_digits(digit_len)
607	}
608	}
609
610	/// Parse an N-digit hex representation of a Unicode codepoint. This
611	/// expects the parser to be positioned at the first digit and will advance
612	/// the parser to the first character immediately following the escape
613	/// sequence.
614	///
615	/// The number of digits given must be 2 (for `\xNN`), 4 (for `\uNNNN`)
616	/// or 8 (for `\UNNNNNNNN`).
617	fn parse_hex_digits(&self, digit_len: usize) -> Result<Hir, Error> {
618	let mut scratch = String::new();
619	for i in `0`..digit_len {
620	if i > `0` && !self.bump_and_bump_space() {
621	return Err(Error::new(ERR_HEX_FIXED_UNEXPECTED_EOF));
622	}
623	if !is_hex(self.char()) {
624	return Err(Error::new(ERR_HEX_FIXED_INVALID_DIGIT));
625	}
626	scratch.push(self.char());
627	}
628	// The final bump just moves the parser past the literal, which may
629	// be EOF.
630	self.bump_and_bump_space();
631	match u32::from_str_radix(&scratch, `16`).ok().and_then(char::from_u32) {
632	None => Err(Error::new(ERR_HEX_FIXED_INVALID)),
633	Some(ch) => Ok(self.hir_char(ch)),
634	}
635	}
636
637	/// Parse a hex representation of any Unicode scalar value. This expects
638	/// the parser to be positioned at the opening brace `{` and will advance
639	/// the parser to the first character following the closing brace `}`.
640	fn parse_hex_brace(&self) -> Result<Hir, Error> {
641	let mut scratch = String::new();
642	while self.bump_and_bump_space() && self.char() != '}' {
643	if !is_hex(self.char()) {
644	return Err(Error::new(ERR_HEX_BRACE_INVALID_DIGIT));
645	}
646	scratch.push(self.char());
647	}
648	if self.is_done() {
649	return Err(Error::new(ERR_HEX_BRACE_UNEXPECTED_EOF));
650	}
651	assert_eq!(self.char(), '}');
652	self.bump_and_bump_space();
653
654	if scratch.is_empty() {
655	return Err(Error::new(ERR_HEX_BRACE_EMPTY));
656	}
657	match u32::from_str_radix(&scratch, `16`).ok().and_then(char::from_u32) {
658	None => Err(Error::new(ERR_HEX_BRACE_INVALID)),
659	Some(ch) => Ok(self.hir_char(ch)),
660	}
661	}
662
663	/// Parse a decimal number into a u32 while trimming leading and trailing
664	/// whitespace.
665	///
666	/// This expects the parser to be positioned at the first position where
667	/// a decimal digit could occur. This will advance the parser to the byte
668	/// immediately following the last contiguous decimal digit.
669	///
670	/// If no decimal digit could be found or if there was a problem parsing
671	/// the complete set of digits into a u32, then an error is returned.
672	fn parse_decimal(&self) -> Result<u32, Error> {
673	let mut scratch = String::new();
674	while !self.is_done() && self.char().is_whitespace() {
675	self.bump();
676	}
677	while !self.is_done() && '0' <= self.char() && self.char() <= '9' {
678	scratch.push(self.char());
679	self.bump_and_bump_space();
680	}
681	while !self.is_done() && self.char().is_whitespace() {
682	self.bump_and_bump_space();
683	}
684	let digits = scratch.as_str();
685	if digits.is_empty() {
686	return Err(Error::new(ERR_DECIMAL_NO_DIGITS));
687	}
688	match u32::from_str_radix(digits, `10`).ok() {
689	Some(n) => Ok(n),
690	None => Err(Error::new(ERR_DECIMAL_INVALID)),
691	}
692	}
693
694	/// Parses an uncounted repetition operator. An uncounted repetition
695	/// operator includes `?`, `` and `+`, but does not include the `{m,n}`*
696	/// syntax. The current character should be one of `?`, `` or `+`. Any*
697	/// other character will result in a panic.
698	///
699	/// This assumes that the parser is currently positioned at the repetition
700	/// operator and advances the parser to the first character after the
701	/// operator. (Note that the operator may include a single additional `?`,
702	/// which makes the operator ungreedy.)
703	///
704	/// The caller should include the concatenation that is being built. The
705	/// concatenation returned includes the repetition operator applied to the
706	/// last expression in the given concatenation.
707	///
708	/// If the concatenation is empty, then this returns an error.
709	fn parse_uncounted_repetition(
710	&self,
711	mut concat: Vec<Hir>,
712	) -> Result<Vec<Hir>, Error> {
713	let sub = match concat.pop() {
714	Some(hir) => Box::new(hir),
715	None => {
716	return Err(Error::new(ERR_UNCOUNTED_REP_SUB_MISSING));
717	}
718	};
719	let (min, max) = match self.char() {
720	'?' => (`0`, Some(`1`)),
721	'*' => (`0`, None),
722	'+' => (`1`, None),
723	unk => unreachable!("unrecognized repetition operator '{}'", unk),
724	};
725	let mut greedy = `true`;
726	if self.bump() && self.char() == '?' {
727	greedy = `false`;
728	self.bump();
729	}
730	if self.flags().swap_greed {
731	greedy = !greedy;
732	}
733	concat.push(Hir::repetition(hir::Repetition {
734	min,
735	max,
736	greedy,
737	sub,
738	}));
739	Ok(concat)
740	}
741
742	/// Parses a counted repetition operation. A counted repetition operator
743	/// corresponds to the `{m,n}` syntax, and does not include the `?`, `` or*
744	/// `+` operators.
745	///
746	/// This assumes that the parser is currently at the opening `{` and
747	/// advances the parser to the first character after the operator. (Note
748	/// that the operator may include a single additional `?`, which makes the
749	/// operator ungreedy.)
750	///
751	/// The caller should include the concatenation that is being built. The
752	/// concatenation returned includes the repetition operator applied to the
753	/// last expression in the given concatenation.
754	///
755	/// If the concatenation is empty, then this returns an error.
756	fn parse_counted_repetition(
757	&self,
758	mut concat: Vec<Hir>,
759	) -> Result<Vec<Hir>, Error> {
760	assert_eq!(self.char(), '{', "expected opening brace");
761	let sub = match concat.pop() {
762	Some(hir) => Box::new(hir),
763	None => {
764	return Err(Error::new(ERR_COUNTED_REP_SUB_MISSING));
765	}
766	};
767	if !self.bump_and_bump_space() {
768	return Err(Error::new(ERR_COUNTED_REP_UNCLOSED));
769	}
770	let min = self.parse_decimal()?;
771	let mut max = Some(min);
772	if self.is_done() {
773	return Err(Error::new(ERR_COUNTED_REP_MIN_UNCLOSED));
774	}
775	if self.char() == ',' {
776	if !self.bump_and_bump_space() {
777	return Err(Error::new(ERR_COUNTED_REP_COMMA_UNCLOSED));
778	}
779	if self.char() != '}' {
780	max = Some(self.parse_decimal()?);
781	} else {
782	max = None;
783	}
784	if self.is_done() {
785	return Err(Error::new(ERR_COUNTED_REP_MIN_MAX_UNCLOSED));
786	}
787	}
788	if self.char() != '}' {
789	return Err(Error::new(ERR_COUNTED_REP_INVALID));
790	}
791
792	let mut greedy = `true`;
793	if self.bump_and_bump_space() && self.char() == '?' {
794	greedy = `false`;
795	self.bump();
796	}
797	if self.flags().swap_greed {
798	greedy = !greedy;
799	}
800
801	if max.map_or(`false`, \|max\| min > max) {
802	return Err(Error::new(ERR_COUNTED_REP_INVALID_RANGE));
803	}
804	concat.push(Hir::repetition(hir::Repetition {
805	min,
806	max,
807	greedy,
808	sub,
809	}));
810	Ok(concat)
811	}
812
813	/// Parses the part of a pattern that starts with a `(`. This is usually
814	/// a group sub-expression, but might just be a directive that enables
815	/// (or disables) certain flags.
816	///
817	/// This assumes the parser is pointing at the opening `(`.
818	fn parse_group(&self) -> Result<Option<Hir>, Error> {
819	assert_eq!(self.char(), '(');
820	self.bump_and_bump_space();
821	if self.is_lookaround_prefix() {
822	return Err(Error::new(ERR_LOOK_UNSUPPORTED));
823	}
824	if self.bump_if("?P<") \|\| self.bump_if("?<") {
825	let index = self.next_capture_index()?;
826	let name = Some(Box::from(self.parse_capture_name()?));
827	let sub = Box::new(self.parse_inner()?);
828	let cap = hir::Capture { index, name, sub };
829	Ok(Some(Hir::capture(cap)))
830	} else if self.bump_if("?") {
831	if self.is_done() {
832	return Err(Error::new(ERR_UNCLOSED_GROUP_QUESTION));
833	}
834	let start = self.pos();
835	// The flags get reset in the top-level 'parse' routine.
836	*self.flags.borrow_mut() = self.parse_flags()?;
837	let consumed = self.pos() - start;
838	if self.char() == ')' {
839	// We don't allow empty flags, e.g., `(?)`.
840	if consumed == `0` {
841	return Err(Error::new(ERR_EMPTY_FLAGS));
842	}
843	Ok(None)
844	} else {
845	assert_eq!(':', self.char());
846	self.bump();
847	self.parse_inner().map(Some)
848	}
849	} else {
850	let index = self.next_capture_index()?;
851	let sub = Box::new(self.parse_inner()?);
852	let cap = hir::Capture { index, name: None, sub };
853	Ok(Some(Hir::capture(cap)))
854	}
855	}
856
857	/// Parses a capture group name. Assumes that the parser is positioned at
858	/// the first character in the name following the opening `<` (and may
859	/// possibly be EOF). This advances the parser to the first character
860	/// following the closing `>`.
861	fn parse_capture_name(&self) -> Result<&str, Error> {
862	if self.is_done() {
863	return Err(Error::new(ERR_MISSING_GROUP_NAME));
864	}
865	let start = self.pos();
866	loop {
867	if self.char() == '>' {
868	break;
869	}
870	if !is_capture_char(self.char(), self.pos() == start) {
871	return Err(Error::new(ERR_INVALID_GROUP_NAME));
872	}
873	if !self.bump() {
874	break;
875	}
876	}
877	let end = self.pos();
878	if self.is_done() {
879	return Err(Error::new(ERR_UNCLOSED_GROUP_NAME));
880	}
881	assert_eq!(self.char(), '>');
882	self.bump();
883	let name = &self.pattern()[start..end];
884	if name.is_empty() {
885	return Err(Error::new(ERR_EMPTY_GROUP_NAME));
886	}
887	self.add_capture_name(name)?;
888	Ok(name)
889	}
890
891	/// Parse a sequence of flags starting at the current character.
892	///
893	/// This advances the parser to the character immediately following the
894	/// flags, which is guaranteed to be either `:` or `)`.
895	///
896	/// # Errors
897	///
898	/// If any flags are duplicated, then an error is returned.
899	///
900	/// If the negation operator is used more than once, then an error is
901	/// returned.
902	///
903	/// If no flags could be found or if the negation operation is not followed
904	/// by any flags, then an error is returned.
905	fn parse_flags(&self) -> Result<Flags, Error> {
906	let mut flags = *self.flags.borrow();
907	let mut negate = `false`;
908	// Keeps track of whether the previous flag item was a '-'. We use this
909	// to detect whether there is a dangling '-', which is invalid.
910	let mut last_was_negation = `false`;
911	// A set to keep track of the flags we've seen. Since all flags are
912	// ASCII, we only need 128 bytes.
913	let mut seen = [`false`; `128`];
914	while self.char() != ':' && self.char() != ')' {
915	if self.char() == '-' {
916	last_was_negation = `true`;
917	if negate {
918	return Err(Error::new(ERR_FLAG_REPEATED_NEGATION));
919	}
920	negate = `true`;
921	} else {
922	last_was_negation = `false`;
923	self.parse_flag(&mut flags, negate)?;
924	// OK because every valid flag is ASCII, and we're only here if
925	// the flag is valid.
926	let flag_byte = u8::try_from(self.char()).unwrap();
927	if seen[usize::from(flag_byte)] {
928	return Err(Error::new(ERR_FLAG_DUPLICATE));
929	}
930	seen[usize::from(flag_byte)] = `true`;
931	}
932	if !self.bump() {
933	return Err(Error::new(ERR_FLAG_UNEXPECTED_EOF));
934	}
935	}
936	if last_was_negation {
937	return Err(Error::new(ERR_FLAG_DANGLING_NEGATION));
938	}
939	Ok(flags)
940	}
941
942	/// Parse the current character as a flag. Do not advance the parser.
943	///
944	/// This sets the appropriate boolean value in place on the set of flags
945	/// given. The boolean is inverted when `negate` is true.
946	///
947	/// # Errors
948	///
949	/// If the flag is not recognized, then an error is returned.
950	fn parse_flag(
951	&self,
952	flags: &mut Flags,
953	negate: bool,
954	) -> Result<(), Error> {
955	let enabled = !negate;
956	match self.char() {
957	'i' => flags.case_insensitive = enabled,
958	'm' => flags.multi_line = enabled,
959	's' => flags.dot_matches_new_line = enabled,
960	'U' => flags.swap_greed = enabled,
961	'R' => flags.crlf = enabled,
962	'x' => flags.ignore_whitespace = enabled,
963	// We make a special exception for this flag where we let it
964	// through as a recognized flag, but treat it as a no-op. This in
965	// practice retains some compatibility with the regex crate. It is
966	// a little suspect to do this, but for example, '(?-u:\b).+' in
967	// the regex crate is equivalent to '\b.+' in regex-lite.
968	'u' => {}
969	_ => return Err(Error::new(ERR_FLAG_UNRECOGNIZED)),
970	}
971	Ok(())
972	}
973
974	/// Parse a standard character class consisting primarily of characters or
975	/// character ranges.
976	///
977	/// This assumes the parser is positioned at the opening `[`. If parsing
978	/// is successful, then the parser is advanced to the position immediately
979	/// following the closing `]`.
980	fn parse_class(&self) -> Result<Hir, Error> {
981	assert_eq!(self.char(), '[');
982
983	let mut union = vec![];
984	if !self.bump_and_bump_space() {
985	return Err(Error::new(ERR_CLASS_UNCLOSED));
986	}
987	// Determine whether the class is negated or not.
988	let negate = if self.char() != '^' {
989	`false`
990	} else {
991	if !self.bump_and_bump_space() {
992	return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_NEGATION));
993	}
994	`true`
995	};
996	// Accept any number of `-` as literal `-`.
997	while self.char() == '-' {
998	union.push(hir::ClassRange { start: '-', end: '-' });
999	if !self.bump_and_bump_space() {
1000	return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_DASH));
1001	}
1002	}
1003	// If `]` is the first* char in a set, then interpret it as a literal*
1004	// `]`. That is, an empty class is impossible to write.
1005	if union.is_empty() && self.char() == ']' {
1006	union.push(hir::ClassRange { start: ']', end: ']' });
1007	if !self.bump_and_bump_space() {
1008	return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_CLOSING));
1009	}
1010	}
1011	loop {
1012	self.bump_space();
1013	if self.is_done() {
1014	return Err(Error::new(ERR_CLASS_UNCLOSED));
1015	}
1016	match self.char() {
1017	'[' => {
1018	// Attempt to treat this as the beginning of a POSIX class.
1019	// If POSIX class parsing fails, then the parser backs up
1020	// to `[`.
1021	if let Some(class) = self.maybe_parse_posix_class() {
1022	union.extend_from_slice(&class.ranges);
1023	continue;
1024	}
1025	// ... otherwise we don't support nested classes.
1026	return Err(Error::new(ERR_CLASS_NEST_UNSUPPORTED));
1027	}
1028	']' => {
1029	self.bump();
1030	let mut class = hir::Class::new(union);
1031	// Note that we must apply case folding before negation!
1032	// Consider `(?i)[^x]`. If we applied negation first, then
1033	// the result would be the character class that matched any
1034	// Unicode scalar value.
1035	if self.flags().case_insensitive {
1036	class.ascii_case_fold();
1037	}
1038	if negate {
1039	class.negate();
1040	}
1041	return Ok(Hir::class(class));
1042	}
1043	'&' if self.peek() == Some('&') => {
1044	return Err(Error::new(
1045	ERR_CLASS_INTERSECTION_UNSUPPORTED,
1046	));
1047	}
1048	'-' if self.peek() == Some('-') => {
1049	return Err(Error::new(ERR_CLASS_DIFFERENCE_UNSUPPORTED));
1050	}
1051	'~' if self.peek() == Some('~') => {
1052	return Err(Error::new(
1053	ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED,
1054	));
1055	}
1056	_ => self.parse_class_range(&mut union)?,
1057	}
1058	}
1059	}
1060
1061	/// Parse a single primitive item in a character class set. The item to
1062	/// be parsed can either be one of a simple literal character, a range
1063	/// between two simple literal characters or a "primitive" character
1064	/// class like `\w`.
1065	///
1066	/// If an invalid escape is found, or if a character class is found where
1067	/// a simple literal is expected (e.g., in a range), then an error is
1068	/// returned.
1069	///
1070	/// Otherwise, the range (or ranges) are appended to the given union of
1071	/// ranges.
1072	fn parse_class_range(
1073	&self,
1074	union: &mut Vec<hir::ClassRange>,
1075	) -> Result<(), Error> {
1076	let prim1 = self.parse_class_item()?;
1077	self.bump_space();
1078	if self.is_done() {
1079	return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_ITEM));
1080	}
1081	// If the next char isn't a `-`, then we don't have a range.
1082	// There are two exceptions. If the char after a `-` is a `]`, then
1083	// `-` is interpreted as a literal `-`. Alternatively, if the char
1084	// after a `-` is a `-`, then `--` corresponds to a "difference"
1085	// operation. (Which we don't support in regex-lite, but error about
1086	// specifically in an effort to be loud about differences between the
1087	// main regex crate where possible.)
1088	if self.char() != '-'
1089	\|\| self.peek_space() == Some(']')
1090	\|\| self.peek_space() == Some('-')
1091	{
1092	union.extend_from_slice(&into_class_item_ranges(prim1)?);
1093	return Ok(());
1094	}
1095	// OK, now we're parsing a range, so bump past the `-` and parse the
1096	// second half of the range.
1097	if !self.bump_and_bump_space() {
1098	return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_DASH));
1099	}
1100	let prim2 = self.parse_class_item()?;
1101	let range = hir::ClassRange {
1102	start: into_class_item_range(prim1)?,
1103	end: into_class_item_range(prim2)?,
1104	};
1105	if range.start > range.end {
1106	return Err(Error::new(ERR_CLASS_INVALID_RANGE));
1107	}
1108	union.push(range);
1109	Ok(())
1110	}
1111
1112	/// Parse a single item in a character class as a primitive, where the
1113	/// primitive either consists of a verbatim literal or a single escape
1114	/// sequence.
1115	///
1116	/// This assumes the parser is positioned at the beginning of a primitive,
1117	/// and advances the parser to the first position after the primitive if
1118	/// successful.
1119	///
1120	/// Note that it is the caller's responsibility to report an error if an
1121	/// illegal primitive was parsed.
1122	fn parse_class_item(&self) -> Result<Hir, Error> {
1123	let ch = self.char();
1124	self.bump();
1125	if ch == '`\\`' {
1126	self.parse_escape()
1127	} else {
1128	Ok(Hir::char(ch))
1129	}
1130	}
1131
1132	/// Attempt to parse a POSIX character class, e.g., `[:alnum:]`.
1133	///
1134	/// This assumes the parser is positioned at the opening `[`.
1135	///
1136	/// If no valid POSIX character class could be found, then this does not
1137	/// advance the parser and `None` is returned. Otherwise, the parser is
1138	/// advanced to the first byte following the closing `]` and the
1139	/// corresponding POSIX class is returned.
1140	fn maybe_parse_posix_class(&self) -> Option<hir::Class> {
1141	// POSIX character classes are interesting from a parsing perspective
1142	// because parsing cannot fail with any interesting error. For example,
1143	// in order to use an POSIX character class, it must be enclosed in
1144	// double brackets, e.g., `[[:alnum:]]`. Alternatively, you might think
1145	// of it as "POSIX character classes have the syntax `[:NAME:]` which
1146	// can only appear within character brackets." This means that things
1147	// like `[[:lower:]A]` are legal constructs.
1148	//
1149	// However, if one types an incorrect POSIX character class, e.g.,
1150	// `[[:loower:]]`, then we treat that as if it were normal nested
1151	// character class containing the characters `:elorw`. (Which isn't
1152	// supported and results in an error in regex-lite.) One might argue
1153	// that we should return an error instead since the repeated colons
1154	// give away the intent to write an POSIX class. But what if the user
1155	// typed `[[:lower]]` instead? How can we tell that was intended to be
1156	// a POSXI class and not just a normal nested class?
1157	//
1158	// Reasonable people can probably disagree over this, but for better
1159	// or worse, we implement semantics that never fails at the expense of
1160	// better failure modes.
1161	assert_eq!(self.char(), '[');
1162
1163	// If parsing fails, then we back up the parser to this starting point.
1164	let start_pos = self.pos();
1165	let start_char = self.char.get();
1166	let reset = \|\| {
1167	self.pos.set(start_pos);
1168	self.char.set(start_char);
1169	};
1170
1171	let mut negated = `false`;
1172	if !self.bump() \|\| self.char() != ':' {
1173	reset();
1174	return None;
1175	}
1176	if !self.bump() {
1177	reset();
1178	return None;
1179	}
1180	if self.char() == '^' {
1181	negated = `true`;
1182	if !self.bump() {
1183	reset();
1184	return None;
1185	}
1186	}
1187	let name_start = self.pos();
1188	while self.char() != ':' && self.bump() {}
1189	if self.is_done() {
1190	reset();
1191	return None;
1192	}
1193	let name = &self.pattern()[name_start..self.pos()];
1194	if !self.bump_if(":]") {
1195	reset();
1196	return None;
1197	}
1198	if let Ok(ranges) = posix_class(name) {
1199	let mut class = hir::Class::new(ranges);
1200	if negated {
1201	class.negate();
1202	}
1203	return Some(class);
1204	}
1205	reset();
1206	None
1207	}
1208
1209	/// Parse a Perl character class, e.g., `\d` or `\W`. This assumes the
1210	/// parser is currently at a valid character class name and will be
1211	/// advanced to the character immediately following the class.
1212	fn parse_perl_class(&self) -> Hir {
1213	let ch = self.char();
1214	self.bump();
1215	let mut class = hir::Class::new(match ch {
1216	'd' \| 'D' => posix_class("digit").unwrap(),
1217	's' \| 'S' => posix_class("space").unwrap(),
1218	'w' \| 'W' => posix_class("word").unwrap(),
1219	unk => unreachable!("invalid Perl class `\\`{}", unk),
1220	});
1221	if ch.is_ascii_uppercase() {
1222	class.negate();
1223	}
1224	Hir::class(class)
1225	}
1226
1227	fn hir_dot(&self) -> Hir {
1228	if self.flags().dot_matches_new_line {
1229	Hir::class(hir::Class::new([hir::ClassRange {
1230	start: '`\x00`',
1231	end: '`\u{10FFFF}`',
1232	}]))
1233	} else if self.flags().crlf {
1234	Hir::class(hir::Class::new([
1235	hir::ClassRange { start: '`\x00`', end: '`\x09`' },
1236	hir::ClassRange { start: '`\x0B`', end: '`\x0C`' },
1237	hir::ClassRange { start: '`\x0E`', end: '`\u{10FFFF}`' },
1238	]))
1239	} else {
1240	Hir::class(hir::Class::new([
1241	hir::ClassRange { start: '`\x00`', end: '`\x09`' },
1242	hir::ClassRange { start: '`\x0B`', end: '`\u{10FFFF}`' },
1243	]))
1244	}
1245	}
1246
1247	fn hir_anchor_start(&self) -> Hir {
1248	let look = if self.flags().multi_line {
1249	if self.flags().crlf {
1250	hir::Look::StartCRLF
1251	} else {
1252	hir::Look::StartLF
1253	}
1254	} else {
1255	hir::Look::Start
1256	};
1257	Hir::look(look)
1258	}
1259
1260	fn hir_anchor_end(&self) -> Hir {
1261	let look = if self.flags().multi_line {
1262	if self.flags().crlf {
1263	hir::Look::EndCRLF
1264	} else {
1265	hir::Look::EndLF
1266	}
1267	} else {
1268	hir::Look::End
1269	};
1270	Hir::look(look)
1271	}
1272
1273	fn hir_char(&self, ch: char) -> Hir {
1274	if self.flags().case_insensitive {
1275	let this = hir::ClassRange { start: ch, end: ch };
1276	if let Some(folded) = this.ascii_case_fold() {
1277	return Hir::class(hir::Class::new([this, folded]));
1278	}
1279	}
1280	Hir::char(ch)
1281	}
1282	}
1283
1284	/// This checks the depth of the given `Hir` value, and if it exceeds the given
1285	/// limit, then an error is returned.
1286	fn check_hir_nesting(hir: &Hir, limit: u32) -> Result<(), Error> {
1287	fn recurse(hir: &Hir, limit: u32, depth: u32) -> Result<(), Error> {
1288	if depth > limit {
1289	return Err(Error::new(ERR_TOO_MUCH_NESTING));
1290	}
1291	let Some(next_depth) = depth.checked_add(`1`) else {
1292	return Err(Error::new(ERR_TOO_MUCH_NESTING));
1293	};
1294	match *hir.kind() {
1295	HirKind::Empty
1296	\| HirKind::Char(_)
1297	\| HirKind::Class(_)
1298	\| HirKind::Look(_) => Ok(()),
1299	HirKind::Repetition(hir::Repetition { ref sub, .. }) => {
1300	recurse(sub, limit, next_depth)
1301	}
1302	HirKind::Capture(hir::Capture { ref sub, .. }) => {
1303	recurse(sub, limit, next_depth)
1304	}
1305	HirKind::Concat(ref subs) \| HirKind::Alternation(ref subs) => {
1306	for sub in subs.iter() {
1307	recurse(sub, limit, next_depth)?;
1308	}
1309	Ok(())
1310	}
1311	}
1312	}
1313	recurse(hir, limit, `0`)
1314	}
1315
1316	/// Converts the given Hir to a literal char if the Hir is just a single
1317	/// character. Otherwise this returns an error.
1318	///
1319	/// This is useful in contexts where you can only accept a single character,
1320	/// but where it is convenient to parse something more general. For example,
1321	/// parsing a single part of a character class range. It's useful to reuse
1322	/// the literal parsing code, but that code can itself return entire classes
1323	/// which can't be used as the start/end of a class range.
1324	fn into_class_item_range(hir: Hir) -> Result<char, Error> {
1325	match hir.kind {
1326	HirKind::Char(ch: char) => Ok(ch),
1327	_ => Err(Error::new(ERR_CLASS_INVALID_RANGE_ITEM)),
1328	}
1329	}
1330
1331	fn into_class_item_ranges(
1332	mut hir: Hir,
1333	) -> Result<Vec<hir::ClassRange>, Error> {
1334	match core::mem::replace(&mut hir.kind, src:HirKind::Empty) {
1335	HirKind::Char(ch: char) => Ok(vec![hir::ClassRange { start: ch, end: ch }]),
1336	HirKind::Class(hir::Class { ranges: Vec }) => Ok(ranges),
1337	_ => Err(Error::new(ERR_CLASS_INVALID_ITEM)),
1338	}
1339	}
1340
1341	/// Returns an iterator of character class ranges for the given named POSIX
1342	/// character class. If no such character class exists for the name given, then
1343	/// an error is returned.
1344	fn posix_class(
1345	kind: &str,
1346	) -> Result<impl Iterator<Item = hir::ClassRange>, Error> {
1347	let slice: &'static [(u8, u8)] = match kind {
1348	"alnum" => &[(b'0', b'9'), (b'A', b'Z'), (b'a', b'z')],
1349	"alpha" => &[(b'A', b'Z'), (b'a', b'z')],
1350	"ascii" => &[(b'`\x00`', b'`\x7F`')],
1351	"blank" => &[(b'`\t`', b'`\t`'), (b' ', b' ')],
1352	"cntrl" => &[(b'`\x00`', b'`\x1F`'), (b'`\x7F`', b'`\x7F`')],
1353	"digit" => &[(b'0', b'9')],
1354	"graph" => &[(b'!', b'~')],
1355	"lower" => &[(b'a', b'z')],
1356	"print" => &[(b' ', b'~')],
1357	"punct" => &[(b'!', b'/'), (b':', b'@'), (b'[', b'`'), (b'{', b'~')],
1358	"space" => &[
1359	(b'`\t`', b'`\t`'),
1360	(b'`\n`', b'`\n`'),
1361	(b'`\x0B`', b'`\x0B`'),
1362	(b'`\x0C`', b'`\x0C`'),
1363	(b'`\r`', b'`\r`'),
1364	(b' ', b' '),
1365	],
1366	"upper" => &[(b'A', b'Z')],
1367	"word" => &[(b'0', b'9'), (b'A', b'Z'), (b'_', b'_'), (b'a', b'z')],
1368	"xdigit" => &[(b'0', b'9'), (b'A', b'F'), (b'a', b'f')],
1369	_ => return Err(Error::new(ERR_POSIX_CLASS_UNRECOGNIZED)),
1370	};
1371	Ok(slice.iter().map(\|&(start, end)\| hir::ClassRange {
1372	start: char::from(start),
1373	end: char::from(end),
1374	}))
1375	}
1376
1377	/// Returns true if the given character is a hexadecimal digit.
1378	fn is_hex(c: char) -> bool {
1379	('0' <= c && c <= '9') \|\| ('a' <= c && c <= 'f') \|\| ('A' <= c && c <= 'F')
1380	}
1381
1382	/// Returns true if the given character is a valid in a capture group name.
1383	///
1384	/// If `first` is true, then `c` is treated as the first character in the
1385	/// group name (which must be alphabetic or underscore).
1386	fn is_capture_char(c: char, first: bool) -> bool {
1387	if first {
1388	c == '_' \|\| c.is_alphabetic()
1389	} else {
1390	c == '_' \|\| c == '.' \|\| c == '[' \|\| c == ']' \|\| c.is_alphanumeric()
1391	}
1392	}
1393
1394	#[cfg(test)]
1395	mod tests {
1396	use super::*;
1397
1398	fn p(pattern: &str) -> Hir {
1399	Parser::new(Config::default(), pattern).parse_inner().unwrap()
1400	}
1401
1402	fn perr(pattern: &str) -> String {
1403	Parser::new(Config::default(), pattern)
1404	.parse_inner()
1405	.unwrap_err()
1406	.to_string()
1407	}
1408
1409	fn class<I: IntoIterator<Item = (char, char)>>(it: I) -> Hir {
1410	Hir::class(hir::Class::new(
1411	it.into_iter().map(\|(start, end)\| hir::ClassRange { start, end }),
1412	))
1413	}
1414
1415	fn singles<I: IntoIterator<Item = char>>(it: I) -> Hir {
1416	Hir::class(hir::Class::new(
1417	it.into_iter().map(\|ch\| hir::ClassRange { start: ch, end: ch }),
1418	))
1419	}
1420
1421	fn posix(name: &str) -> Hir {
1422	Hir::class(hir::Class::new(posix_class(name).unwrap()))
1423	}
1424
1425	fn cap(index: u32, sub: Hir) -> Hir {
1426	Hir::capture(hir::Capture { index, name: None, sub: Box::new(sub) })
1427	}
1428
1429	fn named_cap(index: u32, name: &str, sub: Hir) -> Hir {
1430	Hir::capture(hir::Capture {
1431	index,
1432	name: Some(Box::from(name)),
1433	sub: Box::new(sub),
1434	})
1435	}
1436
1437	#[test]
1438	fn ok_literal() {
1439	assert_eq!(p("a"), Hir::char('a'));
1440	assert_eq!(p("ab"), Hir::concat(vec![Hir::char('a'), Hir::char('b')]));
1441	assert_eq!(p("💩"), Hir::char('💩'));
1442	}
1443
1444	#[test]
1445	fn ok_meta_escapes() {
1446	assert_eq!(p(r"\"), Hir::char(''));
1447	assert_eq!(p(r"\+"), Hir::char('+'));
1448	assert_eq!(p(r"\?"), Hir::char('?'));
1449	assert_eq!(p(r"\\|"), Hir::char('\|'));
1450	assert_eq!(p(r"\("), Hir::char('('));
1451	assert_eq!(p(r"\)"), Hir::char(')'));
1452	assert_eq!(p(r"\^"), Hir::char('^'));
1453	assert_eq!(p(r"\$"), Hir::char('$'));
1454	assert_eq!(p(r"\["), Hir::char('['));
1455	assert_eq!(p(r"\]"), Hir::char(']'));
1456	}
1457
1458	#[test]
1459	fn ok_special_escapes() {
1460	assert_eq!(p(r"\a"), Hir::char('`\x07`'));
1461	assert_eq!(p(r"\f"), Hir::char('`\x0C`'));
1462	assert_eq!(p(r"\t"), Hir::char('`\t`'));
1463	assert_eq!(p(r"\n"), Hir::char('`\n`'));
1464	assert_eq!(p(r"\r"), Hir::char('`\r`'));
1465	assert_eq!(p(r"\v"), Hir::char('`\x0B`'));
1466	assert_eq!(p(r"\A"), Hir::look(hir::Look::Start));
1467	assert_eq!(p(r"\z"), Hir::look(hir::Look::End));
1468	assert_eq!(p(r"\b"), Hir::look(hir::Look::Word));
1469	assert_eq!(p(r"\B"), Hir::look(hir::Look::WordNegate));
1470	}
1471
1472	#[test]
1473	fn ok_hex() {
1474	// fixed length
1475	assert_eq!(p(r"\x41"), Hir::char('A'));
1476	assert_eq!(p(r"\u2603"), Hir::char('☃'));
1477	assert_eq!(p(r"\U0001F4A9"), Hir::char('💩'));
1478	// braces
1479	assert_eq!(p(r"\x{1F4A9}"), Hir::char('💩'));
1480	assert_eq!(p(r"\u{1F4A9}"), Hir::char('💩'));
1481	assert_eq!(p(r"\U{1F4A9}"), Hir::char('💩'));
1482	}
1483
1484	#[test]
1485	fn ok_perl() {
1486	assert_eq!(p(r"\d"), posix("digit"));
1487	assert_eq!(p(r"\s"), posix("space"));
1488	assert_eq!(p(r"\w"), posix("word"));
1489
1490	let negated = \|name\| {
1491	let mut class = hir::Class::new(posix_class(name).unwrap());
1492	class.negate();
1493	Hir::class(class)
1494	};
1495	assert_eq!(p(r"\D"), negated("digit"));
1496	assert_eq!(p(r"\S"), negated("space"));
1497	assert_eq!(p(r"\W"), negated("word"));
1498	}
1499
1500	#[test]
1501	fn ok_flags_and_primitives() {
1502	assert_eq!(p(r"a"), Hir::char('a'));
1503	assert_eq!(p(r"(?i:a)"), singles(['A', 'a']));
1504
1505	assert_eq!(p(r"^"), Hir::look(hir::Look::Start));
1506	assert_eq!(p(r"(?m:^)"), Hir::look(hir::Look::StartLF));
1507	assert_eq!(p(r"(?mR:^)"), Hir::look(hir::Look::StartCRLF));
1508
1509	assert_eq!(p(r"$"), Hir::look(hir::Look::End));
1510	assert_eq!(p(r"(?m:$)"), Hir::look(hir::Look::EndLF));
1511	assert_eq!(p(r"(?mR:$)"), Hir::look(hir::Look::EndCRLF));
1512
1513	assert_eq!(p(r"."), class([('`\x00`', '`\x09`'), ('`\x0B`', '`\u{10FFFF}`')]));
1514	assert_eq!(
1515	p(r"(?R:.)"),
1516	class([
1517	('`\x00`', '`\x09`'),
1518	('`\x0B`', '`\x0C`'),
1519	('`\x0E`', '`\u{10FFFF}`'),
1520	])
1521	);
1522	assert_eq!(p(r"(?s:.)"), class([('`\x00`', '`\u{10FFFF}`')]));
1523	assert_eq!(p(r"(?sR:.)"), class([('`\x00`', '`\u{10FFFF}`')]));
1524	}
1525
1526	#[test]
1527	fn ok_alternate() {
1528	assert_eq!(
1529	p(r"a\|b"),
1530	Hir::alternation(vec![Hir::char('a'), Hir::char('b')])
1531	);
1532	assert_eq!(
1533	p(r"(?:a\|b)"),
1534	Hir::alternation(vec![Hir::char('a'), Hir::char('b')])
1535	);
1536
1537	assert_eq!(
1538	p(r"(a\|b)"),
1539	cap(`1`, Hir::alternation(vec![Hir::char('a'), Hir::char('b')]))
1540	);
1541	assert_eq!(
1542	p(r"(?<foo>a\|b)"),
1543	named_cap(
1544	`1`,
1545	"foo",
1546	Hir::alternation(vec![Hir::char('a'), Hir::char('b')])
1547	)
1548	);
1549
1550	assert_eq!(
1551	p(r"a\|b\|c"),
1552	Hir::alternation(vec![
1553	Hir::char('a'),
1554	Hir::char('b'),
1555	Hir::char('c')
1556	])
1557	);
1558
1559	assert_eq!(
1560	p(r"ax\|by\|cz"),
1561	Hir::alternation(vec![
1562	Hir::concat(vec![Hir::char('a'), Hir::char('x')]),
1563	Hir::concat(vec![Hir::char('b'), Hir::char('y')]),
1564	Hir::concat(vec![Hir::char('c'), Hir::char('z')]),
1565	])
1566	);
1567	assert_eq!(
1568	p(r"(ax\|(by\|(cz)))"),
1569	cap(
1570	`1`,
1571	Hir::alternation(vec![
1572	Hir::concat(vec![Hir::char('a'), Hir::char('x')]),
1573	cap(
1574	`2`,
1575	Hir::alternation(vec![
1576	Hir::concat(vec![Hir::char('b'), Hir::char('y')]),
1577	cap(
1578	`3`,
1579	Hir::concat(vec![
1580	Hir::char('c'),
1581	Hir::char('z')
1582	])
1583	),
1584	])
1585	),
1586	])
1587	)
1588	);
1589
1590	assert_eq!(
1591	p(r"\|"),
1592	Hir::alternation(vec![Hir::empty(), Hir::empty()])
1593	);
1594	assert_eq!(
1595	p(r"\|\|"),
1596	Hir::alternation(vec![Hir::empty(), Hir::empty(), Hir::empty()])
1597	);
1598
1599	assert_eq!(
1600	p(r"a\|"),
1601	Hir::alternation(vec![Hir::char('a'), Hir::empty()])
1602	);
1603	assert_eq!(
1604	p(r"\|a"),
1605	Hir::alternation(vec![Hir::empty(), Hir::char('a')])
1606	);
1607
1608	assert_eq!(
1609	p(r"(\|)"),
1610	cap(`1`, Hir::alternation(vec![Hir::empty(), Hir::empty()]))
1611	);
1612	assert_eq!(
1613	p(r"(a\|)"),
1614	cap(`1`, Hir::alternation(vec![Hir::char('a'), Hir::empty()]))
1615	);
1616	assert_eq!(
1617	p(r"(\|a)"),
1618	cap(`1`, Hir::alternation(vec![Hir::empty(), Hir::char('a')]))
1619	);
1620	}
1621
1622	#[test]
1623	fn ok_flag_group() {
1624	assert_eq!(
1625	p("a(?i:b)"),
1626	Hir::concat(vec![Hir::char('a'), singles(['B', 'b'])])
1627	);
1628	}
1629
1630	#[test]
1631	fn ok_flag_directive() {
1632	assert_eq!(p("(?i)a"), singles(['A', 'a']));
1633	assert_eq!(p("a(?i)"), Hir::char('a'));
1634	assert_eq!(
1635	p("a(?i)b"),
1636	Hir::concat(vec![Hir::char('a'), singles(['B', 'b'])])
1637	);
1638	assert_eq!(
1639	p("a(?i)a(?-i)a"),
1640	Hir::concat(vec![
1641	Hir::char('a'),
1642	singles(['A', 'a']),
1643	Hir::char('a'),
1644	])
1645	);
1646	assert_eq!(
1647	p("a(?:(?i)a)a"),
1648	Hir::concat(vec![
1649	Hir::char('a'),
1650	singles(['A', 'a']),
1651	Hir::char('a'),
1652	])
1653	);
1654	assert_eq!(
1655	p("a((?i)a)a"),
1656	Hir::concat(vec![
1657	Hir::char('a'),
1658	cap(`1`, singles(['A', 'a'])),
1659	Hir::char('a'),
1660	])
1661	);
1662	}
1663
1664	#[test]
1665	fn ok_uncounted_repetition() {
1666	assert_eq!(
1667	p(r"a?"),
1668	Hir::repetition(hir::Repetition {
1669	min: `0`,
1670	max: Some(`1`),
1671	greedy: `true`,
1672	sub: Box::new(Hir::char('a')),
1673	}),
1674	);
1675	assert_eq!(
1676	p(r"a*"),
1677	Hir::repetition(hir::Repetition {
1678	min: `0`,
1679	max: None,
1680	greedy: `true`,
1681	sub: Box::new(Hir::char('a')),
1682	}),
1683	);
1684	assert_eq!(
1685	p(r"a+"),
1686	Hir::repetition(hir::Repetition {
1687	min: `1`,
1688	max: None,
1689	greedy: `true`,
1690	sub: Box::new(Hir::char('a')),
1691	}),
1692	);
1693
1694	assert_eq!(
1695	p(r"a??"),
1696	Hir::repetition(hir::Repetition {
1697	min: `0`,
1698	max: Some(`1`),
1699	greedy: `false`,
1700	sub: Box::new(Hir::char('a')),
1701	}),
1702	);
1703	assert_eq!(
1704	p(r"a*?"),
1705	Hir::repetition(hir::Repetition {
1706	min: `0`,
1707	max: None,
1708	greedy: `false`,
1709	sub: Box::new(Hir::char('a')),
1710	}),
1711	);
1712	assert_eq!(
1713	p(r"a+?"),
1714	Hir::repetition(hir::Repetition {
1715	min: `1`,
1716	max: None,
1717	greedy: `false`,
1718	sub: Box::new(Hir::char('a')),
1719	}),
1720	);
1721
1722	assert_eq!(
1723	p(r"a?b"),
1724	Hir::concat(vec![
1725	Hir::repetition(hir::Repetition {
1726	min: `0`,
1727	max: Some(`1`),
1728	greedy: `true`,
1729	sub: Box::new(Hir::char('a')),
1730	}),
1731	Hir::char('b'),
1732	]),
1733	);
1734
1735	assert_eq!(
1736	p(r"ab?"),
1737	Hir::concat(vec![
1738	Hir::char('a'),
1739	Hir::repetition(hir::Repetition {
1740	min: `0`,
1741	max: Some(`1`),
1742	greedy: `true`,
1743	sub: Box::new(Hir::char('b')),
1744	}),
1745	]),
1746	);
1747
1748	assert_eq!(
1749	p(r"(?:ab)?"),
1750	Hir::repetition(hir::Repetition {
1751	min: `0`,
1752	max: Some(`1`),
1753	greedy: `true`,
1754	sub: Box::new(Hir::concat(vec![
1755	Hir::char('a'),
1756	Hir::char('b')
1757	])),
1758	}),
1759	);
1760
1761	assert_eq!(
1762	p(r"(ab)?"),
1763	Hir::repetition(hir::Repetition {
1764	min: `0`,
1765	max: Some(`1`),
1766	greedy: `true`,
1767	sub: Box::new(cap(
1768	`1`,
1769	Hir::concat(vec![Hir::char('a'), Hir::char('b')])
1770	)),
1771	}),
1772	);
1773
1774	assert_eq!(
1775	p(r"\|a?"),
1776	Hir::alternation(vec![
1777	Hir::empty(),
1778	Hir::repetition(hir::Repetition {
1779	min: `0`,
1780	max: Some(`1`),
1781	greedy: `true`,
1782	sub: Box::new(Hir::char('a')),
1783	})
1784	]),
1785	);
1786	}
1787
1788	#[test]
1789	fn ok_counted_repetition() {
1790	assert_eq!(
1791	p(r"a{5}"),
1792	Hir::repetition(hir::Repetition {
1793	min: `5`,
1794	max: Some(`5`),
1795	greedy: `true`,
1796	sub: Box::new(Hir::char('a')),
1797	}),
1798	);
1799	assert_eq!(
1800	p(r"a{5}?"),
1801	Hir::repetition(hir::Repetition {
1802	min: `5`,
1803	max: Some(`5`),
1804	greedy: `false`,
1805	sub: Box::new(Hir::char('a')),
1806	}),
1807	);
1808
1809	assert_eq!(
1810	p(r"a{5,}"),
1811	Hir::repetition(hir::Repetition {
1812	min: `5`,
1813	max: None,
1814	greedy: `true`,
1815	sub: Box::new(Hir::char('a')),
1816	}),
1817	);
1818
1819	assert_eq!(
1820	p(r"a{5,9}"),
1821	Hir::repetition(hir::Repetition {
1822	min: `5`,
1823	max: Some(`9`),
1824	greedy: `true`,
1825	sub: Box::new(Hir::char('a')),
1826	}),
1827	);
1828
1829	assert_eq!(
1830	p(r"ab{5}c"),
1831	Hir::concat(vec![
1832	Hir::char('a'),
1833	Hir::repetition(hir::Repetition {
1834	min: `5`,
1835	max: Some(`5`),
1836	greedy: `true`,
1837	sub: Box::new(Hir::char('b')),
1838	}),
1839	Hir::char('c'),
1840	]),
1841	);
1842
1843	assert_eq!(
1844	p(r"a{ 5 }"),
1845	Hir::repetition(hir::Repetition {
1846	min: `5`,
1847	max: Some(`5`),
1848	greedy: `true`,
1849	sub: Box::new(Hir::char('a')),
1850	}),
1851	);
1852	assert_eq!(
1853	p(r"a{ 5 , 9 }"),
1854	Hir::repetition(hir::Repetition {
1855	min: `5`,
1856	max: Some(`9`),
1857	greedy: `true`,
1858	sub: Box::new(Hir::char('a')),
1859	}),
1860	);
1861	}
1862
1863	#[test]
1864	fn ok_group_unnamed() {
1865	assert_eq!(p("(a)"), cap(`1`, Hir::char('a')));
1866	assert_eq!(
1867	p("(ab)"),
1868	cap(`1`, Hir::concat(vec![Hir::char('a'), Hir::char('b')]))
1869	);
1870	}
1871
1872	#[test]
1873	fn ok_group_named() {
1874	assert_eq!(p("(?P<foo>a)"), named_cap(`1`, "foo", Hir::char('a')));
1875	assert_eq!(p("(?<foo>a)"), named_cap(`1`, "foo", Hir::char('a')));
1876
1877	assert_eq!(
1878	p("(?P<foo>ab)"),
1879	named_cap(
1880	`1`,
1881	"foo",
1882	Hir::concat(vec![Hir::char('a'), Hir::char('b')])
1883	)
1884	);
1885	assert_eq!(
1886	p("(?<foo>ab)"),
1887	named_cap(
1888	`1`,
1889	"foo",
1890	Hir::concat(vec![Hir::char('a'), Hir::char('b')])
1891	)
1892	);
1893
1894	assert_eq!(p(r"(?<a>z)"), named_cap(`1`, "a", Hir::char('z')));
1895	assert_eq!(p(r"(?P<a>z)"), named_cap(`1`, "a", Hir::char('z')));
1896
1897	assert_eq!(p(r"(?<a_1>z)"), named_cap(`1`, "a_1", Hir::char('z')));
1898	assert_eq!(p(r"(?P<a_1>z)"), named_cap(`1`, "a_1", Hir::char('z')));
1899
1900	assert_eq!(p(r"(?<a.1>z)"), named_cap(`1`, "a.1", Hir::char('z')));
1901	assert_eq!(p(r"(?P<a.1>z)"), named_cap(`1`, "a.1", Hir::char('z')));
1902
1903	assert_eq!(p(r"(?<a[1]>z)"), named_cap(`1`, "a[1]", Hir::char('z')));
1904	assert_eq!(p(r"(?P<a[1]>z)"), named_cap(`1`, "a[1]", Hir::char('z')));
1905
1906	assert_eq!(p(r"(?<a¾>z)"), named_cap(`1`, "a¾", Hir::char('z')));
1907	assert_eq!(p(r"(?P<a¾>z)"), named_cap(`1`, "a¾", Hir::char('z')));
1908
1909	assert_eq!(p(r"(?<名字>z)"), named_cap(`1`, "名字", Hir::char('z')));
1910	assert_eq!(p(r"(?P<名字>z)"), named_cap(`1`, "名字", Hir::char('z')));
1911	}
1912
1913	#[test]
1914	fn ok_class() {
1915	assert_eq!(p(r"[a]"), singles(['a']));
1916	assert_eq!(p(r"[a\]]"), singles(['a', ']']));
1917	assert_eq!(p(r"[a\-z]"), singles(['a', '-', 'z']));
1918	assert_eq!(p(r"[ab]"), class([('a', 'b')]));
1919	assert_eq!(p(r"[a-]"), singles(['a', '-']));
1920	assert_eq!(p(r"[-a]"), singles(['a', '-']));
1921	assert_eq!(p(r"[--a]"), singles(['a', '-']));
1922	assert_eq!(p(r"[---a]"), singles(['a', '-']));
1923	assert_eq!(p(r"[[:alnum:]]"), posix("alnum"));
1924	assert_eq!(p(r"[\w]"), posix("word"));
1925	assert_eq!(p(r"[a\wz]"), posix("word"));
1926	assert_eq!(p(r"[\s\S]"), class([('`\x00`', '`\u{10FFFF}`')]));
1927	assert_eq!(p(r"[^\s\S]"), Hir::fail());
1928	assert_eq!(p(r"[a-cx-z]"), class([('a', 'c'), ('x', 'z')]));
1929	assert_eq!(p(r"[☃-⛄]"), class([('☃', '⛄')]));
1930	assert_eq!(p(r"[]]"), singles([']']));
1931	assert_eq!(p(r"[]a]"), singles([']', 'a']));
1932	assert_eq!(p(r"[]\[]"), singles(['[', ']']));
1933	assert_eq!(p(r"[\[]"), singles(['[']));
1934
1935	assert_eq!(p(r"(?i)[a]"), singles(['A', 'a']));
1936	assert_eq!(p(r"(?i)[A]"), singles(['A', 'a']));
1937	assert_eq!(p(r"(?i)[k]"), singles(['K', 'k']));
1938	assert_eq!(p(r"(?i)[s]"), singles(['S', 's']));
1939	assert_eq!(p(r"(?i)[β]"), singles(['β']));
1940
1941	assert_eq!(p(r"[^^]"), class([('`\x00`', ']'), ('_', '`\u{10FFFF}`')]));
1942	assert_eq!(
1943	p(r"[^-a]"),
1944	class([('`\x00`', ','), ('.', '`'), ('b', '`\u{10FFFF}`')])
1945	);
1946
1947	assert_eq!(
1948	p(r"[-]a]"),
1949	Hir::concat(vec![singles(['-']), Hir::char('a'), Hir::char(']')])
1950	);
1951	}
1952
1953	#[test]
1954	fn ok_verbatim() {
1955	assert_eq!(
1956	p(r"(?x)a{5,9} ?"),
1957	Hir::repetition(hir::Repetition {
1958	min: `5`,
1959	max: Some(`9`),
1960	greedy: `false`,
1961	sub: Box::new(Hir::char('a')),
1962	})
1963	);
1964	assert_eq!(p(r"(?x)[ a]"), singles(['a']));
1965	assert_eq!(
1966	p(r"(?x)[ ^ a]"),
1967	class([('`\x00`', '`'), ('b', '`\u{10FFFF}`')])
1968	);
1969	assert_eq!(p(r"(?x)[ - a]"), singles(['a', '-']));
1970	assert_eq!(p(r"(?x)[ ] a]"), singles([']', 'a']));
1971
1972	assert_eq!(
1973	p(r"(?x)a b"),
1974	Hir::concat(vec![Hir::char('a'), Hir::char('b')])
1975	);
1976	assert_eq!(
1977	p(r"(?x)a b(?-x)a b"),
1978	Hir::concat(vec![
1979	Hir::char('a'),
1980	Hir::char('b'),
1981	Hir::char('a'),
1982	Hir::char(' '),
1983	Hir::char('b'),
1984	])
1985	);
1986	assert_eq!(
1987	p(r"a (?x:a )a "),
1988	Hir::concat(vec![
1989	Hir::char('a'),
1990	Hir::char(' '),
1991	Hir::char('a'),
1992	Hir::char('a'),
1993	Hir::char(' '),
1994	])
1995	);
1996	assert_eq!(
1997	p(r"(?x)( ?P<foo> a )"),
1998	named_cap(`1`, "foo", Hir::char('a')),
1999	);
2000	assert_eq!(p(r"(?x)( a )"), cap(`1`, Hir::char('a')));
2001	assert_eq!(p(r"(?x)( ?: a )"), Hir::char('a'));
2002	assert_eq!(p(r"(?x)\x { 53 }"), Hir::char('`\x53`'));
2003	assert_eq!(p(r"(?x)\ "), Hir::char(' '));
2004	}
2005
2006	#[test]
2007	fn ok_comments() {
2008	let pat = "(?x)
2009	# This is comment 1.
2010	foo # This is comment 2.
2011	# This is comment 3.
2012	bar
2013	# This is comment 4.";
2014	assert_eq!(
2015	p(pat),
2016	Hir::concat(vec![
2017	Hir::char('f'),
2018	Hir::char('o'),
2019	Hir::char('o'),
2020	Hir::char('b'),
2021	Hir::char('a'),
2022	Hir::char('r'),
2023	])
2024	);
2025	}
2026
2027	#[test]
2028	fn err_standard() {
2029	assert_eq!(
2030	ERR_TOO_MUCH_NESTING,
2031	perr("(((((((((((((((((((((((((((((((((((((((((((((((((((a)))))))))))))))))))))))))))))))))))))))))))))))))))"),
2032	);
2033	// This one is tricky, because the only way it can happen is if the
2034	// number of captures overflows u32. Perhaps we should allow setting a
2035	// lower limit?
2036	// assert_eq!(ERR_TOO_MANY_CAPTURES, perr(""));
2037	assert_eq!(ERR_DUPLICATE_CAPTURE_NAME, perr(r"(?P<a>y)(?P<a>z)"));
2038	assert_eq!(ERR_UNCLOSED_GROUP, perr("("));
2039	assert_eq!(ERR_UNCLOSED_GROUP_QUESTION, perr("(?"));
2040	assert_eq!(ERR_UNOPENED_GROUP, perr(")"));
2041	assert_eq!(ERR_LOOK_UNSUPPORTED, perr(r"(?=a)"));
2042	assert_eq!(ERR_LOOK_UNSUPPORTED, perr(r"(?!a)"));
2043	assert_eq!(ERR_LOOK_UNSUPPORTED, perr(r"(?<=a)"));
2044	assert_eq!(ERR_LOOK_UNSUPPORTED, perr(r"(?<!a)"));
2045	assert_eq!(ERR_EMPTY_FLAGS, perr(r"(?)"));
2046	assert_eq!(ERR_MISSING_GROUP_NAME, perr(r"(?P<"));
2047	assert_eq!(ERR_MISSING_GROUP_NAME, perr(r"(?<"));
2048	assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?P<1abc>z)"));
2049	assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<1abc>z)"));
2050	assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<¾>z)"));
2051	assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<¾a>z)"));
2052	assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<☃>z)"));
2053	assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<a☃>z)"));
2054	assert_eq!(ERR_UNCLOSED_GROUP_NAME, perr(r"(?P<foo"));
2055	assert_eq!(ERR_UNCLOSED_GROUP_NAME, perr(r"(?<foo"));
2056	assert_eq!(ERR_EMPTY_GROUP_NAME, perr(r"(?P<>z)"));
2057	assert_eq!(ERR_EMPTY_GROUP_NAME, perr(r"(?<>z)"));
2058	assert_eq!(ERR_FLAG_UNRECOGNIZED, perr(r"(?z:foo)"));
2059	assert_eq!(ERR_FLAG_REPEATED_NEGATION, perr(r"(?s-i-R)"));
2060	assert_eq!(ERR_FLAG_DUPLICATE, perr(r"(?isi)"));
2061	assert_eq!(ERR_FLAG_DUPLICATE, perr(r"(?is-i)"));
2062	assert_eq!(ERR_FLAG_UNEXPECTED_EOF, perr(r"(?is"));
2063	assert_eq!(ERR_FLAG_DANGLING_NEGATION, perr(r"(?is-:foo)"));
2064	assert_eq!(ERR_HEX_BRACE_INVALID_DIGIT, perr(r"\x{Z}"));
2065	assert_eq!(ERR_HEX_BRACE_UNEXPECTED_EOF, perr(r"\x{"));
2066	assert_eq!(ERR_HEX_BRACE_UNEXPECTED_EOF, perr(r"\x{A"));
2067	assert_eq!(ERR_HEX_BRACE_EMPTY, perr(r"\x{}"));
2068	assert_eq!(ERR_HEX_BRACE_INVALID, perr(r"\x{FFFFFFFFFFFFFFFFF}"));
2069	assert_eq!(ERR_HEX_FIXED_UNEXPECTED_EOF, perr(r"\xA"));
2070	assert_eq!(ERR_HEX_FIXED_INVALID_DIGIT, perr(r"\xZ"));
2071	assert_eq!(ERR_HEX_FIXED_INVALID_DIGIT, perr(r"\xZA"));
2072	assert_eq!(ERR_HEX_FIXED_INVALID_DIGIT, perr(r"\xAZ"));
2073	assert_eq!(ERR_HEX_FIXED_INVALID, perr(r"\uD800"));
2074	assert_eq!(ERR_HEX_FIXED_INVALID, perr(r"\UFFFFFFFF"));
2075	assert_eq!(ERR_HEX_UNEXPECTED_EOF, perr(r"\x"));
2076	assert_eq!(ERR_ESCAPE_UNEXPECTED_EOF, perr(r"\"));
2077	assert_eq!(ERR_BACKREF_UNSUPPORTED, perr(r"\0"));
2078	assert_eq!(ERR_BACKREF_UNSUPPORTED, perr(r"\1"));
2079	assert_eq!(ERR_BACKREF_UNSUPPORTED, perr(r"\8"));
2080	assert_eq!(ERR_UNICODE_CLASS_UNSUPPORTED, perr(r"\pL"));
2081	assert_eq!(ERR_UNICODE_CLASS_UNSUPPORTED, perr(r"\p{L}"));
2082	assert_eq!(ERR_ESCAPE_UNRECOGNIZED, perr(r"\i"));
2083	assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"?"));
2084	assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"*"));
2085	assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"+"));
2086	assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"(+)"));
2087	assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"\|?"));
2088	assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"(?i)?"));
2089	assert_eq!(ERR_COUNTED_REP_SUB_MISSING, perr(r"{5}"));
2090	assert_eq!(ERR_COUNTED_REP_SUB_MISSING, perr(r"({5})"));
2091	assert_eq!(ERR_COUNTED_REP_SUB_MISSING, perr(r"(?i){5}"));
2092	assert_eq!(ERR_COUNTED_REP_UNCLOSED, perr(r"a{"));
2093	assert_eq!(ERR_COUNTED_REP_MIN_UNCLOSED, perr(r"a{5"));
2094	assert_eq!(ERR_COUNTED_REP_COMMA_UNCLOSED, perr(r"a{5,"));
2095	assert_eq!(ERR_COUNTED_REP_MIN_MAX_UNCLOSED, perr(r"a{5,6"));
2096	assert_eq!(ERR_COUNTED_REP_INVALID, perr(r"a{5,6Z"));
2097	assert_eq!(ERR_COUNTED_REP_INVALID_RANGE, perr(r"a{6,5}"));
2098	assert_eq!(ERR_DECIMAL_NO_DIGITS, perr(r"a{}"));
2099	assert_eq!(ERR_DECIMAL_NO_DIGITS, perr(r"a{]}"));
2100	assert_eq!(ERR_DECIMAL_INVALID, perr(r"a{999999999999999}"));
2101	assert_eq!(ERR_CLASS_UNCLOSED_AFTER_ITEM, perr(r"[a"));
2102	assert_eq!(ERR_CLASS_INVALID_RANGE_ITEM, perr(r"[\w-a]"));
2103	assert_eq!(ERR_CLASS_INVALID_RANGE_ITEM, perr(r"[a-\w]"));
2104	assert_eq!(ERR_CLASS_INVALID_ITEM, perr(r"[\b]"));
2105	assert_eq!(ERR_CLASS_UNCLOSED_AFTER_DASH, perr(r"[a-"));
2106	assert_eq!(ERR_CLASS_UNCLOSED_AFTER_NEGATION, perr(r"[^"));
2107	assert_eq!(ERR_CLASS_UNCLOSED_AFTER_CLOSING, perr(r"[]"));
2108	assert_eq!(ERR_CLASS_INVALID_RANGE, perr(r"[z-a]"));
2109	assert_eq!(ERR_CLASS_UNCLOSED, perr(r"["));
2110	assert_eq!(ERR_CLASS_UNCLOSED, perr(r"[a-z"));
2111	assert_eq!(ERR_CLASS_NEST_UNSUPPORTED, perr(r"[a-z[A-Z]]"));
2112	assert_eq!(ERR_CLASS_NEST_UNSUPPORTED, perr(r"[[:alnum]]"));
2113	assert_eq!(ERR_CLASS_INTERSECTION_UNSUPPORTED, perr(r"[a&&b]"));
2114	assert_eq!(ERR_CLASS_DIFFERENCE_UNSUPPORTED, perr(r"[a--b]"));
2115	assert_eq!(ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED, perr(r"[a~~b]"));
2116	assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED, perr(r"\b{foo"));
2117	assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED, perr(r"\b{foo!}"));
2118	assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED, perr(r"\b{foo}"));
2119	assert_eq!(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF, perr(r"\b{"));
2120	assert_eq!(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF, perr(r"(?x)\b{ "));
2121	}
2122
2123	#[test]
2124	fn err_verbatim() {
2125	// See: https://github.com/rust-lang/regex/issues/792
2126	assert_eq!(ERR_CLASS_UNCLOSED_AFTER_DASH, perr(r"(?x)[-#]"));
2127	assert_eq!(ERR_CLASS_UNCLOSED_AFTER_ITEM, perr(r"(?x)[a "));
2128	assert_eq!(ERR_CLASS_UNCLOSED_AFTER_DASH, perr(r"(?x)[a- "));
2129	assert_eq!(ERR_CLASS_UNCLOSED, perr(r"(?x)[ "));
2130	}
2131
2132	// This tests a bug fix where the nest limit checker wasn't decrementing
2133	// its depth during post-traversal, which causes long regexes to trip
2134	// the default limit too aggressively.
2135	#[test]
2136	fn regression_454_nest_too_big() {
2137	let pattern = r#"
2138	2(?:
2139	[45]\d{3}\|
2140	7(?:
2141	1[0-267]\|
2142	2[0-289]\|
2143	3[0-29]\|
2144	4[01]\|
2145	5[1-3]\|
2146	6[013]\|
2147	7[0178]\|
2148	91
2149	)\|
2150	8(?:
2151	0[125]\|
2152	[139][1-6]\|
2153	2[0157-9]\|
2154	41\|
2155	6[1-35]\|
2156	7[1-5]\|
2157	8[1-8]\|
2158	90
2159	)\|
2160	9(?:
2161	0[0-2]\|
2162	1[0-4]\|
2163	2[568]\|
2164	3[3-6]\|
2165	5[5-7]\|
2166	6[0167]\|
2167	7[15]\|
2168	8[0146-9]
2169	)
2170	)\d{4}
2171	"#;
2172	p(pattern);
2173	}
2174
2175	// This tests that we treat a trailing `-` in a character class as a
2176	// literal `-` even when whitespace mode is enabled and there is whitespace
2177	// after the trailing `-`.
2178	#[test]
2179	fn regression_455_trailing_dash_ignore_whitespace() {
2180	p("(?x)[ / - ]");
2181	p("(?x)[ a - ]");
2182	p("(?x)[
2183	a
2184	- ]
2185	");
2186	p("(?x)[
2187	a # wat
2188	- ]
2189	");
2190
2191	perr("(?x)[ / -");
2192	perr("(?x)[ / - ");
2193	perr(
2194	"(?x)[
2195	/ -
2196	",
2197	);
2198	perr(
2199	"(?x)[
2200	/ - # wat
2201	",
2202	);
2203	}
2204
2205	#[test]
2206	fn regression_capture_indices() {
2207	let got = p(r"(a\|ab\|c\|bcd){4,10}(d*)");
2208	assert_eq!(
2209	got,
2210	Hir::concat(vec![
2211	Hir::repetition(hir::Repetition {
2212	min: `4`,
2213	max: Some(`10`),
2214	greedy: `true`,
2215	sub: Box::new(cap(
2216	`1`,
2217	Hir::alternation(vec![
2218	Hir::char('a'),
2219	Hir::concat(vec![Hir::char('a'), Hir::char('b')]),
2220	Hir::char('c'),
2221	Hir::concat(vec![
2222	Hir::char('b'),
2223	Hir::char('c'),
2224	Hir::char('d')
2225	]),
2226	])
2227	))
2228	}),
2229	cap(
2230	`2`,
2231	Hir::repetition(hir::Repetition {
2232	min: `0`,
2233	max: None,
2234	greedy: `true`,
2235	sub: Box::new(Hir::char('d')),
2236	})
2237	),
2238	])
2239	);
2240	}
2241	}
2242