mod.rs source code [crates/regex-syntax/src/hir/mod.rs]

1	/!*
2	Defines a high-level intermediate (HIR) representation for regular expressions.
3
4	The HIR is represented by the [`Hir`] type, and it principally constructed via
5	[translation](translate) from an [`Ast`](crate::ast::Ast). Alternatively, users
6	may use the smart constructors defined on `Hir` to build their own by hand. The
7	smart constructors simultaneously simplify and "optimize" the HIR, and are also
8	the same routines used by translation.
9
10	Most regex engines only have an HIR like this, and usually construct it
11	directly from the concrete syntax. This crate however first parses the
12	concrete syntax into an `Ast`, and only then creates the HIR from the `Ast`,
13	as mentioned above. It's done this way to facilitate better error reporting,
14	and to have a structured representation of a regex that faithfully represents
15	its concrete syntax. Namely, while an `Hir` value can be converted back to an
16	equivalent regex pattern string, it is unlikely to look like the original due
17	to its simplified structure.
18	*/
19
20	use core::{char, cmp};
21
22	use alloc::{
23	boxed::Box,
24	format,
25	string::{String, ToString},
26	vec,
27	vec::Vec,
28	};
29
30	use crate::{
31	ast::Span,
32	hir::interval::{Interval, IntervalSet, IntervalSetIter},
33	unicode,
34	};
35
36	pub use crate::{
37	hir::visitor::{visit, Visitor},
38	unicode::CaseFoldError,
39	};
40
41	mod interval;
42	pub mod literal;
43	pub mod print;
44	pub mod translate;
45	mod visitor;
46
47	/// An error that can occur while translating an `Ast` to a `Hir`.
48	#[derive(Clone, Debug, Eq, PartialEq)]
49	pub struct Error {
50	/// The kind of error.
51	kind: ErrorKind,
52	/// The original pattern that the translator's Ast was parsed from. Every
53	/// span in an error is a valid range into this string.
54	pattern: String,
55	/// The span of this error, derived from the Ast given to the translator.
56	span: Span,
57	}
58
59	impl Error {
60	/// Return the type of this error.
61	pub fn kind(&self) -> &ErrorKind {
62	&self.kind
63	}
64
65	/// The original pattern string in which this error occurred.
66	///
67	/// Every span reported by this error is reported in terms of this string.
68	pub fn pattern(&self) -> &str {
69	&self.pattern
70	}
71
72	/// Return the span at which this error occurred.
73	pub fn span(&self) -> &Span {
74	&self.span
75	}
76	}
77
78	/// The type of an error that occurred while building an `Hir`.
79	///
80	/// This error type is marked as `non_exhaustive`. This means that adding a
81	/// new variant is not considered a breaking change.
82	#[non_exhaustive]
83	#[derive(Clone, Debug, Eq, PartialEq)]
84	pub enum ErrorKind {
85	/// This error occurs when a Unicode feature is used when Unicode
86	/// support is disabled. For example `(?-u:\pL)` would trigger this error.
87	UnicodeNotAllowed,
88	/// This error occurs when translating a pattern that could match a byte
89	/// sequence that isn't UTF-8 and `utf8` was enabled.
90	InvalidUtf8,
91	/// This error occurs when one uses a non-ASCII byte for a line terminator,
92	/// but where Unicode mode is enabled and UTF-8 mode is disabled.
93	InvalidLineTerminator,
94	/// This occurs when an unrecognized Unicode property name could not
95	/// be found.
96	UnicodePropertyNotFound,
97	/// This occurs when an unrecognized Unicode property value could not
98	/// be found.
99	UnicodePropertyValueNotFound,
100	/// This occurs when a Unicode-aware Perl character class (`\w`, `\s` or
101	/// `\d`) could not be found. This can occur when the `unicode-perl`
102	/// crate feature is not enabled.
103	UnicodePerlClassNotFound,
104	/// This occurs when the Unicode simple case mapping tables are not
105	/// available, and the regular expression required Unicode aware case
106	/// insensitivity.
107	UnicodeCaseUnavailable,
108	}
109
110	#[cfg(feature = "std")]
111	impl std::error::Error for Error {}
112
113	impl core::fmt::Display for Error {
114	fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
115	crate::error::Formatter::from(self).fmt(f)
116	}
117	}
118
119	impl core::fmt::Display for ErrorKind {
120	fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
121	use self::ErrorKind::*;
122
123	let msg: &str = match *self {
124	UnicodeNotAllowed => "Unicode not allowed here",
125	InvalidUtf8 => "pattern can match invalid UTF-8",
126	InvalidLineTerminator => "invalid line terminator, must be ASCII",
127	UnicodePropertyNotFound => "Unicode property not found",
128	UnicodePropertyValueNotFound => "Unicode property value not found",
129	UnicodePerlClassNotFound => {
130	"Unicode-aware Perl class not found \
131	(make sure the unicode-perl feature is enabled)"
132	}
133	UnicodeCaseUnavailable => {
134	"Unicode-aware case insensitivity matching is not available \
135	(make sure the unicode-case feature is enabled)"
136	}
137	};
138	f.write_str(data:msg)
139	}
140	}
141
142	/// A high-level intermediate representation (HIR) for a regular expression.
143	///
144	/// An HIR value is a combination of a [`HirKind`] and a set of [`Properties`].
145	/// An `HirKind` indicates what kind of regular expression it is (a literal,
146	/// a repetition, a look-around assertion, etc.), where as a `Properties`
147	/// describes various facts about the regular expression. For example, whether
148	/// it matches UTF-8 or if it matches the empty string.
149	///
150	/// The HIR of a regular expression represents an intermediate step between
151	/// its abstract syntax (a structured description of the concrete syntax) and
152	/// an actual regex matcher. The purpose of HIR is to make regular expressions
153	/// easier to analyze. In particular, the AST is much more complex than the
154	/// HIR. For example, while an AST supports arbitrarily nested character
155	/// classes, the HIR will flatten all nested classes into a single set. The HIR
156	/// will also "compile away" every flag present in the concrete syntax. For
157	/// example, users of HIR expressions never need to worry about case folding;
158	/// it is handled automatically by the translator (e.g., by translating
159	/// `(?i:A)` to `[aA]`).
160	///
161	/// The specific type of an HIR expression can be accessed via its `kind`
162	/// or `into_kind` methods. This extra level of indirection exists for two
163	/// reasons:
164	///
165	/// 1. Construction of an HIR expression must* use the constructor methods on*
166	/// this `Hir` type instead of building the `HirKind` values directly. This
167	/// permits construction to enforce invariants like "concatenations always
168	/// consist of two or more sub-expressions."
169	/// 2. Every HIR expression contains attributes that are defined inductively,
170	/// and can be computed cheaply during the construction process. For example,
171	/// one such attribute is whether the expression must match at the beginning of
172	/// the haystack.
173	///
174	/// In particular, if you have an `HirKind` value, then there is intentionally
175	/// no way to build an `Hir` value from it. You instead need to do case
176	/// analysis on the `HirKind` value and build the `Hir` value using its smart
177	/// constructors.
178	///
179	/// # UTF-8
180	///
181	/// If the HIR was produced by a translator with
182	/// [`TranslatorBuilder::utf8`](translate::TranslatorBuilder::utf8) enabled,
183	/// then the HIR is guaranteed to match UTF-8 exclusively for all non-empty
184	/// matches.
185	///
186	/// For empty matches, those can occur at any position. It is the
187	/// responsibility of the regex engine to determine whether empty matches are
188	/// permitted between the code units of a single codepoint.
189	///
190	/// # Stack space
191	///
192	/// This type defines its own destructor that uses constant stack space and
193	/// heap space proportional to the size of the HIR.
194	///
195	/// Also, an `Hir`'s `fmt::Display` implementation prints an HIR as a regular
196	/// expression pattern string, and uses constant stack space and heap space
197	/// proportional to the size of the `Hir`. The regex it prints is guaranteed to
198	/// be _semantically_ equivalent to the original concrete syntax, but it may
199	/// look very different. (And potentially not practically readable by a human.)
200	///
201	/// An `Hir`'s `fmt::Debug` implementation currently does not use constant
202	/// stack space. The implementation will also suppress some details (such as
203	/// the `Properties` inlined into every `Hir` value to make it less noisy).
204	#[derive(Clone, Eq, PartialEq)]
205	pub struct Hir {
206	/// The underlying HIR kind.
207	kind: HirKind,
208	/// Analysis info about this HIR, computed during construction.
209	props: Properties,
210	}
211
212	/// Methods for accessing the underlying `HirKind` and `Properties`.
213	impl Hir {
214	/// Returns a reference to the underlying HIR kind.
215	pub fn kind(&self) -> &HirKind {
216	&self.kind
217	}
218
219	/// Consumes ownership of this HIR expression and returns its underlying
220	/// `HirKind`.
221	pub fn into_kind(mut self) -> HirKind {
222	core::mem::replace(&mut self.kind, HirKind::Empty)
223	}
224
225	/// Returns the properties computed for this `Hir`.
226	pub fn properties(&self) -> &Properties {
227	&self.props
228	}
229
230	/// Splits this HIR into its constituent parts.
231	///
232	/// This is useful because `let Hir { kind, props } = hir;` does not work
233	/// because of `Hir`'s custom `Drop` implementation.
234	fn into_parts(mut self) -> (HirKind, Properties) {
235	(
236	core::mem::replace(&mut self.kind, HirKind::Empty),
237	core::mem::replace(&mut self.props, Properties::empty()),
238	)
239	}
240	}
241
242	/// Smart constructors for HIR values.
243	///
244	/// These constructors are called "smart" because they do inductive work or
245	/// simplifications. For example, calling `Hir::repetition` with a repetition
246	/// like `a{0}` will actually return a `Hir` with a `HirKind::Empty` kind
247	/// since it is equivalent to an empty regex. Another example is calling
248	/// `Hir::concat(vec![expr])`. Instead of getting a `HirKind::Concat`, you'll
249	/// just get back the original `expr` since it's precisely equivalent.
250	///
251	/// Smart constructors enable maintaining invariants about the HIR data type
252	/// while also simulanteously keeping the representation as simple as possible.
253	impl Hir {
254	/// Returns an empty HIR expression.
255	///
256	/// An empty HIR expression always matches, including the empty string.
257	#[inline]
258	pub fn empty() -> Hir {
259	let props = Properties::empty();
260	Hir { kind: HirKind::Empty, props }
261	}
262
263	/// Returns an HIR expression that can never match anything. That is,
264	/// the size of the set of strings in the language described by the HIR
265	/// returned is `0`.
266	///
267	/// This is distinct from [`Hir::empty`] in that the empty string matches
268	/// the HIR returned by `Hir::empty`. That is, the set of strings in the
269	/// language describe described by `Hir::empty` is non-empty.
270	///
271	/// Note that currently, the HIR returned uses an empty character class to
272	/// indicate that nothing can match. An equivalent expression that cannot
273	/// match is an empty alternation, but all such "fail" expressions are
274	/// normalized (via smart constructors) to empty character classes. This is
275	/// because empty character classes can be spelled in the concrete syntax
276	/// of a regex (e.g., `\P{any}` or `(?-u:[^\x00-\xFF])` or `[a&&b]`), but
277	/// empty alternations cannot.
278	#[inline]
279	pub fn fail() -> Hir {
280	let class = Class::Bytes(ClassBytes::empty());
281	let props = Properties::class(&class);
282	// We can't just call Hir::class here because it defers to Hir::fail
283	// in order to canonicalize the Hir value used to represent "cannot
284	// match."
285	Hir { kind: HirKind::Class(class), props }
286	}
287
288	/// Creates a literal HIR expression.
289	///
290	/// This accepts anything that can be converted into a `Box<[u8]>`.
291	///
292	/// Note that there is no mechanism for storing a `char` or a `Box<str>`
293	/// in an HIR. Everything is "just bytes." Whether a `Literal` (or
294	/// any HIR node) matches valid UTF-8 exclusively can be queried via
295	/// [`Properties::is_utf8`].
296	///
297	/// # Example
298	///
299	/// This example shows that concatenations of `Literal` HIR values will
300	/// automatically get flattened and combined together. So for example, even
301	/// if you concat multiple `Literal` values that are themselves not valid
302	/// UTF-8, they might add up to valid UTF-8. This also demonstrates just
303	/// how "smart" Hir's smart constructors are.
304	///
305	/// ```
306	/// use regex_syntax::hir::{Hir, HirKind, Literal};
307	///
308	/// let literals = vec![
309	/// Hir::literal([`0xE2`]),
310	/// Hir::literal([`0x98`]),
311	/// Hir::literal([`0x83`]),
312	/// ];
313	/// // Each literal, on its own, is invalid UTF-8.
314	/// assert!(literals.iter().all(\|hir\| !hir.properties().is_utf8()));
315	///
316	/// let concat = Hir::concat(literals);
317	/// // But the concatenation is valid UTF-8!
318	/// assert!(concat.properties().is_utf8());
319	///
320	/// // And also notice that the literals have been concatenated into a
321	/// // single `Literal`, to the point where there is no explicit `Concat`!
322	/// let expected = HirKind::Literal(Literal(Box::from("☃".as_bytes())));
323	/// assert_eq!(&expected, concat.kind());
324	/// ```
325	#[inline]
326	pub fn literal<B: Into<Box<[u8]>>>(lit: B) -> Hir {
327	let bytes = lit.into();
328	if bytes.is_empty() {
329	return Hir::empty();
330	}
331
332	let lit = Literal(bytes);
333	let props = Properties::literal(&lit);
334	Hir { kind: HirKind::Literal(lit), props }
335	}
336
337	/// Creates a class HIR expression. The class may either be defined over
338	/// ranges of Unicode codepoints or ranges of raw byte values.
339	///
340	/// Note that an empty class is permitted. An empty class is equivalent to
341	/// `Hir::fail()`.
342	#[inline]
343	pub fn class(class: Class) -> Hir {
344	if class.is_empty() {
345	return Hir::fail();
346	} else if let Some(bytes) = class.literal() {
347	return Hir::literal(bytes);
348	}
349	let props = Properties::class(&class);
350	Hir { kind: HirKind::Class(class), props }
351	}
352
353	/// Creates a look-around assertion HIR expression.
354	#[inline]
355	pub fn look(look: Look) -> Hir {
356	let props = Properties::look(look);
357	Hir { kind: HirKind::Look(look), props }
358	}
359
360	/// Creates a repetition HIR expression.
361	#[inline]
362	pub fn repetition(mut rep: Repetition) -> Hir {
363	// If the sub-expression of a repetition can only match the empty
364	// string, then we force its maximum to be at most 1.
365	if rep.sub.properties().maximum_len() == Some(`0`) {
366	rep.min = cmp::min(rep.min, `1`);
367	rep.max = rep.max.map(\|n\| cmp::min(n, `1`)).or(Some(`1`));
368	}
369	// The regex 'a{0}' is always equivalent to the empty regex. This is
370	// true even when 'a' is an expression that never matches anything
371	// (like '\P{any}').
372	//
373	// Additionally, the regex 'a{1}' is always equivalent to 'a'.
374	if rep.min == `0` && rep.max == Some(`0`) {
375	return Hir::empty();
376	} else if rep.min == `1` && rep.max == Some(`1`) {
377	return *rep.sub;
378	}
379	let props = Properties::repetition(&rep);
380	Hir { kind: HirKind::Repetition(rep), props }
381	}
382
383	/// Creates a capture HIR expression.
384	///
385	/// Note that there is no explicit HIR value for a non-capturing group.
386	/// Since a non-capturing group only exists to override precedence in the
387	/// concrete syntax and since an HIR already does its own grouping based on
388	/// what is parsed, there is no need to explicitly represent non-capturing
389	/// groups in the HIR.
390	#[inline]
391	pub fn capture(capture: Capture) -> Hir {
392	let props = Properties::capture(&capture);
393	Hir { kind: HirKind::Capture(capture), props }
394	}
395
396	/// Returns the concatenation of the given expressions.
397	///
398	/// This attempts to flatten and simplify the concatenation as appropriate.
399	///
400	/// # Example
401	///
402	/// This shows a simple example of basic flattening of both concatenations
403	/// and literals.
404	///
405	/// ```
406	/// use regex_syntax::hir::Hir;
407	///
408	/// let hir = Hir::concat(vec![
409	/// Hir::concat(vec![
410	/// Hir::literal([b'a']),
411	/// Hir::literal([b'b']),
412	/// Hir::literal([b'c']),
413	/// ]),
414	/// Hir::concat(vec![
415	/// Hir::literal([b'x']),
416	/// Hir::literal([b'y']),
417	/// Hir::literal([b'z']),
418	/// ]),
419	/// ]);
420	/// let expected = Hir::literal("abcxyz".as_bytes());
421	/// assert_eq!(expected, hir);
422	/// ```
423	pub fn concat(subs: Vec<Hir>) -> Hir {
424	// We rebuild the concatenation by simplifying it. Would be nice to do
425	// it in place, but that seems a little tricky?
426	let mut new = vec![];
427	// This gobbles up any adjacent literals in a concatenation and smushes
428	// them together. Basically, when we see a literal, we add its bytes
429	// to 'prior_lit', and whenever we see anything else, we first take
430	// any bytes in 'prior_lit' and add it to the 'new' concatenation.
431	let mut prior_lit: Option<Vec<u8>> = None;
432	for sub in subs {
433	let (kind, props) = sub.into_parts();
434	match kind {
435	HirKind::Literal(Literal(bytes)) => {
436	if let Some(ref mut prior_bytes) = prior_lit {
437	prior_bytes.extend_from_slice(&bytes);
438	} else {
439	prior_lit = Some(bytes.to_vec());
440	}
441	}
442	// We also flatten concats that are direct children of another
443	// concat. We only need to do this one level deep since
444	// Hir::concat is the only way to build concatenations, and so
445	// flattening happens inductively.
446	HirKind::Concat(subs2) => {
447	for sub2 in subs2 {
448	let (kind2, props2) = sub2.into_parts();
449	match kind2 {
450	HirKind::Literal(Literal(bytes)) => {
451	if let Some(ref mut prior_bytes) = prior_lit {
452	prior_bytes.extend_from_slice(&bytes);
453	} else {
454	prior_lit = Some(bytes.to_vec());
455	}
456	}
457	kind2 => {
458	if let Some(prior_bytes) = prior_lit.take() {
459	new.push(Hir::literal(prior_bytes));
460	}
461	new.push(Hir { kind: kind2, props: props2 });
462	}
463	}
464	}
465	}
466	// We can just skip empty HIRs.
467	HirKind::Empty => {}
468	kind => {
469	if let Some(prior_bytes) = prior_lit.take() {
470	new.push(Hir::literal(prior_bytes));
471	}
472	new.push(Hir { kind, props });
473	}
474	}
475	}
476	if let Some(prior_bytes) = prior_lit.take() {
477	new.push(Hir::literal(prior_bytes));
478	}
479	if new.is_empty() {
480	return Hir::empty();
481	} else if new.len() == `1` {
482	return new.pop().unwrap();
483	}
484	let props = Properties::concat(&new);
485	Hir { kind: HirKind::Concat(new), props }
486	}
487
488	/// Returns the alternation of the given expressions.
489	///
490	/// This flattens and simplifies the alternation as appropriate. This may
491	/// include factoring out common prefixes or even rewriting the alternation
492	/// as a character class.
493	///
494	/// Note that an empty alternation is equivalent to `Hir::fail()`. (It
495	/// is not possible for one to write an empty alternation, or even an
496	/// alternation with a single sub-expression, in the concrete syntax of a
497	/// regex.)
498	///
499	/// # Example
500	///
501	/// This is a simple example showing how an alternation might get
502	/// simplified.
503	///
504	/// ```
505	/// use regex_syntax::hir::{Hir, Class, ClassUnicode, ClassUnicodeRange};
506	///
507	/// let hir = Hir::alternation(vec![
508	/// Hir::literal([b'a']),
509	/// Hir::literal([b'b']),
510	/// Hir::literal([b'c']),
511	/// Hir::literal([b'd']),
512	/// Hir::literal([b'e']),
513	/// Hir::literal([b'f']),
514	/// ]);
515	/// let expected = Hir::class(Class::Unicode(ClassUnicode::new([
516	/// ClassUnicodeRange::new('a', 'f'),
517	/// ])));
518	/// assert_eq!(expected, hir);
519	/// ```
520	///
521	/// And another example showing how common prefixes might get factored
522	/// out.
523	///
524	/// ```
525	/// use regex_syntax::hir::{Hir, Class, ClassUnicode, ClassUnicodeRange};
526	///
527	/// let hir = Hir::alternation(vec![
528	/// Hir::concat(vec![
529	/// Hir::literal("abc".as_bytes()),
530	/// Hir::class(Class::Unicode(ClassUnicode::new([
531	/// ClassUnicodeRange::new('A', 'Z'),
532	/// ]))),
533	/// ]),
534	/// Hir::concat(vec![
535	/// Hir::literal("abc".as_bytes()),
536	/// Hir::class(Class::Unicode(ClassUnicode::new([
537	/// ClassUnicodeRange::new('a', 'z'),
538	/// ]))),
539	/// ]),
540	/// ]);
541	/// let expected = Hir::concat(vec![
542	/// Hir::literal("abc".as_bytes()),
543	/// Hir::alternation(vec![
544	/// Hir::class(Class::Unicode(ClassUnicode::new([
545	/// ClassUnicodeRange::new('A', 'Z'),
546	/// ]))),
547	/// Hir::class(Class::Unicode(ClassUnicode::new([
548	/// ClassUnicodeRange::new('a', 'z'),
549	/// ]))),
550	/// ]),
551	/// ]);
552	/// assert_eq!(expected, hir);
553	/// ```
554	///
555	/// Note that these sorts of simplifications are not guaranteed.
556	pub fn alternation(subs: Vec<Hir>) -> Hir {
557	// We rebuild the alternation by simplifying it. We proceed similarly
558	// as the concatenation case. But in this case, there's no literal
559	// simplification happening. We're just flattening alternations.
560	let mut new = Vec::with_capacity(subs.len());
561	for sub in subs {
562	let (kind, props) = sub.into_parts();
563	match kind {
564	HirKind::Alternation(subs2) => {
565	new.extend(subs2);
566	}
567	kind => {
568	new.push(Hir { kind, props });
569	}
570	}
571	}
572	if new.is_empty() {
573	return Hir::fail();
574	} else if new.len() == `1` {
575	return new.pop().unwrap();
576	}
577	// Now that it's completely flattened, look for the special case of
578	// 'char1\|char2\|...\|charN' and collapse that into a class. Note that
579	// we look for 'char' first and then bytes. The issue here is that if
580	// we find both non-ASCII codepoints and non-ASCII singleton bytes,
581	// then it isn't actually possible to smush them into a single class.
582	// (Because classes are either "all codepoints" or "all bytes." You
583	// can have a class that both matches non-ASCII but valid UTF-8 and
584	// invalid UTF-8.) So we look for all chars and then all bytes, and
585	// don't handle anything else.
586	if let Some(singletons) = singleton_chars(&new) {
587	let it = singletons
588	.into_iter()
589	.map(\|ch\| ClassUnicodeRange { start: ch, end: ch });
590	return Hir::class(Class::Unicode(ClassUnicode::new(it)));
591	}
592	if let Some(singletons) = singleton_bytes(&new) {
593	let it = singletons
594	.into_iter()
595	.map(\|b\| ClassBytesRange { start: b, end: b });
596	return Hir::class(Class::Bytes(ClassBytes::new(it)));
597	}
598	// Similar to singleton chars, we can also look for alternations of
599	// classes. Those can be smushed into a single class.
600	if let Some(cls) = class_chars(&new) {
601	return Hir::class(cls);
602	}
603	if let Some(cls) = class_bytes(&new) {
604	return Hir::class(cls);
605	}
606	// Factor out a common prefix if we can, which might potentially
607	// simplify the expression and unlock other optimizations downstream.
608	// It also might generally make NFA matching and DFA construction
609	// faster by reducing the scope of branching in the regex.
610	new = match lift_common_prefix(new) {
611	Ok(hir) => return hir,
612	Err(unchanged) => unchanged,
613	};
614	let props = Properties::alternation(&new);
615	Hir { kind: HirKind::Alternation(new), props }
616	}
617
618	/// Returns an HIR expression for `.`.
619	///
620	/// * [`Dot::AnyChar`] maps to `(?su-R:.)`.
621	/// * [`Dot::AnyByte`] maps to `(?s-Ru:.)`.
622	/// * [`Dot::AnyCharExceptLF`] maps to `(?u-Rs:.)`.
623	/// * [`Dot::AnyCharExceptCRLF`] maps to `(?Ru-s:.)`.
624	/// * [`Dot::AnyByteExceptLF`] maps to `(?-Rsu:.)`.
625	/// * [`Dot::AnyByteExceptCRLF`] maps to `(?R-su:.)`.
626	///
627	/// # Example
628	///
629	/// Note that this is a convenience routine for constructing the correct
630	/// character class based on the value of `Dot`. There is no explicit "dot"
631	/// HIR value. It is just an abbreviation for a common character class.
632	///
633	/// ```
634	/// use regex_syntax::hir::{Hir, Dot, Class, ClassBytes, ClassBytesRange};
635	///
636	/// let hir = Hir::dot(Dot::AnyByte);
637	/// let expected = Hir::class(Class::Bytes(ClassBytes::new([
638	/// ClassBytesRange::new(`0x00`, `0xFF`),
639	/// ])));
640	/// assert_eq!(expected, hir);
641	/// ```
642	#[inline]
643	pub fn dot(dot: Dot) -> Hir {
644	match dot {
645	Dot::AnyChar => {
646	let mut cls = ClassUnicode::empty();
647	cls.push(ClassUnicodeRange::new('`\0`', '`\u{10FFFF}`'));
648	Hir::class(Class::Unicode(cls))
649	}
650	Dot::AnyByte => {
651	let mut cls = ClassBytes::empty();
652	cls.push(ClassBytesRange::new(b'`\0`', b'`\xFF`'));
653	Hir::class(Class::Bytes(cls))
654	}
655	Dot::AnyCharExcept(ch) => {
656	let mut cls =
657	ClassUnicode::new([ClassUnicodeRange::new(ch, ch)]);
658	cls.negate();
659	Hir::class(Class::Unicode(cls))
660	}
661	Dot::AnyCharExceptLF => {
662	let mut cls = ClassUnicode::empty();
663	cls.push(ClassUnicodeRange::new('`\0`', '`\x09`'));
664	cls.push(ClassUnicodeRange::new('`\x0B`', '`\u{10FFFF}`'));
665	Hir::class(Class::Unicode(cls))
666	}
667	Dot::AnyCharExceptCRLF => {
668	let mut cls = ClassUnicode::empty();
669	cls.push(ClassUnicodeRange::new('`\0`', '`\x09`'));
670	cls.push(ClassUnicodeRange::new('`\x0B`', '`\x0C`'));
671	cls.push(ClassUnicodeRange::new('`\x0E`', '`\u{10FFFF}`'));
672	Hir::class(Class::Unicode(cls))
673	}
674	Dot::AnyByteExcept(byte) => {
675	let mut cls =
676	ClassBytes::new([ClassBytesRange::new(byte, byte)]);
677	cls.negate();
678	Hir::class(Class::Bytes(cls))
679	}
680	Dot::AnyByteExceptLF => {
681	let mut cls = ClassBytes::empty();
682	cls.push(ClassBytesRange::new(b'`\0`', b'`\x09`'));
683	cls.push(ClassBytesRange::new(b'`\x0B`', b'`\xFF`'));
684	Hir::class(Class::Bytes(cls))
685	}
686	Dot::AnyByteExceptCRLF => {
687	let mut cls = ClassBytes::empty();
688	cls.push(ClassBytesRange::new(b'`\0`', b'`\x09`'));
689	cls.push(ClassBytesRange::new(b'`\x0B`', b'`\x0C`'));
690	cls.push(ClassBytesRange::new(b'`\x0E`', b'`\xFF`'));
691	Hir::class(Class::Bytes(cls))
692	}
693	}
694	}
695	}
696
697	/// The underlying kind of an arbitrary [`Hir`] expression.
698	///
699	/// An `HirKind` is principally useful for doing case analysis on the type
700	/// of a regular expression. If you're looking to build new `Hir` values,
701	/// then you _must_ use the smart constructors defined on `Hir`, like
702	/// [`Hir::repetition`], to build new `Hir` values. The API intentionally does
703	/// not expose any way of building an `Hir` directly from an `HirKind`.
704	#[derive(Clone, Debug, Eq, PartialEq)]
705	pub enum HirKind {
706	/// The empty regular expression, which matches everything, including the
707	/// empty string.
708	Empty,
709	/// A literalstring that matches exactly these bytes.
710	Literal(Literal),
711	/// A single character class that matches any of the characters in the
712	/// class. A class can either consist of Unicode scalar values as
713	/// characters, or it can use bytes.
714	///
715	/// A class may be empty. In which case, it matches nothing.
716	Class(Class),
717	/// A look-around assertion. A look-around match always has zero length.
718	Look(Look),
719	/// A repetition operation applied to a sub-expression.
720	Repetition(Repetition),
721	/// A capturing group, which contains a sub-expression.
722	Capture(Capture),
723	/// A concatenation of expressions.
724	///
725	/// A concatenation matches only if each of its sub-expressions match one
726	/// after the other.
727	///
728	/// Concatenations are guaranteed by `Hir`'s smart constructors to always
729	/// have at least two sub-expressions.
730	Concat(Vec<Hir>),
731	/// An alternation of expressions.
732	///
733	/// An alternation matches only if at least one of its sub-expressions
734	/// match. If multiple sub-expressions match, then the leftmost is
735	/// preferred.
736	///
737	/// Alternations are guaranteed by `Hir`'s smart constructors to always
738	/// have at least two sub-expressions.
739	Alternation(Vec<Hir>),
740	}
741
742	impl HirKind {
743	/// Returns a slice of this kind's sub-expressions, if any.
744	pub fn subs(&self) -> &[Hir] {
745	use core::slice::from_ref;
746
747	match *self {
748	HirKind::Empty
749	\| HirKind::Literal(_)
750	\| HirKind::Class(_)
751	\| HirKind::Look(_) => &[],
752	HirKind::Repetition(Repetition { ref sub: &Box, .. }) => from_ref(sub),
753	HirKind::Capture(Capture { ref sub: &Box, .. }) => from_ref(sub),
754	HirKind::Concat(ref subs: &Vec) => subs,
755	HirKind::Alternation(ref subs: &Vec) => subs,
756	}
757	}
758	}
759
760	impl core::fmt::Debug for Hir {
761	fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
762	self.kind.fmt(f)
763	}
764	}
765
766	/// Print a display representation of this Hir.
767	///
768	/// The result of this is a valid regular expression pattern string.
769	///
770	/// This implementation uses constant stack space and heap space proportional
771	/// to the size of the `Hir`.
772	impl core::fmt::Display for Hir {
773	fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
774	crate::hir::print::Printer::new().print(self, wtr:f)
775	}
776	}
777
778	/// The high-level intermediate representation of a literal.
779	///
780	/// A literal corresponds to `0` or more bytes that should be matched
781	/// literally. The smart constructors defined on `Hir` will automatically
782	/// concatenate adjacent literals into one literal, and will even automatically
783	/// replace empty literals with `Hir::empty()`.
784	///
785	/// Note that despite a literal being represented by a sequence of bytes, its
786	/// `Debug` implementation will attempt to print it as a normal string. (That
787	/// is, not a sequence of decimal numbers.)
788	#[derive(Clone, Eq, PartialEq)]
789	pub struct Literal(pub Box<[u8]>);
790
791	impl core::fmt::Debug for Literal {
792	fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
793	crate::debug::Bytes(&self.0).fmt(f)
794	}
795	}
796
797	/// The high-level intermediate representation of a character class.
798	///
799	/// A character class corresponds to a set of characters. A character is either
800	/// defined by a Unicode scalar value or a byte.
801	///
802	/// A character class, regardless of its character type, is represented by a
803	/// sequence of non-overlapping non-adjacent ranges of characters.
804	///
805	/// There are no guarantees about which class variant is used. Generally
806	/// speaking, the Unicode variat is used whenever a class needs to contain
807	/// non-ASCII Unicode scalar values. But the Unicode variant can be used even
808	/// when Unicode mode is disabled. For example, at the time of writing, the
809	/// regex `(?-u:a\|\xc2\xa0)` will compile down to HIR for the Unicode class
810	/// `[a\u00A0]` due to optimizations.
811	///
812	/// Note that `Bytes` variant may be produced even when it exclusively matches
813	/// valid UTF-8. This is because a `Bytes` variant represents an intention by
814	/// the author of the regular expression to disable Unicode mode, which in turn
815	/// impacts the semantics of case insensitive matching. For example, `(?i)k`
816	/// and `(?i-u)k` will not match the same set of strings.
817	#[derive(Clone, Eq, PartialEq)]
818	pub enum Class {
819	/// A set of characters represented by Unicode scalar values.
820	Unicode(ClassUnicode),
821	/// A set of characters represented by arbitrary bytes (one byte per
822	/// character).
823	Bytes(ClassBytes),
824	}
825
826	impl Class {
827	/// Apply Unicode simple case folding to this character class, in place.
828	/// The character class will be expanded to include all simple case folded
829	/// character variants.
830	///
831	/// If this is a byte oriented character class, then this will be limited
832	/// to the ASCII ranges `A-Z` and `a-z`.
833	///
834	/// # Panics
835	///
836	/// This routine panics when the case mapping data necessary for this
837	/// routine to complete is unavailable. This occurs when the `unicode-case`
838	/// feature is not enabled and the underlying class is Unicode oriented.
839	///
840	/// Callers should prefer using `try_case_fold_simple` instead, which will
841	/// return an error instead of panicking.
842	pub fn case_fold_simple(&mut self) {
843	match *self {
844	Class::Unicode(ref mut x) => x.case_fold_simple(),
845	Class::Bytes(ref mut x) => x.case_fold_simple(),
846	}
847	}
848
849	/// Apply Unicode simple case folding to this character class, in place.
850	/// The character class will be expanded to include all simple case folded
851	/// character variants.
852	///
853	/// If this is a byte oriented character class, then this will be limited
854	/// to the ASCII ranges `A-Z` and `a-z`.
855	///
856	/// # Error
857	///
858	/// This routine returns an error when the case mapping data necessary
859	/// for this routine to complete is unavailable. This occurs when the
860	/// `unicode-case` feature is not enabled and the underlying class is
861	/// Unicode oriented.
862	pub fn try_case_fold_simple(
863	&mut self,
864	) -> core::result::Result<(), CaseFoldError> {
865	match *self {
866	Class::Unicode(ref mut x) => x.try_case_fold_simple()?,
867	Class::Bytes(ref mut x) => x.case_fold_simple(),
868	}
869	Ok(())
870	}
871
872	/// Negate this character class in place.
873	///
874	/// After completion, this character class will contain precisely the
875	/// characters that weren't previously in the class.
876	pub fn negate(&mut self) {
877	match *self {
878	Class::Unicode(ref mut x) => x.negate(),
879	Class::Bytes(ref mut x) => x.negate(),
880	}
881	}
882
883	/// Returns true if and only if this character class will only ever match
884	/// valid UTF-8.
885	///
886	/// A character class can match invalid UTF-8 only when the following
887	/// conditions are met:
888	///
889	/// 1. The translator was configured to permit generating an expression
890	/// that can match invalid UTF-8. (By default, this is disabled.)
891	/// 2. Unicode mode (via the `u` flag) was disabled either in the concrete
892	/// syntax or in the parser builder. By default, Unicode mode is
893	/// enabled.
894	pub fn is_utf8(&self) -> bool {
895	match *self {
896	Class::Unicode(_) => `true`,
897	Class::Bytes(ref x) => x.is_ascii(),
898	}
899	}
900
901	/// Returns the length, in bytes, of the smallest string matched by this
902	/// character class.
903	///
904	/// For non-empty byte oriented classes, this always returns `1`. For
905	/// non-empty Unicode oriented classes, this can return `1`, `2`, `3` or
906	/// `4`. For empty classes, `None` is returned. It is impossible for `0` to
907	/// be returned.
908	///
909	/// # Example
910	///
911	/// This example shows some examples of regexes and their corresponding
912	/// minimum length, if any.
913	///
914	/// ```
915	/// use regex_syntax::{hir::Properties, parse};
916	///
917	/// // The empty string has a min length of 0.
918	/// let hir = parse(r"")?;
919	/// assert_eq!(Some(`0`), hir.properties().minimum_len());
920	/// // As do other types of regexes that only match the empty string.
921	/// let hir = parse(r"^$\b\B")?;
922	/// assert_eq!(Some(`0`), hir.properties().minimum_len());
923	/// // A regex that can match the empty string but match more is still 0.
924	/// let hir = parse(r"a*")?;
925	/// assert_eq!(Some(`0`), hir.properties().minimum_len());
926	/// // A regex that matches nothing has no minimum defined.
927	/// let hir = parse(r"[a&&b]")?;
928	/// assert_eq!(None, hir.properties().minimum_len());
929	/// // Character classes usually have a minimum length of 1.
930	/// let hir = parse(r"\w")?;
931	/// assert_eq!(Some(`1`), hir.properties().minimum_len());
932	/// // But sometimes Unicode classes might be bigger!
933	/// let hir = parse(r"\p{Cyrillic}")?;
934	/// assert_eq!(Some(`2`), hir.properties().minimum_len());
935	///
936	/// # Ok::<(), Box<dyn std::error::Error>>(())
937	/// ```
938	pub fn minimum_len(&self) -> Option<usize> {
939	match *self {
940	Class::Unicode(ref x) => x.minimum_len(),
941	Class::Bytes(ref x) => x.minimum_len(),
942	}
943	}
944
945	/// Returns the length, in bytes, of the longest string matched by this
946	/// character class.
947	///
948	/// For non-empty byte oriented classes, this always returns `1`. For
949	/// non-empty Unicode oriented classes, this can return `1`, `2`, `3` or
950	/// `4`. For empty classes, `None` is returned. It is impossible for `0` to
951	/// be returned.
952	///
953	/// # Example
954	///
955	/// This example shows some examples of regexes and their corresponding
956	/// maximum length, if any.
957	///
958	/// ```
959	/// use regex_syntax::{hir::Properties, parse};
960	///
961	/// // The empty string has a max length of 0.
962	/// let hir = parse(r"")?;
963	/// assert_eq!(Some(`0`), hir.properties().maximum_len());
964	/// // As do other types of regexes that only match the empty string.
965	/// let hir = parse(r"^$\b\B")?;
966	/// assert_eq!(Some(`0`), hir.properties().maximum_len());
967	/// // A regex that matches nothing has no maximum defined.
968	/// let hir = parse(r"[a&&b]")?;
969	/// assert_eq!(None, hir.properties().maximum_len());
970	/// // Bounded repeats work as you expect.
971	/// let hir = parse(r"x{2,10}")?;
972	/// assert_eq!(Some(`10`), hir.properties().maximum_len());
973	/// // An unbounded repeat means there is no maximum.
974	/// let hir = parse(r"x{2,}")?;
975	/// assert_eq!(None, hir.properties().maximum_len());
976	/// // With Unicode enabled, \w can match up to 4 bytes!
977	/// let hir = parse(r"\w")?;
978	/// assert_eq!(Some(`4`), hir.properties().maximum_len());
979	/// // Without Unicode enabled, \w matches at most 1 byte.
980	/// let hir = parse(r"(?-u)\w")?;
981	/// assert_eq!(Some(`1`), hir.properties().maximum_len());
982	///
983	/// # Ok::<(), Box<dyn std::error::Error>>(())
984	/// ```
985	pub fn maximum_len(&self) -> Option<usize> {
986	match *self {
987	Class::Unicode(ref x) => x.maximum_len(),
988	Class::Bytes(ref x) => x.maximum_len(),
989	}
990	}
991
992	/// Returns true if and only if this character class is empty. That is,
993	/// it has no elements.
994	///
995	/// An empty character can never match anything, including an empty string.
996	pub fn is_empty(&self) -> bool {
997	match *self {
998	Class::Unicode(ref x) => x.ranges().is_empty(),
999	Class::Bytes(ref x) => x.ranges().is_empty(),
1000	}
1001	}
1002
1003	/// If this class consists of exactly one element (whether a codepoint or a
1004	/// byte), then return it as a literal byte string.
1005	///
1006	/// If this class is empty or contains more than one element, then `None`
1007	/// is returned.
1008	pub fn literal(&self) -> Option<Vec<u8>> {
1009	match *self {
1010	Class::Unicode(ref x) => x.literal(),
1011	Class::Bytes(ref x) => x.literal(),
1012	}
1013	}
1014	}
1015
1016	impl core::fmt::Debug for Class {
1017	fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
1018	use crate::debug::Byte;
1019
1020	let mut fmter: DebugSet<'_, '_> = f.debug_set();
1021	match *self {
1022	Class::Unicode(ref cls: &ClassUnicode) => {
1023	for r: &ClassUnicodeRange in cls.ranges().iter() {
1024	fmter.entry(&(r.start..=r.end));
1025	}
1026	}
1027	Class::Bytes(ref cls: &ClassBytes) => {
1028	for r: &ClassBytesRange in cls.ranges().iter() {
1029	fmter.entry(&(Byte(r.start)..=Byte(r.end)));
1030	}
1031	}
1032	}
1033	fmter.finish()
1034	}
1035	}
1036
1037	/// A set of characters represented by Unicode scalar values.
1038	#[derive(Clone, Debug, Eq, PartialEq)]
1039	pub struct ClassUnicode {
1040	set: IntervalSet<ClassUnicodeRange>,
1041	}
1042
1043	impl ClassUnicode {
1044	/// Create a new class from a sequence of ranges.
1045	///
1046	/// The given ranges do not need to be in any specific order, and ranges
1047	/// may overlap. Ranges will automatically be sorted into a canonical
1048	/// non-overlapping order.
1049	pub fn new<I>(ranges: I) -> ClassUnicode
1050	where
1051	I: IntoIterator<Item = ClassUnicodeRange>,
1052	{
1053	ClassUnicode { set: IntervalSet::new(ranges) }
1054	}
1055
1056	/// Create a new class with no ranges.
1057	///
1058	/// An empty class matches nothing. That is, it is equivalent to
1059	/// [`Hir::fail`].
1060	pub fn empty() -> ClassUnicode {
1061	ClassUnicode::new(vec![])
1062	}
1063
1064	/// Add a new range to this set.
1065	pub fn push(&mut self, range: ClassUnicodeRange) {
1066	self.set.push(range);
1067	}
1068
1069	/// Return an iterator over all ranges in this class.
1070	///
1071	/// The iterator yields ranges in ascending order.
1072	pub fn iter(&self) -> ClassUnicodeIter<'_> {
1073	ClassUnicodeIter(self.set.iter())
1074	}
1075
1076	/// Return the underlying ranges as a slice.
1077	pub fn ranges(&self) -> &[ClassUnicodeRange] {
1078	self.set.intervals()
1079	}
1080
1081	/// Expand this character class such that it contains all case folded
1082	/// characters, according to Unicode's "simple" mapping. For example, if
1083	/// this class consists of the range `a-z`, then applying case folding will
1084	/// result in the class containing both the ranges `a-z` and `A-Z`.
1085	///
1086	/// # Panics
1087	///
1088	/// This routine panics when the case mapping data necessary for this
1089	/// routine to complete is unavailable. This occurs when the `unicode-case`
1090	/// feature is not enabled.
1091	///
1092	/// Callers should prefer using `try_case_fold_simple` instead, which will
1093	/// return an error instead of panicking.
1094	pub fn case_fold_simple(&mut self) {
1095	self.set
1096	.case_fold_simple()
1097	.expect("unicode-case feature must be enabled");
1098	}
1099
1100	/// Expand this character class such that it contains all case folded
1101	/// characters, according to Unicode's "simple" mapping. For example, if
1102	/// this class consists of the range `a-z`, then applying case folding will
1103	/// result in the class containing both the ranges `a-z` and `A-Z`.
1104	///
1105	/// # Error
1106	///
1107	/// This routine returns an error when the case mapping data necessary
1108	/// for this routine to complete is unavailable. This occurs when the
1109	/// `unicode-case` feature is not enabled.
1110	pub fn try_case_fold_simple(
1111	&mut self,
1112	) -> core::result::Result<(), CaseFoldError> {
1113	self.set.case_fold_simple()
1114	}
1115
1116	/// Negate this character class.
1117	///
1118	/// For all `c` where `c` is a Unicode scalar value, if `c` was in this
1119	/// set, then it will not be in this set after negation.
1120	pub fn negate(&mut self) {
1121	self.set.negate();
1122	}
1123
1124	/// Union this character class with the given character class, in place.
1125	pub fn union(&mut self, other: &ClassUnicode) {
1126	self.set.union(&other.set);
1127	}
1128
1129	/// Intersect this character class with the given character class, in
1130	/// place.
1131	pub fn intersect(&mut self, other: &ClassUnicode) {
1132	self.set.intersect(&other.set);
1133	}
1134
1135	/// Subtract the given character class from this character class, in place.
1136	pub fn difference(&mut self, other: &ClassUnicode) {
1137	self.set.difference(&other.set);
1138	}
1139
1140	/// Compute the symmetric difference of the given character classes, in
1141	/// place.
1142	///
1143	/// This computes the symmetric difference of two character classes. This
1144	/// removes all elements in this class that are also in the given class,
1145	/// but all adds all elements from the given class that aren't in this
1146	/// class. That is, the class will contain all elements in either class,
1147	/// but will not contain any elements that are in both classes.
1148	pub fn symmetric_difference(&mut self, other: &ClassUnicode) {
1149	self.set.symmetric_difference(&other.set);
1150	}
1151
1152	/// Returns true if and only if this character class will either match
1153	/// nothing or only ASCII bytes. Stated differently, this returns false
1154	/// if and only if this class contains a non-ASCII codepoint.
1155	pub fn is_ascii(&self) -> bool {
1156	self.set.intervals().last().map_or(`true`, \|r\| r.end <= '`\x7F`')
1157	}
1158
1159	/// Returns the length, in bytes, of the smallest string matched by this
1160	/// character class.
1161	///
1162	/// Returns `None` when the class is empty.
1163	pub fn minimum_len(&self) -> Option<usize> {
1164	let first = self.ranges().get(`0`)?;
1165	// Correct because c1 < c2 implies c1.len_utf8() < c2.len_utf8().
1166	Some(first.start.len_utf8())
1167	}
1168
1169	/// Returns the length, in bytes, of the longest string matched by this
1170	/// character class.
1171	///
1172	/// Returns `None` when the class is empty.
1173	pub fn maximum_len(&self) -> Option<usize> {
1174	let last = self.ranges().last()?;
1175	// Correct because c1 < c2 implies c1.len_utf8() < c2.len_utf8().
1176	Some(last.end.len_utf8())
1177	}
1178
1179	/// If this class consists of exactly one codepoint, then return it as
1180	/// a literal byte string.
1181	///
1182	/// If this class is empty or contains more than one codepoint, then `None`
1183	/// is returned.
1184	pub fn literal(&self) -> Option<Vec<u8>> {
1185	let rs = self.ranges();
1186	if rs.len() == `1` && rs[`0`].start == rs[`0`].end {
1187	Some(rs[`0`].start.encode_utf8(&mut [`0`; `4`]).to_string().into_bytes())
1188	} else {
1189	None
1190	}
1191	}
1192
1193	/// If this class consists of only ASCII ranges, then return its
1194	/// corresponding and equivalent byte class.
1195	pub fn to_byte_class(&self) -> Option<ClassBytes> {
1196	if !self.is_ascii() {
1197	return None;
1198	}
1199	Some(ClassBytes::new(self.ranges().iter().map(\|r\| {
1200	// Since we are guaranteed that our codepoint range is ASCII, the
1201	// 'u8::try_from' calls below are guaranteed to be correct.
1202	ClassBytesRange {
1203	start: u8::try_from(r.start).unwrap(),
1204	end: u8::try_from(r.end).unwrap(),
1205	}
1206	})))
1207	}
1208	}
1209
1210	/// An iterator over all ranges in a Unicode character class.
1211	///
1212	/// The lifetime `'a` refers to the lifetime of the underlying class.
1213	#[derive(Debug)]
1214	pub struct ClassUnicodeIter<'a>(IntervalSetIter<'a, ClassUnicodeRange>);
1215
1216	impl<'a> Iterator for ClassUnicodeIter<'a> {
1217	type Item = &'a ClassUnicodeRange;
1218
1219	fn next(&mut self) -> Option<&'a ClassUnicodeRange> {
1220	self.0.next()
1221	}
1222	}
1223
1224	/// A single range of characters represented by Unicode scalar values.
1225	///
1226	/// The range is closed. That is, the start and end of the range are included
1227	/// in the range.
1228	#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
1229	pub struct ClassUnicodeRange {
1230	start: char,
1231	end: char,
1232	}
1233
1234	impl core::fmt::Debug for ClassUnicodeRange {
1235	fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1236	let start: String = if !self.start.is_whitespace() && !self.start.is_control()
1237	{
1238	self.start.to_string()
1239	} else {
1240	format!("0x{:X}", u32::from(self.start))
1241	};
1242	let end: String = if !self.end.is_whitespace() && !self.end.is_control() {
1243	self.end.to_string()
1244	} else {
1245	format!("0x{:X}", u32::from(self.end))
1246	};
1247	f&mut DebugStruct<'_, '_>.debug_struct("ClassUnicodeRange")
1248	.field("start", &start)
1249	.field(name:"end", &end)
1250	.finish()
1251	}
1252	}
1253
1254	impl Interval for ClassUnicodeRange {
1255	type Bound = char;
1256
1257	#[inline]
1258	fn lower(&self) -> char {
1259	self.start
1260	}
1261	#[inline]
1262	fn upper(&self) -> char {
1263	self.end
1264	}
1265	#[inline]
1266	fn set_lower(&mut self, bound: char) {
1267	self.start = bound;
1268	}
1269	#[inline]
1270	fn set_upper(&mut self, bound: char) {
1271	self.end = bound;
1272	}
1273
1274	/// Apply simple case folding to this Unicode scalar value range.
1275	///
1276	/// Additional ranges are appended to the given vector. Canonical ordering
1277	/// is not* maintained in the given vector.*
1278	fn case_fold_simple(
1279	&self,
1280	ranges: &mut Vec<ClassUnicodeRange>,
1281	) -> Result<(), unicode::CaseFoldError> {
1282	let mut folder = unicode::SimpleCaseFolder::new()?;
1283	if !folder.overlaps(self.start, self.end) {
1284	return Ok(());
1285	}
1286	let (start, end) = (u32::from(self.start), u32::from(self.end));
1287	for cp in (start..=end).filter_map(char::from_u32) {
1288	for &cp_folded in folder.mapping(cp) {
1289	ranges.push(ClassUnicodeRange::new(cp_folded, cp_folded));
1290	}
1291	}
1292	Ok(())
1293	}
1294	}
1295
1296	impl ClassUnicodeRange {
1297	/// Create a new Unicode scalar value range for a character class.
1298	///
1299	/// The returned range is always in a canonical form. That is, the range
1300	/// returned always satisfies the invariant that `start <= end`.
1301	pub fn new(start: char, end: char) -> ClassUnicodeRange {
1302	ClassUnicodeRange::create(start, end)
1303	}
1304
1305	/// Return the start of this range.
1306	///
1307	/// The start of a range is always less than or equal to the end of the
1308	/// range.
1309	pub fn start(&self) -> char {
1310	self.start
1311	}
1312
1313	/// Return the end of this range.
1314	///
1315	/// The end of a range is always greater than or equal to the start of the
1316	/// range.
1317	pub fn end(&self) -> char {
1318	self.end
1319	}
1320
1321	/// Returns the number of codepoints in this range.
1322	pub fn len(&self) -> usize {
1323	let diff = `1` + u32::from(self.end) - u32::from(self.start);
1324	// This is likely to panic in 16-bit targets since a usize can only fit
1325	// 2^16. It's not clear what to do here, other than to return an error
1326	// when building a Unicode class that contains a range whose length
1327	// overflows usize. (Which, to be honest, is probably quite common on
1328	// 16-bit targets. For example, this would imply that '.' and '\p{any}'
1329	// would be impossible to build.)
1330	usize::try_from(diff).expect("char class len fits in usize")
1331	}
1332	}
1333
1334	/// A set of characters represented by arbitrary bytes.
1335	///
1336	/// Each byte corresponds to one character.
1337	#[derive(Clone, Debug, Eq, PartialEq)]
1338	pub struct ClassBytes {
1339	set: IntervalSet<ClassBytesRange>,
1340	}
1341
1342	impl ClassBytes {
1343	/// Create a new class from a sequence of ranges.
1344	///
1345	/// The given ranges do not need to be in any specific order, and ranges
1346	/// may overlap. Ranges will automatically be sorted into a canonical
1347	/// non-overlapping order.
1348	pub fn new<I>(ranges: I) -> ClassBytes
1349	where
1350	I: IntoIterator<Item = ClassBytesRange>,
1351	{
1352	ClassBytes { set: IntervalSet::new(ranges) }
1353	}
1354
1355	/// Create a new class with no ranges.
1356	///
1357	/// An empty class matches nothing. That is, it is equivalent to
1358	/// [`Hir::fail`].
1359	pub fn empty() -> ClassBytes {
1360	ClassBytes::new(vec![])
1361	}
1362
1363	/// Add a new range to this set.
1364	pub fn push(&mut self, range: ClassBytesRange) {
1365	self.set.push(range);
1366	}
1367
1368	/// Return an iterator over all ranges in this class.
1369	///
1370	/// The iterator yields ranges in ascending order.
1371	pub fn iter(&self) -> ClassBytesIter<'_> {
1372	ClassBytesIter(self.set.iter())
1373	}
1374
1375	/// Return the underlying ranges as a slice.
1376	pub fn ranges(&self) -> &[ClassBytesRange] {
1377	self.set.intervals()
1378	}
1379
1380	/// Expand this character class such that it contains all case folded
1381	/// characters. For example, if this class consists of the range `a-z`,
1382	/// then applying case folding will result in the class containing both the
1383	/// ranges `a-z` and `A-Z`.
1384	///
1385	/// Note that this only applies ASCII case folding, which is limited to the
1386	/// characters `a-z` and `A-Z`.
1387	pub fn case_fold_simple(&mut self) {
1388	self.set.case_fold_simple().expect("ASCII case folding never fails");
1389	}
1390
1391	/// Negate this byte class.
1392	///
1393	/// For all `b` where `b` is a any byte, if `b` was in this set, then it
1394	/// will not be in this set after negation.
1395	pub fn negate(&mut self) {
1396	self.set.negate();
1397	}
1398
1399	/// Union this byte class with the given byte class, in place.
1400	pub fn union(&mut self, other: &ClassBytes) {
1401	self.set.union(&other.set);
1402	}
1403
1404	/// Intersect this byte class with the given byte class, in place.
1405	pub fn intersect(&mut self, other: &ClassBytes) {
1406	self.set.intersect(&other.set);
1407	}
1408
1409	/// Subtract the given byte class from this byte class, in place.
1410	pub fn difference(&mut self, other: &ClassBytes) {
1411	self.set.difference(&other.set);
1412	}
1413
1414	/// Compute the symmetric difference of the given byte classes, in place.
1415	///
1416	/// This computes the symmetric difference of two byte classes. This
1417	/// removes all elements in this class that are also in the given class,
1418	/// but all adds all elements from the given class that aren't in this
1419	/// class. That is, the class will contain all elements in either class,
1420	/// but will not contain any elements that are in both classes.
1421	pub fn symmetric_difference(&mut self, other: &ClassBytes) {
1422	self.set.symmetric_difference(&other.set);
1423	}
1424
1425	/// Returns true if and only if this character class will either match
1426	/// nothing or only ASCII bytes. Stated differently, this returns false
1427	/// if and only if this class contains a non-ASCII byte.
1428	pub fn is_ascii(&self) -> bool {
1429	self.set.intervals().last().map_or(`true`, \|r\| r.end <= `0x7F`)
1430	}
1431
1432	/// Returns the length, in bytes, of the smallest string matched by this
1433	/// character class.
1434	///
1435	/// Returns `None` when the class is empty.
1436	pub fn minimum_len(&self) -> Option<usize> {
1437	if self.ranges().is_empty() {
1438	None
1439	} else {
1440	Some(`1`)
1441	}
1442	}
1443
1444	/// Returns the length, in bytes, of the longest string matched by this
1445	/// character class.
1446	///
1447	/// Returns `None` when the class is empty.
1448	pub fn maximum_len(&self) -> Option<usize> {
1449	if self.ranges().is_empty() {
1450	None
1451	} else {
1452	Some(`1`)
1453	}
1454	}
1455
1456	/// If this class consists of exactly one byte, then return it as
1457	/// a literal byte string.
1458	///
1459	/// If this class is empty or contains more than one byte, then `None`
1460	/// is returned.
1461	pub fn literal(&self) -> Option<Vec<u8>> {
1462	let rs = self.ranges();
1463	if rs.len() == `1` && rs[`0`].start == rs[`0`].end {
1464	Some(vec![rs[`0`].start])
1465	} else {
1466	None
1467	}
1468	}
1469
1470	/// If this class consists of only ASCII ranges, then return its
1471	/// corresponding and equivalent Unicode class.
1472	pub fn to_unicode_class(&self) -> Option<ClassUnicode> {
1473	if !self.is_ascii() {
1474	return None;
1475	}
1476	Some(ClassUnicode::new(self.ranges().iter().map(\|r\| {
1477	// Since we are guaranteed that our byte range is ASCII, the
1478	// 'char::from' calls below are correct and will not erroneously
1479	// convert a raw byte value into its corresponding codepoint.
1480	ClassUnicodeRange {
1481	start: char::from(r.start),
1482	end: char::from(r.end),
1483	}
1484	})))
1485	}
1486	}
1487
1488	/// An iterator over all ranges in a byte character class.
1489	///
1490	/// The lifetime `'a` refers to the lifetime of the underlying class.
1491	#[derive(Debug)]
1492	pub struct ClassBytesIter<'a>(IntervalSetIter<'a, ClassBytesRange>);
1493
1494	impl<'a> Iterator for ClassBytesIter<'a> {
1495	type Item = &'a ClassBytesRange;
1496
1497	fn next(&mut self) -> Option<&'a ClassBytesRange> {
1498	self.0.next()
1499	}
1500	}
1501
1502	/// A single range of characters represented by arbitrary bytes.
1503	///
1504	/// The range is closed. That is, the start and end of the range are included
1505	/// in the range.
1506	#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
1507	pub struct ClassBytesRange {
1508	start: u8,
1509	end: u8,
1510	}
1511
1512	impl Interval for ClassBytesRange {
1513	type Bound = u8;
1514
1515	#[inline]
1516	fn lower(&self) -> u8 {
1517	self.start
1518	}
1519	#[inline]
1520	fn upper(&self) -> u8 {
1521	self.end
1522	}
1523	#[inline]
1524	fn set_lower(&mut self, bound: u8) {
1525	self.start = bound;
1526	}
1527	#[inline]
1528	fn set_upper(&mut self, bound: u8) {
1529	self.end = bound;
1530	}
1531
1532	/// Apply simple case folding to this byte range. Only ASCII case mappings
1533	/// (for a-z) are applied.
1534	///
1535	/// Additional ranges are appended to the given vector. Canonical ordering
1536	/// is not* maintained in the given vector.*
1537	fn case_fold_simple(
1538	&self,
1539	ranges: &mut Vec<ClassBytesRange>,
1540	) -> Result<(), unicode::CaseFoldError> {
1541	if !ClassBytesRange::new(b'a', b'z').is_intersection_empty(self) {
1542	let lower = cmp::max(self.start, b'a');
1543	let upper = cmp::min(self.end, b'z');
1544	ranges.push(ClassBytesRange::new(lower - `32`, upper - `32`));
1545	}
1546	if !ClassBytesRange::new(b'A', b'Z').is_intersection_empty(self) {
1547	let lower = cmp::max(self.start, b'A');
1548	let upper = cmp::min(self.end, b'Z');
1549	ranges.push(ClassBytesRange::new(lower + `32`, upper + `32`));
1550	}
1551	Ok(())
1552	}
1553	}
1554
1555	impl ClassBytesRange {
1556	/// Create a new byte range for a character class.
1557	///
1558	/// The returned range is always in a canonical form. That is, the range
1559	/// returned always satisfies the invariant that `start <= end`.
1560	pub fn new(start: u8, end: u8) -> ClassBytesRange {
1561	ClassBytesRange::create(start, end)
1562	}
1563
1564	/// Return the start of this range.
1565	///
1566	/// The start of a range is always less than or equal to the end of the
1567	/// range.
1568	pub fn start(&self) -> u8 {
1569	self.start
1570	}
1571
1572	/// Return the end of this range.
1573	///
1574	/// The end of a range is always greater than or equal to the start of the
1575	/// range.
1576	pub fn end(&self) -> u8 {
1577	self.end
1578	}
1579
1580	/// Returns the number of bytes in this range.
1581	pub fn len(&self) -> usize {
1582	usize::from(self.end.checked_sub(self.start).unwrap())
1583	.checked_add(`1`)
1584	.unwrap()
1585	}
1586	}
1587
1588	impl core::fmt::Debug for ClassBytesRange {
1589	fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1590	f&mut DebugStruct<'_, '_>.debug_struct("ClassBytesRange")
1591	.field("start", &crate::debug::Byte(self.start))
1592	.field(name:"end", &crate::debug::Byte(self.end))
1593	.finish()
1594	}
1595	}
1596
1597	/// The high-level intermediate representation for a look-around assertion.
1598	///
1599	/// An assertion match is always zero-length. Also called an "empty match."
1600	#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1601	pub enum Look {
1602	/// Match the beginning of text. Specifically, this matches at the starting
1603	/// position of the input.
1604	Start = `1` << `0`,
1605	/// Match the end of text. Specifically, this matches at the ending
1606	/// position of the input.
1607	End = `1` << `1`,
1608	/// Match the beginning of a line or the beginning of text. Specifically,
1609	/// this matches at the starting position of the input, or at the position
1610	/// immediately following a `\n` character.
1611	StartLF = `1` << `2`,
1612	/// Match the end of a line or the end of text. Specifically, this matches
1613	/// at the end position of the input, or at the position immediately
1614	/// preceding a `\n` character.
1615	EndLF = `1` << `3`,
1616	/// Match the beginning of a line or the beginning of text. Specifically,
1617	/// this matches at the starting position of the input, or at the position
1618	/// immediately following either a `\r` or `\n` character, but never after
1619	/// a `\r` when a `\n` follows.
1620	StartCRLF = `1` << `4`,
1621	/// Match the end of a line or the end of text. Specifically, this matches
1622	/// at the end position of the input, or at the position immediately
1623	/// preceding a `\r` or `\n` character, but never before a `\n` when a `\r`
1624	/// precedes it.
1625	EndCRLF = `1` << `5`,
1626	/// Match an ASCII-only word boundary. That is, this matches a position
1627	/// where the left adjacent character and right adjacent character
1628	/// correspond to a word and non-word or a non-word and word character.
1629	WordAscii = `1` << `6`,
1630	/// Match an ASCII-only negation of a word boundary.
1631	WordAsciiNegate = `1` << `7`,
1632	/// Match a Unicode-aware word boundary. That is, this matches a position
1633	/// where the left adjacent character and right adjacent character
1634	/// correspond to a word and non-word or a non-word and word character.
1635	WordUnicode = `1` << `8`,
1636	/// Match a Unicode-aware negation of a word boundary.
1637	WordUnicodeNegate = `1` << `9`,
1638	/// Match the start of an ASCII-only word boundary. That is, this matches a
1639	/// position at either the beginning of the haystack or where the previous
1640	/// character is not a word character and the following character is a word
1641	/// character.
1642	WordStartAscii = `1` << `10`,
1643	/// Match the end of an ASCII-only word boundary. That is, this matches
1644	/// a position at either the end of the haystack or where the previous
1645	/// character is a word character and the following character is not a word
1646	/// character.
1647	WordEndAscii = `1` << `11`,
1648	/// Match the start of a Unicode word boundary. That is, this matches a
1649	/// position at either the beginning of the haystack or where the previous
1650	/// character is not a word character and the following character is a word
1651	/// character.
1652	WordStartUnicode = `1` << `12`,
1653	/// Match the end of a Unicode word boundary. That is, this matches a
1654	/// position at either the end of the haystack or where the previous
1655	/// character is a word character and the following character is not a word
1656	/// character.
1657	WordEndUnicode = `1` << `13`,
1658	/// Match the start half of an ASCII-only word boundary. That is, this
1659	/// matches a position at either the beginning of the haystack or where the
1660	/// previous character is not a word character.
1661	WordStartHalfAscii = `1` << `14`,
1662	/// Match the end half of an ASCII-only word boundary. That is, this
1663	/// matches a position at either the end of the haystack or where the
1664	/// following character is not a word character.
1665	WordEndHalfAscii = `1` << `15`,
1666	/// Match the start half of a Unicode word boundary. That is, this matches
1667	/// a position at either the beginning of the haystack or where the
1668	/// previous character is not a word character.
1669	WordStartHalfUnicode = `1` << `16`,
1670	/// Match the end half of a Unicode word boundary. That is, this matches
1671	/// a position at either the end of the haystack or where the following
1672	/// character is not a word character.
1673	WordEndHalfUnicode = `1` << `17`,
1674	}
1675
1676	impl Look {
1677	/// Flip the look-around assertion to its equivalent for reverse searches.
1678	/// For example, `StartLF` gets translated to `EndLF`.
1679	///
1680	/// Some assertions, such as `WordUnicode`, remain the same since they
1681	/// match the same positions regardless of the direction of the search.
1682	#[inline]
1683	pub const fn reversed(self) -> Look {
1684	match self {
1685	Look::Start => Look::End,
1686	Look::End => Look::Start,
1687	Look::StartLF => Look::EndLF,
1688	Look::EndLF => Look::StartLF,
1689	Look::StartCRLF => Look::EndCRLF,
1690	Look::EndCRLF => Look::StartCRLF,
1691	Look::WordAscii => Look::WordAscii,
1692	Look::WordAsciiNegate => Look::WordAsciiNegate,
1693	Look::WordUnicode => Look::WordUnicode,
1694	Look::WordUnicodeNegate => Look::WordUnicodeNegate,
1695	Look::WordStartAscii => Look::WordEndAscii,
1696	Look::WordEndAscii => Look::WordStartAscii,
1697	Look::WordStartUnicode => Look::WordEndUnicode,
1698	Look::WordEndUnicode => Look::WordStartUnicode,
1699	Look::WordStartHalfAscii => Look::WordEndHalfAscii,
1700	Look::WordEndHalfAscii => Look::WordStartHalfAscii,
1701	Look::WordStartHalfUnicode => Look::WordEndHalfUnicode,
1702	Look::WordEndHalfUnicode => Look::WordStartHalfUnicode,
1703	}
1704	}
1705
1706	/// Return the underlying representation of this look-around enumeration
1707	/// as an integer. Giving the return value to the [`Look::from_repr`]
1708	/// constructor is guaranteed to return the same look-around variant that
1709	/// one started with within a semver compatible release of this crate.
1710	#[inline]
1711	pub const fn as_repr(self) -> u32 {
1712	// AFAIK, 'as' is the only way to zero-cost convert an int enum to an
1713	// actual int.
1714	self as u32
1715	}
1716
1717	/// Given the underlying representation of a `Look` value, return the
1718	/// corresponding `Look` value if the representation is valid. Otherwise
1719	/// `None` is returned.
1720	#[inline]
1721	pub const fn from_repr(repr: u32) -> Option<Look> {
1722	match repr {
1723	`0b00_0000_0000_0000_0001` => Some(Look::Start),
1724	`0b00_0000_0000_0000_0010` => Some(Look::End),
1725	`0b00_0000_0000_0000_0100` => Some(Look::StartLF),
1726	`0b00_0000_0000_0000_1000` => Some(Look::EndLF),
1727	`0b00_0000_0000_0001_0000` => Some(Look::StartCRLF),
1728	`0b00_0000_0000_0010_0000` => Some(Look::EndCRLF),
1729	`0b00_0000_0000_0100_0000` => Some(Look::WordAscii),
1730	`0b00_0000_0000_1000_0000` => Some(Look::WordAsciiNegate),
1731	`0b00_0000_0001_0000_0000` => Some(Look::WordUnicode),
1732	`0b00_0000_0010_0000_0000` => Some(Look::WordUnicodeNegate),
1733	`0b00_0000_0100_0000_0000` => Some(Look::WordStartAscii),
1734	`0b00_0000_1000_0000_0000` => Some(Look::WordEndAscii),
1735	`0b00_0001_0000_0000_0000` => Some(Look::WordStartUnicode),
1736	`0b00_0010_0000_0000_0000` => Some(Look::WordEndUnicode),
1737	`0b00_0100_0000_0000_0000` => Some(Look::WordStartHalfAscii),
1738	`0b00_1000_0000_0000_0000` => Some(Look::WordEndHalfAscii),
1739	`0b01_0000_0000_0000_0000` => Some(Look::WordStartHalfUnicode),
1740	`0b10_0000_0000_0000_0000` => Some(Look::WordEndHalfUnicode),
1741	_ => None,
1742	}
1743	}
1744
1745	/// Returns a convenient single codepoint representation of this
1746	/// look-around assertion. Each assertion is guaranteed to be represented
1747	/// by a distinct character.
1748	///
1749	/// This is useful for succinctly representing a look-around assertion in
1750	/// human friendly but succinct output intended for a programmer working on
1751	/// regex internals.
1752	#[inline]
1753	pub const fn as_char(self) -> char {
1754	match self {
1755	Look::Start => 'A',
1756	Look::End => 'z',
1757	Look::StartLF => '^',
1758	Look::EndLF => '$',
1759	Look::StartCRLF => 'r',
1760	Look::EndCRLF => 'R',
1761	Look::WordAscii => 'b',
1762	Look::WordAsciiNegate => 'B',
1763	Look::WordUnicode => '𝛃',
1764	Look::WordUnicodeNegate => '𝚩',
1765	Look::WordStartAscii => '<',
1766	Look::WordEndAscii => '>',
1767	Look::WordStartUnicode => '〈',
1768	Look::WordEndUnicode => '〉',
1769	Look::WordStartHalfAscii => '◁',
1770	Look::WordEndHalfAscii => '▷',
1771	Look::WordStartHalfUnicode => '◀',
1772	Look::WordEndHalfUnicode => '▶',
1773	}
1774	}
1775	}
1776
1777	/// The high-level intermediate representation for a capturing group.
1778	///
1779	/// A capturing group always has an index and a child expression. It may
1780	/// also have a name associated with it (e.g., `(?P<foo>\w)`), but it's not
1781	/// necessary.
1782	///
1783	/// Note that there is no explicit representation of a non-capturing group
1784	/// in a `Hir`. Instead, non-capturing grouping is handled automatically by
1785	/// the recursive structure of the `Hir` itself.
1786	#[derive(Clone, Debug, Eq, PartialEq)]
1787	pub struct Capture {
1788	/// The capture index of the capture.
1789	pub index: u32,
1790	/// The name of the capture, if it exists.
1791	pub name: Option<Box<str>>,
1792	/// The expression inside the capturing group, which may be empty.
1793	pub sub: Box<Hir>,
1794	}
1795
1796	/// The high-level intermediate representation of a repetition operator.
1797	///
1798	/// A repetition operator permits the repetition of an arbitrary
1799	/// sub-expression.
1800	#[derive(Clone, Debug, Eq, PartialEq)]
1801	pub struct Repetition {
1802	/// The minimum range of the repetition.
1803	///
1804	/// Note that special cases like `?`, `+` and `` all get translated into*
1805	/// the ranges `{0,1}`, `{1,}` and `{0,}`, respectively.
1806	///
1807	/// When `min` is zero, this expression can match the empty string
1808	/// regardless of what its sub-expression is.
1809	pub min: u32,
1810	/// The maximum range of the repetition.
1811	///
1812	/// Note that when `max` is `None`, `min` acts as a lower bound but where
1813	/// there is no upper bound. For something like `x{5}` where the min and
1814	/// max are equivalent, `min` will be set to `5` and `max` will be set to
1815	/// `Some(5)`.
1816	pub max: Option<u32>,
1817	/// Whether this repetition operator is greedy or not. A greedy operator
1818	/// will match as much as it can. A non-greedy operator will match as
1819	/// little as it can.
1820	///
1821	/// Typically, operators are greedy by default and are only non-greedy when
1822	/// a `?` suffix is used, e.g., `(expr)` is greedy while `(expr)?` is
1823	/// not. However, this can be inverted via the `U` "ungreedy" flag.
1824	pub greedy: bool,
1825	/// The expression being repeated.
1826	pub sub: Box<Hir>,
1827	}
1828
1829	impl Repetition {
1830	/// Returns a new repetition with the same `min`, `max` and `greedy`
1831	/// values, but with its sub-expression replaced with the one given.
1832	pub fn with(&self, sub: Hir) -> Repetition {
1833	Repetition {
1834	min: self.min,
1835	max: self.max,
1836	greedy: self.greedy,
1837	sub: Box::new(sub),
1838	}
1839	}
1840	}
1841
1842	/// A type describing the different flavors of `.`.
1843	///
1844	/// This type is meant to be used with [`Hir::dot`], which is a convenience
1845	/// routine for building HIR values derived from the `.` regex.
1846	#[non_exhaustive]
1847	#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1848	pub enum Dot {
1849	/// Matches the UTF-8 encoding of any Unicode scalar value.
1850	///
1851	/// This is equivalent to `(?su:.)` and also `\p{any}`.
1852	AnyChar,
1853	/// Matches any byte value.
1854	///
1855	/// This is equivalent to `(?s-u:.)` and also `(?-u:[\x00-\xFF])`.
1856	AnyByte,
1857	/// Matches the UTF-8 encoding of any Unicode scalar value except for the
1858	/// `char` given.
1859	///
1860	/// This is equivalent to using `(?u-s:.)` with the line terminator set
1861	/// to a particular ASCII byte. (Because of peculiarities in the regex
1862	/// engines, a line terminator must be a single byte. It follows that when
1863	/// UTF-8 mode is enabled, this single byte must also be a Unicode scalar
1864	/// value. That is, ti must be ASCII.)
1865	///
1866	/// (This and `AnyCharExceptLF` both exist because of legacy reasons.
1867	/// `AnyCharExceptLF` will be dropped in the next breaking change release.)
1868	AnyCharExcept(char),
1869	/// Matches the UTF-8 encoding of any Unicode scalar value except for `\n`.
1870	///
1871	/// This is equivalent to `(?u-s:.)` and also `[\p{any}--\n]`.
1872	AnyCharExceptLF,
1873	/// Matches the UTF-8 encoding of any Unicode scalar value except for `\r`
1874	/// and `\n`.
1875	///
1876	/// This is equivalent to `(?uR-s:.)` and also `[\p{any}--\r\n]`.
1877	AnyCharExceptCRLF,
1878	/// Matches any byte value except for the `u8` given.
1879	///
1880	/// This is equivalent to using `(?-us:.)` with the line terminator set
1881	/// to a particular ASCII byte. (Because of peculiarities in the regex
1882	/// engines, a line terminator must be a single byte. It follows that when
1883	/// UTF-8 mode is enabled, this single byte must also be a Unicode scalar
1884	/// value. That is, ti must be ASCII.)
1885	///
1886	/// (This and `AnyByteExceptLF` both exist because of legacy reasons.
1887	/// `AnyByteExceptLF` will be dropped in the next breaking change release.)
1888	AnyByteExcept(u8),
1889	/// Matches any byte value except for `\n`.
1890	///
1891	/// This is equivalent to `(?-su:.)` and also `(?-u:[[\x00-\xFF]--\n])`.
1892	AnyByteExceptLF,
1893	/// Matches any byte value except for `\r` and `\n`.
1894	///
1895	/// This is equivalent to `(?R-su:.)` and also `(?-u:[[\x00-\xFF]--\r\n])`.
1896	AnyByteExceptCRLF,
1897	}
1898
1899	/// A custom `Drop` impl is used for `HirKind` such that it uses constant stack
1900	/// space but heap space proportional to the depth of the total `Hir`.
1901	impl Drop for Hir {
1902	fn drop(&mut self) {
1903	use core::mem;
1904
1905	match *self.kind() {
1906	HirKind::Empty
1907	\| HirKind::Literal(_)
1908	\| HirKind::Class(_)
1909	\| HirKind::Look(_) => return,
1910	HirKind::Capture(ref x) if x.sub.kind.subs().is_empty() => return,
1911	HirKind::Repetition(ref x) if x.sub.kind.subs().is_empty() => {
1912	return
1913	}
1914	HirKind::Concat(ref x) if x.is_empty() => return,
1915	HirKind::Alternation(ref x) if x.is_empty() => return,
1916	_ => {}
1917	}
1918
1919	let mut stack = vec![mem::replace(self, Hir::empty())];
1920	while let Some(mut expr) = stack.pop() {
1921	match expr.kind {
1922	HirKind::Empty
1923	\| HirKind::Literal(_)
1924	\| HirKind::Class(_)
1925	\| HirKind::Look(_) => {}
1926	HirKind::Capture(ref mut x) => {
1927	stack.push(mem::replace(&mut x.sub, Hir::empty()));
1928	}
1929	HirKind::Repetition(ref mut x) => {
1930	stack.push(mem::replace(&mut x.sub, Hir::empty()));
1931	}
1932	HirKind::Concat(ref mut x) => {
1933	stack.extend(x.drain(..));
1934	}
1935	HirKind::Alternation(ref mut x) => {
1936	stack.extend(x.drain(..));
1937	}
1938	}
1939	}
1940	}
1941	}
1942
1943	/// A type that collects various properties of an HIR value.
1944	///
1945	/// Properties are always scalar values and represent meta data that is
1946	/// computed inductively on an HIR value. Properties are defined for all
1947	/// HIR values.
1948	///
1949	/// All methods on a `Properties` value take constant time and are meant to
1950	/// be cheap to call.
1951	#[derive(Clone, Debug, Eq, PartialEq)]
1952	pub struct Properties(Box<PropertiesI>);
1953
1954	/// The property definition. It is split out so that we can box it, and
1955	/// there by make `Properties` use less stack size. This is kind-of important
1956	/// because every HIR value has a `Properties` attached to it.
1957	///
1958	/// This does have the unfortunate consequence that creating any HIR value
1959	/// always leads to at least one alloc for properties, but this is generally
1960	/// true anyway (for pretty much all HirKinds except for look-arounds).
1961	#[derive(Clone, Debug, Eq, PartialEq)]
1962	struct PropertiesI {
1963	minimum_len: Option<usize>,
1964	maximum_len: Option<usize>,
1965	look_set: LookSet,
1966	look_set_prefix: LookSet,
1967	look_set_suffix: LookSet,
1968	look_set_prefix_any: LookSet,
1969	look_set_suffix_any: LookSet,
1970	utf8: bool,
1971	explicit_captures_len: usize,
1972	static_explicit_captures_len: Option<usize>,
1973	literal: bool,
1974	alternation_literal: bool,
1975	}
1976
1977	impl Properties {
1978	/// Returns the length (in bytes) of the smallest string matched by this
1979	/// HIR.
1980	///
1981	/// A return value of `0` is possible and occurs when the HIR can match an
1982	/// empty string.
1983	///
1984	/// `None` is returned when there is no minimum length. This occurs in
1985	/// precisely the cases where the HIR matches nothing. i.e., The language
1986	/// the regex matches is empty. An example of such a regex is `\P{any}`.
1987	#[inline]
1988	pub fn minimum_len(&self) -> Option<usize> {
1989	self.0.minimum_len
1990	}
1991
1992	/// Returns the length (in bytes) of the longest string matched by this
1993	/// HIR.
1994	///
1995	/// A return value of `0` is possible and occurs when nothing longer than
1996	/// the empty string is in the language described by this HIR.
1997	///
1998	/// `None` is returned when there is no longest matching string. This
1999	/// occurs when the HIR matches nothing or when there is no upper bound on
2000	/// the length of matching strings. Example of such regexes are `\P{any}`
2001	/// (matches nothing) and `a+` (has no upper bound).
2002	#[inline]
2003	pub fn maximum_len(&self) -> Option<usize> {
2004	self.0.maximum_len
2005	}
2006
2007	/// Returns a set of all look-around assertions that appear at least once
2008	/// in this HIR value.
2009	#[inline]
2010	pub fn look_set(&self) -> LookSet {
2011	self.0.look_set
2012	}
2013
2014	/// Returns a set of all look-around assertions that appear as a prefix for
2015	/// this HIR value. That is, the set returned corresponds to the set of
2016	/// assertions that must be passed before matching any bytes in a haystack.
2017	///
2018	/// For example, `hir.look_set_prefix().contains(Look::Start)` returns true
2019	/// if and only if the HIR is fully anchored at the start.
2020	#[inline]
2021	pub fn look_set_prefix(&self) -> LookSet {
2022	self.0.look_set_prefix
2023	}
2024
2025	/// Returns a set of all look-around assertions that appear as a _possible_
2026	/// prefix for this HIR value. That is, the set returned corresponds to the
2027	/// set of assertions that _may_ be passed before matching any bytes in a
2028	/// haystack.
2029	///
2030	/// For example, `hir.look_set_prefix_any().contains(Look::Start)` returns
2031	/// true if and only if it's possible for the regex to match through a
2032	/// anchored assertion before consuming any input.
2033	#[inline]
2034	pub fn look_set_prefix_any(&self) -> LookSet {
2035	self.0.look_set_prefix_any
2036	}
2037
2038	/// Returns a set of all look-around assertions that appear as a suffix for
2039	/// this HIR value. That is, the set returned corresponds to the set of
2040	/// assertions that must be passed in order to be considered a match after
2041	/// all other consuming HIR expressions.
2042	///
2043	/// For example, `hir.look_set_suffix().contains(Look::End)` returns true
2044	/// if and only if the HIR is fully anchored at the end.
2045	#[inline]
2046	pub fn look_set_suffix(&self) -> LookSet {
2047	self.0.look_set_suffix
2048	}
2049
2050	/// Returns a set of all look-around assertions that appear as a _possible_
2051	/// suffix for this HIR value. That is, the set returned corresponds to the
2052	/// set of assertions that _may_ be passed before matching any bytes in a
2053	/// haystack.
2054	///
2055	/// For example, `hir.look_set_suffix_any().contains(Look::End)` returns
2056	/// true if and only if it's possible for the regex to match through a
2057	/// anchored assertion at the end of a match without consuming any input.
2058	#[inline]
2059	pub fn look_set_suffix_any(&self) -> LookSet {
2060	self.0.look_set_suffix_any
2061	}
2062
2063	/// Return true if and only if the corresponding HIR will always match
2064	/// valid UTF-8.
2065	///
2066	/// When this returns false, then it is possible for this HIR expression to
2067	/// match invalid UTF-8, including by matching between the code units of
2068	/// a single UTF-8 encoded codepoint.
2069	///
2070	/// Note that this returns true even when the corresponding HIR can match
2071	/// the empty string. Since an empty string can technically appear between
2072	/// UTF-8 code units, it is possible for a match to be reported that splits
2073	/// a codepoint which could in turn be considered matching invalid UTF-8.
2074	/// However, it is generally assumed that such empty matches are handled
2075	/// specially by the search routine if it is absolutely required that
2076	/// matches not split a codepoint.
2077	///
2078	/// # Example
2079	///
2080	/// This code example shows the UTF-8 property of a variety of patterns.
2081	///
2082	/// ```
2083	/// use regex_syntax::{ParserBuilder, parse};
2084	///
2085	/// // Examples of 'is_utf8() == true'.
2086	/// assert!(parse(r"a")?.properties().is_utf8());
2087	/// assert!(parse(r"[^a]")?.properties().is_utf8());
2088	/// assert!(parse(r".")?.properties().is_utf8());
2089	/// assert!(parse(r"\W")?.properties().is_utf8());
2090	/// assert!(parse(r"\b")?.properties().is_utf8());
2091	/// assert!(parse(r"\B")?.properties().is_utf8());
2092	/// assert!(parse(r"(?-u)\b")?.properties().is_utf8());
2093	/// assert!(parse(r"(?-u)\B")?.properties().is_utf8());
2094	/// // Unicode mode is enabled by default, and in
2095	/// // that mode, all \x hex escapes are treated as
2096	/// // codepoints. So this actually matches the UTF-8
2097	/// // encoding of U+00FF.
2098	/// assert!(parse(r"\xFF")?.properties().is_utf8());
2099	///
2100	/// // Now we show examples of 'is_utf8() == false'.
2101	/// // The only way to do this is to force the parser
2102	/// // to permit invalid UTF-8, otherwise all of these
2103	/// // would fail to parse!
2104	/// let parse = \|pattern\| {
2105	/// ParserBuilder::new().utf8(`false`).build().parse(pattern)
2106	/// };
2107	/// assert!(!parse(r"(?-u)[^a]")?.properties().is_utf8());
2108	/// assert!(!parse(r"(?-u).")?.properties().is_utf8());
2109	/// assert!(!parse(r"(?-u)\W")?.properties().is_utf8());
2110	/// // Conversely to the equivalent example above,
2111	/// // when Unicode mode is disabled, \x hex escapes
2112	/// // are treated as their raw byte values.
2113	/// assert!(!parse(r"(?-u)\xFF")?.properties().is_utf8());
2114	/// // Note that just because we disabled UTF-8 in the
2115	/// // parser doesn't mean we still can't use Unicode.
2116	/// // It is enabled by default, so \xFF is still
2117	/// // equivalent to matching the UTF-8 encoding of
2118	/// // U+00FF by default.
2119	/// assert!(parse(r"\xFF")?.properties().is_utf8());
2120	/// // Even though we use raw bytes that individually
2121	/// // are not valid UTF-8, when combined together, the
2122	/// // overall expression does* match valid UTF-8!*
2123	/// assert!(parse(r"(?-u)\xE2\x98\x83")?.properties().is_utf8());
2124	///
2125	/// # Ok::<(), Box<dyn std::error::Error>>(())
2126	/// ```
2127	#[inline]
2128	pub fn is_utf8(&self) -> bool {
2129	self.0.utf8
2130	}
2131
2132	/// Returns the total number of explicit capturing groups in the
2133	/// corresponding HIR.
2134	///
2135	/// Note that this does not include the implicit capturing group
2136	/// corresponding to the entire match that is typically included by regex
2137	/// engines.
2138	///
2139	/// # Example
2140	///
2141	/// This method will return `0` for `a` and `1` for `(a)`:
2142	///
2143	/// ```
2144	/// use regex_syntax::parse;
2145	///
2146	/// assert_eq!(`0`, parse("a")?.properties().explicit_captures_len());
2147	/// assert_eq!(`1`, parse("(a)")?.properties().explicit_captures_len());
2148	///
2149	/// # Ok::<(), Box<dyn std::error::Error>>(())
2150	/// ```
2151	#[inline]
2152	pub fn explicit_captures_len(&self) -> usize {
2153	self.0.explicit_captures_len
2154	}
2155
2156	/// Returns the total number of explicit capturing groups that appear in
2157	/// every possible match.
2158	///
2159	/// If the number of capture groups can vary depending on the match, then
2160	/// this returns `None`. That is, a value is only returned when the number
2161	/// of matching groups is invariant or "static."
2162	///
2163	/// Note that this does not include the implicit capturing group
2164	/// corresponding to the entire match.
2165	///
2166	/// # Example
2167	///
2168	/// This shows a few cases where a static number of capture groups is
2169	/// available and a few cases where it is not.
2170	///
2171	/// ```
2172	/// use regex_syntax::parse;
2173	///
2174	/// let len = \|pattern\| {
2175	/// parse(pattern).map(\|h\| {
2176	/// h.properties().static_explicit_captures_len()
2177	/// })
2178	/// };
2179	///
2180	/// assert_eq!(Some(`0`), len("a")?);
2181	/// assert_eq!(Some(`1`), len("(a)")?);
2182	/// assert_eq!(Some(`1`), len("(a)\|(b)")?);
2183	/// assert_eq!(Some(`2`), len("(a)(b)\|(c)(d)")?);
2184	/// assert_eq!(None, len("(a)\|b")?);
2185	/// assert_eq!(None, len("a\|(b)")?);
2186	/// assert_eq!(None, len("(b)*")?);
2187	/// assert_eq!(Some(`1`), len("(b)+")?);
2188	///
2189	/// # Ok::<(), Box<dyn std::error::Error>>(())
2190	/// ```
2191	#[inline]
2192	pub fn static_explicit_captures_len(&self) -> Option<usize> {
2193	self.0.static_explicit_captures_len
2194	}
2195
2196	/// Return true if and only if this HIR is a simple literal. This is
2197	/// only true when this HIR expression is either itself a `Literal` or a
2198	/// concatenation of only `Literal`s.
2199	///
2200	/// For example, `f` and `foo` are literals, but `f+`, `(foo)`, `foo()` and
2201	/// the empty string are not (even though they contain sub-expressions that
2202	/// are literals).
2203	#[inline]
2204	pub fn is_literal(&self) -> bool {
2205	self.0.literal
2206	}
2207
2208	/// Return true if and only if this HIR is either a simple literal or an
2209	/// alternation of simple literals. This is only
2210	/// true when this HIR expression is either itself a `Literal` or a
2211	/// concatenation of only `Literal`s or an alternation of only `Literal`s.
2212	///
2213	/// For example, `f`, `foo`, `a\|b\|c`, and `foo\|bar\|baz` are alternation
2214	/// literals, but `f+`, `(foo)`, `foo()`, and the empty pattern are not
2215	/// (even though that contain sub-expressions that are literals).
2216	#[inline]
2217	pub fn is_alternation_literal(&self) -> bool {
2218	self.0.alternation_literal
2219	}
2220
2221	/// Returns the total amount of heap memory usage, in bytes, used by this
2222	/// `Properties` value.
2223	#[inline]
2224	pub fn memory_usage(&self) -> usize {
2225	core::mem::size_of::<PropertiesI>()
2226	}
2227
2228	/// Returns a new set of properties that corresponds to the union of the
2229	/// iterator of properties given.
2230	///
2231	/// This is useful when one has multiple `Hir` expressions and wants
2232	/// to combine them into a single alternation without constructing the
2233	/// corresponding `Hir`. This routine provides a way of combining the
2234	/// properties of each `Hir` expression into one set of properties
2235	/// representing the union of those expressions.
2236	///
2237	/// # Example: union with HIRs that never match
2238	///
2239	/// This example shows that unioning properties together with one that
2240	/// represents a regex that never matches will "poison" certain attributes,
2241	/// like the minimum and maximum lengths.
2242	///
2243	/// ```
2244	/// use regex_syntax::{hir::Properties, parse};
2245	///
2246	/// let hir1 = parse("ab?c?")?;
2247	/// assert_eq!(Some(`1`), hir1.properties().minimum_len());
2248	/// assert_eq!(Some(`3`), hir1.properties().maximum_len());
2249	///
2250	/// let hir2 = parse(r"[a&&b]")?;
2251	/// assert_eq!(None, hir2.properties().minimum_len());
2252	/// assert_eq!(None, hir2.properties().maximum_len());
2253	///
2254	/// let hir3 = parse(r"wxy?z?")?;
2255	/// assert_eq!(Some(`2`), hir3.properties().minimum_len());
2256	/// assert_eq!(Some(`4`), hir3.properties().maximum_len());
2257	///
2258	/// let unioned = Properties::union([
2259	/// hir1.properties(),
2260	/// hir2.properties(),
2261	/// hir3.properties(),
2262	/// ]);
2263	/// assert_eq!(None, unioned.minimum_len());
2264	/// assert_eq!(None, unioned.maximum_len());
2265	///
2266	/// # Ok::<(), Box<dyn std::error::Error>>(())
2267	/// ```
2268	///
2269	/// The maximum length can also be "poisoned" by a pattern that has no
2270	/// upper bound on the length of a match. The minimum length remains
2271	/// unaffected:
2272	///
2273	/// ```
2274	/// use regex_syntax::{hir::Properties, parse};
2275	///
2276	/// let hir1 = parse("ab?c?")?;
2277	/// assert_eq!(Some(`1`), hir1.properties().minimum_len());
2278	/// assert_eq!(Some(`3`), hir1.properties().maximum_len());
2279	///
2280	/// let hir2 = parse(r"a+")?;
2281	/// assert_eq!(Some(`1`), hir2.properties().minimum_len());
2282	/// assert_eq!(None, hir2.properties().maximum_len());
2283	///
2284	/// let hir3 = parse(r"wxy?z?")?;
2285	/// assert_eq!(Some(`2`), hir3.properties().minimum_len());
2286	/// assert_eq!(Some(`4`), hir3.properties().maximum_len());
2287	///
2288	/// let unioned = Properties::union([
2289	/// hir1.properties(),
2290	/// hir2.properties(),
2291	/// hir3.properties(),
2292	/// ]);
2293	/// assert_eq!(Some(`1`), unioned.minimum_len());
2294	/// assert_eq!(None, unioned.maximum_len());
2295	///
2296	/// # Ok::<(), Box<dyn std::error::Error>>(())
2297	/// ```
2298	pub fn union<I, P>(props: I) -> Properties
2299	where
2300	I: IntoIterator<Item = P>,
2301	P: core::borrow::Borrow<Properties>,
2302	{
2303	let mut it = props.into_iter().peekable();
2304	// While empty alternations aren't possible, we still behave as if they
2305	// are. When we have an empty alternate, then clearly the look-around
2306	// prefix and suffix is empty. Otherwise, it is the intersection of all
2307	// prefixes and suffixes (respectively) of the branches.
2308	let fix = if it.peek().is_none() {
2309	LookSet::empty()
2310	} else {
2311	LookSet::full()
2312	};
2313	// And also, an empty alternate means we have 0 static capture groups,
2314	// but we otherwise start with the number corresponding to the first
2315	// alternate. If any subsequent alternate has a different number of
2316	// static capture groups, then we overall have a variation and not a
2317	// static number of groups.
2318	let static_explicit_captures_len =
2319	it.peek().and_then(\|p\| p.borrow().static_explicit_captures_len());
2320	// The base case is an empty alternation, which matches nothing.
2321	// Note though that empty alternations aren't possible, because the
2322	// Hir::alternation smart constructor rewrites those as empty character
2323	// classes.
2324	let mut props = PropertiesI {
2325	minimum_len: None,
2326	maximum_len: None,
2327	look_set: LookSet::empty(),
2328	look_set_prefix: fix,
2329	look_set_suffix: fix,
2330	look_set_prefix_any: LookSet::empty(),
2331	look_set_suffix_any: LookSet::empty(),
2332	utf8: `true`,
2333	explicit_captures_len: `0`,
2334	static_explicit_captures_len,
2335	literal: `false`,
2336	alternation_literal: `true`,
2337	};
2338	let (mut min_poisoned, mut max_poisoned) = (`false`, `false`);
2339	// Handle properties that need to visit every child hir.
2340	for prop in it {
2341	let p = prop.borrow();
2342	props.look_set.set_union(p.look_set());
2343	props.look_set_prefix.set_intersect(p.look_set_prefix());
2344	props.look_set_suffix.set_intersect(p.look_set_suffix());
2345	props.look_set_prefix_any.set_union(p.look_set_prefix_any());
2346	props.look_set_suffix_any.set_union(p.look_set_suffix_any());
2347	props.utf8 = props.utf8 && p.is_utf8();
2348	props.explicit_captures_len = props
2349	.explicit_captures_len
2350	.saturating_add(p.explicit_captures_len());
2351	if props.static_explicit_captures_len
2352	!= p.static_explicit_captures_len()
2353	{
2354	props.static_explicit_captures_len = None;
2355	}
2356	props.alternation_literal =
2357	props.alternation_literal && p.is_literal();
2358	if !min_poisoned {
2359	if let Some(xmin) = p.minimum_len() {
2360	if props.minimum_len.map_or(`true`, \|pmin\| xmin < pmin) {
2361	props.minimum_len = Some(xmin);
2362	}
2363	} else {
2364	props.minimum_len = None;
2365	min_poisoned = `true`;
2366	}
2367	}
2368	if !max_poisoned {
2369	if let Some(xmax) = p.maximum_len() {
2370	if props.maximum_len.map_or(`true`, \|pmax\| xmax > pmax) {
2371	props.maximum_len = Some(xmax);
2372	}
2373	} else {
2374	props.maximum_len = None;
2375	max_poisoned = `true`;
2376	}
2377	}
2378	}
2379	Properties(Box::new(props))
2380	}
2381	}
2382
2383	impl Properties {
2384	/// Create a new set of HIR properties for an empty regex.
2385	fn empty() -> Properties {
2386	let inner = PropertiesI {
2387	minimum_len: Some(`0`),
2388	maximum_len: Some(`0`),
2389	look_set: LookSet::empty(),
2390	look_set_prefix: LookSet::empty(),
2391	look_set_suffix: LookSet::empty(),
2392	look_set_prefix_any: LookSet::empty(),
2393	look_set_suffix_any: LookSet::empty(),
2394	// It is debatable whether an empty regex always matches at valid
2395	// UTF-8 boundaries. Strictly speaking, at a byte oriented view,
2396	// it is clearly false. There are, for example, many empty strings
2397	// between the bytes encoding a '☃'.
2398	//
2399	// However, when Unicode mode is enabled, the fundamental atom
2400	// of matching is really a codepoint. And in that scenario, an
2401	// empty regex is defined to only match at valid UTF-8 boundaries
2402	// and to never split a codepoint. It just so happens that this
2403	// enforcement is somewhat tricky to do for regexes that match
2404	// the empty string inside regex engines themselves. It usually
2405	// requires some layer above the regex engine to filter out such
2406	// matches.
2407	//
2408	// In any case, 'true' is really the only coherent option. If it
2409	// were false, for example, then 'a' would also need to be false*
2410	// since it too can match the empty string.
2411	utf8: `true`,
2412	explicit_captures_len: `0`,
2413	static_explicit_captures_len: Some(`0`),
2414	literal: `false`,
2415	alternation_literal: `false`,
2416	};
2417	Properties(Box::new(inner))
2418	}
2419
2420	/// Create a new set of HIR properties for a literal regex.
2421	fn literal(lit: &Literal) -> Properties {
2422	let inner = PropertiesI {
2423	minimum_len: Some(lit.0.len()),
2424	maximum_len: Some(lit.0.len()),
2425	look_set: LookSet::empty(),
2426	look_set_prefix: LookSet::empty(),
2427	look_set_suffix: LookSet::empty(),
2428	look_set_prefix_any: LookSet::empty(),
2429	look_set_suffix_any: LookSet::empty(),
2430	utf8: core::str::from_utf8(&lit.0).is_ok(),
2431	explicit_captures_len: `0`,
2432	static_explicit_captures_len: Some(`0`),
2433	literal: `true`,
2434	alternation_literal: `true`,
2435	};
2436	Properties(Box::new(inner))
2437	}
2438
2439	/// Create a new set of HIR properties for a character class.
2440	fn class(class: &Class) -> Properties {
2441	let inner = PropertiesI {
2442	minimum_len: class.minimum_len(),
2443	maximum_len: class.maximum_len(),
2444	look_set: LookSet::empty(),
2445	look_set_prefix: LookSet::empty(),
2446	look_set_suffix: LookSet::empty(),
2447	look_set_prefix_any: LookSet::empty(),
2448	look_set_suffix_any: LookSet::empty(),
2449	utf8: class.is_utf8(),
2450	explicit_captures_len: `0`,
2451	static_explicit_captures_len: Some(`0`),
2452	literal: `false`,
2453	alternation_literal: `false`,
2454	};
2455	Properties(Box::new(inner))
2456	}
2457
2458	/// Create a new set of HIR properties for a look-around assertion.
2459	fn look(look: Look) -> Properties {
2460	let inner = PropertiesI {
2461	minimum_len: Some(`0`),
2462	maximum_len: Some(`0`),
2463	look_set: LookSet::singleton(look),
2464	look_set_prefix: LookSet::singleton(look),
2465	look_set_suffix: LookSet::singleton(look),
2466	look_set_prefix_any: LookSet::singleton(look),
2467	look_set_suffix_any: LookSet::singleton(look),
2468	// This requires a little explanation. Basically, we don't consider
2469	// matching an empty string to be equivalent to matching invalid
2470	// UTF-8, even though technically matching every empty string will
2471	// split the UTF-8 encoding of a single codepoint when treating a
2472	// UTF-8 encoded string as a sequence of bytes. Our defense here is
2473	// that in such a case, a codepoint should logically be treated as
2474	// the fundamental atom for matching, and thus the only valid match
2475	// points are between codepoints and not bytes.
2476	//
2477	// More practically, this is true here because it's also true
2478	// for 'Hir::empty()', otherwise something like 'a' would be*
2479	// considered to match invalid UTF-8. That in turn makes this
2480	// property borderline useless.
2481	utf8: `true`,
2482	explicit_captures_len: `0`,
2483	static_explicit_captures_len: Some(`0`),
2484	literal: `false`,
2485	alternation_literal: `false`,
2486	};
2487	Properties(Box::new(inner))
2488	}
2489
2490	/// Create a new set of HIR properties for a repetition.
2491	fn repetition(rep: &Repetition) -> Properties {
2492	let p = rep.sub.properties();
2493	let minimum_len = p.minimum_len().map(\|child_min\| {
2494	let rep_min = usize::try_from(rep.min).unwrap_or(usize::MAX);
2495	child_min.saturating_mul(rep_min)
2496	});
2497	let maximum_len = rep.max.and_then(\|rep_max\| {
2498	let rep_max = usize::try_from(rep_max).ok()?;
2499	let child_max = p.maximum_len()?;
2500	child_max.checked_mul(rep_max)
2501	});
2502
2503	let mut inner = PropertiesI {
2504	minimum_len,
2505	maximum_len,
2506	look_set: p.look_set(),
2507	look_set_prefix: LookSet::empty(),
2508	look_set_suffix: LookSet::empty(),
2509	look_set_prefix_any: p.look_set_prefix_any(),
2510	look_set_suffix_any: p.look_set_suffix_any(),
2511	utf8: p.is_utf8(),
2512	explicit_captures_len: p.explicit_captures_len(),
2513	static_explicit_captures_len: p.static_explicit_captures_len(),
2514	literal: `false`,
2515	alternation_literal: `false`,
2516	};
2517	// If the repetition operator can match the empty string, then its
2518	// lookset prefix and suffixes themselves remain empty since they are
2519	// no longer required to match.
2520	if rep.min > `0` {
2521	inner.look_set_prefix = p.look_set_prefix();
2522	inner.look_set_suffix = p.look_set_suffix();
2523	}
2524	// If the static captures len of the sub-expression is not known or
2525	// is greater than zero, then it automatically propagates to the
2526	// repetition, regardless of the repetition. Otherwise, it might
2527	// change, but only when the repetition can match 0 times.
2528	if rep.min == `0`
2529	&& inner.static_explicit_captures_len.map_or(`false`, \|len\| len > `0`)
2530	{
2531	// If we require a match 0 times, then our captures len is
2532	// guaranteed to be zero. Otherwise, if we can* match the empty*
2533	// string, then it's impossible to know how many captures will be
2534	// in the resulting match.
2535	if rep.max == Some(`0`) {
2536	inner.static_explicit_captures_len = Some(`0`);
2537	} else {
2538	inner.static_explicit_captures_len = None;
2539	}
2540	}
2541	Properties(Box::new(inner))
2542	}
2543
2544	/// Create a new set of HIR properties for a capture.
2545	fn capture(capture: &Capture) -> Properties {
2546	let p = capture.sub.properties();
2547	Properties(Box::new(PropertiesI {
2548	explicit_captures_len: p.explicit_captures_len().saturating_add(`1`),
2549	static_explicit_captures_len: p
2550	.static_explicit_captures_len()
2551	.map(\|len\| len.saturating_add(`1`)),
2552	literal: `false`,
2553	alternation_literal: `false`,
2554	..*p.0.clone()
2555	}))
2556	}
2557
2558	/// Create a new set of HIR properties for a concatenation.
2559	fn concat(concat: &[Hir]) -> Properties {
2560	// The base case is an empty concatenation, which matches the empty
2561	// string. Note though that empty concatenations aren't possible,
2562	// because the Hir::concat smart constructor rewrites those as
2563	// Hir::empty.
2564	let mut props = PropertiesI {
2565	minimum_len: Some(`0`),
2566	maximum_len: Some(`0`),
2567	look_set: LookSet::empty(),
2568	look_set_prefix: LookSet::empty(),
2569	look_set_suffix: LookSet::empty(),
2570	look_set_prefix_any: LookSet::empty(),
2571	look_set_suffix_any: LookSet::empty(),
2572	utf8: `true`,
2573	explicit_captures_len: `0`,
2574	static_explicit_captures_len: Some(`0`),
2575	literal: `true`,
2576	alternation_literal: `true`,
2577	};
2578	// Handle properties that need to visit every child hir.
2579	for x in concat.iter() {
2580	let p = x.properties();
2581	props.look_set.set_union(p.look_set());
2582	props.utf8 = props.utf8 && p.is_utf8();
2583	props.explicit_captures_len = props
2584	.explicit_captures_len
2585	.saturating_add(p.explicit_captures_len());
2586	props.static_explicit_captures_len = p
2587	.static_explicit_captures_len()
2588	.and_then(\|len1\| {
2589	Some((len1, props.static_explicit_captures_len?))
2590	})
2591	.and_then(\|(len1, len2)\| Some(len1.saturating_add(len2)));
2592	props.literal = props.literal && p.is_literal();
2593	props.alternation_literal =
2594	props.alternation_literal && p.is_alternation_literal();
2595	if let Some(minimum_len) = props.minimum_len {
2596	match p.minimum_len() {
2597	None => props.minimum_len = None,
2598	Some(len) => {
2599	// We use saturating arithmetic here because the
2600	// minimum is just a lower bound. We can't go any
2601	// higher than what our number types permit.
2602	props.minimum_len =
2603	Some(minimum_len.saturating_add(len));
2604	}
2605	}
2606	}
2607	if let Some(maximum_len) = props.maximum_len {
2608	match p.maximum_len() {
2609	None => props.maximum_len = None,
2610	Some(len) => {
2611	props.maximum_len = maximum_len.checked_add(len)
2612	}
2613	}
2614	}
2615	}
2616	// Handle the prefix properties, which only requires visiting
2617	// child exprs until one matches more than the empty string.
2618	let mut it = concat.iter();
2619	while let Some(x) = it.next() {
2620	props.look_set_prefix.set_union(x.properties().look_set_prefix());
2621	props
2622	.look_set_prefix_any
2623	.set_union(x.properties().look_set_prefix_any());
2624	if x.properties().maximum_len().map_or(`true`, \|x\| x > `0`) {
2625	break;
2626	}
2627	}
2628	// Same thing for the suffix properties, but in reverse.
2629	let mut it = concat.iter().rev();
2630	while let Some(x) = it.next() {
2631	props.look_set_suffix.set_union(x.properties().look_set_suffix());
2632	props
2633	.look_set_suffix_any
2634	.set_union(x.properties().look_set_suffix_any());
2635	if x.properties().maximum_len().map_or(`true`, \|x\| x > `0`) {
2636	break;
2637	}
2638	}
2639	Properties(Box::new(props))
2640	}
2641
2642	/// Create a new set of HIR properties for a concatenation.
2643	fn alternation(alts: &[Hir]) -> Properties {
2644	Properties::union(alts.iter().map(\|hir\| hir.properties()))
2645	}
2646	}
2647
2648	/// A set of look-around assertions.
2649	///
2650	/// This is useful for efficiently tracking look-around assertions. For
2651	/// example, an [`Hir`] provides properties that return `LookSet`s.
2652	#[derive(Clone, Copy, Default, Eq, PartialEq)]
2653	pub struct LookSet {
2654	/// The underlying representation this set is exposed to make it possible
2655	/// to store it somewhere efficiently. The representation is that
2656	/// of a bitset, where each assertion occupies bit `i` where `i =
2657	/// Look::as_repr()`.
2658	///
2659	/// Note that users of this internal representation must permit the full
2660	/// range of `u16` values to be represented. For example, even if the
2661	/// current implementation only makes use of the 10 least significant bits,
2662	/// it may use more bits in a future semver compatible release.
2663	pub bits: u32,
2664	}
2665
2666	impl LookSet {
2667	/// Create an empty set of look-around assertions.
2668	#[inline]
2669	pub fn empty() -> LookSet {
2670	LookSet { bits: `0` }
2671	}
2672
2673	/// Create a full set of look-around assertions.
2674	///
2675	/// This set contains all possible look-around assertions.
2676	#[inline]
2677	pub fn full() -> LookSet {
2678	LookSet { bits: !`0` }
2679	}
2680
2681	/// Create a look-around set containing the look-around assertion given.
2682	///
2683	/// This is a convenience routine for creating an empty set and inserting
2684	/// one look-around assertions.
2685	#[inline]
2686	pub fn singleton(look: Look) -> LookSet {
2687	LookSet::empty().insert(look)
2688	}
2689
2690	/// Returns the total number of look-around assertions in this set.
2691	#[inline]
2692	pub fn len(self) -> usize {
2693	// OK because max value always fits in a u8, which in turn always
2694	// fits in a usize, regardless of target.
2695	usize::try_from(self.bits.count_ones()).unwrap()
2696	}
2697
2698	/// Returns true if and only if this set is empty.
2699	#[inline]
2700	pub fn is_empty(self) -> bool {
2701	self.len() == `0`
2702	}
2703
2704	/// Returns true if and only if the given look-around assertion is in this
2705	/// set.
2706	#[inline]
2707	pub fn contains(self, look: Look) -> bool {
2708	self.bits & look.as_repr() != `0`
2709	}
2710
2711	/// Returns true if and only if this set contains any anchor assertions.
2712	/// This includes both "start/end of haystack" and "start/end of line."
2713	#[inline]
2714	pub fn contains_anchor(&self) -> bool {
2715	self.contains_anchor_haystack() \|\| self.contains_anchor_line()
2716	}
2717
2718	/// Returns true if and only if this set contains any "start/end of
2719	/// haystack" anchors. This doesn't include "start/end of line" anchors.
2720	#[inline]
2721	pub fn contains_anchor_haystack(&self) -> bool {
2722	self.contains(Look::Start) \|\| self.contains(Look::End)
2723	}
2724
2725	/// Returns true if and only if this set contains any "start/end of line"
2726	/// anchors. This doesn't include "start/end of haystack" anchors. This
2727	/// includes both `\n` line anchors and CRLF (`\r\n`) aware line anchors.
2728	#[inline]
2729	pub fn contains_anchor_line(&self) -> bool {
2730	self.contains(Look::StartLF)
2731	\|\| self.contains(Look::EndLF)
2732	\|\| self.contains(Look::StartCRLF)
2733	\|\| self.contains(Look::EndCRLF)
2734	}
2735
2736	/// Returns true if and only if this set contains any "start/end of line"
2737	/// anchors that only treat `\n` as line terminators. This does not include
2738	/// haystack anchors or CRLF aware line anchors.
2739	#[inline]
2740	pub fn contains_anchor_lf(&self) -> bool {
2741	self.contains(Look::StartLF) \|\| self.contains(Look::EndLF)
2742	}
2743
2744	/// Returns true if and only if this set contains any "start/end of line"
2745	/// anchors that are CRLF-aware. This doesn't include "start/end of
2746	/// haystack" or "start/end of line-feed" anchors.
2747	#[inline]
2748	pub fn contains_anchor_crlf(&self) -> bool {
2749	self.contains(Look::StartCRLF) \|\| self.contains(Look::EndCRLF)
2750	}
2751
2752	/// Returns true if and only if this set contains any word boundary or
2753	/// negated word boundary assertions. This include both Unicode and ASCII
2754	/// word boundaries.
2755	#[inline]
2756	pub fn contains_word(self) -> bool {
2757	self.contains_word_unicode() \|\| self.contains_word_ascii()
2758	}
2759
2760	/// Returns true if and only if this set contains any Unicode word boundary
2761	/// or negated Unicode word boundary assertions.
2762	#[inline]
2763	pub fn contains_word_unicode(self) -> bool {
2764	self.contains(Look::WordUnicode)
2765	\|\| self.contains(Look::WordUnicodeNegate)
2766	\|\| self.contains(Look::WordStartUnicode)
2767	\|\| self.contains(Look::WordEndUnicode)
2768	\|\| self.contains(Look::WordStartHalfUnicode)
2769	\|\| self.contains(Look::WordEndHalfUnicode)
2770	}
2771
2772	/// Returns true if and only if this set contains any ASCII word boundary
2773	/// or negated ASCII word boundary assertions.
2774	#[inline]
2775	pub fn contains_word_ascii(self) -> bool {
2776	self.contains(Look::WordAscii)
2777	\|\| self.contains(Look::WordAsciiNegate)
2778	\|\| self.contains(Look::WordStartAscii)
2779	\|\| self.contains(Look::WordEndAscii)
2780	\|\| self.contains(Look::WordStartHalfAscii)
2781	\|\| self.contains(Look::WordEndHalfAscii)
2782	}
2783
2784	/// Returns an iterator over all of the look-around assertions in this set.
2785	#[inline]
2786	pub fn iter(self) -> LookSetIter {
2787	LookSetIter { set: self }
2788	}
2789
2790	/// Return a new set that is equivalent to the original, but with the given
2791	/// assertion added to it. If the assertion is already in the set, then the
2792	/// returned set is equivalent to the original.
2793	#[inline]
2794	pub fn insert(self, look: Look) -> LookSet {
2795	LookSet { bits: self.bits \| look.as_repr() }
2796	}
2797
2798	/// Updates this set in place with the result of inserting the given
2799	/// assertion into this set.
2800	#[inline]
2801	pub fn set_insert(&mut self, look: Look) {
2802	*self = self.insert(look);
2803	}
2804
2805	/// Return a new set that is equivalent to the original, but with the given
2806	/// assertion removed from it. If the assertion is not in the set, then the
2807	/// returned set is equivalent to the original.
2808	#[inline]
2809	pub fn remove(self, look: Look) -> LookSet {
2810	LookSet { bits: self.bits & !look.as_repr() }
2811	}
2812
2813	/// Updates this set in place with the result of removing the given
2814	/// assertion from this set.
2815	#[inline]
2816	pub fn set_remove(&mut self, look: Look) {
2817	*self = self.remove(look);
2818	}
2819
2820	/// Returns a new set that is the result of subtracting the given set from
2821	/// this set.
2822	#[inline]
2823	pub fn subtract(self, other: LookSet) -> LookSet {
2824	LookSet { bits: self.bits & !other.bits }
2825	}
2826
2827	/// Updates this set in place with the result of subtracting the given set
2828	/// from this set.
2829	#[inline]
2830	pub fn set_subtract(&mut self, other: LookSet) {
2831	*self = self.subtract(other);
2832	}
2833
2834	/// Returns a new set that is the union of this and the one given.
2835	#[inline]
2836	pub fn union(self, other: LookSet) -> LookSet {
2837	LookSet { bits: self.bits \| other.bits }
2838	}
2839
2840	/// Updates this set in place with the result of unioning it with the one
2841	/// given.
2842	#[inline]
2843	pub fn set_union(&mut self, other: LookSet) {
2844	*self = self.union(other);
2845	}
2846
2847	/// Returns a new set that is the intersection of this and the one given.
2848	#[inline]
2849	pub fn intersect(self, other: LookSet) -> LookSet {
2850	LookSet { bits: self.bits & other.bits }
2851	}
2852
2853	/// Updates this set in place with the result of intersecting it with the
2854	/// one given.
2855	#[inline]
2856	pub fn set_intersect(&mut self, other: LookSet) {
2857	*self = self.intersect(other);
2858	}
2859
2860	/// Return a `LookSet` from the slice given as a native endian 32-bit
2861	/// integer.
2862	///
2863	/// # Panics
2864	///
2865	/// This panics if `slice.len() < 4`.
2866	#[inline]
2867	pub fn read_repr(slice: &[u8]) -> LookSet {
2868	let bits = u32::from_ne_bytes(slice[..`4`].try_into().unwrap());
2869	LookSet { bits }
2870	}
2871
2872	/// Write a `LookSet` as a native endian 32-bit integer to the beginning
2873	/// of the slice given.
2874	///
2875	/// # Panics
2876	///
2877	/// This panics if `slice.len() < 4`.
2878	#[inline]
2879	pub fn write_repr(self, slice: &mut [u8]) {
2880	let raw = self.bits.to_ne_bytes();
2881	slice[`0`] = raw[`0`];
2882	slice[`1`] = raw[`1`];
2883	slice[`2`] = raw[`2`];
2884	slice[`3`] = raw[`3`];
2885	}
2886	}
2887
2888	impl core::fmt::Debug for LookSet {
2889	fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
2890	if self.is_empty() {
2891	return write!(f, "∅");
2892	}
2893	for look: Look in self.iter() {
2894	write!(f, "{}", look.as_char())?;
2895	}
2896	Ok(())
2897	}
2898	}
2899
2900	/// An iterator over all look-around assertions in a [`LookSet`].
2901	///
2902	/// This iterator is created by [`LookSet::iter`].
2903	#[derive(Clone, Debug)]
2904	pub struct LookSetIter {
2905	set: LookSet,
2906	}
2907
2908	impl Iterator for LookSetIter {
2909	type Item = Look;
2910
2911	#[inline]
2912	fn next(&mut self) -> Option<Look> {
2913	if self.set.is_empty() {
2914	return None;
2915	}
2916	// We'll never have more than u8::MAX distinct look-around assertions,
2917	// so 'bit' will always fit into a u16.
2918	let bit: u16 = u16::try_from(self.set.bits.trailing_zeros()).unwrap();
2919	let look: Look = Look::from_repr(`1` << bit)?;
2920	self.set = self.set.remove(look);
2921	Some(look)
2922	}
2923	}
2924
2925	/// Given a sequence of HIR values where each value corresponds to a Unicode
2926	/// class (or an all-ASCII byte class), return a single Unicode class
2927	/// corresponding to the union of the classes found.
2928	fn class_chars(hirs: &[Hir]) -> Option<Class> {
2929	let mut cls: ClassUnicode = ClassUnicode::new(ranges:vec![]);
2930	for hir: &Hir in hirs.iter() {
2931	match *hir.kind() {
2932	HirKind::Class(Class::Unicode(ref cls2: &ClassUnicode)) => {
2933	cls.union(cls2);
2934	}
2935	HirKind::Class(Class::Bytes(ref cls2: &ClassBytes)) => {
2936	cls.union(&cls2.to_unicode_class()?);
2937	}
2938	_ => return None,
2939	};
2940	}
2941	Some(Class::Unicode(cls))
2942	}
2943
2944	/// Given a sequence of HIR values where each value corresponds to a byte class
2945	/// (or an all-ASCII Unicode class), return a single byte class corresponding
2946	/// to the union of the classes found.
2947	fn class_bytes(hirs: &[Hir]) -> Option<Class> {
2948	let mut cls: ClassBytes = ClassBytes::new(ranges:vec![]);
2949	for hir: &Hir in hirs.iter() {
2950	match *hir.kind() {
2951	HirKind::Class(Class::Unicode(ref cls2: &ClassUnicode)) => {
2952	cls.union(&cls2.to_byte_class()?);
2953	}
2954	HirKind::Class(Class::Bytes(ref cls2: &ClassBytes)) => {
2955	cls.union(cls2);
2956	}
2957	_ => return None,
2958	};
2959	}
2960	Some(Class::Bytes(cls))
2961	}
2962
2963	/// Given a sequence of HIR values where each value corresponds to a literal
2964	/// that is a single `char`, return that sequence of `char`s. Otherwise return
2965	/// None. No deduplication is done.
2966	fn singleton_chars(hirs: &[Hir]) -> Option<Vec<char>> {
2967	let mut singletons: Vec = vec![];
2968	for hir: &Hir in hirs.iter() {
2969	let literal: &Box<[u8]> = match *hir.kind() {
2970	HirKind::Literal(Literal(ref bytes: &Box<[u8]>)) => bytes,
2971	_ => return None,
2972	};
2973	let ch: char = match crate::debug::utf8_decode(bytes:literal) {
2974	None => return None,
2975	Some(Err(_)) => return None,
2976	Some(Ok(ch: char)) => ch,
2977	};
2978	if literal.len() != ch.len_utf8() {
2979	return None;
2980	}
2981	singletons.push(ch);
2982	}
2983	Some(singletons)
2984	}
2985
2986	/// Given a sequence of HIR values where each value corresponds to a literal
2987	/// that is a single byte, return that sequence of bytes. Otherwise return
2988	/// None. No deduplication is done.
2989	fn singleton_bytes(hirs: &[Hir]) -> Option<Vec<u8>> {
2990	let mut singletons: Vec = vec![];
2991	for hir: &Hir in hirs.iter() {
2992	let literal: &Box<[u8]> = match *hir.kind() {
2993	HirKind::Literal(Literal(ref bytes: &Box<[u8]>)) => bytes,
2994	_ => return None,
2995	};
2996	if literal.len() != `1` {
2997	return None;
2998	}
2999	singletons.push(literal[`0`]);
3000	}
3001	Some(singletons)
3002	}
3003
3004	/// Looks for a common prefix in the list of alternation branches given. If one
3005	/// is found, then an equivalent but (hopefully) simplified Hir is returned.
3006	/// Otherwise, the original given list of branches is returned unmodified.
3007	///
3008	/// This is not quite as good as it could be. Right now, it requires that
3009	/// all branches are 'Concat' expressions. It also doesn't do well with
3010	/// literals. For example, given 'foofoo\|foobar', it will not refactor it to
3011	/// 'foo(?:foo\|bar)' because literals are flattened into their own special
3012	/// concatenation. (One wonders if perhaps 'Literal' should be a single atom
3013	/// instead of a string of bytes because of this. Otherwise, handling the
3014	/// current representation in this routine will be pretty gnarly. Sigh.)
3015	fn lift_common_prefix(hirs: Vec<Hir>) -> Result<Hir, Vec<Hir>> {
3016	if hirs.len() <= `1` {
3017	return Err(hirs);
3018	}
3019	let mut prefix = match hirs[`0`].kind() {
3020	HirKind::Concat(ref xs) => &**xs,
3021	_ => return Err(hirs),
3022	};
3023	if prefix.is_empty() {
3024	return Err(hirs);
3025	}
3026	for h in hirs.iter().skip(`1`) {
3027	let concat = match h.kind() {
3028	HirKind::Concat(ref xs) => xs,
3029	_ => return Err(hirs),
3030	};
3031	let common_len = prefix
3032	.iter()
3033	.zip(concat.iter())
3034	.take_while(\|(x, y)\| x == y)
3035	.count();
3036	prefix = &prefix[..common_len];
3037	if prefix.is_empty() {
3038	return Err(hirs);
3039	}
3040	}
3041	let len = prefix.len();
3042	assert_ne!(`0`, len);
3043	let mut prefix_concat = vec![];
3044	let mut suffix_alts = vec![];
3045	for h in hirs {
3046	let mut concat = match h.into_kind() {
3047	HirKind::Concat(xs) => xs,
3048	// We required all sub-expressions to be
3049	// concats above, so we're only here if we
3050	// have a concat.
3051	_ => unreachable!(),
3052	};
3053	suffix_alts.push(Hir::concat(concat.split_off(len)));
3054	if prefix_concat.is_empty() {
3055	prefix_concat = concat;
3056	}
3057	}
3058	let mut concat = prefix_concat;
3059	concat.push(Hir::alternation(suffix_alts));
3060	Ok(Hir::concat(concat))
3061	}
3062
3063	#[cfg(test)]
3064	mod tests {
3065	use super::*;
3066
3067	fn uclass(ranges: &[(char, char)]) -> ClassUnicode {
3068	let ranges: Vec<ClassUnicodeRange> = ranges
3069	.iter()
3070	.map(\|&(s, e)\| ClassUnicodeRange::new(s, e))
3071	.collect();
3072	ClassUnicode::new(ranges)
3073	}
3074
3075	fn bclass(ranges: &[(u8, u8)]) -> ClassBytes {
3076	let ranges: Vec<ClassBytesRange> =
3077	ranges.iter().map(\|&(s, e)\| ClassBytesRange::new(s, e)).collect();
3078	ClassBytes::new(ranges)
3079	}
3080
3081	fn uranges(cls: &ClassUnicode) -> Vec<(char, char)> {
3082	cls.iter().map(\|x\| (x.start(), x.end())).collect()
3083	}
3084
3085	#[cfg(feature = "unicode-case")]
3086	fn ucasefold(cls: &ClassUnicode) -> ClassUnicode {
3087	let mut cls_ = cls.clone();
3088	cls_.case_fold_simple();
3089	cls_
3090	}
3091
3092	fn uunion(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
3093	let mut cls_ = cls1.clone();
3094	cls_.union(cls2);
3095	cls_
3096	}
3097
3098	fn uintersect(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
3099	let mut cls_ = cls1.clone();
3100	cls_.intersect(cls2);
3101	cls_
3102	}
3103
3104	fn udifference(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
3105	let mut cls_ = cls1.clone();
3106	cls_.difference(cls2);
3107	cls_
3108	}
3109
3110	fn usymdifference(
3111	cls1: &ClassUnicode,
3112	cls2: &ClassUnicode,
3113	) -> ClassUnicode {
3114	let mut cls_ = cls1.clone();
3115	cls_.symmetric_difference(cls2);
3116	cls_
3117	}
3118
3119	fn unegate(cls: &ClassUnicode) -> ClassUnicode {
3120	let mut cls_ = cls.clone();
3121	cls_.negate();
3122	cls_
3123	}
3124
3125	fn branges(cls: &ClassBytes) -> Vec<(u8, u8)> {
3126	cls.iter().map(\|x\| (x.start(), x.end())).collect()
3127	}
3128
3129	fn bcasefold(cls: &ClassBytes) -> ClassBytes {
3130	let mut cls_ = cls.clone();
3131	cls_.case_fold_simple();
3132	cls_
3133	}
3134
3135	fn bunion(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
3136	let mut cls_ = cls1.clone();
3137	cls_.union(cls2);
3138	cls_
3139	}
3140
3141	fn bintersect(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
3142	let mut cls_ = cls1.clone();
3143	cls_.intersect(cls2);
3144	cls_
3145	}
3146
3147	fn bdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
3148	let mut cls_ = cls1.clone();
3149	cls_.difference(cls2);
3150	cls_
3151	}
3152
3153	fn bsymdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
3154	let mut cls_ = cls1.clone();
3155	cls_.symmetric_difference(cls2);
3156	cls_
3157	}
3158
3159	fn bnegate(cls: &ClassBytes) -> ClassBytes {
3160	let mut cls_ = cls.clone();
3161	cls_.negate();
3162	cls_
3163	}
3164
3165	#[test]
3166	fn class_range_canonical_unicode() {
3167	let range = ClassUnicodeRange::new('`\u{00FF}`', '`\0`');
3168	assert_eq!('`\0`', range.start());
3169	assert_eq!('`\u{00FF}`', range.end());
3170	}
3171
3172	#[test]
3173	fn class_range_canonical_bytes() {
3174	let range = ClassBytesRange::new(b'`\xFF`', b'`\0`');
3175	assert_eq!(b'`\0`', range.start());
3176	assert_eq!(b'`\xFF`', range.end());
3177	}
3178
3179	#[test]
3180	fn class_canonicalize_unicode() {
3181	let cls = uclass(&[('a', 'c'), ('x', 'z')]);
3182	let expected = vec![('a', 'c'), ('x', 'z')];
3183	assert_eq!(expected, uranges(&cls));
3184
3185	let cls = uclass(&[('x', 'z'), ('a', 'c')]);
3186	let expected = vec![('a', 'c'), ('x', 'z')];
3187	assert_eq!(expected, uranges(&cls));
3188
3189	let cls = uclass(&[('x', 'z'), ('w', 'y')]);
3190	let expected = vec![('w', 'z')];
3191	assert_eq!(expected, uranges(&cls));
3192
3193	let cls = uclass(&[
3194	('c', 'f'),
3195	('a', 'g'),
3196	('d', 'j'),
3197	('a', 'c'),
3198	('m', 'p'),
3199	('l', 's'),
3200	]);
3201	let expected = vec![('a', 'j'), ('l', 's')];
3202	assert_eq!(expected, uranges(&cls));
3203
3204	let cls = uclass(&[('x', 'z'), ('u', 'w')]);
3205	let expected = vec![('u', 'z')];
3206	assert_eq!(expected, uranges(&cls));
3207
3208	let cls = uclass(&[('`\x00`', '`\u{10FFFF}`'), ('`\x00`', '`\u{10FFFF}`')]);
3209	let expected = vec![('`\x00`', '`\u{10FFFF}`')];
3210	assert_eq!(expected, uranges(&cls));
3211
3212	let cls = uclass(&[('a', 'a'), ('b', 'b')]);
3213	let expected = vec![('a', 'b')];
3214	assert_eq!(expected, uranges(&cls));
3215	}
3216
3217	#[test]
3218	fn class_canonicalize_bytes() {
3219	let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]);
3220	let expected = vec![(b'a', b'c'), (b'x', b'z')];
3221	assert_eq!(expected, branges(&cls));
3222
3223	let cls = bclass(&[(b'x', b'z'), (b'a', b'c')]);
3224	let expected = vec![(b'a', b'c'), (b'x', b'z')];
3225	assert_eq!(expected, branges(&cls));
3226
3227	let cls = bclass(&[(b'x', b'z'), (b'w', b'y')]);
3228	let expected = vec![(b'w', b'z')];
3229	assert_eq!(expected, branges(&cls));
3230
3231	let cls = bclass(&[
3232	(b'c', b'f'),
3233	(b'a', b'g'),
3234	(b'd', b'j'),
3235	(b'a', b'c'),
3236	(b'm', b'p'),
3237	(b'l', b's'),
3238	]);
3239	let expected = vec![(b'a', b'j'), (b'l', b's')];
3240	assert_eq!(expected, branges(&cls));
3241
3242	let cls = bclass(&[(b'x', b'z'), (b'u', b'w')]);
3243	let expected = vec![(b'u', b'z')];
3244	assert_eq!(expected, branges(&cls));
3245
3246	let cls = bclass(&[(b'`\x00`', b'`\xFF`'), (b'`\x00`', b'`\xFF`')]);
3247	let expected = vec![(b'`\x00`', b'`\xFF`')];
3248	assert_eq!(expected, branges(&cls));
3249
3250	let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]);
3251	let expected = vec![(b'a', b'b')];
3252	assert_eq!(expected, branges(&cls));
3253	}
3254
3255	#[test]
3256	#[cfg(feature = "unicode-case")]
3257	fn class_case_fold_unicode() {
3258	let cls = uclass(&[
3259	('C', 'F'),
3260	('A', 'G'),
3261	('D', 'J'),
3262	('A', 'C'),
3263	('M', 'P'),
3264	('L', 'S'),
3265	('c', 'f'),
3266	]);
3267	let expected = uclass(&[
3268	('A', 'J'),
3269	('L', 'S'),
3270	('a', 'j'),
3271	('l', 's'),
3272	('`\u{17F}`', '`\u{17F}`'),
3273	]);
3274	assert_eq!(expected, ucasefold(&cls));
3275
3276	let cls = uclass(&[('A', 'Z')]);
3277	let expected = uclass(&[
3278	('A', 'Z'),
3279	('a', 'z'),
3280	('`\u{17F}`', '`\u{17F}`'),
3281	('`\u{212A}`', '`\u{212A}`'),
3282	]);
3283	assert_eq!(expected, ucasefold(&cls));
3284
3285	let cls = uclass(&[('a', 'z')]);
3286	let expected = uclass(&[
3287	('A', 'Z'),
3288	('a', 'z'),
3289	('`\u{17F}`', '`\u{17F}`'),
3290	('`\u{212A}`', '`\u{212A}`'),
3291	]);
3292	assert_eq!(expected, ucasefold(&cls));
3293
3294	let cls = uclass(&[('A', 'A'), ('_', '_')]);
3295	let expected = uclass(&[('A', 'A'), ('_', '_'), ('a', 'a')]);
3296	assert_eq!(expected, ucasefold(&cls));
3297
3298	let cls = uclass(&[('A', 'A'), ('=', '=')]);
3299	let expected = uclass(&[('=', '='), ('A', 'A'), ('a', 'a')]);
3300	assert_eq!(expected, ucasefold(&cls));
3301
3302	let cls = uclass(&[('`\x00`', '`\x10`')]);
3303	assert_eq!(cls, ucasefold(&cls));
3304
3305	let cls = uclass(&[('k', 'k')]);
3306	let expected =
3307	uclass(&[('K', 'K'), ('k', 'k'), ('`\u{212A}`', '`\u{212A}`')]);
3308	assert_eq!(expected, ucasefold(&cls));
3309
3310	let cls = uclass(&[('@', '@')]);
3311	assert_eq!(cls, ucasefold(&cls));
3312	}
3313
3314	#[test]
3315	#[cfg(not(feature = "unicode-case"))]
3316	fn class_case_fold_unicode_disabled() {
3317	let mut cls = uclass(&[
3318	('C', 'F'),
3319	('A', 'G'),
3320	('D', 'J'),
3321	('A', 'C'),
3322	('M', 'P'),
3323	('L', 'S'),
3324	('c', 'f'),
3325	]);
3326	assert!(cls.try_case_fold_simple().is_err());
3327	}
3328
3329	#[test]
3330	#[should_panic]
3331	#[cfg(not(feature = "unicode-case"))]
3332	fn class_case_fold_unicode_disabled_panics() {
3333	let mut cls = uclass(&[
3334	('C', 'F'),
3335	('A', 'G'),
3336	('D', 'J'),
3337	('A', 'C'),
3338	('M', 'P'),
3339	('L', 'S'),
3340	('c', 'f'),
3341	]);
3342	cls.case_fold_simple();
3343	}
3344
3345	#[test]
3346	fn class_case_fold_bytes() {
3347	let cls = bclass(&[
3348	(b'C', b'F'),
3349	(b'A', b'G'),
3350	(b'D', b'J'),
3351	(b'A', b'C'),
3352	(b'M', b'P'),
3353	(b'L', b'S'),
3354	(b'c', b'f'),
3355	]);
3356	let expected =
3357	bclass(&[(b'A', b'J'), (b'L', b'S'), (b'a', b'j'), (b'l', b's')]);
3358	assert_eq!(expected, bcasefold(&cls));
3359
3360	let cls = bclass(&[(b'A', b'Z')]);
3361	let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]);
3362	assert_eq!(expected, bcasefold(&cls));
3363
3364	let cls = bclass(&[(b'a', b'z')]);
3365	let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]);
3366	assert_eq!(expected, bcasefold(&cls));
3367
3368	let cls = bclass(&[(b'A', b'A'), (b'_', b'_')]);
3369	let expected = bclass(&[(b'A', b'A'), (b'_', b'_'), (b'a', b'a')]);
3370	assert_eq!(expected, bcasefold(&cls));
3371
3372	let cls = bclass(&[(b'A', b'A'), (b'=', b'=')]);
3373	let expected = bclass(&[(b'=', b'='), (b'A', b'A'), (b'a', b'a')]);
3374	assert_eq!(expected, bcasefold(&cls));
3375
3376	let cls = bclass(&[(b'`\x00`', b'`\x10`')]);
3377	assert_eq!(cls, bcasefold(&cls));
3378
3379	let cls = bclass(&[(b'k', b'k')]);
3380	let expected = bclass(&[(b'K', b'K'), (b'k', b'k')]);
3381	assert_eq!(expected, bcasefold(&cls));
3382
3383	let cls = bclass(&[(b'@', b'@')]);
3384	assert_eq!(cls, bcasefold(&cls));
3385	}
3386
3387	#[test]
3388	fn class_negate_unicode() {
3389	let cls = uclass(&[('a', 'a')]);
3390	let expected = uclass(&[('`\x00`', '`\x60`'), ('`\x62`', '`\u{10FFFF}`')]);
3391	assert_eq!(expected, unegate(&cls));
3392
3393	let cls = uclass(&[('a', 'a'), ('b', 'b')]);
3394	let expected = uclass(&[('`\x00`', '`\x60`'), ('`\x63`', '`\u{10FFFF}`')]);
3395	assert_eq!(expected, unegate(&cls));
3396
3397	let cls = uclass(&[('a', 'c'), ('x', 'z')]);
3398	let expected = uclass(&[
3399	('`\x00`', '`\x60`'),
3400	('`\x64`', '`\x77`'),
3401	('`\x7B`', '`\u{10FFFF}`'),
3402	]);
3403	assert_eq!(expected, unegate(&cls));
3404
3405	let cls = uclass(&[('`\x00`', 'a')]);
3406	let expected = uclass(&[('`\x62`', '`\u{10FFFF}`')]);
3407	assert_eq!(expected, unegate(&cls));
3408
3409	let cls = uclass(&[('a', '`\u{10FFFF}`')]);
3410	let expected = uclass(&[('`\x00`', '`\x60`')]);
3411	assert_eq!(expected, unegate(&cls));
3412
3413	let cls = uclass(&[('`\x00`', '`\u{10FFFF}`')]);
3414	let expected = uclass(&[]);
3415	assert_eq!(expected, unegate(&cls));
3416
3417	let cls = uclass(&[]);
3418	let expected = uclass(&[('`\x00`', '`\u{10FFFF}`')]);
3419	assert_eq!(expected, unegate(&cls));
3420
3421	let cls =
3422	uclass(&[('`\x00`', '`\u{10FFFD}`'), ('`\u{10FFFF}`', '`\u{10FFFF}`')]);
3423	let expected = uclass(&[('`\u{10FFFE}`', '`\u{10FFFE}`')]);
3424	assert_eq!(expected, unegate(&cls));
3425
3426	let cls = uclass(&[('`\x00`', '`\u{D7FF}`')]);
3427	let expected = uclass(&[('`\u{E000}`', '`\u{10FFFF}`')]);
3428	assert_eq!(expected, unegate(&cls));
3429
3430	let cls = uclass(&[('`\x00`', '`\u{D7FE}`')]);
3431	let expected = uclass(&[('`\u{D7FF}`', '`\u{10FFFF}`')]);
3432	assert_eq!(expected, unegate(&cls));
3433
3434	let cls = uclass(&[('`\u{E000}`', '`\u{10FFFF}`')]);
3435	let expected = uclass(&[('`\x00`', '`\u{D7FF}`')]);
3436	assert_eq!(expected, unegate(&cls));
3437
3438	let cls = uclass(&[('`\u{E001}`', '`\u{10FFFF}`')]);
3439	let expected = uclass(&[('`\x00`', '`\u{E000}`')]);
3440	assert_eq!(expected, unegate(&cls));
3441	}
3442
3443	#[test]
3444	fn class_negate_bytes() {
3445	let cls = bclass(&[(b'a', b'a')]);
3446	let expected = bclass(&[(b'`\x00`', b'`\x60`'), (b'`\x62`', b'`\xFF`')]);
3447	assert_eq!(expected, bnegate(&cls));
3448
3449	let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]);
3450	let expected = bclass(&[(b'`\x00`', b'`\x60`'), (b'`\x63`', b'`\xFF`')]);
3451	assert_eq!(expected, bnegate(&cls));
3452
3453	let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]);
3454	let expected = bclass(&[
3455	(b'`\x00`', b'`\x60`'),
3456	(b'`\x64`', b'`\x77`'),
3457	(b'`\x7B`', b'`\xFF`'),
3458	]);
3459	assert_eq!(expected, bnegate(&cls));
3460
3461	let cls = bclass(&[(b'`\x00`', b'a')]);
3462	let expected = bclass(&[(b'`\x62`', b'`\xFF`')]);
3463	assert_eq!(expected, bnegate(&cls));
3464
3465	let cls = bclass(&[(b'a', b'`\xFF`')]);
3466	let expected = bclass(&[(b'`\x00`', b'`\x60`')]);
3467	assert_eq!(expected, bnegate(&cls));
3468
3469	let cls = bclass(&[(b'`\x00`', b'`\xFF`')]);
3470	let expected = bclass(&[]);
3471	assert_eq!(expected, bnegate(&cls));
3472
3473	let cls = bclass(&[]);
3474	let expected = bclass(&[(b'`\x00`', b'`\xFF`')]);
3475	assert_eq!(expected, bnegate(&cls));
3476
3477	let cls = bclass(&[(b'`\x00`', b'`\xFD`'), (b'`\xFF`', b'`\xFF`')]);
3478	let expected = bclass(&[(b'`\xFE`', b'`\xFE`')]);
3479	assert_eq!(expected, bnegate(&cls));
3480	}
3481
3482	#[test]
3483	fn class_union_unicode() {
3484	let cls1 = uclass(&[('a', 'g'), ('m', 't'), ('A', 'C')]);
3485	let cls2 = uclass(&[('a', 'z')]);
3486	let expected = uclass(&[('a', 'z'), ('A', 'C')]);
3487	assert_eq!(expected, uunion(&cls1, &cls2));
3488	}
3489
3490	#[test]
3491	fn class_union_bytes() {
3492	let cls1 = bclass(&[(b'a', b'g'), (b'm', b't'), (b'A', b'C')]);
3493	let cls2 = bclass(&[(b'a', b'z')]);
3494	let expected = bclass(&[(b'a', b'z'), (b'A', b'C')]);
3495	assert_eq!(expected, bunion(&cls1, &cls2));
3496	}
3497
3498	#[test]
3499	fn class_intersect_unicode() {
3500	let cls1 = uclass(&[]);
3501	let cls2 = uclass(&[('a', 'a')]);
3502	let expected = uclass(&[]);
3503	assert_eq!(expected, uintersect(&cls1, &cls2));
3504
3505	let cls1 = uclass(&[('a', 'a')]);
3506	let cls2 = uclass(&[('a', 'a')]);
3507	let expected = uclass(&[('a', 'a')]);
3508	assert_eq!(expected, uintersect(&cls1, &cls2));
3509
3510	let cls1 = uclass(&[('a', 'a')]);
3511	let cls2 = uclass(&[('b', 'b')]);
3512	let expected = uclass(&[]);
3513	assert_eq!(expected, uintersect(&cls1, &cls2));
3514
3515	let cls1 = uclass(&[('a', 'a')]);
3516	let cls2 = uclass(&[('a', 'c')]);
3517	let expected = uclass(&[('a', 'a')]);
3518	assert_eq!(expected, uintersect(&cls1, &cls2));
3519
3520	let cls1 = uclass(&[('a', 'b')]);
3521	let cls2 = uclass(&[('a', 'c')]);
3522	let expected = uclass(&[('a', 'b')]);
3523	assert_eq!(expected, uintersect(&cls1, &cls2));
3524
3525	let cls1 = uclass(&[('a', 'b')]);
3526	let cls2 = uclass(&[('b', 'c')]);
3527	let expected = uclass(&[('b', 'b')]);
3528	assert_eq!(expected, uintersect(&cls1, &cls2));
3529
3530	let cls1 = uclass(&[('a', 'b')]);
3531	let cls2 = uclass(&[('c', 'd')]);
3532	let expected = uclass(&[]);
3533	assert_eq!(expected, uintersect(&cls1, &cls2));
3534
3535	let cls1 = uclass(&[('b', 'c')]);
3536	let cls2 = uclass(&[('a', 'd')]);
3537	let expected = uclass(&[('b', 'c')]);
3538	assert_eq!(expected, uintersect(&cls1, &cls2));
3539
3540	let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3541	let cls2 = uclass(&[('a', 'h')]);
3542	let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3543	assert_eq!(expected, uintersect(&cls1, &cls2));
3544
3545	let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3546	let cls2 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3547	let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3548	assert_eq!(expected, uintersect(&cls1, &cls2));
3549
3550	let cls1 = uclass(&[('a', 'b'), ('g', 'h')]);
3551	let cls2 = uclass(&[('d', 'e'), ('k', 'l')]);
3552	let expected = uclass(&[]);
3553	assert_eq!(expected, uintersect(&cls1, &cls2));
3554
3555	let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3556	let cls2 = uclass(&[('h', 'h')]);
3557	let expected = uclass(&[('h', 'h')]);
3558	assert_eq!(expected, uintersect(&cls1, &cls2));
3559
3560	let cls1 = uclass(&[('a', 'b'), ('e', 'f'), ('i', 'j')]);
3561	let cls2 = uclass(&[('c', 'd'), ('g', 'h'), ('k', 'l')]);
3562	let expected = uclass(&[]);
3563	assert_eq!(expected, uintersect(&cls1, &cls2));
3564
3565	let cls1 = uclass(&[('a', 'b'), ('c', 'd'), ('e', 'f')]);
3566	let cls2 = uclass(&[('b', 'c'), ('d', 'e'), ('f', 'g')]);
3567	let expected = uclass(&[('b', 'f')]);
3568	assert_eq!(expected, uintersect(&cls1, &cls2));
3569	}
3570
3571	#[test]
3572	fn class_intersect_bytes() {
3573	let cls1 = bclass(&[]);
3574	let cls2 = bclass(&[(b'a', b'a')]);
3575	let expected = bclass(&[]);
3576	assert_eq!(expected, bintersect(&cls1, &cls2));
3577
3578	let cls1 = bclass(&[(b'a', b'a')]);
3579	let cls2 = bclass(&[(b'a', b'a')]);
3580	let expected = bclass(&[(b'a', b'a')]);
3581	assert_eq!(expected, bintersect(&cls1, &cls2));
3582
3583	let cls1 = bclass(&[(b'a', b'a')]);
3584	let cls2 = bclass(&[(b'b', b'b')]);
3585	let expected = bclass(&[]);
3586	assert_eq!(expected, bintersect(&cls1, &cls2));
3587
3588	let cls1 = bclass(&[(b'a', b'a')]);
3589	let cls2 = bclass(&[(b'a', b'c')]);
3590	let expected = bclass(&[(b'a', b'a')]);
3591	assert_eq!(expected, bintersect(&cls1, &cls2));
3592
3593	let cls1 = bclass(&[(b'a', b'b')]);
3594	let cls2 = bclass(&[(b'a', b'c')]);
3595	let expected = bclass(&[(b'a', b'b')]);
3596	assert_eq!(expected, bintersect(&cls1, &cls2));
3597
3598	let cls1 = bclass(&[(b'a', b'b')]);
3599	let cls2 = bclass(&[(b'b', b'c')]);
3600	let expected = bclass(&[(b'b', b'b')]);
3601	assert_eq!(expected, bintersect(&cls1, &cls2));
3602
3603	let cls1 = bclass(&[(b'a', b'b')]);
3604	let cls2 = bclass(&[(b'c', b'd')]);
3605	let expected = bclass(&[]);
3606	assert_eq!(expected, bintersect(&cls1, &cls2));
3607
3608	let cls1 = bclass(&[(b'b', b'c')]);
3609	let cls2 = bclass(&[(b'a', b'd')]);
3610	let expected = bclass(&[(b'b', b'c')]);
3611	assert_eq!(expected, bintersect(&cls1, &cls2));
3612
3613	let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3614	let cls2 = bclass(&[(b'a', b'h')]);
3615	let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3616	assert_eq!(expected, bintersect(&cls1, &cls2));
3617
3618	let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3619	let cls2 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3620	let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3621	assert_eq!(expected, bintersect(&cls1, &cls2));
3622
3623	let cls1 = bclass(&[(b'a', b'b'), (b'g', b'h')]);
3624	let cls2 = bclass(&[(b'd', b'e'), (b'k', b'l')]);
3625	let expected = bclass(&[]);
3626	assert_eq!(expected, bintersect(&cls1, &cls2));
3627
3628	let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3629	let cls2 = bclass(&[(b'h', b'h')]);
3630	let expected = bclass(&[(b'h', b'h')]);
3631	assert_eq!(expected, bintersect(&cls1, &cls2));
3632
3633	let cls1 = bclass(&[(b'a', b'b'), (b'e', b'f'), (b'i', b'j')]);
3634	let cls2 = bclass(&[(b'c', b'd'), (b'g', b'h'), (b'k', b'l')]);
3635	let expected = bclass(&[]);
3636	assert_eq!(expected, bintersect(&cls1, &cls2));
3637
3638	let cls1 = bclass(&[(b'a', b'b'), (b'c', b'd'), (b'e', b'f')]);
3639	let cls2 = bclass(&[(b'b', b'c'), (b'd', b'e'), (b'f', b'g')]);
3640	let expected = bclass(&[(b'b', b'f')]);
3641	assert_eq!(expected, bintersect(&cls1, &cls2));
3642	}
3643
3644	#[test]
3645	fn class_difference_unicode() {
3646	let cls1 = uclass(&[('a', 'a')]);
3647	let cls2 = uclass(&[('a', 'a')]);
3648	let expected = uclass(&[]);
3649	assert_eq!(expected, udifference(&cls1, &cls2));
3650
3651	let cls1 = uclass(&[('a', 'a')]);
3652	let cls2 = uclass(&[]);
3653	let expected = uclass(&[('a', 'a')]);
3654	assert_eq!(expected, udifference(&cls1, &cls2));
3655
3656	let cls1 = uclass(&[]);
3657	let cls2 = uclass(&[('a', 'a')]);
3658	let expected = uclass(&[]);
3659	assert_eq!(expected, udifference(&cls1, &cls2));
3660
3661	let cls1 = uclass(&[('a', 'z')]);
3662	let cls2 = uclass(&[('a', 'a')]);
3663	let expected = uclass(&[('b', 'z')]);
3664	assert_eq!(expected, udifference(&cls1, &cls2));
3665
3666	let cls1 = uclass(&[('a', 'z')]);
3667	let cls2 = uclass(&[('z', 'z')]);
3668	let expected = uclass(&[('a', 'y')]);
3669	assert_eq!(expected, udifference(&cls1, &cls2));
3670
3671	let cls1 = uclass(&[('a', 'z')]);
3672	let cls2 = uclass(&[('m', 'm')]);
3673	let expected = uclass(&[('a', 'l'), ('n', 'z')]);
3674	assert_eq!(expected, udifference(&cls1, &cls2));
3675
3676	let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
3677	let cls2 = uclass(&[('a', 'z')]);
3678	let expected = uclass(&[]);
3679	assert_eq!(expected, udifference(&cls1, &cls2));
3680
3681	let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
3682	let cls2 = uclass(&[('d', 'v')]);
3683	let expected = uclass(&[('a', 'c')]);
3684	assert_eq!(expected, udifference(&cls1, &cls2));
3685
3686	let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
3687	let cls2 = uclass(&[('b', 'g'), ('s', 'u')]);
3688	let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]);
3689	assert_eq!(expected, udifference(&cls1, &cls2));
3690
3691	let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
3692	let cls2 = uclass(&[('b', 'd'), ('e', 'g'), ('s', 'u')]);
3693	let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]);
3694	assert_eq!(expected, udifference(&cls1, &cls2));
3695
3696	let cls1 = uclass(&[('x', 'z')]);
3697	let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]);
3698	let expected = uclass(&[('x', 'z')]);
3699	assert_eq!(expected, udifference(&cls1, &cls2));
3700
3701	let cls1 = uclass(&[('a', 'z')]);
3702	let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]);
3703	let expected = uclass(&[('d', 'd'), ('h', 'r'), ('v', 'z')]);
3704	assert_eq!(expected, udifference(&cls1, &cls2));
3705	}
3706
3707	#[test]
3708	fn class_difference_bytes() {
3709	let cls1 = bclass(&[(b'a', b'a')]);
3710	let cls2 = bclass(&[(b'a', b'a')]);
3711	let expected = bclass(&[]);
3712	assert_eq!(expected, bdifference(&cls1, &cls2));
3713
3714	let cls1 = bclass(&[(b'a', b'a')]);
3715	let cls2 = bclass(&[]);
3716	let expected = bclass(&[(b'a', b'a')]);
3717	assert_eq!(expected, bdifference(&cls1, &cls2));
3718
3719	let cls1 = bclass(&[]);
3720	let cls2 = bclass(&[(b'a', b'a')]);
3721	let expected = bclass(&[]);
3722	assert_eq!(expected, bdifference(&cls1, &cls2));
3723
3724	let cls1 = bclass(&[(b'a', b'z')]);
3725	let cls2 = bclass(&[(b'a', b'a')]);
3726	let expected = bclass(&[(b'b', b'z')]);
3727	assert_eq!(expected, bdifference(&cls1, &cls2));
3728
3729	let cls1 = bclass(&[(b'a', b'z')]);
3730	let cls2 = bclass(&[(b'z', b'z')]);
3731	let expected = bclass(&[(b'a', b'y')]);
3732	assert_eq!(expected, bdifference(&cls1, &cls2));
3733
3734	let cls1 = bclass(&[(b'a', b'z')]);
3735	let cls2 = bclass(&[(b'm', b'm')]);
3736	let expected = bclass(&[(b'a', b'l'), (b'n', b'z')]);
3737	assert_eq!(expected, bdifference(&cls1, &cls2));
3738
3739	let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
3740	let cls2 = bclass(&[(b'a', b'z')]);
3741	let expected = bclass(&[]);
3742	assert_eq!(expected, bdifference(&cls1, &cls2));
3743
3744	let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
3745	let cls2 = bclass(&[(b'd', b'v')]);
3746	let expected = bclass(&[(b'a', b'c')]);
3747	assert_eq!(expected, bdifference(&cls1, &cls2));
3748
3749	let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
3750	let cls2 = bclass(&[(b'b', b'g'), (b's', b'u')]);
3751	let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]);
3752	assert_eq!(expected, bdifference(&cls1, &cls2));
3753
3754	let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
3755	let cls2 = bclass(&[(b'b', b'd'), (b'e', b'g'), (b's', b'u')]);
3756	let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]);
3757	assert_eq!(expected, bdifference(&cls1, &cls2));
3758
3759	let cls1 = bclass(&[(b'x', b'z')]);
3760	let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]);
3761	let expected = bclass(&[(b'x', b'z')]);
3762	assert_eq!(expected, bdifference(&cls1, &cls2));
3763
3764	let cls1 = bclass(&[(b'a', b'z')]);
3765	let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]);
3766	let expected = bclass(&[(b'd', b'd'), (b'h', b'r'), (b'v', b'z')]);
3767	assert_eq!(expected, bdifference(&cls1, &cls2));
3768	}
3769
3770	#[test]
3771	fn class_symmetric_difference_unicode() {
3772	let cls1 = uclass(&[('a', 'm')]);
3773	let cls2 = uclass(&[('g', 't')]);
3774	let expected = uclass(&[('a', 'f'), ('n', 't')]);
3775	assert_eq!(expected, usymdifference(&cls1, &cls2));
3776	}
3777
3778	#[test]
3779	fn class_symmetric_difference_bytes() {
3780	let cls1 = bclass(&[(b'a', b'm')]);
3781	let cls2 = bclass(&[(b'g', b't')]);
3782	let expected = bclass(&[(b'a', b'f'), (b'n', b't')]);
3783	assert_eq!(expected, bsymdifference(&cls1, &cls2));
3784	}
3785
3786	// We use a thread with an explicit stack size to test that our destructor
3787	// for Hir can handle arbitrarily sized expressions in constant stack
3788	// space. In case we run on a platform without threads (WASM?), we limit
3789	// this test to Windows/Unix.
3790	#[test]
3791	#[cfg(any(unix, windows))]
3792	fn no_stack_overflow_on_drop() {
3793	use std::thread;
3794
3795	let run = \|\| {
3796	let mut expr = Hir::empty();
3797	for _ in `0`..`100` {
3798	expr = Hir::capture(Capture {
3799	index: `1`,
3800	name: None,
3801	sub: Box::new(expr),
3802	});
3803	expr = Hir::repetition(Repetition {
3804	min: `0`,
3805	max: Some(`1`),
3806	greedy: `true`,
3807	sub: Box::new(expr),
3808	});
3809
3810	expr = Hir {
3811	kind: HirKind::Concat(vec![expr]),
3812	props: Properties::empty(),
3813	};
3814	expr = Hir {
3815	kind: HirKind::Alternation(vec![expr]),
3816	props: Properties::empty(),
3817	};
3818	}
3819	assert!(!matches!(*expr.kind(), HirKind::Empty));
3820	};
3821
3822	// We run our test on a thread with a small stack size so we can
3823	// force the issue more easily.
3824	//
3825	// NOTE(2023-03-21): See the corresponding test in 'crate::ast::tests'
3826	// for context on the specific stack size chosen here.
3827	thread::Builder::new()
3828	.stack_size(`16` << `10`)
3829	.spawn(run)
3830	.unwrap()
3831	.join()
3832	.unwrap();
3833	}
3834
3835	#[test]
3836	fn look_set_iter() {
3837	let set = LookSet::empty();
3838	assert_eq!(`0`, set.iter().count());
3839
3840	let set = LookSet::full();
3841	assert_eq!(`18`, set.iter().count());
3842
3843	let set =
3844	LookSet::empty().insert(Look::StartLF).insert(Look::WordUnicode);
3845	assert_eq!(`2`, set.iter().count());
3846
3847	let set = LookSet::empty().insert(Look::StartLF);
3848	assert_eq!(`1`, set.iter().count());
3849
3850	let set = LookSet::empty().insert(Look::WordAsciiNegate);
3851	assert_eq!(`1`, set.iter().count());
3852	}
3853
3854	#[test]
3855	fn look_set_debug() {
3856	let res = format!("{:?}", LookSet::empty());
3857	assert_eq!("∅", res);
3858	let res = format!("{:?}", LookSet::full());
3859	assert_eq!("Az^$rRbB𝛃𝚩<>〈〉◁▷◀▶", res);
3860	}
3861	}
3862