mod.rs source code [crates/regex-syntax-0.7.2/src/hir/mod.rs]

1	/!*
2	Defines a high-level intermediate (HIR) representation for regular expressions.
3
4	The HIR is represented by the [`Hir`] type, and it principally constructed via
5	[translation](translate) from an [`Ast`](crate::ast::Ast). Alternatively, users
6	may use the smart constructors defined on `Hir` to build their own by hand. The
7	smart constructors simultaneously simplify and "optimize" the HIR, and are also
8	the same routines used by translation.
9
10	Most regex engines only have an HIR like this, and usually construct it
11	directly from the concrete syntax. This crate however first parses the
12	concrete syntax into an `Ast`, and only then creates the HIR from the `Ast`,
13	as mentioned above. It's done this way to facilitate better error reporting,
14	and to have a structured representation of a regex that faithfully represents
15	its concrete syntax. Namely, while an `Hir` value can be converted back to an
16	equivalent regex pattern string, it is unlikely to look like the original due
17	to its simplified structure.
18	*/
19
20	use core::{char, cmp};
21
22	use alloc::{
23	boxed::Box,
24	format,
25	string::{String, ToString},
26	vec,
27	vec::Vec,
28	};
29
30	use crate::{
31	ast::Span,
32	hir::interval::{Interval, IntervalSet, IntervalSetIter},
33	unicode,
34	};
35
36	pub use crate::{
37	hir::visitor::{visit, Visitor},
38	unicode::CaseFoldError,
39	};
40
41	mod interval;
42	pub mod literal;
43	pub mod print;
44	pub mod translate;
45	mod visitor;
46
47	/// An error that can occur while translating an `Ast` to a `Hir`.
48	#[derive(Clone, Debug, Eq, PartialEq)]
49	pub struct Error {
50	/// The kind of error.
51	kind: ErrorKind,
52	/// The original pattern that the translator's Ast was parsed from. Every
53	/// span in an error is a valid range into this string.
54	pattern: String,
55	/// The span of this error, derived from the Ast given to the translator.
56	span: Span,
57	}
58
59	impl Error {
60	/// Return the type of this error.
61	pub fn kind(&self) -> &ErrorKind {
62	&self.kind
63	}
64
65	/// The original pattern string in which this error occurred.
66	///
67	/// Every span reported by this error is reported in terms of this string.
68	pub fn pattern(&self) -> &str {
69	&self.pattern
70	}
71
72	/// Return the span at which this error occurred.
73	pub fn span(&self) -> &Span {
74	&self.span
75	}
76	}
77
78	/// The type of an error that occurred while building an `Hir`.
79	///
80	/// This error type is marked as `non_exhaustive`. This means that adding a
81	/// new variant is not considered a breaking change.
82	#[non_exhaustive]
83	#[derive(Clone, Debug, Eq, PartialEq)]
84	pub enum ErrorKind {
85	/// This error occurs when a Unicode feature is used when Unicode
86	/// support is disabled. For example `(?-u:\pL)` would trigger this error.
87	UnicodeNotAllowed,
88	/// This error occurs when translating a pattern that could match a byte
89	/// sequence that isn't UTF-8 and `utf8` was enabled.
90	InvalidUtf8,
91	/// This occurs when an unrecognized Unicode property name could not
92	/// be found.
93	UnicodePropertyNotFound,
94	/// This occurs when an unrecognized Unicode property value could not
95	/// be found.
96	UnicodePropertyValueNotFound,
97	/// This occurs when a Unicode-aware Perl character class (`\w`, `\s` or
98	/// `\d`) could not be found. This can occur when the `unicode-perl`
99	/// crate feature is not enabled.
100	UnicodePerlClassNotFound,
101	/// This occurs when the Unicode simple case mapping tables are not
102	/// available, and the regular expression required Unicode aware case
103	/// insensitivity.
104	UnicodeCaseUnavailable,
105	}
106
107	#[cfg(feature = "std")]
108	impl std::error::Error for Error {}
109
110	impl core::fmt::Display for Error {
111	fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
112	crate::error::Formatter::from(self).fmt(f)
113	}
114	}
115
116	impl core::fmt::Display for ErrorKind {
117	fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
118	use self::ErrorKind::*;
119
120	let msg: &str = match *self {
121	UnicodeNotAllowed => "Unicode not allowed here",
122	InvalidUtf8 => "pattern can match invalid UTF-8",
123	UnicodePropertyNotFound => "Unicode property not found",
124	UnicodePropertyValueNotFound => "Unicode property value not found",
125	UnicodePerlClassNotFound => {
126	"Unicode-aware Perl class not found \
127	(make sure the unicode-perl feature is enabled)"
128	}
129	UnicodeCaseUnavailable => {
130	"Unicode-aware case insensitivity matching is not available \
131	(make sure the unicode-case feature is enabled)"
132	}
133	};
134	f.write_str(data:msg)
135	}
136	}
137
138	/// A high-level intermediate representation (HIR) for a regular expression.
139	///
140	/// An HIR value is a combination of a [`HirKind`] and a set of [`Properties`].
141	/// An `HirKind` indicates what kind of regular expression it is (a literal,
142	/// a repetition, a look-around assertion, etc.), where as a `Properties`
143	/// describes various facts about the regular expression. For example, whether
144	/// it matches UTF-8 or if it matches the empty string.
145	///
146	/// The HIR of a regular expression represents an intermediate step between
147	/// its abstract syntax (a structured description of the concrete syntax) and
148	/// an actual regex matcher. The purpose of HIR is to make regular expressions
149	/// easier to analyze. In particular, the AST is much more complex than the
150	/// HIR. For example, while an AST supports arbitrarily nested character
151	/// classes, the HIR will flatten all nested classes into a single set. The HIR
152	/// will also "compile away" every flag present in the concrete syntax. For
153	/// example, users of HIR expressions never need to worry about case folding;
154	/// it is handled automatically by the translator (e.g., by translating
155	/// `(?i:A)` to `[aA]`).
156	///
157	/// The specific type of an HIR expression can be accessed via its `kind`
158	/// or `into_kind` methods. This extra level of indirection exists for two
159	/// reasons:
160	///
161	/// 1. Construction of an HIR expression must* use the constructor methods on*
162	/// this `Hir` type instead of building the `HirKind` values directly. This
163	/// permits construction to enforce invariants like "concatenations always
164	/// consist of two or more sub-expressions."
165	/// 2. Every HIR expression contains attributes that are defined inductively,
166	/// and can be computed cheaply during the construction process. For example,
167	/// one such attribute is whether the expression must match at the beginning of
168	/// the haystack.
169	///
170	/// In particular, if you have an `HirKind` value, then there is intentionally
171	/// no way to build an `Hir` value from it. You instead need to do case
172	/// analysis on the `HirKind` value and build the `Hir` value using its smart
173	/// constructors.
174	///
175	/// # UTF-8
176	///
177	/// If the HIR was produced by a translator with
178	/// [`TranslatorBuilder::utf8`](translate::TranslatorBuilder::utf8) enabled,
179	/// then the HIR is guaranteed to match UTF-8 exclusively for all non-empty
180	/// matches.
181	///
182	/// For empty matches, those can occur at any position. It is the
183	/// repsonsibility of the regex engine to determine whether empty matches are
184	/// permitted between the code units of a single codepoint.
185	///
186	/// # Stack space
187	///
188	/// This type defines its own destructor that uses constant stack space and
189	/// heap space proportional to the size of the HIR.
190	///
191	/// Also, an `Hir`'s `fmt::Display` implementation prints an HIR as a regular
192	/// expression pattern string, and uses constant stack space and heap space
193	/// proportional to the size of the `Hir`. The regex it prints is guaranteed to
194	/// be _semantically_ equivalent to the original concrete syntax, but it may
195	/// look very different. (And potentially not practically readable by a human.)
196	///
197	/// An `Hir`'s `fmt::Debug` implementation currently does not use constant
198	/// stack space. The implementation will also suppress some details (such as
199	/// the `Properties` inlined into every `Hir` value to make it less noisy).
200	#[derive(Clone, Eq, PartialEq)]
201	pub struct Hir {
202	/// The underlying HIR kind.
203	kind: HirKind,
204	/// Analysis info about this HIR, computed during construction.
205	props: Properties,
206	}
207
208	/// Methods for accessing the underlying `HirKind` and `Properties`.
209	impl Hir {
210	/// Returns a reference to the underlying HIR kind.
211	pub fn kind(&self) -> &HirKind {
212	&self.kind
213	}
214
215	/// Consumes ownership of this HIR expression and returns its underlying
216	/// `HirKind`.
217	pub fn into_kind(mut self) -> HirKind {
218	core::mem::replace(&mut self.kind, HirKind::Empty)
219	}
220
221	/// Returns the properties computed for this `Hir`.
222	pub fn properties(&self) -> &Properties {
223	&self.props
224	}
225
226	/// Splits this HIR into its constituent parts.
227	///
228	/// This is useful because `let Hir { kind, props } = hir;` does not work
229	/// because of `Hir`'s custom `Drop` implementation.
230	fn into_parts(mut self) -> (HirKind, Properties) {
231	(
232	core::mem::replace(&mut self.kind, HirKind::Empty),
233	core::mem::replace(&mut self.props, Properties::empty()),
234	)
235	}
236	}
237
238	/// Smart constructors for HIR values.
239	///
240	/// These constructors are called "smart" because they do inductive work or
241	/// simplifications. For example, calling `Hir::repetition` with a repetition
242	/// like `a{0}` will actually return a `Hir` with a `HirKind::Empty` kind
243	/// since it is equivalent to an empty regex. Another example is calling
244	/// `Hir::concat(vec![expr])`. Instead of getting a `HirKind::Concat`, you'll
245	/// just get back the original `expr` since it's precisely equivalent.
246	///
247	/// Smart constructors enable maintaining invariants about the HIR data type
248	/// while also simulanteously keeping the representation as simple as possible.
249	impl Hir {
250	/// Returns an empty HIR expression.
251	///
252	/// An empty HIR expression always matches, including the empty string.
253	#[inline]
254	pub fn empty() -> Hir {
255	let props = Properties::empty();
256	Hir { kind: HirKind::Empty, props }
257	}
258
259	/// Returns an HIR expression that can never match anything. That is,
260	/// the size of the set of strings in the language described by the HIR
261	/// returned is `0`.
262	///
263	/// This is distinct from [`Hir::empty`] in that the empty string matches
264	/// the HIR returned by `Hir::empty`. That is, the set of strings in the
265	/// language describe described by `Hir::empty` is non-empty.
266	///
267	/// Note that currently, the HIR returned uses an empty character class to
268	/// indicate that nothing can match. An equivalent expression that cannot
269	/// match is an empty alternation, but all such "fail" expressions are
270	/// normalized (via smart constructors) to empty character classes. This is
271	/// because empty character classes can be spelled in the concrete syntax
272	/// of a regex (e.g., `\P{any}` or `(?-u:[^\x00-\xFF])` or `[a&&b]`), but
273	/// empty alternations cannot.
274	#[inline]
275	pub fn fail() -> Hir {
276	let class = Class::Bytes(ClassBytes::empty());
277	let props = Properties::class(&class);
278	// We can't just call Hir::class here because it defers to Hir::fail
279	// in order to canonicalize the Hir value used to represent "cannot
280	// match."
281	Hir { kind: HirKind::Class(class), props }
282	}
283
284	/// Creates a literal HIR expression.
285	///
286	/// This accepts anything that can be converted into a `Box<[u8]>`.
287	///
288	/// Note that there is no mechanism for storing a `char` or a `Box<str>`
289	/// in an HIR. Everything is "just bytes." Whether a `Literal` (or
290	/// any HIR node) matches valid UTF-8 exclusively can be queried via
291	/// [`Properties::is_utf8`].
292	///
293	/// # Example
294	///
295	/// This example shows that concatenations of `Literal` HIR values will
296	/// automatically get flattened and combined together. So for example, even
297	/// if you concat multiple `Literal` values that are themselves not valid
298	/// UTF-8, they might add up to valid UTF-8. This also demonstrates just
299	/// how "smart" Hir's smart constructors are.
300	///
301	/// ```
302	/// use regex_syntax::hir::{Hir, HirKind, Literal};
303	///
304	/// let literals = vec![
305	/// Hir::literal([`0xE2`]),
306	/// Hir::literal([`0x98`]),
307	/// Hir::literal([`0x83`]),
308	/// ];
309	/// // Each literal, on its own, is invalid UTF-8.
310	/// assert!(literals.iter().all(\|hir\| !hir.properties().is_utf8()));
311	///
312	/// let concat = Hir::concat(literals);
313	/// // But the concatenation is valid UTF-8!
314	/// assert!(concat.properties().is_utf8());
315	///
316	/// // And also notice that the literals have been concatenated into a
317	/// // single `Literal`, to the point where there is no explicit `Concat`!
318	/// let expected = HirKind::Literal(Literal(Box::from("☃".as_bytes())));
319	/// assert_eq!(&expected, concat.kind());
320	/// ```
321	#[inline]
322	pub fn literal<B: Into<Box<[u8]>>>(lit: B) -> Hir {
323	let bytes = lit.into();
324	if bytes.is_empty() {
325	return Hir::empty();
326	}
327
328	let lit = Literal(bytes);
329	let props = Properties::literal(&lit);
330	Hir { kind: HirKind::Literal(lit), props }
331	}
332
333	/// Creates a class HIR expression. The class may either be defined over
334	/// ranges of Unicode codepoints or ranges of raw byte values.
335	///
336	/// Note that an empty class is permitted. An empty class is equivalent to
337	/// `Hir::fail()`.
338	#[inline]
339	pub fn class(class: Class) -> Hir {
340	if class.is_empty() {
341	return Hir::fail();
342	} else if let Some(bytes) = class.literal() {
343	return Hir::literal(bytes);
344	}
345	let props = Properties::class(&class);
346	Hir { kind: HirKind::Class(class), props }
347	}
348
349	/// Creates a look-around assertion HIR expression.
350	#[inline]
351	pub fn look(look: Look) -> Hir {
352	let props = Properties::look(look);
353	Hir { kind: HirKind::Look(look), props }
354	}
355
356	/// Creates a repetition HIR expression.
357	#[inline]
358	pub fn repetition(rep: Repetition) -> Hir {
359	// The regex 'a{0}' is always equivalent to the empty regex. This is
360	// true even when 'a' is an expression that never matches anything
361	// (like '\P{any}').
362	//
363	// Additionally, the regex 'a{1}' is always equivalent to 'a'.
364	if rep.min == `0` && rep.max == Some(`0`) {
365	return Hir::empty();
366	} else if rep.min == `1` && rep.max == Some(`1`) {
367	return *rep.sub;
368	}
369	let props = Properties::repetition(&rep);
370	Hir { kind: HirKind::Repetition(rep), props }
371	}
372
373	/// Creates a capture HIR expression.
374	///
375	/// Note that there is no explicit HIR value for a non-capturing group.
376	/// Since a non-capturing group only exists to override precedence in the
377	/// concrete syntax and since an HIR already does its own grouping based on
378	/// what is parsed, there is no need to explicitly represent non-capturing
379	/// groups in the HIR.
380	#[inline]
381	pub fn capture(capture: Capture) -> Hir {
382	let props = Properties::capture(&capture);
383	Hir { kind: HirKind::Capture(capture), props }
384	}
385
386	/// Returns the concatenation of the given expressions.
387	///
388	/// This attempts to flatten and simplify the concatenation as appropriate.
389	///
390	/// # Example
391	///
392	/// This shows a simple example of basic flattening of both concatenations
393	/// and literals.
394	///
395	/// ```
396	/// use regex_syntax::hir::Hir;
397	///
398	/// let hir = Hir::concat(vec![
399	/// Hir::concat(vec![
400	/// Hir::literal([b'a']),
401	/// Hir::literal([b'b']),
402	/// Hir::literal([b'c']),
403	/// ]),
404	/// Hir::concat(vec![
405	/// Hir::literal([b'x']),
406	/// Hir::literal([b'y']),
407	/// Hir::literal([b'z']),
408	/// ]),
409	/// ]);
410	/// let expected = Hir::literal("abcxyz".as_bytes());
411	/// assert_eq!(expected, hir);
412	/// ```
413	pub fn concat(subs: Vec<Hir>) -> Hir {
414	// We rebuild the concatenation by simplifying it. Would be nice to do
415	// it in place, but that seems a little tricky?
416	let mut new = vec![];
417	// This gobbles up any adjacent literals in a concatenation and smushes
418	// them together. Basically, when we see a literal, we add its bytes
419	// to 'prior_lit', and whenever we see anything else, we first take
420	// any bytes in 'prior_lit' and add it to the 'new' concatenation.
421	let mut prior_lit: Option<Vec<u8>> = None;
422	for sub in subs {
423	let (kind, props) = sub.into_parts();
424	match kind {
425	HirKind::Literal(Literal(bytes)) => {
426	if let Some(ref mut prior_bytes) = prior_lit {
427	prior_bytes.extend_from_slice(&bytes);
428	} else {
429	prior_lit = Some(bytes.to_vec());
430	}
431	}
432	// We also flatten concats that are direct children of another
433	// concat. We only need to do this one level deep since
434	// Hir::concat is the only way to build concatenations, and so
435	// flattening happens inductively.
436	HirKind::Concat(subs2) => {
437	for sub2 in subs2 {
438	let (kind2, props2) = sub2.into_parts();
439	match kind2 {
440	HirKind::Literal(Literal(bytes)) => {
441	if let Some(ref mut prior_bytes) = prior_lit {
442	prior_bytes.extend_from_slice(&bytes);
443	} else {
444	prior_lit = Some(bytes.to_vec());
445	}
446	}
447	kind2 => {
448	if let Some(prior_bytes) = prior_lit.take() {
449	new.push(Hir::literal(prior_bytes));
450	}
451	new.push(Hir { kind: kind2, props: props2 });
452	}
453	}
454	}
455	}
456	// We can just skip empty HIRs.
457	HirKind::Empty => {}
458	kind => {
459	if let Some(prior_bytes) = prior_lit.take() {
460	new.push(Hir::literal(prior_bytes));
461	}
462	new.push(Hir { kind, props });
463	}
464	}
465	}
466	if let Some(prior_bytes) = prior_lit.take() {
467	new.push(Hir::literal(prior_bytes));
468	}
469	if new.is_empty() {
470	return Hir::empty();
471	} else if new.len() == `1` {
472	return new.pop().unwrap();
473	}
474	let props = Properties::concat(&new);
475	Hir { kind: HirKind::Concat(new), props }
476	}
477
478	/// Returns the alternation of the given expressions.
479	///
480	/// This flattens and simplifies the alternation as appropriate. This may
481	/// include factoring out common prefixes or even rewriting the alternation
482	/// as a character class.
483	///
484	/// Note that an empty alternation is equivalent to `Hir::fail()`. (It
485	/// is not possible for one to write an empty alternation, or even an
486	/// alternation with a single sub-expression, in the concrete syntax of a
487	/// regex.)
488	///
489	/// # Example
490	///
491	/// This is a simple example showing how an alternation might get
492	/// simplified.
493	///
494	/// ```
495	/// use regex_syntax::hir::{Hir, Class, ClassUnicode, ClassUnicodeRange};
496	///
497	/// let hir = Hir::alternation(vec![
498	/// Hir::literal([b'a']),
499	/// Hir::literal([b'b']),
500	/// Hir::literal([b'c']),
501	/// Hir::literal([b'd']),
502	/// Hir::literal([b'e']),
503	/// Hir::literal([b'f']),
504	/// ]);
505	/// let expected = Hir::class(Class::Unicode(ClassUnicode::new([
506	/// ClassUnicodeRange::new('a', 'f'),
507	/// ])));
508	/// assert_eq!(expected, hir);
509	/// ```
510	///
511	/// And another example showing how common prefixes might get factored
512	/// out.
513	///
514	/// ```
515	/// use regex_syntax::hir::{Hir, Class, ClassUnicode, ClassUnicodeRange};
516	///
517	/// let hir = Hir::alternation(vec![
518	/// Hir::concat(vec![
519	/// Hir::literal("abc".as_bytes()),
520	/// Hir::class(Class::Unicode(ClassUnicode::new([
521	/// ClassUnicodeRange::new('A', 'Z'),
522	/// ]))),
523	/// ]),
524	/// Hir::concat(vec![
525	/// Hir::literal("abc".as_bytes()),
526	/// Hir::class(Class::Unicode(ClassUnicode::new([
527	/// ClassUnicodeRange::new('a', 'z'),
528	/// ]))),
529	/// ]),
530	/// ]);
531	/// let expected = Hir::concat(vec![
532	/// Hir::literal("abc".as_bytes()),
533	/// Hir::alternation(vec![
534	/// Hir::class(Class::Unicode(ClassUnicode::new([
535	/// ClassUnicodeRange::new('A', 'Z'),
536	/// ]))),
537	/// Hir::class(Class::Unicode(ClassUnicode::new([
538	/// ClassUnicodeRange::new('a', 'z'),
539	/// ]))),
540	/// ]),
541	/// ]);
542	/// assert_eq!(expected, hir);
543	/// ```
544	///
545	/// Note that these sorts of simplifications are not guaranteed.
546	pub fn alternation(subs: Vec<Hir>) -> Hir {
547	// We rebuild the alternation by simplifying it. We proceed similarly
548	// as the concatenation case. But in this case, there's no literal
549	// simplification happening. We're just flattening alternations.
550	let mut new = vec![];
551	for sub in subs {
552	let (kind, props) = sub.into_parts();
553	match kind {
554	HirKind::Alternation(subs2) => {
555	new.extend(subs2);
556	}
557	kind => {
558	new.push(Hir { kind, props });
559	}
560	}
561	}
562	if new.is_empty() {
563	return Hir::fail();
564	} else if new.len() == `1` {
565	return new.pop().unwrap();
566	}
567	// Now that it's completely flattened, look for the special case of
568	// 'char1\|char2\|...\|charN' and collapse that into a class. Note that
569	// we look for 'char' first and then bytes. The issue here is that if
570	// we find both non-ASCII codepoints and non-ASCII singleton bytes,
571	// then it isn't actually possible to smush them into a single class.
572	// (Because classes are either "all codepoints" or "all bytes." You
573	// can have a class that both matches non-ASCII but valid UTF-8 and
574	// invalid UTF-8.) So we look for all chars and then all bytes, and
575	// don't handle anything else.
576	if let Some(singletons) = singleton_chars(&new) {
577	let it = singletons
578	.into_iter()
579	.map(\|ch\| ClassUnicodeRange { start: ch, end: ch });
580	return Hir::class(Class::Unicode(ClassUnicode::new(it)));
581	}
582	if let Some(singletons) = singleton_bytes(&new) {
583	let it = singletons
584	.into_iter()
585	.map(\|b\| ClassBytesRange { start: b, end: b });
586	return Hir::class(Class::Bytes(ClassBytes::new(it)));
587	}
588	// Similar to singleton chars, we can also look for alternations of
589	// classes. Those can be smushed into a single class.
590	if let Some(cls) = class_chars(&new) {
591	return Hir::class(cls);
592	}
593	if let Some(cls) = class_bytes(&new) {
594	return Hir::class(cls);
595	}
596	// Factor out a common prefix if we can, which might potentially
597	// simplify the expression and unlock other optimizations downstream.
598	// It also might generally make NFA matching and DFA construction
599	// faster by reducing the scope of branching in the regex.
600	new = match lift_common_prefix(new) {
601	Ok(hir) => return hir,
602	Err(unchanged) => unchanged,
603	};
604	let props = Properties::alternation(&new);
605	Hir { kind: HirKind::Alternation(new), props }
606	}
607
608	/// Returns an HIR expression for `.`.
609	///
610	/// * [`Dot::AnyChar`] maps to `(?su-R:.)`.
611	/// * [`Dot::AnyByte`] maps to `(?s-Ru:.)`.
612	/// * [`Dot::AnyCharExceptLF`] maps to `(?u-Rs:.)`.
613	/// * [`Dot::AnyCharExceptCRLF`] maps to `(?Ru-s:.)`.
614	/// * [`Dot::AnyByteExceptLF`] maps to `(?-Rsu:.)`.
615	/// * [`Dot::AnyByteExceptCRLF`] maps to `(?R-su:.)`.
616	///
617	/// # Example
618	///
619	/// Note that this is a convenience routine for constructing the correct
620	/// character class based on the value of `Dot`. There is no explicit "dot"
621	/// HIR value. It is just an abbreviation for a common character class.
622	///
623	/// ```
624	/// use regex_syntax::hir::{Hir, Dot, Class, ClassBytes, ClassBytesRange};
625	///
626	/// let hir = Hir::dot(Dot::AnyByte);
627	/// let expected = Hir::class(Class::Bytes(ClassBytes::new([
628	/// ClassBytesRange::new(`0x00`, `0xFF`),
629	/// ])));
630	/// assert_eq!(expected, hir);
631	/// ```
632	#[inline]
633	pub fn dot(dot: Dot) -> Hir {
634	match dot {
635	Dot::AnyChar => {
636	let mut cls = ClassUnicode::empty();
637	cls.push(ClassUnicodeRange::new('`\0`', '`\u{10FFFF}`'));
638	Hir::class(Class::Unicode(cls))
639	}
640	Dot::AnyByte => {
641	let mut cls = ClassBytes::empty();
642	cls.push(ClassBytesRange::new(b'`\0`', b'`\xFF`'));
643	Hir::class(Class::Bytes(cls))
644	}
645	Dot::AnyCharExceptLF => {
646	let mut cls = ClassUnicode::empty();
647	cls.push(ClassUnicodeRange::new('`\0`', '`\x09`'));
648	cls.push(ClassUnicodeRange::new('`\x0B`', '`\u{10FFFF}`'));
649	Hir::class(Class::Unicode(cls))
650	}
651	Dot::AnyCharExceptCRLF => {
652	let mut cls = ClassUnicode::empty();
653	cls.push(ClassUnicodeRange::new('`\0`', '`\x09`'));
654	cls.push(ClassUnicodeRange::new('`\x0B`', '`\x0C`'));
655	cls.push(ClassUnicodeRange::new('`\x0E`', '`\u{10FFFF}`'));
656	Hir::class(Class::Unicode(cls))
657	}
658	Dot::AnyByteExceptLF => {
659	let mut cls = ClassBytes::empty();
660	cls.push(ClassBytesRange::new(b'`\0`', b'`\x09`'));
661	cls.push(ClassBytesRange::new(b'`\x0B`', b'`\xFF`'));
662	Hir::class(Class::Bytes(cls))
663	}
664	Dot::AnyByteExceptCRLF => {
665	let mut cls = ClassBytes::empty();
666	cls.push(ClassBytesRange::new(b'`\0`', b'`\x09`'));
667	cls.push(ClassBytesRange::new(b'`\x0B`', b'`\x0C`'));
668	cls.push(ClassBytesRange::new(b'`\x0E`', b'`\xFF`'));
669	Hir::class(Class::Bytes(cls))
670	}
671	}
672	}
673	}
674
675	/// The underlying kind of an arbitrary [`Hir`] expression.
676	///
677	/// An `HirKind` is principally useful for doing case analysis on the type
678	/// of a regular expression. If you're looking to build new `Hir` values,
679	/// then you _must_ use the smart constructors defined on `Hir`, like
680	/// [`Hir::repetition`], to build new `Hir` values. The API intentionally does
681	/// not expose any way of building an `Hir` directly from an `HirKind`.
682	#[derive(Clone, Debug, Eq, PartialEq)]
683	pub enum HirKind {
684	/// The empty regular expression, which matches everything, including the
685	/// empty string.
686	Empty,
687	/// A literalstring that matches exactly these bytes.
688	Literal(Literal),
689	/// A single character class that matches any of the characters in the
690	/// class. A class can either consist of Unicode scalar values as
691	/// characters, or it can use bytes.
692	///
693	/// A class may be empty. In which case, it matches nothing.
694	Class(Class),
695	/// A look-around assertion. A look-around match always has zero length.
696	Look(Look),
697	/// A repetition operation applied to a sub-expression.
698	Repetition(Repetition),
699	/// A capturing group, which contains a sub-expression.
700	Capture(Capture),
701	/// A concatenation of expressions.
702	///
703	/// A concatenation matches only if each of its sub-expressions match one
704	/// after the other.
705	///
706	/// Concatenations are guaranteed by `Hir`'s smart constructors to always
707	/// have at least two sub-expressions.
708	Concat(Vec<Hir>),
709	/// An alternation of expressions.
710	///
711	/// An alternation matches only if at least one of its sub-expressions
712	/// match. If multiple sub-expressions match, then the leftmost is
713	/// preferred.
714	///
715	/// Alternations are guaranteed by `Hir`'s smart constructors to always
716	/// have at least two sub-expressions.
717	Alternation(Vec<Hir>),
718	}
719
720	impl HirKind {
721	/// Returns a slice of this kind's sub-expressions, if any.
722	pub fn subs(&self) -> &[Hir] {
723	use core::slice::from_ref;
724
725	match *self {
726	HirKind::Empty
727	\| HirKind::Literal(_)
728	\| HirKind::Class(_)
729	\| HirKind::Look(_) => &[],
730	HirKind::Repetition(Repetition { ref sub: &Box, .. }) => from_ref(sub),
731	HirKind::Capture(Capture { ref sub: &Box, .. }) => from_ref(sub),
732	HirKind::Concat(ref subs: &Vec) => subs,
733	HirKind::Alternation(ref subs: &Vec) => subs,
734	}
735	}
736	}
737
738	impl core::fmt::Debug for Hir {
739	fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
740	self.kind.fmt(f)
741	}
742	}
743
744	/// Print a display representation of this Hir.
745	///
746	/// The result of this is a valid regular expression pattern string.
747	///
748	/// This implementation uses constant stack space and heap space proportional
749	/// to the size of the `Hir`.
750	impl core::fmt::Display for Hir {
751	fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
752	crate::hir::print::Printer::new().print(self, wtr:f)
753	}
754	}
755
756	/// The high-level intermediate representation of a literal.
757	///
758	/// A literal corresponds to `0` or more bytes that should be matched
759	/// literally. The smart constructors defined on `Hir` will automatically
760	/// concatenate adjacent literals into one literal, and will even automatically
761	/// replace empty literals with `Hir::empty()`.
762	///
763	/// Note that despite a literal being represented by a sequence of bytes, its
764	/// `Debug` implementation will attempt to print it as a normal string. (That
765	/// is, not a sequence of decimal numbers.)
766	#[derive(Clone, Eq, PartialEq)]
767	pub struct Literal(pub Box<[u8]>);
768
769	impl core::fmt::Debug for Literal {
770	fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
771	crate::debug::Bytes(&self.0).fmt(f)
772	}
773	}
774
775	/// The high-level intermediate representation of a character class.
776	///
777	/// A character class corresponds to a set of characters. A character is either
778	/// defined by a Unicode scalar value or a byte. Unicode characters are used
779	/// by default, while bytes are used when Unicode mode (via the `u` flag) is
780	/// disabled.
781	///
782	/// A character class, regardless of its character type, is represented by a
783	/// sequence of non-overlapping non-adjacent ranges of characters.
784	///
785	/// Note that `Bytes` variant may be produced even when it exclusively matches
786	/// valid UTF-8. This is because a `Bytes` variant represents an intention by
787	/// the author of the regular expression to disable Unicode mode, which in turn
788	/// impacts the semantics of case insensitive matching. For example, `(?i)k`
789	/// and `(?i-u)k` will not match the same set of strings.
790	#[derive(Clone, Eq, PartialEq)]
791	pub enum Class {
792	/// A set of characters represented by Unicode scalar values.
793	Unicode(ClassUnicode),
794	/// A set of characters represented by arbitrary bytes (one byte per
795	/// character).
796	Bytes(ClassBytes),
797	}
798
799	impl Class {
800	/// Apply Unicode simple case folding to this character class, in place.
801	/// The character class will be expanded to include all simple case folded
802	/// character variants.
803	///
804	/// If this is a byte oriented character class, then this will be limited
805	/// to the ASCII ranges `A-Z` and `a-z`.
806	///
807	/// # Panics
808	///
809	/// This routine panics when the case mapping data necessary for this
810	/// routine to complete is unavailable. This occurs when the `unicode-case`
811	/// feature is not enabled and the underlying class is Unicode oriented.
812	///
813	/// Callers should prefer using `try_case_fold_simple` instead, which will
814	/// return an error instead of panicking.
815	pub fn case_fold_simple(&mut self) {
816	match *self {
817	Class::Unicode(ref mut x) => x.case_fold_simple(),
818	Class::Bytes(ref mut x) => x.case_fold_simple(),
819	}
820	}
821
822	/// Apply Unicode simple case folding to this character class, in place.
823	/// The character class will be expanded to include all simple case folded
824	/// character variants.
825	///
826	/// If this is a byte oriented character class, then this will be limited
827	/// to the ASCII ranges `A-Z` and `a-z`.
828	///
829	/// # Error
830	///
831	/// This routine returns an error when the case mapping data necessary
832	/// for this routine to complete is unavailable. This occurs when the
833	/// `unicode-case` feature is not enabled and the underlying class is
834	/// Unicode oriented.
835	pub fn try_case_fold_simple(
836	&mut self,
837	) -> core::result::Result<(), CaseFoldError> {
838	match *self {
839	Class::Unicode(ref mut x) => x.try_case_fold_simple()?,
840	Class::Bytes(ref mut x) => x.case_fold_simple(),
841	}
842	Ok(())
843	}
844
845	/// Negate this character class in place.
846	///
847	/// After completion, this character class will contain precisely the
848	/// characters that weren't previously in the class.
849	pub fn negate(&mut self) {
850	match *self {
851	Class::Unicode(ref mut x) => x.negate(),
852	Class::Bytes(ref mut x) => x.negate(),
853	}
854	}
855
856	/// Returns true if and only if this character class will only ever match
857	/// valid UTF-8.
858	///
859	/// A character class can match invalid UTF-8 only when the following
860	/// conditions are met:
861	///
862	/// 1. The translator was configured to permit generating an expression
863	/// that can match invalid UTF-8. (By default, this is disabled.)
864	/// 2. Unicode mode (via the `u` flag) was disabled either in the concrete
865	/// syntax or in the parser builder. By default, Unicode mode is
866	/// enabled.
867	pub fn is_utf8(&self) -> bool {
868	match *self {
869	Class::Unicode(_) => `true`,
870	Class::Bytes(ref x) => x.is_ascii(),
871	}
872	}
873
874	/// Returns the length, in bytes, of the smallest string matched by this
875	/// character class.
876	///
877	/// For non-empty byte oriented classes, this always returns `1`. For
878	/// non-empty Unicode oriented classes, this can return `1`, `2`, `3` or
879	/// `4`. For empty classes, `None` is returned. It is impossible for `0` to
880	/// be returned.
881	///
882	/// # Example
883	///
884	/// This example shows some examples of regexes and their corresponding
885	/// minimum length, if any.
886	///
887	/// ```
888	/// use regex_syntax::{hir::Properties, parse};
889	///
890	/// // The empty string has a min length of 0.
891	/// let hir = parse(r"")?;
892	/// assert_eq!(Some(`0`), hir.properties().minimum_len());
893	/// // As do other types of regexes that only match the empty string.
894	/// let hir = parse(r"^$\b\B")?;
895	/// assert_eq!(Some(`0`), hir.properties().minimum_len());
896	/// // A regex that can match the empty string but match more is still 0.
897	/// let hir = parse(r"a*")?;
898	/// assert_eq!(Some(`0`), hir.properties().minimum_len());
899	/// // A regex that matches nothing has no minimum defined.
900	/// let hir = parse(r"[a&&b]")?;
901	/// assert_eq!(None, hir.properties().minimum_len());
902	/// // Character classes usually have a minimum length of 1.
903	/// let hir = parse(r"\w")?;
904	/// assert_eq!(Some(`1`), hir.properties().minimum_len());
905	/// // But sometimes Unicode classes might be bigger!
906	/// let hir = parse(r"\p{Cyrillic}")?;
907	/// assert_eq!(Some(`2`), hir.properties().minimum_len());
908	///
909	/// # Ok::<(), Box<dyn std::error::Error>>(())
910	/// ```
911	pub fn minimum_len(&self) -> Option<usize> {
912	match *self {
913	Class::Unicode(ref x) => x.minimum_len(),
914	Class::Bytes(ref x) => x.minimum_len(),
915	}
916	}
917
918	/// Returns the length, in bytes, of the longest string matched by this
919	/// character class.
920	///
921	/// For non-empty byte oriented classes, this always returns `1`. For
922	/// non-empty Unicode oriented classes, this can return `1`, `2`, `3` or
923	/// `4`. For empty classes, `None` is returned. It is impossible for `0` to
924	/// be returned.
925	///
926	/// # Example
927	///
928	/// This example shows some examples of regexes and their corresponding
929	/// maximum length, if any.
930	///
931	/// ```
932	/// use regex_syntax::{hir::Properties, parse};
933	///
934	/// // The empty string has a max length of 0.
935	/// let hir = parse(r"")?;
936	/// assert_eq!(Some(`0`), hir.properties().maximum_len());
937	/// // As do other types of regexes that only match the empty string.
938	/// let hir = parse(r"^$\b\B")?;
939	/// assert_eq!(Some(`0`), hir.properties().maximum_len());
940	/// // A regex that matches nothing has no maximum defined.
941	/// let hir = parse(r"[a&&b]")?;
942	/// assert_eq!(None, hir.properties().maximum_len());
943	/// // Bounded repeats work as you expect.
944	/// let hir = parse(r"x{2,10}")?;
945	/// assert_eq!(Some(`10`), hir.properties().maximum_len());
946	/// // An unbounded repeat means there is no maximum.
947	/// let hir = parse(r"x{2,}")?;
948	/// assert_eq!(None, hir.properties().maximum_len());
949	/// // With Unicode enabled, \w can match up to 4 bytes!
950	/// let hir = parse(r"\w")?;
951	/// assert_eq!(Some(`4`), hir.properties().maximum_len());
952	/// // Without Unicode enabled, \w matches at most 1 byte.
953	/// let hir = parse(r"(?-u)\w")?;
954	/// assert_eq!(Some(`1`), hir.properties().maximum_len());
955	///
956	/// # Ok::<(), Box<dyn std::error::Error>>(())
957	/// ```
958	pub fn maximum_len(&self) -> Option<usize> {
959	match *self {
960	Class::Unicode(ref x) => x.maximum_len(),
961	Class::Bytes(ref x) => x.maximum_len(),
962	}
963	}
964
965	/// Returns true if and only if this character class is empty. That is,
966	/// it has no elements.
967	///
968	/// An empty character can never match anything, including an empty string.
969	pub fn is_empty(&self) -> bool {
970	match *self {
971	Class::Unicode(ref x) => x.ranges().is_empty(),
972	Class::Bytes(ref x) => x.ranges().is_empty(),
973	}
974	}
975
976	/// If this class consists of exactly one element (whether a codepoint or a
977	/// byte), then return it as a literal byte string.
978	///
979	/// If this class is empty or contains more than one element, then `None`
980	/// is returned.
981	pub fn literal(&self) -> Option<Vec<u8>> {
982	match *self {
983	Class::Unicode(ref x) => x.literal(),
984	Class::Bytes(ref x) => x.literal(),
985	}
986	}
987	}
988
989	impl core::fmt::Debug for Class {
990	fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
991	use crate::debug::Byte;
992
993	let mut fmter: DebugSet<'_, '_> = f.debug_set();
994	match *self {
995	Class::Unicode(ref cls: &ClassUnicode) => {
996	for r: &ClassUnicodeRange in cls.ranges().iter() {
997	fmter.entry(&(r.start..=r.end));
998	}
999	}
1000	Class::Bytes(ref cls: &ClassBytes) => {
1001	for r: &ClassBytesRange in cls.ranges().iter() {
1002	fmter.entry(&(Byte(r.start)..=Byte(r.end)));
1003	}
1004	}
1005	}
1006	fmter.finish()
1007	}
1008	}
1009
1010	/// A set of characters represented by Unicode scalar values.
1011	#[derive(Clone, Debug, Eq, PartialEq)]
1012	pub struct ClassUnicode {
1013	set: IntervalSet<ClassUnicodeRange>,
1014	}
1015
1016	impl ClassUnicode {
1017	/// Create a new class from a sequence of ranges.
1018	///
1019	/// The given ranges do not need to be in any specific order, and ranges
1020	/// may overlap. Ranges will automatically be sorted into a canonical
1021	/// non-overlapping order.
1022	pub fn new<I>(ranges: I) -> ClassUnicode
1023	where
1024	I: IntoIterator<Item = ClassUnicodeRange>,
1025	{
1026	ClassUnicode { set: IntervalSet::new(ranges) }
1027	}
1028
1029	/// Create a new class with no ranges.
1030	///
1031	/// An empty class matches nothing. That is, it is equivalent to
1032	/// [`Hir::fail`].
1033	pub fn empty() -> ClassUnicode {
1034	ClassUnicode::new(vec![])
1035	}
1036
1037	/// Add a new range to this set.
1038	pub fn push(&mut self, range: ClassUnicodeRange) {
1039	self.set.push(range);
1040	}
1041
1042	/// Return an iterator over all ranges in this class.
1043	///
1044	/// The iterator yields ranges in ascending order.
1045	pub fn iter(&self) -> ClassUnicodeIter<'_> {
1046	ClassUnicodeIter(self.set.iter())
1047	}
1048
1049	/// Return the underlying ranges as a slice.
1050	pub fn ranges(&self) -> &[ClassUnicodeRange] {
1051	self.set.intervals()
1052	}
1053
1054	/// Expand this character class such that it contains all case folded
1055	/// characters, according to Unicode's "simple" mapping. For example, if
1056	/// this class consists of the range `a-z`, then applying case folding will
1057	/// result in the class containing both the ranges `a-z` and `A-Z`.
1058	///
1059	/// # Panics
1060	///
1061	/// This routine panics when the case mapping data necessary for this
1062	/// routine to complete is unavailable. This occurs when the `unicode-case`
1063	/// feature is not enabled.
1064	///
1065	/// Callers should prefer using `try_case_fold_simple` instead, which will
1066	/// return an error instead of panicking.
1067	pub fn case_fold_simple(&mut self) {
1068	self.set
1069	.case_fold_simple()
1070	.expect("unicode-case feature must be enabled");
1071	}
1072
1073	/// Expand this character class such that it contains all case folded
1074	/// characters, according to Unicode's "simple" mapping. For example, if
1075	/// this class consists of the range `a-z`, then applying case folding will
1076	/// result in the class containing both the ranges `a-z` and `A-Z`.
1077	///
1078	/// # Error
1079	///
1080	/// This routine returns an error when the case mapping data necessary
1081	/// for this routine to complete is unavailable. This occurs when the
1082	/// `unicode-case` feature is not enabled.
1083	pub fn try_case_fold_simple(
1084	&mut self,
1085	) -> core::result::Result<(), CaseFoldError> {
1086	self.set.case_fold_simple()
1087	}
1088
1089	/// Negate this character class.
1090	///
1091	/// For all `c` where `c` is a Unicode scalar value, if `c` was in this
1092	/// set, then it will not be in this set after negation.
1093	pub fn negate(&mut self) {
1094	self.set.negate();
1095	}
1096
1097	/// Union this character class with the given character class, in place.
1098	pub fn union(&mut self, other: &ClassUnicode) {
1099	self.set.union(&other.set);
1100	}
1101
1102	/// Intersect this character class with the given character class, in
1103	/// place.
1104	pub fn intersect(&mut self, other: &ClassUnicode) {
1105	self.set.intersect(&other.set);
1106	}
1107
1108	/// Subtract the given character class from this character class, in place.
1109	pub fn difference(&mut self, other: &ClassUnicode) {
1110	self.set.difference(&other.set);
1111	}
1112
1113	/// Compute the symmetric difference of the given character classes, in
1114	/// place.
1115	///
1116	/// This computes the symmetric difference of two character classes. This
1117	/// removes all elements in this class that are also in the given class,
1118	/// but all adds all elements from the given class that aren't in this
1119	/// class. That is, the class will contain all elements in either class,
1120	/// but will not contain any elements that are in both classes.
1121	pub fn symmetric_difference(&mut self, other: &ClassUnicode) {
1122	self.set.symmetric_difference(&other.set);
1123	}
1124
1125	/// Returns true if and only if this character class will either match
1126	/// nothing or only ASCII bytes. Stated differently, this returns false
1127	/// if and only if this class contains a non-ASCII codepoint.
1128	pub fn is_ascii(&self) -> bool {
1129	self.set.intervals().last().map_or(`true`, \|r\| r.end <= '`\x7F`')
1130	}
1131
1132	/// Returns the length, in bytes, of the smallest string matched by this
1133	/// character class.
1134	///
1135	/// Returns `None` when the class is empty.
1136	pub fn minimum_len(&self) -> Option<usize> {
1137	let first = self.ranges().get(`0`)?;
1138	// Correct because c1 < c2 implies c1.len_utf8() < c2.len_utf8().
1139	Some(first.start.len_utf8())
1140	}
1141
1142	/// Returns the length, in bytes, of the longest string matched by this
1143	/// character class.
1144	///
1145	/// Returns `None` when the class is empty.
1146	pub fn maximum_len(&self) -> Option<usize> {
1147	let last = self.ranges().last()?;
1148	// Correct because c1 < c2 implies c1.len_utf8() < c2.len_utf8().
1149	Some(last.end.len_utf8())
1150	}
1151
1152	/// If this class consists of exactly one codepoint, then return it as
1153	/// a literal byte string.
1154	///
1155	/// If this class is empty or contains more than one codepoint, then `None`
1156	/// is returned.
1157	pub fn literal(&self) -> Option<Vec<u8>> {
1158	let rs = self.ranges();
1159	if rs.len() == `1` && rs[`0`].start == rs[`0`].end {
1160	Some(rs[`0`].start.encode_utf8(&mut [`0`; `4`]).to_string().into_bytes())
1161	} else {
1162	None
1163	}
1164	}
1165
1166	/// If this class consists of only ASCII ranges, then return its
1167	/// corresponding and equivalent byte class.
1168	pub fn to_byte_class(&self) -> Option<ClassBytes> {
1169	if !self.is_ascii() {
1170	return None;
1171	}
1172	Some(ClassBytes::new(self.ranges().iter().map(\|r\| {
1173	// Since we are guaranteed that our codepoint range is ASCII, the
1174	// 'u8::try_from' calls below are guaranteed to be correct.
1175	ClassBytesRange {
1176	start: u8::try_from(r.start).unwrap(),
1177	end: u8::try_from(r.end).unwrap(),
1178	}
1179	})))
1180	}
1181	}
1182
1183	/// An iterator over all ranges in a Unicode character class.
1184	///
1185	/// The lifetime `'a` refers to the lifetime of the underlying class.
1186	#[derive(Debug)]
1187	pub struct ClassUnicodeIter<'a>(IntervalSetIter<'a, ClassUnicodeRange>);
1188
1189	impl<'a> Iterator for ClassUnicodeIter<'a> {
1190	type Item = &'a ClassUnicodeRange;
1191
1192	fn next(&mut self) -> Option<&'a ClassUnicodeRange> {
1193	self.0.next()
1194	}
1195	}
1196
1197	/// A single range of characters represented by Unicode scalar values.
1198	///
1199	/// The range is closed. That is, the start and end of the range are included
1200	/// in the range.
1201	#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
1202	pub struct ClassUnicodeRange {
1203	start: char,
1204	end: char,
1205	}
1206
1207	impl core::fmt::Debug for ClassUnicodeRange {
1208	fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1209	let start: String = if !self.start.is_whitespace() && !self.start.is_control()
1210	{
1211	self.start.to_string()
1212	} else {
1213	format!("0x{:X}", u32::from(self.start))
1214	};
1215	let end: String = if !self.end.is_whitespace() && !self.end.is_control() {
1216	self.end.to_string()
1217	} else {
1218	format!("0x{:X}", u32::from(self.end))
1219	};
1220	f&mut DebugStruct<'_, '_>.debug_struct("ClassUnicodeRange")
1221	.field("start", &start)
1222	.field(name:"end", &end)
1223	.finish()
1224	}
1225	}
1226
1227	impl Interval for ClassUnicodeRange {
1228	type Bound = char;
1229
1230	#[inline]
1231	fn lower(&self) -> char {
1232	self.start
1233	}
1234	#[inline]
1235	fn upper(&self) -> char {
1236	self.end
1237	}
1238	#[inline]
1239	fn set_lower(&mut self, bound: char) {
1240	self.start = bound;
1241	}
1242	#[inline]
1243	fn set_upper(&mut self, bound: char) {
1244	self.end = bound;
1245	}
1246
1247	/// Apply simple case folding to this Unicode scalar value range.
1248	///
1249	/// Additional ranges are appended to the given vector. Canonical ordering
1250	/// is not* maintained in the given vector.*
1251	fn case_fold_simple(
1252	&self,
1253	ranges: &mut Vec<ClassUnicodeRange>,
1254	) -> Result<(), unicode::CaseFoldError> {
1255	let mut folder = unicode::SimpleCaseFolder::new()?;
1256	if !folder.overlaps(self.start, self.end) {
1257	return Ok(());
1258	}
1259	let (start, end) = (u32::from(self.start), u32::from(self.end));
1260	for cp in (start..=end).filter_map(char::from_u32) {
1261	for &cp_folded in folder.mapping(cp) {
1262	ranges.push(ClassUnicodeRange::new(cp_folded, cp_folded));
1263	}
1264	}
1265	Ok(())
1266	}
1267	}
1268
1269	impl ClassUnicodeRange {
1270	/// Create a new Unicode scalar value range for a character class.
1271	///
1272	/// The returned range is always in a canonical form. That is, the range
1273	/// returned always satisfies the invariant that `start <= end`.
1274	pub fn new(start: char, end: char) -> ClassUnicodeRange {
1275	ClassUnicodeRange::create(start, end)
1276	}
1277
1278	/// Return the start of this range.
1279	///
1280	/// The start of a range is always less than or equal to the end of the
1281	/// range.
1282	pub fn start(&self) -> char {
1283	self.start
1284	}
1285
1286	/// Return the end of this range.
1287	///
1288	/// The end of a range is always greater than or equal to the start of the
1289	/// range.
1290	pub fn end(&self) -> char {
1291	self.end
1292	}
1293
1294	/// Returns the number of codepoints in this range.
1295	pub fn len(&self) -> usize {
1296	let diff = `1` + u32::from(self.end) - u32::from(self.start);
1297	// This is likely to panic in 16-bit targets since a usize can only fit
1298	// 2^16. It's not clear what to do here, other than to return an error
1299	// when building a Unicode class that contains a range whose length
1300	// overflows usize. (Which, to be honest, is probably quite common on
1301	// 16-bit targets. For example, this would imply that '.' and '\p{any}'
1302	// would be impossible to build.)
1303	usize::try_from(diff).expect("char class len fits in usize")
1304	}
1305	}
1306
1307	/// A set of characters represented by arbitrary bytes (where one byte
1308	/// corresponds to one character).
1309	#[derive(Clone, Debug, Eq, PartialEq)]
1310	pub struct ClassBytes {
1311	set: IntervalSet<ClassBytesRange>,
1312	}
1313
1314	impl ClassBytes {
1315	/// Create a new class from a sequence of ranges.
1316	///
1317	/// The given ranges do not need to be in any specific order, and ranges
1318	/// may overlap. Ranges will automatically be sorted into a canonical
1319	/// non-overlapping order.
1320	pub fn new<I>(ranges: I) -> ClassBytes
1321	where
1322	I: IntoIterator<Item = ClassBytesRange>,
1323	{
1324	ClassBytes { set: IntervalSet::new(ranges) }
1325	}
1326
1327	/// Create a new class with no ranges.
1328	///
1329	/// An empty class matches nothing. That is, it is equivalent to
1330	/// [`Hir::fail`].
1331	pub fn empty() -> ClassBytes {
1332	ClassBytes::new(vec![])
1333	}
1334
1335	/// Add a new range to this set.
1336	pub fn push(&mut self, range: ClassBytesRange) {
1337	self.set.push(range);
1338	}
1339
1340	/// Return an iterator over all ranges in this class.
1341	///
1342	/// The iterator yields ranges in ascending order.
1343	pub fn iter(&self) -> ClassBytesIter<'_> {
1344	ClassBytesIter(self.set.iter())
1345	}
1346
1347	/// Return the underlying ranges as a slice.
1348	pub fn ranges(&self) -> &[ClassBytesRange] {
1349	self.set.intervals()
1350	}
1351
1352	/// Expand this character class such that it contains all case folded
1353	/// characters. For example, if this class consists of the range `a-z`,
1354	/// then applying case folding will result in the class containing both the
1355	/// ranges `a-z` and `A-Z`.
1356	///
1357	/// Note that this only applies ASCII case folding, which is limited to the
1358	/// characters `a-z` and `A-Z`.
1359	pub fn case_fold_simple(&mut self) {
1360	self.set.case_fold_simple().expect("ASCII case folding never fails");
1361	}
1362
1363	/// Negate this byte class.
1364	///
1365	/// For all `b` where `b` is a any byte, if `b` was in this set, then it
1366	/// will not be in this set after negation.
1367	pub fn negate(&mut self) {
1368	self.set.negate();
1369	}
1370
1371	/// Union this byte class with the given byte class, in place.
1372	pub fn union(&mut self, other: &ClassBytes) {
1373	self.set.union(&other.set);
1374	}
1375
1376	/// Intersect this byte class with the given byte class, in place.
1377	pub fn intersect(&mut self, other: &ClassBytes) {
1378	self.set.intersect(&other.set);
1379	}
1380
1381	/// Subtract the given byte class from this byte class, in place.
1382	pub fn difference(&mut self, other: &ClassBytes) {
1383	self.set.difference(&other.set);
1384	}
1385
1386	/// Compute the symmetric difference of the given byte classes, in place.
1387	///
1388	/// This computes the symmetric difference of two byte classes. This
1389	/// removes all elements in this class that are also in the given class,
1390	/// but all adds all elements from the given class that aren't in this
1391	/// class. That is, the class will contain all elements in either class,
1392	/// but will not contain any elements that are in both classes.
1393	pub fn symmetric_difference(&mut self, other: &ClassBytes) {
1394	self.set.symmetric_difference(&other.set);
1395	}
1396
1397	/// Returns true if and only if this character class will either match
1398	/// nothing or only ASCII bytes. Stated differently, this returns false
1399	/// if and only if this class contains a non-ASCII byte.
1400	pub fn is_ascii(&self) -> bool {
1401	self.set.intervals().last().map_or(`true`, \|r\| r.end <= `0x7F`)
1402	}
1403
1404	/// Returns the length, in bytes, of the smallest string matched by this
1405	/// character class.
1406	///
1407	/// Returns `None` when the class is empty.
1408	pub fn minimum_len(&self) -> Option<usize> {
1409	if self.ranges().is_empty() {
1410	None
1411	} else {
1412	Some(`1`)
1413	}
1414	}
1415
1416	/// Returns the length, in bytes, of the longest string matched by this
1417	/// character class.
1418	///
1419	/// Returns `None` when the class is empty.
1420	pub fn maximum_len(&self) -> Option<usize> {
1421	if self.ranges().is_empty() {
1422	None
1423	} else {
1424	Some(`1`)
1425	}
1426	}
1427
1428	/// If this class consists of exactly one byte, then return it as
1429	/// a literal byte string.
1430	///
1431	/// If this class is empty or contains more than one byte, then `None`
1432	/// is returned.
1433	pub fn literal(&self) -> Option<Vec<u8>> {
1434	let rs = self.ranges();
1435	if rs.len() == `1` && rs[`0`].start == rs[`0`].end {
1436	Some(vec![rs[`0`].start])
1437	} else {
1438	None
1439	}
1440	}
1441
1442	/// If this class consists of only ASCII ranges, then return its
1443	/// corresponding and equivalent Unicode class.
1444	pub fn to_unicode_class(&self) -> Option<ClassUnicode> {
1445	if !self.is_ascii() {
1446	return None;
1447	}
1448	Some(ClassUnicode::new(self.ranges().iter().map(\|r\| {
1449	// Since we are guaranteed that our byte range is ASCII, the
1450	// 'char::from' calls below are correct and will not erroneously
1451	// convert a raw byte value into its corresponding codepoint.
1452	ClassUnicodeRange {
1453	start: char::from(r.start),
1454	end: char::from(r.end),
1455	}
1456	})))
1457	}
1458	}
1459
1460	/// An iterator over all ranges in a byte character class.
1461	///
1462	/// The lifetime `'a` refers to the lifetime of the underlying class.
1463	#[derive(Debug)]
1464	pub struct ClassBytesIter<'a>(IntervalSetIter<'a, ClassBytesRange>);
1465
1466	impl<'a> Iterator for ClassBytesIter<'a> {
1467	type Item = &'a ClassBytesRange;
1468
1469	fn next(&mut self) -> Option<&'a ClassBytesRange> {
1470	self.0.next()
1471	}
1472	}
1473
1474	/// A single range of characters represented by arbitrary bytes.
1475	///
1476	/// The range is closed. That is, the start and end of the range are included
1477	/// in the range.
1478	#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
1479	pub struct ClassBytesRange {
1480	start: u8,
1481	end: u8,
1482	}
1483
1484	impl Interval for ClassBytesRange {
1485	type Bound = u8;
1486
1487	#[inline]
1488	fn lower(&self) -> u8 {
1489	self.start
1490	}
1491	#[inline]
1492	fn upper(&self) -> u8 {
1493	self.end
1494	}
1495	#[inline]
1496	fn set_lower(&mut self, bound: u8) {
1497	self.start = bound;
1498	}
1499	#[inline]
1500	fn set_upper(&mut self, bound: u8) {
1501	self.end = bound;
1502	}
1503
1504	/// Apply simple case folding to this byte range. Only ASCII case mappings
1505	/// (for a-z) are applied.
1506	///
1507	/// Additional ranges are appended to the given vector. Canonical ordering
1508	/// is not* maintained in the given vector.*
1509	fn case_fold_simple(
1510	&self,
1511	ranges: &mut Vec<ClassBytesRange>,
1512	) -> Result<(), unicode::CaseFoldError> {
1513	if !ClassBytesRange::new(b'a', b'z').is_intersection_empty(self) {
1514	let lower = cmp::max(self.start, b'a');
1515	let upper = cmp::min(self.end, b'z');
1516	ranges.push(ClassBytesRange::new(lower - `32`, upper - `32`));
1517	}
1518	if !ClassBytesRange::new(b'A', b'Z').is_intersection_empty(self) {
1519	let lower = cmp::max(self.start, b'A');
1520	let upper = cmp::min(self.end, b'Z');
1521	ranges.push(ClassBytesRange::new(lower + `32`, upper + `32`));
1522	}
1523	Ok(())
1524	}
1525	}
1526
1527	impl ClassBytesRange {
1528	/// Create a new byte range for a character class.
1529	///
1530	/// The returned range is always in a canonical form. That is, the range
1531	/// returned always satisfies the invariant that `start <= end`.
1532	pub fn new(start: u8, end: u8) -> ClassBytesRange {
1533	ClassBytesRange::create(start, end)
1534	}
1535
1536	/// Return the start of this range.
1537	///
1538	/// The start of a range is always less than or equal to the end of the
1539	/// range.
1540	pub fn start(&self) -> u8 {
1541	self.start
1542	}
1543
1544	/// Return the end of this range.
1545	///
1546	/// The end of a range is always greater than or equal to the start of the
1547	/// range.
1548	pub fn end(&self) -> u8 {
1549	self.end
1550	}
1551
1552	/// Returns the number of bytes in this range.
1553	pub fn len(&self) -> usize {
1554	usize::from(self.end.checked_sub(self.start).unwrap())
1555	.checked_add(`1`)
1556	.unwrap()
1557	}
1558	}
1559
1560	impl core::fmt::Debug for ClassBytesRange {
1561	fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1562	f&mut DebugStruct<'_, '_>.debug_struct("ClassBytesRange")
1563	.field("start", &crate::debug::Byte(self.start))
1564	.field(name:"end", &crate::debug::Byte(self.end))
1565	.finish()
1566	}
1567	}
1568
1569	/// The high-level intermediate representation for a look-around assertion.
1570	///
1571	/// An assertion match is always zero-length. Also called an "empty match."
1572	#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1573	pub enum Look {
1574	/// Match the beginning of text. Specifically, this matches at the starting
1575	/// position of the input.
1576	Start = `1` << `0`,
1577	/// Match the end of text. Specifically, this matches at the ending
1578	/// position of the input.
1579	End = `1` << `1`,
1580	/// Match the beginning of a line or the beginning of text. Specifically,
1581	/// this matches at the starting position of the input, or at the position
1582	/// immediately following a `\n` character.
1583	StartLF = `1` << `2`,
1584	/// Match the end of a line or the end of text. Specifically, this matches
1585	/// at the end position of the input, or at the position immediately
1586	/// preceding a `\n` character.
1587	EndLF = `1` << `3`,
1588	/// Match the beginning of a line or the beginning of text. Specifically,
1589	/// this matches at the starting position of the input, or at the position
1590	/// immediately following either a `\r` or `\n` character, but never after
1591	/// a `\r` when a `\n` follows.
1592	StartCRLF = `1` << `4`,
1593	/// Match the end of a line or the end of text. Specifically, this matches
1594	/// at the end position of the input, or at the position immediately
1595	/// preceding a `\r` or `\n` character, but never before a `\n` when a `\r`
1596	/// precedes it.
1597	EndCRLF = `1` << `5`,
1598	/// Match an ASCII-only word boundary. That is, this matches a position
1599	/// where the left adjacent character and right adjacent character
1600	/// correspond to a word and non-word or a non-word and word character.
1601	WordAscii = `1` << `6`,
1602	/// Match an ASCII-only negation of a word boundary.
1603	WordAsciiNegate = `1` << `7`,
1604	/// Match a Unicode-aware word boundary. That is, this matches a position
1605	/// where the left adjacent character and right adjacent character
1606	/// correspond to a word and non-word or a non-word and word character.
1607	WordUnicode = `1` << `8`,
1608	/// Match a Unicode-aware negation of a word boundary.
1609	WordUnicodeNegate = `1` << `9`,
1610	}
1611
1612	impl Look {
1613	/// Flip the look-around assertion to its equivalent for reverse searches.
1614	/// For example, `StartLF` gets translated to `EndLF`.
1615	///
1616	/// Some assertions, such as `WordUnicode`, remain the same since they
1617	/// match the same positions regardless of the direction of the search.
1618	#[inline]
1619	pub const fn reversed(self) -> Look {
1620	match self {
1621	Look::Start => Look::End,
1622	Look::End => Look::Start,
1623	Look::StartLF => Look::EndLF,
1624	Look::EndLF => Look::StartLF,
1625	Look::StartCRLF => Look::EndCRLF,
1626	Look::EndCRLF => Look::StartCRLF,
1627	Look::WordAscii => Look::WordAscii,
1628	Look::WordAsciiNegate => Look::WordAsciiNegate,
1629	Look::WordUnicode => Look::WordUnicode,
1630	Look::WordUnicodeNegate => Look::WordUnicodeNegate,
1631	}
1632	}
1633
1634	/// Return the underlying representation of this look-around enumeration
1635	/// as an integer. Giving the return value to the [`Look::from_repr`]
1636	/// constructor is guaranteed to return the same look-around variant that
1637	/// one started with within a semver compatible release of this crate.
1638	#[inline]
1639	pub const fn as_repr(self) -> u16 {
1640	// AFAIK, 'as' is the only way to zero-cost convert an int enum to an
1641	// actual int.
1642	self as u16
1643	}
1644
1645	/// Given the underlying representation of a `Look` value, return the
1646	/// corresponding `Look` value if the representation is valid. Otherwise
1647	/// `None` is returned.
1648	#[inline]
1649	pub const fn from_repr(repr: u16) -> Option<Look> {
1650	match repr {
1651	`0b00_0000_0001` => Some(Look::Start),
1652	`0b00_0000_0010` => Some(Look::End),
1653	`0b00_0000_0100` => Some(Look::StartLF),
1654	`0b00_0000_1000` => Some(Look::EndLF),
1655	`0b00_0001_0000` => Some(Look::StartCRLF),
1656	`0b00_0010_0000` => Some(Look::EndCRLF),
1657	`0b00_0100_0000` => Some(Look::WordAscii),
1658	`0b00_1000_0000` => Some(Look::WordAsciiNegate),
1659	`0b01_0000_0000` => Some(Look::WordUnicode),
1660	`0b10_0000_0000` => Some(Look::WordUnicodeNegate),
1661	_ => None,
1662	}
1663	}
1664
1665	/// Returns a convenient single codepoint representation of this
1666	/// look-around assertion. Each assertion is guaranteed to be represented
1667	/// by a distinct character.
1668	///
1669	/// This is useful for succinctly representing a look-around assertion in
1670	/// human friendly but succinct output intended for a programmer working on
1671	/// regex internals.
1672	#[inline]
1673	pub const fn as_char(self) -> char {
1674	match self {
1675	Look::Start => 'A',
1676	Look::End => 'z',
1677	Look::StartLF => '^',
1678	Look::EndLF => '$',
1679	Look::StartCRLF => 'r',
1680	Look::EndCRLF => 'R',
1681	Look::WordAscii => 'b',
1682	Look::WordAsciiNegate => 'B',
1683	Look::WordUnicode => '𝛃',
1684	Look::WordUnicodeNegate => '𝚩',
1685	}
1686	}
1687	}
1688
1689	/// The high-level intermediate representation for a capturing group.
1690	///
1691	/// A capturing group always has an index and a child expression. It may
1692	/// also have a name associated with it (e.g., `(?P<foo>\w)`), but it's not
1693	/// necessary.
1694	///
1695	/// Note that there is no explicit representation of a non-capturing group
1696	/// in a `Hir`. Instead, non-capturing grouping is handled automatically by
1697	/// the recursive structure of the `Hir` itself.
1698	#[derive(Clone, Debug, Eq, PartialEq)]
1699	pub struct Capture {
1700	/// The capture index of the capture.
1701	pub index: u32,
1702	/// The name of the capture, if it exists.
1703	pub name: Option<Box<str>>,
1704	/// The expression inside the capturing group, which may be empty.
1705	pub sub: Box<Hir>,
1706	}
1707
1708	/// The high-level intermediate representation of a repetition operator.
1709	///
1710	/// A repetition operator permits the repetition of an arbitrary
1711	/// sub-expression.
1712	#[derive(Clone, Debug, Eq, PartialEq)]
1713	pub struct Repetition {
1714	/// The minimum range of the repetition.
1715	///
1716	/// Note that special cases like `?`, `+` and `` all get translated into*
1717	/// the ranges `{0,1}`, `{1,}` and `{0,}`, respectively.
1718	///
1719	/// When `min` is zero, this expression can match the empty string
1720	/// regardless of what its sub-expression is.
1721	pub min: u32,
1722	/// The maximum range of the repetition.
1723	///
1724	/// Note that when `max` is `None`, `min` acts as a lower bound but where
1725	/// there is no upper bound. For something like `x{5}` where the min and
1726	/// max are equivalent, `min` will be set to `5` and `max` will be set to
1727	/// `Some(5)`.
1728	pub max: Option<u32>,
1729	/// Whether this repetition operator is greedy or not. A greedy operator
1730	/// will match as much as it can. A non-greedy operator will match as
1731	/// little as it can.
1732	///
1733	/// Typically, operators are greedy by default and are only non-greedy when
1734	/// a `?` suffix is used, e.g., `(expr)` is greedy while `(expr)?` is
1735	/// not. However, this can be inverted via the `U` "ungreedy" flag.
1736	pub greedy: bool,
1737	/// The expression being repeated.
1738	pub sub: Box<Hir>,
1739	}
1740
1741	impl Repetition {
1742	/// Returns a new repetition with the same `min`, `max` and `greedy`
1743	/// values, but with its sub-expression replaced with the one given.
1744	pub fn with(&self, sub: Hir) -> Repetition {
1745	Repetition {
1746	min: self.min,
1747	max: self.max,
1748	greedy: self.greedy,
1749	sub: Box::new(sub),
1750	}
1751	}
1752	}
1753
1754	/// A type describing the different flavors of `.`.
1755	///
1756	/// This type is meant to be used with [`Hir::dot`], which is a convenience
1757	/// routine for building HIR values derived from the `.` regex.
1758	#[non_exhaustive]
1759	#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1760	pub enum Dot {
1761	/// Matches the UTF-8 encoding of any Unicode scalar value.
1762	///
1763	/// This is equivalent to `(?su:.)` and also `\p{any}`.
1764	AnyChar,
1765	/// Matches any byte value.
1766	///
1767	/// This is equivalent to `(?s-u:.)` and also `(?-u:[\x00-\xFF])`.
1768	AnyByte,
1769	/// Matches the UTF-8 encoding of any Unicode scalar value except for `\n`.
1770	///
1771	/// This is equivalent to `(?u-s:.)` and also `[\p{any}--\n]`.
1772	AnyCharExceptLF,
1773	/// Matches the UTF-8 encoding of any Unicode scalar value except for `\r`
1774	/// and `\n`.
1775	///
1776	/// This is equivalent to `(?uR-s:.)` and also `[\p{any}--\r\n]`.
1777	AnyCharExceptCRLF,
1778	/// Matches any byte value except for `\n`.
1779	///
1780	/// This is equivalent to `(?-su:.)` and also `(?-u:[[\x00-\xFF]--\n])`.
1781	AnyByteExceptLF,
1782	/// Matches any byte value except for `\r` and `\n`.
1783	///
1784	/// This is equivalent to `(?R-su:.)` and also `(?-u:[[\x00-\xFF]--\r\n])`.
1785	AnyByteExceptCRLF,
1786	}
1787
1788	/// A custom `Drop` impl is used for `HirKind` such that it uses constant stack
1789	/// space but heap space proportional to the depth of the total `Hir`.
1790	impl Drop for Hir {
1791	fn drop(&mut self) {
1792	use core::mem;
1793
1794	match *self.kind() {
1795	HirKind::Empty
1796	\| HirKind::Literal(_)
1797	\| HirKind::Class(_)
1798	\| HirKind::Look(_) => return,
1799	HirKind::Capture(ref x) if x.sub.kind.subs().is_empty() => return,
1800	HirKind::Repetition(ref x) if x.sub.kind.subs().is_empty() => {
1801	return
1802	}
1803	HirKind::Concat(ref x) if x.is_empty() => return,
1804	HirKind::Alternation(ref x) if x.is_empty() => return,
1805	_ => {}
1806	}
1807
1808	let mut stack = vec![mem::replace(self, Hir::empty())];
1809	while let Some(mut expr) = stack.pop() {
1810	match expr.kind {
1811	HirKind::Empty
1812	\| HirKind::Literal(_)
1813	\| HirKind::Class(_)
1814	\| HirKind::Look(_) => {}
1815	HirKind::Capture(ref mut x) => {
1816	stack.push(mem::replace(&mut x.sub, Hir::empty()));
1817	}
1818	HirKind::Repetition(ref mut x) => {
1819	stack.push(mem::replace(&mut x.sub, Hir::empty()));
1820	}
1821	HirKind::Concat(ref mut x) => {
1822	stack.extend(x.drain(..));
1823	}
1824	HirKind::Alternation(ref mut x) => {
1825	stack.extend(x.drain(..));
1826	}
1827	}
1828	}
1829	}
1830	}
1831
1832	/// A type that collects various properties of an HIR value.
1833	///
1834	/// Properties are always scalar values and represent meta data that is
1835	/// computed inductively on an HIR value. Properties are defined for all
1836	/// HIR values.
1837	///
1838	/// All methods on a `Properties` value take constant time and are meant to
1839	/// be cheap to call.
1840	#[derive(Clone, Debug, Eq, PartialEq)]
1841	pub struct Properties(Box<PropertiesI>);
1842
1843	/// The property definition. It is split out so that we can box it, and
1844	/// there by make `Properties` use less stack size. This is kind-of important
1845	/// because every HIR value has a `Properties` attached to it.
1846	///
1847	/// This does have the unfortunate consequence that creating any HIR value
1848	/// always leads to at least one alloc for properties, but this is generally
1849	/// true anyway (for pretty much all HirKinds except for look-arounds).
1850	#[derive(Clone, Debug, Eq, PartialEq)]
1851	struct PropertiesI {
1852	minimum_len: Option<usize>,
1853	maximum_len: Option<usize>,
1854	look_set: LookSet,
1855	look_set_prefix: LookSet,
1856	look_set_suffix: LookSet,
1857	look_set_prefix_any: LookSet,
1858	look_set_suffix_any: LookSet,
1859	utf8: bool,
1860	explicit_captures_len: usize,
1861	static_explicit_captures_len: Option<usize>,
1862	literal: bool,
1863	alternation_literal: bool,
1864	}
1865
1866	impl Properties {
1867	/// Returns the length (in bytes) of the smallest string matched by this
1868	/// HIR.
1869	///
1870	/// A return value of `0` is possible and occurs when the HIR can match an
1871	/// empty string.
1872	///
1873	/// `None` is returned when there is no minimum length. This occurs in
1874	/// precisely the cases where the HIR matches nothing. i.e., The language
1875	/// the regex matches is empty. An example of such a regex is `\P{any}`.
1876	#[inline]
1877	pub fn minimum_len(&self) -> Option<usize> {
1878	self.0.minimum_len
1879	}
1880
1881	/// Returns the length (in bytes) of the longest string matched by this
1882	/// HIR.
1883	///
1884	/// A return value of `0` is possible and occurs when nothing longer than
1885	/// the empty string is in the language described by this HIR.
1886	///
1887	/// `None` is returned when there is no longest matching string. This
1888	/// occurs when the HIR matches nothing or when there is no upper bound on
1889	/// the length of matching strings. Example of such regexes are `\P{any}`
1890	/// (matches nothing) and `a+` (has no upper bound).
1891	#[inline]
1892	pub fn maximum_len(&self) -> Option<usize> {
1893	self.0.maximum_len
1894	}
1895
1896	/// Returns a set of all look-around assertions that appear at least once
1897	/// in this HIR value.
1898	#[inline]
1899	pub fn look_set(&self) -> LookSet {
1900	self.0.look_set
1901	}
1902
1903	/// Returns a set of all look-around assertions that appear as a prefix for
1904	/// this HIR value. That is, the set returned corresponds to the set of
1905	/// assertions that must be passed before matching any bytes in a haystack.
1906	///
1907	/// For example, `hir.look_set_prefix().contains(Look::Start)` returns true
1908	/// if and only if the HIR is fully anchored at the start.
1909	#[inline]
1910	pub fn look_set_prefix(&self) -> LookSet {
1911	self.0.look_set_prefix
1912	}
1913
1914	/// Returns a set of all look-around assertions that appear as a _possible_
1915	/// prefix for this HIR value. That is, the set returned corresponds to the
1916	/// set of assertions that _may_ be passed before matching any bytes in a
1917	/// haystack.
1918	///
1919	/// For example, `hir.look_set_prefix_any().contains(Look::Start)` returns
1920	/// true if and only if it's possible for the regex to match through a
1921	/// anchored assertion before consuming any input.
1922	#[inline]
1923	pub fn look_set_prefix_any(&self) -> LookSet {
1924	self.0.look_set_prefix_any
1925	}
1926
1927	/// Returns a set of all look-around assertions that appear as a suffix for
1928	/// this HIR value. That is, the set returned corresponds to the set of
1929	/// assertions that must be passed in order to be considered a match after
1930	/// all other consuming HIR expressions.
1931	///
1932	/// For example, `hir.look_set_suffix().contains(Look::End)` returns true
1933	/// if and only if the HIR is fully anchored at the end.
1934	#[inline]
1935	pub fn look_set_suffix(&self) -> LookSet {
1936	self.0.look_set_suffix
1937	}
1938
1939	/// Returns a set of all look-around assertions that appear as a _possible_
1940	/// suffix for this HIR value. That is, the set returned corresponds to the
1941	/// set of assertions that _may_ be passed before matching any bytes in a
1942	/// haystack.
1943	///
1944	/// For example, `hir.look_set_suffix_any().contains(Look::End)` returns
1945	/// true if and only if it's possible for the regex to match through a
1946	/// anchored assertion at the end of a match without consuming any input.
1947	#[inline]
1948	pub fn look_set_suffix_any(&self) -> LookSet {
1949	self.0.look_set_suffix_any
1950	}
1951
1952	/// Return true if and only if the corresponding HIR will always match
1953	/// valid UTF-8.
1954	///
1955	/// When this returns false, then it is possible for this HIR expression to
1956	/// match invalid UTF-8, including by matching between the code units of
1957	/// a single UTF-8 encoded codepoint.
1958	///
1959	/// Note that this returns true even when the corresponding HIR can match
1960	/// the empty string. Since an empty string can technically appear between
1961	/// UTF-8 code units, it is possible for a match to be reported that splits
1962	/// a codepoint which could in turn be considered matching invalid UTF-8.
1963	/// However, it is generally assumed that such empty matches are handled
1964	/// specially by the search routine if it is absolutely required that
1965	/// matches not split a codepoint.
1966	///
1967	/// # Example
1968	///
1969	/// This code example shows the UTF-8 property of a variety of patterns.
1970	///
1971	/// ```
1972	/// use regex_syntax::{ParserBuilder, parse};
1973	///
1974	/// // Examples of 'is_utf8() == true'.
1975	/// assert!(parse(r"a")?.properties().is_utf8());
1976	/// assert!(parse(r"[^a]")?.properties().is_utf8());
1977	/// assert!(parse(r".")?.properties().is_utf8());
1978	/// assert!(parse(r"\W")?.properties().is_utf8());
1979	/// assert!(parse(r"\b")?.properties().is_utf8());
1980	/// assert!(parse(r"\B")?.properties().is_utf8());
1981	/// assert!(parse(r"(?-u)\b")?.properties().is_utf8());
1982	/// assert!(parse(r"(?-u)\B")?.properties().is_utf8());
1983	/// // Unicode mode is enabled by default, and in
1984	/// // that mode, all \x hex escapes are treated as
1985	/// // codepoints. So this actually matches the UTF-8
1986	/// // encoding of U+00FF.
1987	/// assert!(parse(r"\xFF")?.properties().is_utf8());
1988	///
1989	/// // Now we show examples of 'is_utf8() == false'.
1990	/// // The only way to do this is to force the parser
1991	/// // to permit invalid UTF-8, otherwise all of these
1992	/// // would fail to parse!
1993	/// let parse = \|pattern\| {
1994	/// ParserBuilder::new().utf8(`false`).build().parse(pattern)
1995	/// };
1996	/// assert!(!parse(r"(?-u)[^a]")?.properties().is_utf8());
1997	/// assert!(!parse(r"(?-u).")?.properties().is_utf8());
1998	/// assert!(!parse(r"(?-u)\W")?.properties().is_utf8());
1999	/// // Conversely to the equivalent example above,
2000	/// // when Unicode mode is disabled, \x hex escapes
2001	/// // are treated as their raw byte values.
2002	/// assert!(!parse(r"(?-u)\xFF")?.properties().is_utf8());
2003	/// // Note that just because we disabled UTF-8 in the
2004	/// // parser doesn't mean we still can't use Unicode.
2005	/// // It is enabled by default, so \xFF is still
2006	/// // equivalent to matching the UTF-8 encoding of
2007	/// // U+00FF by default.
2008	/// assert!(parse(r"\xFF")?.properties().is_utf8());
2009	/// // Even though we use raw bytes that individually
2010	/// // are not valid UTF-8, when combined together, the
2011	/// // overall expression does* match valid UTF-8!*
2012	/// assert!(parse(r"(?-u)\xE2\x98\x83")?.properties().is_utf8());
2013	///
2014	/// # Ok::<(), Box<dyn std::error::Error>>(())
2015	/// ```
2016	#[inline]
2017	pub fn is_utf8(&self) -> bool {
2018	self.0.utf8
2019	}
2020
2021	/// Returns the total number of explicit capturing groups in the
2022	/// corresponding HIR.
2023	///
2024	/// Note that this does not include the implicit capturing group
2025	/// corresponding to the entire match that is typically included by regex
2026	/// engines.
2027	///
2028	/// # Example
2029	///
2030	/// This method will return `0` for `a` and `1` for `(a)`:
2031	///
2032	/// ```
2033	/// use regex_syntax::parse;
2034	///
2035	/// assert_eq!(`0`, parse("a")?.properties().explicit_captures_len());
2036	/// assert_eq!(`1`, parse("(a)")?.properties().explicit_captures_len());
2037	///
2038	/// # Ok::<(), Box<dyn std::error::Error>>(())
2039	/// ```
2040	#[inline]
2041	pub fn explicit_captures_len(&self) -> usize {
2042	self.0.explicit_captures_len
2043	}
2044
2045	/// Returns the total number of explicit capturing groups that appear in
2046	/// every possible match.
2047	///
2048	/// If the number of capture groups can vary depending on the match, then
2049	/// this returns `None`. That is, a value is only returned when the number
2050	/// of matching groups is invariant or "static."
2051	///
2052	/// Note that this does not include the implicit capturing group
2053	/// corresponding to the entire match.
2054	///
2055	/// # Example
2056	///
2057	/// This shows a few cases where a static number of capture groups is
2058	/// available and a few cases where it is not.
2059	///
2060	/// ```
2061	/// use regex_syntax::parse;
2062	///
2063	/// let len = \|pattern\| {
2064	/// parse(pattern).map(\|h\| {
2065	/// h.properties().static_explicit_captures_len()
2066	/// })
2067	/// };
2068	///
2069	/// assert_eq!(Some(`0`), len("a")?);
2070	/// assert_eq!(Some(`1`), len("(a)")?);
2071	/// assert_eq!(Some(`1`), len("(a)\|(b)")?);
2072	/// assert_eq!(Some(`2`), len("(a)(b)\|(c)(d)")?);
2073	/// assert_eq!(None, len("(a)\|b")?);
2074	/// assert_eq!(None, len("a\|(b)")?);
2075	/// assert_eq!(None, len("(b)*")?);
2076	/// assert_eq!(Some(`1`), len("(b)+")?);
2077	///
2078	/// # Ok::<(), Box<dyn std::error::Error>>(())
2079	/// ```
2080	#[inline]
2081	pub fn static_explicit_captures_len(&self) -> Option<usize> {
2082	self.0.static_explicit_captures_len
2083	}
2084
2085	/// Return true if and only if this HIR is a simple literal. This is
2086	/// only true when this HIR expression is either itself a `Literal` or a
2087	/// concatenation of only `Literal`s.
2088	///
2089	/// For example, `f` and `foo` are literals, but `f+`, `(foo)`, `foo()` and
2090	/// the empty string are not (even though they contain sub-expressions that
2091	/// are literals).
2092	#[inline]
2093	pub fn is_literal(&self) -> bool {
2094	self.0.literal
2095	}
2096
2097	/// Return true if and only if this HIR is either a simple literal or an
2098	/// alternation of simple literals. This is only
2099	/// true when this HIR expression is either itself a `Literal` or a
2100	/// concatenation of only `Literal`s or an alternation of only `Literal`s.
2101	///
2102	/// For example, `f`, `foo`, `a\|b\|c`, and `foo\|bar\|baz` are alternation
2103	/// literals, but `f+`, `(foo)`, `foo()`, and the empty pattern are not
2104	/// (even though that contain sub-expressions that are literals).
2105	#[inline]
2106	pub fn is_alternation_literal(&self) -> bool {
2107	self.0.alternation_literal
2108	}
2109
2110	/// Returns the total amount of heap memory usage, in bytes, used by this
2111	/// `Properties` value.
2112	#[inline]
2113	pub fn memory_usage(&self) -> usize {
2114	core::mem::size_of::<PropertiesI>()
2115	}
2116
2117	/// Returns a new set of properties that corresponds to the union of the
2118	/// iterator of properties given.
2119	///
2120	/// This is useful when one has multiple `Hir` expressions and wants
2121	/// to combine them into a single alternation without constructing the
2122	/// corresponding `Hir`. This routine provides a way of combining the
2123	/// properties of each `Hir` expression into one set of properties
2124	/// representing the union of those expressions.
2125	///
2126	/// # Example: union with HIRs that never match
2127	///
2128	/// This example shows that unioning properties together with one that
2129	/// represents a regex that never matches will "poison" certain attributes,
2130	/// like the minimum and maximum lengths.
2131	///
2132	/// ```
2133	/// use regex_syntax::{hir::Properties, parse};
2134	///
2135	/// let hir1 = parse("ab?c?")?;
2136	/// assert_eq!(Some(`1`), hir1.properties().minimum_len());
2137	/// assert_eq!(Some(`3`), hir1.properties().maximum_len());
2138	///
2139	/// let hir2 = parse(r"[a&&b]")?;
2140	/// assert_eq!(None, hir2.properties().minimum_len());
2141	/// assert_eq!(None, hir2.properties().maximum_len());
2142	///
2143	/// let hir3 = parse(r"wxy?z?")?;
2144	/// assert_eq!(Some(`2`), hir3.properties().minimum_len());
2145	/// assert_eq!(Some(`4`), hir3.properties().maximum_len());
2146	///
2147	/// let unioned = Properties::union([
2148	/// hir1.properties(),
2149	/// hir2.properties(),
2150	/// hir3.properties(),
2151	/// ]);
2152	/// assert_eq!(None, unioned.minimum_len());
2153	/// assert_eq!(None, unioned.maximum_len());
2154	///
2155	/// # Ok::<(), Box<dyn std::error::Error>>(())
2156	/// ```
2157	///
2158	/// The maximum length can also be "poisoned" by a pattern that has no
2159	/// upper bound on the length of a match. The minimum length remains
2160	/// unaffected:
2161	///
2162	/// ```
2163	/// use regex_syntax::{hir::Properties, parse};
2164	///
2165	/// let hir1 = parse("ab?c?")?;
2166	/// assert_eq!(Some(`1`), hir1.properties().minimum_len());
2167	/// assert_eq!(Some(`3`), hir1.properties().maximum_len());
2168	///
2169	/// let hir2 = parse(r"a+")?;
2170	/// assert_eq!(Some(`1`), hir2.properties().minimum_len());
2171	/// assert_eq!(None, hir2.properties().maximum_len());
2172	///
2173	/// let hir3 = parse(r"wxy?z?")?;
2174	/// assert_eq!(Some(`2`), hir3.properties().minimum_len());
2175	/// assert_eq!(Some(`4`), hir3.properties().maximum_len());
2176	///
2177	/// let unioned = Properties::union([
2178	/// hir1.properties(),
2179	/// hir2.properties(),
2180	/// hir3.properties(),
2181	/// ]);
2182	/// assert_eq!(Some(`1`), unioned.minimum_len());
2183	/// assert_eq!(None, unioned.maximum_len());
2184	///
2185	/// # Ok::<(), Box<dyn std::error::Error>>(())
2186	/// ```
2187	pub fn union<I, P>(props: I) -> Properties
2188	where
2189	I: IntoIterator<Item = P>,
2190	P: core::borrow::Borrow<Properties>,
2191	{
2192	let mut it = props.into_iter().peekable();
2193	// While empty alternations aren't possible, we still behave as if they
2194	// are. When we have an empty alternate, then clearly the look-around
2195	// prefix and suffix is empty. Otherwise, it is the intersection of all
2196	// prefixes and suffixes (respectively) of the branches.
2197	let fix = if it.peek().is_none() {
2198	LookSet::empty()
2199	} else {
2200	LookSet::full()
2201	};
2202	// And also, an empty alternate means we have 0 static capture groups,
2203	// but we otherwise start with the number corresponding to the first
2204	// alternate. If any subsequent alternate has a different number of
2205	// static capture groups, then we overall have a variation and not a
2206	// static number of groups.
2207	let static_explicit_captures_len =
2208	it.peek().and_then(\|p\| p.borrow().static_explicit_captures_len());
2209	// The base case is an empty alternation, which matches nothing.
2210	// Note though that empty alternations aren't possible, because the
2211	// Hir::alternation smart constructor rewrites those as empty character
2212	// classes.
2213	let mut props = PropertiesI {
2214	minimum_len: None,
2215	maximum_len: None,
2216	look_set: LookSet::empty(),
2217	look_set_prefix: fix,
2218	look_set_suffix: fix,
2219	look_set_prefix_any: LookSet::empty(),
2220	look_set_suffix_any: LookSet::empty(),
2221	utf8: `true`,
2222	explicit_captures_len: `0`,
2223	static_explicit_captures_len,
2224	literal: `false`,
2225	alternation_literal: `true`,
2226	};
2227	let (mut min_poisoned, mut max_poisoned) = (`false`, `false`);
2228	// Handle properties that need to visit every child hir.
2229	for prop in it {
2230	let p = prop.borrow();
2231	props.look_set.set_union(p.look_set());
2232	props.look_set_prefix.set_intersect(p.look_set_prefix());
2233	props.look_set_suffix.set_intersect(p.look_set_suffix());
2234	props.look_set_prefix_any.set_union(p.look_set_prefix_any());
2235	props.look_set_suffix_any.set_union(p.look_set_suffix_any());
2236	props.utf8 = props.utf8 && p.is_utf8();
2237	props.explicit_captures_len = props
2238	.explicit_captures_len
2239	.saturating_add(p.explicit_captures_len());
2240	if props.static_explicit_captures_len
2241	!= p.static_explicit_captures_len()
2242	{
2243	props.static_explicit_captures_len = None;
2244	}
2245	props.alternation_literal =
2246	props.alternation_literal && p.is_literal();
2247	if !min_poisoned {
2248	if let Some(xmin) = p.minimum_len() {
2249	if props.minimum_len.map_or(`true`, \|pmin\| xmin < pmin) {
2250	props.minimum_len = Some(xmin);
2251	}
2252	} else {
2253	props.minimum_len = None;
2254	min_poisoned = `true`;
2255	}
2256	}
2257	if !max_poisoned {
2258	if let Some(xmax) = p.maximum_len() {
2259	if props.maximum_len.map_or(`true`, \|pmax\| xmax > pmax) {
2260	props.maximum_len = Some(xmax);
2261	}
2262	} else {
2263	props.maximum_len = None;
2264	max_poisoned = `true`;
2265	}
2266	}
2267	}
2268	Properties(Box::new(props))
2269	}
2270	}
2271
2272	impl Properties {
2273	/// Create a new set of HIR properties for an empty regex.
2274	fn empty() -> Properties {
2275	let inner = PropertiesI {
2276	minimum_len: Some(`0`),
2277	maximum_len: Some(`0`),
2278	look_set: LookSet::empty(),
2279	look_set_prefix: LookSet::empty(),
2280	look_set_suffix: LookSet::empty(),
2281	look_set_prefix_any: LookSet::empty(),
2282	look_set_suffix_any: LookSet::empty(),
2283	// It is debatable whether an empty regex always matches at valid
2284	// UTF-8 boundaries. Strictly speaking, at a byte oriented view,
2285	// it is clearly false. There are, for example, many empty strings
2286	// between the bytes encoding a '☃'.
2287	//
2288	// However, when Unicode mode is enabled, the fundamental atom
2289	// of matching is really a codepoint. And in that scenario, an
2290	// empty regex is defined to only match at valid UTF-8 boundaries
2291	// and to never split a codepoint. It just so happens that this
2292	// enforcement is somewhat tricky to do for regexes that match
2293	// the empty string inside regex engines themselves. It usually
2294	// requires some layer above the regex engine to filter out such
2295	// matches.
2296	//
2297	// In any case, 'true' is really the only coherent option. If it
2298	// were false, for example, then 'a' would also need to be false*
2299	// since it too can match the empty string.
2300	utf8: `true`,
2301	explicit_captures_len: `0`,
2302	static_explicit_captures_len: Some(`0`),
2303	literal: `false`,
2304	alternation_literal: `false`,
2305	};
2306	Properties(Box::new(inner))
2307	}
2308
2309	/// Create a new set of HIR properties for a literal regex.
2310	fn literal(lit: &Literal) -> Properties {
2311	let inner = PropertiesI {
2312	minimum_len: Some(lit.0.len()),
2313	maximum_len: Some(lit.0.len()),
2314	look_set: LookSet::empty(),
2315	look_set_prefix: LookSet::empty(),
2316	look_set_suffix: LookSet::empty(),
2317	look_set_prefix_any: LookSet::empty(),
2318	look_set_suffix_any: LookSet::empty(),
2319	utf8: core::str::from_utf8(&lit.0).is_ok(),
2320	explicit_captures_len: `0`,
2321	static_explicit_captures_len: Some(`0`),
2322	literal: `true`,
2323	alternation_literal: `true`,
2324	};
2325	Properties(Box::new(inner))
2326	}
2327
2328	/// Create a new set of HIR properties for a character class.
2329	fn class(class: &Class) -> Properties {
2330	let inner = PropertiesI {
2331	minimum_len: class.minimum_len(),
2332	maximum_len: class.maximum_len(),
2333	look_set: LookSet::empty(),
2334	look_set_prefix: LookSet::empty(),
2335	look_set_suffix: LookSet::empty(),
2336	look_set_prefix_any: LookSet::empty(),
2337	look_set_suffix_any: LookSet::empty(),
2338	utf8: class.is_utf8(),
2339	explicit_captures_len: `0`,
2340	static_explicit_captures_len: Some(`0`),
2341	literal: `false`,
2342	alternation_literal: `false`,
2343	};
2344	Properties(Box::new(inner))
2345	}
2346
2347	/// Create a new set of HIR properties for a look-around assertion.
2348	fn look(look: Look) -> Properties {
2349	let inner = PropertiesI {
2350	minimum_len: Some(`0`),
2351	maximum_len: Some(`0`),
2352	look_set: LookSet::singleton(look),
2353	look_set_prefix: LookSet::singleton(look),
2354	look_set_suffix: LookSet::singleton(look),
2355	look_set_prefix_any: LookSet::singleton(look),
2356	look_set_suffix_any: LookSet::singleton(look),
2357	// This requires a little explanation. Basically, we don't consider
2358	// matching an empty string to be equivalent to matching invalid
2359	// UTF-8, even though technically matching every empty string will
2360	// split the UTF-8 encoding of a single codepoint when treating a
2361	// UTF-8 encoded string as a sequence of bytes. Our defense here is
2362	// that in such a case, a codepoint should logically be treated as
2363	// the fundamental atom for matching, and thus the only valid match
2364	// points are between codepoints and not bytes.
2365	//
2366	// More practically, this is true here because it's also true
2367	// for 'Hir::empty()', otherwise something like 'a' would be*
2368	// considered to match invalid UTF-8. That in turn makes this
2369	// property borderline useless.
2370	utf8: `true`,
2371	explicit_captures_len: `0`,
2372	static_explicit_captures_len: Some(`0`),
2373	literal: `false`,
2374	alternation_literal: `false`,
2375	};
2376	Properties(Box::new(inner))
2377	}
2378
2379	/// Create a new set of HIR properties for a repetition.
2380	fn repetition(rep: &Repetition) -> Properties {
2381	let p = rep.sub.properties();
2382	let minimum_len = p.minimum_len().map(\|child_min\| {
2383	let rep_min = usize::try_from(rep.min).unwrap_or(usize::MAX);
2384	child_min.saturating_mul(rep_min)
2385	});
2386	let maximum_len = rep.max.and_then(\|rep_max\| {
2387	let rep_max = usize::try_from(rep_max).ok()?;
2388	let child_max = p.maximum_len()?;
2389	child_max.checked_mul(rep_max)
2390	});
2391
2392	let mut inner = PropertiesI {
2393	minimum_len,
2394	maximum_len,
2395	look_set: p.look_set(),
2396	look_set_prefix: LookSet::empty(),
2397	look_set_suffix: LookSet::empty(),
2398	look_set_prefix_any: p.look_set_prefix_any(),
2399	look_set_suffix_any: p.look_set_suffix_any(),
2400	utf8: p.is_utf8(),
2401	explicit_captures_len: p.explicit_captures_len(),
2402	static_explicit_captures_len: p.static_explicit_captures_len(),
2403	literal: `false`,
2404	alternation_literal: `false`,
2405	};
2406	// If the repetition operator can match the empty string, then its
2407	// lookset prefix and suffixes themselves remain empty since they are
2408	// no longer required to match.
2409	if rep.min > `0` {
2410	inner.look_set_prefix = p.look_set_prefix();
2411	inner.look_set_suffix = p.look_set_suffix();
2412	}
2413	// If the static captures len of the sub-expression is not known or is
2414	// zero, then it automatically propagates to the repetition, regardless
2415	// of the repetition. Otherwise, it might change, but only when the
2416	// repetition can match 0 times.
2417	if rep.min == `0`
2418	&& inner.static_explicit_captures_len.map_or(`false`, \|len\| len > `0`)
2419	{
2420	// If we require a match 0 times, then our captures len is
2421	// guaranteed to be zero. Otherwise, if we can* match the empty*
2422	// string, then it's impossible to know how many captures will be
2423	// in the resulting match.
2424	if rep.max == Some(`0`) {
2425	inner.static_explicit_captures_len = Some(`0`);
2426	} else {
2427	inner.static_explicit_captures_len = None;
2428	}
2429	}
2430	Properties(Box::new(inner))
2431	}
2432
2433	/// Create a new set of HIR properties for a capture.
2434	fn capture(capture: &Capture) -> Properties {
2435	let p = capture.sub.properties();
2436	Properties(Box::new(PropertiesI {
2437	explicit_captures_len: p.explicit_captures_len().saturating_add(`1`),
2438	static_explicit_captures_len: p
2439	.static_explicit_captures_len()
2440	.map(\|len\| len.saturating_add(`1`)),
2441	literal: `false`,
2442	alternation_literal: `false`,
2443	..*p.0.clone()
2444	}))
2445	}
2446
2447	/// Create a new set of HIR properties for a concatenation.
2448	fn concat(concat: &[Hir]) -> Properties {
2449	// The base case is an empty concatenation, which matches the empty
2450	// string. Note though that empty concatenations aren't possible,
2451	// because the Hir::concat smart constructor rewrites those as
2452	// Hir::empty.
2453	let mut props = PropertiesI {
2454	minimum_len: Some(`0`),
2455	maximum_len: Some(`0`),
2456	look_set: LookSet::empty(),
2457	look_set_prefix: LookSet::empty(),
2458	look_set_suffix: LookSet::empty(),
2459	look_set_prefix_any: LookSet::empty(),
2460	look_set_suffix_any: LookSet::empty(),
2461	utf8: `true`,
2462	explicit_captures_len: `0`,
2463	static_explicit_captures_len: Some(`0`),
2464	literal: `true`,
2465	alternation_literal: `true`,
2466	};
2467	// Handle properties that need to visit every child hir.
2468	for x in concat.iter() {
2469	let p = x.properties();
2470	props.look_set.set_union(p.look_set());
2471	props.utf8 = props.utf8 && p.is_utf8();
2472	props.explicit_captures_len = props
2473	.explicit_captures_len
2474	.saturating_add(p.explicit_captures_len());
2475	props.static_explicit_captures_len = p
2476	.static_explicit_captures_len()
2477	.and_then(\|len1\| {
2478	Some((len1, props.static_explicit_captures_len?))
2479	})
2480	.and_then(\|(len1, len2)\| Some(len1.saturating_add(len2)));
2481	props.literal = props.literal && p.is_literal();
2482	props.alternation_literal =
2483	props.alternation_literal && p.is_alternation_literal();
2484	if let Some(minimum_len) = props.minimum_len {
2485	match p.minimum_len() {
2486	None => props.minimum_len = None,
2487	Some(len) => {
2488	// We use saturating arithmetic here because the
2489	// minimum is just a lower bound. We can't go any
2490	// higher than what our number types permit.
2491	props.minimum_len =
2492	Some(minimum_len.saturating_add(len));
2493	}
2494	}
2495	}
2496	if let Some(maximum_len) = props.maximum_len {
2497	match p.maximum_len() {
2498	None => props.maximum_len = None,
2499	Some(len) => {
2500	props.maximum_len = maximum_len.checked_add(len)
2501	}
2502	}
2503	}
2504	}
2505	// Handle the prefix properties, which only requires visiting
2506	// child exprs until one matches more than the empty string.
2507	let mut it = concat.iter();
2508	while let Some(x) = it.next() {
2509	props.look_set_prefix.set_union(x.properties().look_set_prefix());
2510	props
2511	.look_set_prefix_any
2512	.set_union(x.properties().look_set_prefix_any());
2513	if x.properties().maximum_len().map_or(`true`, \|x\| x > `0`) {
2514	break;
2515	}
2516	}
2517	// Same thing for the suffix properties, but in reverse.
2518	let mut it = concat.iter().rev();
2519	while let Some(x) = it.next() {
2520	props.look_set_suffix.set_union(x.properties().look_set_suffix());
2521	props
2522	.look_set_suffix_any
2523	.set_union(x.properties().look_set_suffix_any());
2524	if x.properties().maximum_len().map_or(`true`, \|x\| x > `0`) {
2525	break;
2526	}
2527	}
2528	Properties(Box::new(props))
2529	}
2530
2531	/// Create a new set of HIR properties for a concatenation.
2532	fn alternation(alts: &[Hir]) -> Properties {
2533	Properties::union(alts.iter().map(\|hir\| hir.properties()))
2534	}
2535	}
2536
2537	/// A set of look-around assertions.
2538	///
2539	/// This is useful for efficiently tracking look-around assertions. For
2540	/// example, an [`Hir`] provides properties that return `LookSet`s.
2541	#[derive(Clone, Copy, Default, Eq, PartialEq)]
2542	pub struct LookSet {
2543	/// The underlying representation this set is exposed to make it possible
2544	/// to store it somewhere efficiently. The representation is that
2545	/// of a bitset, where each assertion occupies bit `i` where `i =
2546	/// Look::as_repr()`.
2547	///
2548	/// Note that users of this internal representation must permit the full
2549	/// range of `u16` values to be represented. For example, even if the
2550	/// current implementation only makes use of the 10 least significant bits,
2551	/// it may use more bits in a future semver compatible release.
2552	pub bits: u16,
2553	}
2554
2555	impl LookSet {
2556	/// Create an empty set of look-around assertions.
2557	#[inline]
2558	pub fn empty() -> LookSet {
2559	LookSet { bits: `0` }
2560	}
2561
2562	/// Create a full set of look-around assertions.
2563	///
2564	/// This set contains all possible look-around assertions.
2565	#[inline]
2566	pub fn full() -> LookSet {
2567	LookSet { bits: !`0` }
2568	}
2569
2570	/// Create a look-around set containing the look-around assertion given.
2571	///
2572	/// This is a convenience routine for creating an empty set and inserting
2573	/// one look-around assertions.
2574	#[inline]
2575	pub fn singleton(look: Look) -> LookSet {
2576	LookSet::empty().insert(look)
2577	}
2578
2579	/// Returns the total number of look-around assertions in this set.
2580	#[inline]
2581	pub fn len(self) -> usize {
2582	// OK because max value always fits in a u8, which in turn always
2583	// fits in a usize, regardless of target.
2584	usize::try_from(self.bits.count_ones()).unwrap()
2585	}
2586
2587	/// Returns true if and only if this set is empty.
2588	#[inline]
2589	pub fn is_empty(self) -> bool {
2590	self.len() == `0`
2591	}
2592
2593	/// Returns true if and only if the given look-around assertion is in this
2594	/// set.
2595	#[inline]
2596	pub fn contains(self, look: Look) -> bool {
2597	self.bits & look.as_repr() != `0`
2598	}
2599
2600	/// Returns true if and only if this set contains any anchor assertions.
2601	/// This includes both "start/end of haystack" and "start/end of line."
2602	#[inline]
2603	pub fn contains_anchor(&self) -> bool {
2604	self.contains_anchor_haystack() \|\| self.contains_anchor_line()
2605	}
2606
2607	/// Returns true if and only if this set contains any "start/end of
2608	/// haystack" anchors. This doesn't include "start/end of line" anchors.
2609	#[inline]
2610	pub fn contains_anchor_haystack(&self) -> bool {
2611	self.contains(Look::Start) \|\| self.contains(Look::End)
2612	}
2613
2614	/// Returns true if and only if this set contains any "start/end of line"
2615	/// anchors. This doesn't include "start/end of haystack" anchors. This
2616	/// includes both `\n` line anchors and CRLF (`\r\n`) aware line anchors.
2617	#[inline]
2618	pub fn contains_anchor_line(&self) -> bool {
2619	self.contains(Look::StartLF)
2620	\|\| self.contains(Look::EndLF)
2621	\|\| self.contains(Look::StartCRLF)
2622	\|\| self.contains(Look::EndCRLF)
2623	}
2624
2625	/// Returns true if and only if this set contains any "start/end of line"
2626	/// anchors that only treat `\n` as line terminators. This does not include
2627	/// haystack anchors or CRLF aware line anchors.
2628	#[inline]
2629	pub fn contains_anchor_lf(&self) -> bool {
2630	self.contains(Look::StartLF) \|\| self.contains(Look::EndLF)
2631	}
2632
2633	/// Returns true if and only if this set contains any "start/end of line"
2634	/// anchors that are CRLF-aware. This doesn't include "start/end of
2635	/// haystack" or "start/end of line-feed" anchors.
2636	#[inline]
2637	pub fn contains_anchor_crlf(&self) -> bool {
2638	self.contains(Look::StartCRLF) \|\| self.contains(Look::EndCRLF)
2639	}
2640
2641	/// Returns true if and only if this set contains any word boundary or
2642	/// negated word boundary assertions. This include both Unicode and ASCII
2643	/// word boundaries.
2644	#[inline]
2645	pub fn contains_word(self) -> bool {
2646	self.contains_word_unicode() \|\| self.contains_word_ascii()
2647	}
2648
2649	/// Returns true if and only if this set contains any Unicode word boundary
2650	/// or negated Unicode word boundary assertions.
2651	#[inline]
2652	pub fn contains_word_unicode(self) -> bool {
2653	self.contains(Look::WordUnicode)
2654	\|\| self.contains(Look::WordUnicodeNegate)
2655	}
2656
2657	/// Returns true if and only if this set contains any ASCII word boundary
2658	/// or negated ASCII word boundary assertions.
2659	#[inline]
2660	pub fn contains_word_ascii(self) -> bool {
2661	self.contains(Look::WordAscii) \|\| self.contains(Look::WordAsciiNegate)
2662	}
2663
2664	/// Returns an iterator over all of the look-around assertions in this set.
2665	#[inline]
2666	pub fn iter(self) -> LookSetIter {
2667	LookSetIter { set: self }
2668	}
2669
2670	/// Return a new set that is equivalent to the original, but with the given
2671	/// assertion added to it. If the assertion is already in the set, then the
2672	/// returned set is equivalent to the original.
2673	#[inline]
2674	pub fn insert(self, look: Look) -> LookSet {
2675	LookSet { bits: self.bits \| look.as_repr() }
2676	}
2677
2678	/// Updates this set in place with the result of inserting the given
2679	/// assertion into this set.
2680	#[inline]
2681	pub fn set_insert(&mut self, look: Look) {
2682	*self = self.insert(look);
2683	}
2684
2685	/// Return a new set that is equivalent to the original, but with the given
2686	/// assertion removed from it. If the assertion is not in the set, then the
2687	/// returned set is equivalent to the original.
2688	#[inline]
2689	pub fn remove(self, look: Look) -> LookSet {
2690	LookSet { bits: self.bits & !look.as_repr() }
2691	}
2692
2693	/// Updates this set in place with the result of removing the given
2694	/// assertion from this set.
2695	#[inline]
2696	pub fn set_remove(&mut self, look: Look) {
2697	*self = self.remove(look);
2698	}
2699
2700	/// Returns a new set that is the result of subtracting the given set from
2701	/// this set.
2702	#[inline]
2703	pub fn subtract(self, other: LookSet) -> LookSet {
2704	LookSet { bits: self.bits & !other.bits }
2705	}
2706
2707	/// Updates this set in place with the result of subtracting the given set
2708	/// from this set.
2709	#[inline]
2710	pub fn set_subtract(&mut self, other: LookSet) {
2711	*self = self.subtract(other);
2712	}
2713
2714	/// Returns a new set that is the union of this and the one given.
2715	#[inline]
2716	pub fn union(self, other: LookSet) -> LookSet {
2717	LookSet { bits: self.bits \| other.bits }
2718	}
2719
2720	/// Updates this set in place with the result of unioning it with the one
2721	/// given.
2722	#[inline]
2723	pub fn set_union(&mut self, other: LookSet) {
2724	*self = self.union(other);
2725	}
2726
2727	/// Returns a new set that is the intersection of this and the one given.
2728	#[inline]
2729	pub fn intersect(self, other: LookSet) -> LookSet {
2730	LookSet { bits: self.bits & other.bits }
2731	}
2732
2733	/// Updates this set in place with the result of intersecting it with the
2734	/// one given.
2735	#[inline]
2736	pub fn set_intersect(&mut self, other: LookSet) {
2737	*self = self.intersect(other);
2738	}
2739
2740	/// Return a `LookSet` from the slice given as a native endian 16-bit
2741	/// integer.
2742	///
2743	/// # Panics
2744	///
2745	/// This panics if `slice.len() < 2`.
2746	#[inline]
2747	pub fn read_repr(slice: &[u8]) -> LookSet {
2748	let bits = u16::from_ne_bytes(slice[..`2`].try_into().unwrap());
2749	LookSet { bits }
2750	}
2751
2752	/// Write a `LookSet` as a native endian 16-bit integer to the beginning
2753	/// of the slice given.
2754	///
2755	/// # Panics
2756	///
2757	/// This panics if `slice.len() < 2`.
2758	#[inline]
2759	pub fn write_repr(self, slice: &mut [u8]) {
2760	let raw = self.bits.to_ne_bytes();
2761	slice[`0`] = raw[`0`];
2762	slice[`1`] = raw[`1`];
2763	}
2764	}
2765
2766	impl core::fmt::Debug for LookSet {
2767	fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
2768	if self.is_empty() {
2769	return write!(f, "∅");
2770	}
2771	for look: Look in self.iter() {
2772	write!(f, "{}", look.as_char())?;
2773	}
2774	Ok(())
2775	}
2776	}
2777
2778	/// An iterator over all look-around assertions in a [`LookSet`].
2779	///
2780	/// This iterator is created by [`LookSet::iter`].
2781	#[derive(Clone, Debug)]
2782	pub struct LookSetIter {
2783	set: LookSet,
2784	}
2785
2786	impl Iterator for LookSetIter {
2787	type Item = Look;
2788
2789	#[inline]
2790	fn next(&mut self) -> Option<Look> {
2791	if self.set.is_empty() {
2792	return None;
2793	}
2794	// We'll never have more than u8::MAX distinct look-around assertions,
2795	// so 'repr' will always fit into a u16.
2796	let repr: u16 = u16::try_from(self.set.bits.trailing_zeros()).unwrap();
2797	let look: Look = Look::from_repr(`1` << repr)?;
2798	self.set = self.set.remove(look);
2799	Some(look)
2800	}
2801	}
2802
2803	/// Given a sequence of HIR values where each value corresponds to a Unicode
2804	/// class (or an all-ASCII byte class), return a single Unicode class
2805	/// corresponding to the union of the classes found.
2806	fn class_chars(hirs: &[Hir]) -> Option<Class> {
2807	let mut cls: ClassUnicode = ClassUnicode::new(ranges:vec![]);
2808	for hir: &Hir in hirs.iter() {
2809	match *hir.kind() {
2810	HirKind::Class(Class::Unicode(ref cls2: &ClassUnicode)) => {
2811	cls.union(cls2);
2812	}
2813	HirKind::Class(Class::Bytes(ref cls2: &ClassBytes)) => {
2814	cls.union(&cls2.to_unicode_class()?);
2815	}
2816	_ => return None,
2817	};
2818	}
2819	Some(Class::Unicode(cls))
2820	}
2821
2822	/// Given a sequence of HIR values where each value corresponds to a byte class
2823	/// (or an all-ASCII Unicode class), return a single byte class corresponding
2824	/// to the union of the classes found.
2825	fn class_bytes(hirs: &[Hir]) -> Option<Class> {
2826	let mut cls: ClassBytes = ClassBytes::new(ranges:vec![]);
2827	for hir: &Hir in hirs.iter() {
2828	match *hir.kind() {
2829	HirKind::Class(Class::Unicode(ref cls2: &ClassUnicode)) => {
2830	cls.union(&cls2.to_byte_class()?);
2831	}
2832	HirKind::Class(Class::Bytes(ref cls2: &ClassBytes)) => {
2833	cls.union(cls2);
2834	}
2835	_ => return None,
2836	};
2837	}
2838	Some(Class::Bytes(cls))
2839	}
2840
2841	/// Given a sequence of HIR values where each value corresponds to a literal
2842	/// that is a single `char`, return that sequence of `char`s. Otherwise return
2843	/// None. No deduplication is done.
2844	fn singleton_chars(hirs: &[Hir]) -> Option<Vec<char>> {
2845	let mut singletons: Vec = vec![];
2846	for hir: &Hir in hirs.iter() {
2847	let literal: &Box<[u8]> = match *hir.kind() {
2848	HirKind::Literal(Literal(ref bytes: &Box<[u8]>)) => bytes,
2849	_ => return None,
2850	};
2851	let ch: char = match crate::debug::utf8_decode(bytes:literal) {
2852	None => return None,
2853	Some(Err(_)) => return None,
2854	Some(Ok(ch: char)) => ch,
2855	};
2856	if literal.len() != ch.len_utf8() {
2857	return None;
2858	}
2859	singletons.push(ch);
2860	}
2861	Some(singletons)
2862	}
2863
2864	/// Given a sequence of HIR values where each value corresponds to a literal
2865	/// that is a single byte, return that sequence of bytes. Otherwise return
2866	/// None. No deduplication is done.
2867	fn singleton_bytes(hirs: &[Hir]) -> Option<Vec<u8>> {
2868	let mut singletons: Vec = vec![];
2869	for hir: &Hir in hirs.iter() {
2870	let literal: &Box<[u8]> = match *hir.kind() {
2871	HirKind::Literal(Literal(ref bytes: &Box<[u8]>)) => bytes,
2872	_ => return None,
2873	};
2874	if literal.len() != `1` {
2875	return None;
2876	}
2877	singletons.push(literal[`0`]);
2878	}
2879	Some(singletons)
2880	}
2881
2882	/// Looks for a common prefix in the list of alternation branches given. If one
2883	/// is found, then an equivalent but (hopefully) simplified Hir is returned.
2884	/// Otherwise, the original given list of branches is returned unmodified.
2885	///
2886	/// This is not quite as good as it could be. Right now, it requires that
2887	/// all branches are 'Concat' expressions. It also doesn't do well with
2888	/// literals. For example, given 'foofoo\|foobar', it will not refactor it to
2889	/// 'foo(?:foo\|bar)' because literals are flattened into their own special
2890	/// concatenation. (One wonders if perhaps 'Literal' should be a single atom
2891	/// instead of a string of bytes because of this. Otherwise, handling the
2892	/// current representation in this routine will be pretty gnarly. Sigh.)
2893	fn lift_common_prefix(hirs: Vec<Hir>) -> Result<Hir, Vec<Hir>> {
2894	if hirs.len() <= `1` {
2895	return Err(hirs);
2896	}
2897	let mut prefix = match hirs[`0`].kind() {
2898	HirKind::Concat(ref xs) => &**xs,
2899	_ => return Err(hirs),
2900	};
2901	if prefix.is_empty() {
2902	return Err(hirs);
2903	}
2904	for h in hirs.iter().skip(`1`) {
2905	let concat = match h.kind() {
2906	HirKind::Concat(ref xs) => xs,
2907	_ => return Err(hirs),
2908	};
2909	let common_len = prefix
2910	.iter()
2911	.zip(concat.iter())
2912	.take_while(\|(x, y)\| x == y)
2913	.count();
2914	prefix = &prefix[..common_len];
2915	if prefix.is_empty() {
2916	return Err(hirs);
2917	}
2918	}
2919	let len = prefix.len();
2920	assert_ne!(`0`, len);
2921	let mut prefix_concat = vec![];
2922	let mut suffix_alts = vec![];
2923	for h in hirs {
2924	let mut concat = match h.into_kind() {
2925	HirKind::Concat(xs) => xs,
2926	// We required all sub-expressions to be
2927	// concats above, so we're only here if we
2928	// have a concat.
2929	_ => unreachable!(),
2930	};
2931	suffix_alts.push(Hir::concat(concat.split_off(len)));
2932	if prefix_concat.is_empty() {
2933	prefix_concat = concat;
2934	}
2935	}
2936	let mut concat = prefix_concat;
2937	concat.push(Hir::alternation(suffix_alts));
2938	Ok(Hir::concat(concat))
2939	}
2940
2941	#[cfg(test)]
2942	mod tests {
2943	use super::*;
2944
2945	fn uclass(ranges: &[(char, char)]) -> ClassUnicode {
2946	let ranges: Vec<ClassUnicodeRange> = ranges
2947	.iter()
2948	.map(\|&(s, e)\| ClassUnicodeRange::new(s, e))
2949	.collect();
2950	ClassUnicode::new(ranges)
2951	}
2952
2953	fn bclass(ranges: &[(u8, u8)]) -> ClassBytes {
2954	let ranges: Vec<ClassBytesRange> =
2955	ranges.iter().map(\|&(s, e)\| ClassBytesRange::new(s, e)).collect();
2956	ClassBytes::new(ranges)
2957	}
2958
2959	fn uranges(cls: &ClassUnicode) -> Vec<(char, char)> {
2960	cls.iter().map(\|x\| (x.start(), x.end())).collect()
2961	}
2962
2963	#[cfg(feature = "unicode-case")]
2964	fn ucasefold(cls: &ClassUnicode) -> ClassUnicode {
2965	let mut cls_ = cls.clone();
2966	cls_.case_fold_simple();
2967	cls_
2968	}
2969
2970	fn uunion(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
2971	let mut cls_ = cls1.clone();
2972	cls_.union(cls2);
2973	cls_
2974	}
2975
2976	fn uintersect(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
2977	let mut cls_ = cls1.clone();
2978	cls_.intersect(cls2);
2979	cls_
2980	}
2981
2982	fn udifference(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
2983	let mut cls_ = cls1.clone();
2984	cls_.difference(cls2);
2985	cls_
2986	}
2987
2988	fn usymdifference(
2989	cls1: &ClassUnicode,
2990	cls2: &ClassUnicode,
2991	) -> ClassUnicode {
2992	let mut cls_ = cls1.clone();
2993	cls_.symmetric_difference(cls2);
2994	cls_
2995	}
2996
2997	fn unegate(cls: &ClassUnicode) -> ClassUnicode {
2998	let mut cls_ = cls.clone();
2999	cls_.negate();
3000	cls_
3001	}
3002
3003	fn branges(cls: &ClassBytes) -> Vec<(u8, u8)> {
3004	cls.iter().map(\|x\| (x.start(), x.end())).collect()
3005	}
3006
3007	fn bcasefold(cls: &ClassBytes) -> ClassBytes {
3008	let mut cls_ = cls.clone();
3009	cls_.case_fold_simple();
3010	cls_
3011	}
3012
3013	fn bunion(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
3014	let mut cls_ = cls1.clone();
3015	cls_.union(cls2);
3016	cls_
3017	}
3018
3019	fn bintersect(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
3020	let mut cls_ = cls1.clone();
3021	cls_.intersect(cls2);
3022	cls_
3023	}
3024
3025	fn bdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
3026	let mut cls_ = cls1.clone();
3027	cls_.difference(cls2);
3028	cls_
3029	}
3030
3031	fn bsymdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
3032	let mut cls_ = cls1.clone();
3033	cls_.symmetric_difference(cls2);
3034	cls_
3035	}
3036
3037	fn bnegate(cls: &ClassBytes) -> ClassBytes {
3038	let mut cls_ = cls.clone();
3039	cls_.negate();
3040	cls_
3041	}
3042
3043	#[test]
3044	fn class_range_canonical_unicode() {
3045	let range = ClassUnicodeRange::new('`\u{00FF}`', '`\0`');
3046	assert_eq!('`\0`', range.start());
3047	assert_eq!('`\u{00FF}`', range.end());
3048	}
3049
3050	#[test]
3051	fn class_range_canonical_bytes() {
3052	let range = ClassBytesRange::new(b'`\xFF`', b'`\0`');
3053	assert_eq!(b'`\0`', range.start());
3054	assert_eq!(b'`\xFF`', range.end());
3055	}
3056
3057	#[test]
3058	fn class_canonicalize_unicode() {
3059	let cls = uclass(&[('a', 'c'), ('x', 'z')]);
3060	let expected = vec![('a', 'c'), ('x', 'z')];
3061	assert_eq!(expected, uranges(&cls));
3062
3063	let cls = uclass(&[('x', 'z'), ('a', 'c')]);
3064	let expected = vec![('a', 'c'), ('x', 'z')];
3065	assert_eq!(expected, uranges(&cls));
3066
3067	let cls = uclass(&[('x', 'z'), ('w', 'y')]);
3068	let expected = vec![('w', 'z')];
3069	assert_eq!(expected, uranges(&cls));
3070
3071	let cls = uclass(&[
3072	('c', 'f'),
3073	('a', 'g'),
3074	('d', 'j'),
3075	('a', 'c'),
3076	('m', 'p'),
3077	('l', 's'),
3078	]);
3079	let expected = vec![('a', 'j'), ('l', 's')];
3080	assert_eq!(expected, uranges(&cls));
3081
3082	let cls = uclass(&[('x', 'z'), ('u', 'w')]);
3083	let expected = vec![('u', 'z')];
3084	assert_eq!(expected, uranges(&cls));
3085
3086	let cls = uclass(&[('`\x00`', '`\u{10FFFF}`'), ('`\x00`', '`\u{10FFFF}`')]);
3087	let expected = vec![('`\x00`', '`\u{10FFFF}`')];
3088	assert_eq!(expected, uranges(&cls));
3089
3090	let cls = uclass(&[('a', 'a'), ('b', 'b')]);
3091	let expected = vec![('a', 'b')];
3092	assert_eq!(expected, uranges(&cls));
3093	}
3094
3095	#[test]
3096	fn class_canonicalize_bytes() {
3097	let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]);
3098	let expected = vec![(b'a', b'c'), (b'x', b'z')];
3099	assert_eq!(expected, branges(&cls));
3100
3101	let cls = bclass(&[(b'x', b'z'), (b'a', b'c')]);
3102	let expected = vec![(b'a', b'c'), (b'x', b'z')];
3103	assert_eq!(expected, branges(&cls));
3104
3105	let cls = bclass(&[(b'x', b'z'), (b'w', b'y')]);
3106	let expected = vec![(b'w', b'z')];
3107	assert_eq!(expected, branges(&cls));
3108
3109	let cls = bclass(&[
3110	(b'c', b'f'),
3111	(b'a', b'g'),
3112	(b'd', b'j'),
3113	(b'a', b'c'),
3114	(b'm', b'p'),
3115	(b'l', b's'),
3116	]);
3117	let expected = vec![(b'a', b'j'), (b'l', b's')];
3118	assert_eq!(expected, branges(&cls));
3119
3120	let cls = bclass(&[(b'x', b'z'), (b'u', b'w')]);
3121	let expected = vec![(b'u', b'z')];
3122	assert_eq!(expected, branges(&cls));
3123
3124	let cls = bclass(&[(b'`\x00`', b'`\xFF`'), (b'`\x00`', b'`\xFF`')]);
3125	let expected = vec![(b'`\x00`', b'`\xFF`')];
3126	assert_eq!(expected, branges(&cls));
3127
3128	let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]);
3129	let expected = vec![(b'a', b'b')];
3130	assert_eq!(expected, branges(&cls));
3131	}
3132
3133	#[test]
3134	#[cfg(feature = "unicode-case")]
3135	fn class_case_fold_unicode() {
3136	let cls = uclass(&[
3137	('C', 'F'),
3138	('A', 'G'),
3139	('D', 'J'),
3140	('A', 'C'),
3141	('M', 'P'),
3142	('L', 'S'),
3143	('c', 'f'),
3144	]);
3145	let expected = uclass(&[
3146	('A', 'J'),
3147	('L', 'S'),
3148	('a', 'j'),
3149	('l', 's'),
3150	('`\u{17F}`', '`\u{17F}`'),
3151	]);
3152	assert_eq!(expected, ucasefold(&cls));
3153
3154	let cls = uclass(&[('A', 'Z')]);
3155	let expected = uclass(&[
3156	('A', 'Z'),
3157	('a', 'z'),
3158	('`\u{17F}`', '`\u{17F}`'),
3159	('`\u{212A}`', '`\u{212A}`'),
3160	]);
3161	assert_eq!(expected, ucasefold(&cls));
3162
3163	let cls = uclass(&[('a', 'z')]);
3164	let expected = uclass(&[
3165	('A', 'Z'),
3166	('a', 'z'),
3167	('`\u{17F}`', '`\u{17F}`'),
3168	('`\u{212A}`', '`\u{212A}`'),
3169	]);
3170	assert_eq!(expected, ucasefold(&cls));
3171
3172	let cls = uclass(&[('A', 'A'), ('_', '_')]);
3173	let expected = uclass(&[('A', 'A'), ('_', '_'), ('a', 'a')]);
3174	assert_eq!(expected, ucasefold(&cls));
3175
3176	let cls = uclass(&[('A', 'A'), ('=', '=')]);
3177	let expected = uclass(&[('=', '='), ('A', 'A'), ('a', 'a')]);
3178	assert_eq!(expected, ucasefold(&cls));
3179
3180	let cls = uclass(&[('`\x00`', '`\x10`')]);
3181	assert_eq!(cls, ucasefold(&cls));
3182
3183	let cls = uclass(&[('k', 'k')]);
3184	let expected =
3185	uclass(&[('K', 'K'), ('k', 'k'), ('`\u{212A}`', '`\u{212A}`')]);
3186	assert_eq!(expected, ucasefold(&cls));
3187
3188	let cls = uclass(&[('@', '@')]);
3189	assert_eq!(cls, ucasefold(&cls));
3190	}
3191
3192	#[test]
3193	#[cfg(not(feature = "unicode-case"))]
3194	fn class_case_fold_unicode_disabled() {
3195	let mut cls = uclass(&[
3196	('C', 'F'),
3197	('A', 'G'),
3198	('D', 'J'),
3199	('A', 'C'),
3200	('M', 'P'),
3201	('L', 'S'),
3202	('c', 'f'),
3203	]);
3204	assert!(cls.try_case_fold_simple().is_err());
3205	}
3206
3207	#[test]
3208	#[should_panic]
3209	#[cfg(not(feature = "unicode-case"))]
3210	fn class_case_fold_unicode_disabled_panics() {
3211	let mut cls = uclass(&[
3212	('C', 'F'),
3213	('A', 'G'),
3214	('D', 'J'),
3215	('A', 'C'),
3216	('M', 'P'),
3217	('L', 'S'),
3218	('c', 'f'),
3219	]);
3220	cls.case_fold_simple();
3221	}
3222
3223	#[test]
3224	fn class_case_fold_bytes() {
3225	let cls = bclass(&[
3226	(b'C', b'F'),
3227	(b'A', b'G'),
3228	(b'D', b'J'),
3229	(b'A', b'C'),
3230	(b'M', b'P'),
3231	(b'L', b'S'),
3232	(b'c', b'f'),
3233	]);
3234	let expected =
3235	bclass(&[(b'A', b'J'), (b'L', b'S'), (b'a', b'j'), (b'l', b's')]);
3236	assert_eq!(expected, bcasefold(&cls));
3237
3238	let cls = bclass(&[(b'A', b'Z')]);
3239	let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]);
3240	assert_eq!(expected, bcasefold(&cls));
3241
3242	let cls = bclass(&[(b'a', b'z')]);
3243	let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]);
3244	assert_eq!(expected, bcasefold(&cls));
3245
3246	let cls = bclass(&[(b'A', b'A'), (b'_', b'_')]);
3247	let expected = bclass(&[(b'A', b'A'), (b'_', b'_'), (b'a', b'a')]);
3248	assert_eq!(expected, bcasefold(&cls));
3249
3250	let cls = bclass(&[(b'A', b'A'), (b'=', b'=')]);
3251	let expected = bclass(&[(b'=', b'='), (b'A', b'A'), (b'a', b'a')]);
3252	assert_eq!(expected, bcasefold(&cls));
3253
3254	let cls = bclass(&[(b'`\x00`', b'`\x10`')]);
3255	assert_eq!(cls, bcasefold(&cls));
3256
3257	let cls = bclass(&[(b'k', b'k')]);
3258	let expected = bclass(&[(b'K', b'K'), (b'k', b'k')]);
3259	assert_eq!(expected, bcasefold(&cls));
3260
3261	let cls = bclass(&[(b'@', b'@')]);
3262	assert_eq!(cls, bcasefold(&cls));
3263	}
3264
3265	#[test]
3266	fn class_negate_unicode() {
3267	let cls = uclass(&[('a', 'a')]);
3268	let expected = uclass(&[('`\x00`', '`\x60`'), ('`\x62`', '`\u{10FFFF}`')]);
3269	assert_eq!(expected, unegate(&cls));
3270
3271	let cls = uclass(&[('a', 'a'), ('b', 'b')]);
3272	let expected = uclass(&[('`\x00`', '`\x60`'), ('`\x63`', '`\u{10FFFF}`')]);
3273	assert_eq!(expected, unegate(&cls));
3274
3275	let cls = uclass(&[('a', 'c'), ('x', 'z')]);
3276	let expected = uclass(&[
3277	('`\x00`', '`\x60`'),
3278	('`\x64`', '`\x77`'),
3279	('`\x7B`', '`\u{10FFFF}`'),
3280	]);
3281	assert_eq!(expected, unegate(&cls));
3282
3283	let cls = uclass(&[('`\x00`', 'a')]);
3284	let expected = uclass(&[('`\x62`', '`\u{10FFFF}`')]);
3285	assert_eq!(expected, unegate(&cls));
3286
3287	let cls = uclass(&[('a', '`\u{10FFFF}`')]);
3288	let expected = uclass(&[('`\x00`', '`\x60`')]);
3289	assert_eq!(expected, unegate(&cls));
3290
3291	let cls = uclass(&[('`\x00`', '`\u{10FFFF}`')]);
3292	let expected = uclass(&[]);
3293	assert_eq!(expected, unegate(&cls));
3294
3295	let cls = uclass(&[]);
3296	let expected = uclass(&[('`\x00`', '`\u{10FFFF}`')]);
3297	assert_eq!(expected, unegate(&cls));
3298
3299	let cls =
3300	uclass(&[('`\x00`', '`\u{10FFFD}`'), ('`\u{10FFFF}`', '`\u{10FFFF}`')]);
3301	let expected = uclass(&[('`\u{10FFFE}`', '`\u{10FFFE}`')]);
3302	assert_eq!(expected, unegate(&cls));
3303
3304	let cls = uclass(&[('`\x00`', '`\u{D7FF}`')]);
3305	let expected = uclass(&[('`\u{E000}`', '`\u{10FFFF}`')]);
3306	assert_eq!(expected, unegate(&cls));
3307
3308	let cls = uclass(&[('`\x00`', '`\u{D7FE}`')]);
3309	let expected = uclass(&[('`\u{D7FF}`', '`\u{10FFFF}`')]);
3310	assert_eq!(expected, unegate(&cls));
3311
3312	let cls = uclass(&[('`\u{E000}`', '`\u{10FFFF}`')]);
3313	let expected = uclass(&[('`\x00`', '`\u{D7FF}`')]);
3314	assert_eq!(expected, unegate(&cls));
3315
3316	let cls = uclass(&[('`\u{E001}`', '`\u{10FFFF}`')]);
3317	let expected = uclass(&[('`\x00`', '`\u{E000}`')]);
3318	assert_eq!(expected, unegate(&cls));
3319	}
3320
3321	#[test]
3322	fn class_negate_bytes() {
3323	let cls = bclass(&[(b'a', b'a')]);
3324	let expected = bclass(&[(b'`\x00`', b'`\x60`'), (b'`\x62`', b'`\xFF`')]);
3325	assert_eq!(expected, bnegate(&cls));
3326
3327	let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]);
3328	let expected = bclass(&[(b'`\x00`', b'`\x60`'), (b'`\x63`', b'`\xFF`')]);
3329	assert_eq!(expected, bnegate(&cls));
3330
3331	let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]);
3332	let expected = bclass(&[
3333	(b'`\x00`', b'`\x60`'),
3334	(b'`\x64`', b'`\x77`'),
3335	(b'`\x7B`', b'`\xFF`'),
3336	]);
3337	assert_eq!(expected, bnegate(&cls));
3338
3339	let cls = bclass(&[(b'`\x00`', b'a')]);
3340	let expected = bclass(&[(b'`\x62`', b'`\xFF`')]);
3341	assert_eq!(expected, bnegate(&cls));
3342
3343	let cls = bclass(&[(b'a', b'`\xFF`')]);
3344	let expected = bclass(&[(b'`\x00`', b'`\x60`')]);
3345	assert_eq!(expected, bnegate(&cls));
3346
3347	let cls = bclass(&[(b'`\x00`', b'`\xFF`')]);
3348	let expected = bclass(&[]);
3349	assert_eq!(expected, bnegate(&cls));
3350
3351	let cls = bclass(&[]);
3352	let expected = bclass(&[(b'`\x00`', b'`\xFF`')]);
3353	assert_eq!(expected, bnegate(&cls));
3354
3355	let cls = bclass(&[(b'`\x00`', b'`\xFD`'), (b'`\xFF`', b'`\xFF`')]);
3356	let expected = bclass(&[(b'`\xFE`', b'`\xFE`')]);
3357	assert_eq!(expected, bnegate(&cls));
3358	}
3359
3360	#[test]
3361	fn class_union_unicode() {
3362	let cls1 = uclass(&[('a', 'g'), ('m', 't'), ('A', 'C')]);
3363	let cls2 = uclass(&[('a', 'z')]);
3364	let expected = uclass(&[('a', 'z'), ('A', 'C')]);
3365	assert_eq!(expected, uunion(&cls1, &cls2));
3366	}
3367
3368	#[test]
3369	fn class_union_bytes() {
3370	let cls1 = bclass(&[(b'a', b'g'), (b'm', b't'), (b'A', b'C')]);
3371	let cls2 = bclass(&[(b'a', b'z')]);
3372	let expected = bclass(&[(b'a', b'z'), (b'A', b'C')]);
3373	assert_eq!(expected, bunion(&cls1, &cls2));
3374	}
3375
3376	#[test]
3377	fn class_intersect_unicode() {
3378	let cls1 = uclass(&[]);
3379	let cls2 = uclass(&[('a', 'a')]);
3380	let expected = uclass(&[]);
3381	assert_eq!(expected, uintersect(&cls1, &cls2));
3382
3383	let cls1 = uclass(&[('a', 'a')]);
3384	let cls2 = uclass(&[('a', 'a')]);
3385	let expected = uclass(&[('a', 'a')]);
3386	assert_eq!(expected, uintersect(&cls1, &cls2));
3387
3388	let cls1 = uclass(&[('a', 'a')]);
3389	let cls2 = uclass(&[('b', 'b')]);
3390	let expected = uclass(&[]);
3391	assert_eq!(expected, uintersect(&cls1, &cls2));
3392
3393	let cls1 = uclass(&[('a', 'a')]);
3394	let cls2 = uclass(&[('a', 'c')]);
3395	let expected = uclass(&[('a', 'a')]);
3396	assert_eq!(expected, uintersect(&cls1, &cls2));
3397
3398	let cls1 = uclass(&[('a', 'b')]);
3399	let cls2 = uclass(&[('a', 'c')]);
3400	let expected = uclass(&[('a', 'b')]);
3401	assert_eq!(expected, uintersect(&cls1, &cls2));
3402
3403	let cls1 = uclass(&[('a', 'b')]);
3404	let cls2 = uclass(&[('b', 'c')]);
3405	let expected = uclass(&[('b', 'b')]);
3406	assert_eq!(expected, uintersect(&cls1, &cls2));
3407
3408	let cls1 = uclass(&[('a', 'b')]);
3409	let cls2 = uclass(&[('c', 'd')]);
3410	let expected = uclass(&[]);
3411	assert_eq!(expected, uintersect(&cls1, &cls2));
3412
3413	let cls1 = uclass(&[('b', 'c')]);
3414	let cls2 = uclass(&[('a', 'd')]);
3415	let expected = uclass(&[('b', 'c')]);
3416	assert_eq!(expected, uintersect(&cls1, &cls2));
3417
3418	let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3419	let cls2 = uclass(&[('a', 'h')]);
3420	let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3421	assert_eq!(expected, uintersect(&cls1, &cls2));
3422
3423	let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3424	let cls2 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3425	let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3426	assert_eq!(expected, uintersect(&cls1, &cls2));
3427
3428	let cls1 = uclass(&[('a', 'b'), ('g', 'h')]);
3429	let cls2 = uclass(&[('d', 'e'), ('k', 'l')]);
3430	let expected = uclass(&[]);
3431	assert_eq!(expected, uintersect(&cls1, &cls2));
3432
3433	let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3434	let cls2 = uclass(&[('h', 'h')]);
3435	let expected = uclass(&[('h', 'h')]);
3436	assert_eq!(expected, uintersect(&cls1, &cls2));
3437
3438	let cls1 = uclass(&[('a', 'b'), ('e', 'f'), ('i', 'j')]);
3439	let cls2 = uclass(&[('c', 'd'), ('g', 'h'), ('k', 'l')]);
3440	let expected = uclass(&[]);
3441	assert_eq!(expected, uintersect(&cls1, &cls2));
3442
3443	let cls1 = uclass(&[('a', 'b'), ('c', 'd'), ('e', 'f')]);
3444	let cls2 = uclass(&[('b', 'c'), ('d', 'e'), ('f', 'g')]);
3445	let expected = uclass(&[('b', 'f')]);
3446	assert_eq!(expected, uintersect(&cls1, &cls2));
3447	}
3448
3449	#[test]
3450	fn class_intersect_bytes() {
3451	let cls1 = bclass(&[]);
3452	let cls2 = bclass(&[(b'a', b'a')]);
3453	let expected = bclass(&[]);
3454	assert_eq!(expected, bintersect(&cls1, &cls2));
3455
3456	let cls1 = bclass(&[(b'a', b'a')]);
3457	let cls2 = bclass(&[(b'a', b'a')]);
3458	let expected = bclass(&[(b'a', b'a')]);
3459	assert_eq!(expected, bintersect(&cls1, &cls2));
3460
3461	let cls1 = bclass(&[(b'a', b'a')]);
3462	let cls2 = bclass(&[(b'b', b'b')]);
3463	let expected = bclass(&[]);
3464	assert_eq!(expected, bintersect(&cls1, &cls2));
3465
3466	let cls1 = bclass(&[(b'a', b'a')]);
3467	let cls2 = bclass(&[(b'a', b'c')]);
3468	let expected = bclass(&[(b'a', b'a')]);
3469	assert_eq!(expected, bintersect(&cls1, &cls2));
3470
3471	let cls1 = bclass(&[(b'a', b'b')]);
3472	let cls2 = bclass(&[(b'a', b'c')]);
3473	let expected = bclass(&[(b'a', b'b')]);
3474	assert_eq!(expected, bintersect(&cls1, &cls2));
3475
3476	let cls1 = bclass(&[(b'a', b'b')]);
3477	let cls2 = bclass(&[(b'b', b'c')]);
3478	let expected = bclass(&[(b'b', b'b')]);
3479	assert_eq!(expected, bintersect(&cls1, &cls2));
3480
3481	let cls1 = bclass(&[(b'a', b'b')]);
3482	let cls2 = bclass(&[(b'c', b'd')]);
3483	let expected = bclass(&[]);
3484	assert_eq!(expected, bintersect(&cls1, &cls2));
3485
3486	let cls1 = bclass(&[(b'b', b'c')]);
3487	let cls2 = bclass(&[(b'a', b'd')]);
3488	let expected = bclass(&[(b'b', b'c')]);
3489	assert_eq!(expected, bintersect(&cls1, &cls2));
3490
3491	let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3492	let cls2 = bclass(&[(b'a', b'h')]);
3493	let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3494	assert_eq!(expected, bintersect(&cls1, &cls2));
3495
3496	let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3497	let cls2 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3498	let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3499	assert_eq!(expected, bintersect(&cls1, &cls2));
3500
3501	let cls1 = bclass(&[(b'a', b'b'), (b'g', b'h')]);
3502	let cls2 = bclass(&[(b'd', b'e'), (b'k', b'l')]);
3503	let expected = bclass(&[]);
3504	assert_eq!(expected, bintersect(&cls1, &cls2));
3505
3506	let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3507	let cls2 = bclass(&[(b'h', b'h')]);
3508	let expected = bclass(&[(b'h', b'h')]);
3509	assert_eq!(expected, bintersect(&cls1, &cls2));
3510
3511	let cls1 = bclass(&[(b'a', b'b'), (b'e', b'f'), (b'i', b'j')]);
3512	let cls2 = bclass(&[(b'c', b'd'), (b'g', b'h'), (b'k', b'l')]);
3513	let expected = bclass(&[]);
3514	assert_eq!(expected, bintersect(&cls1, &cls2));
3515
3516	let cls1 = bclass(&[(b'a', b'b'), (b'c', b'd'), (b'e', b'f')]);
3517	let cls2 = bclass(&[(b'b', b'c'), (b'd', b'e'), (b'f', b'g')]);
3518	let expected = bclass(&[(b'b', b'f')]);
3519	assert_eq!(expected, bintersect(&cls1, &cls2));
3520	}
3521
3522	#[test]
3523	fn class_difference_unicode() {
3524	let cls1 = uclass(&[('a', 'a')]);
3525	let cls2 = uclass(&[('a', 'a')]);
3526	let expected = uclass(&[]);
3527	assert_eq!(expected, udifference(&cls1, &cls2));
3528
3529	let cls1 = uclass(&[('a', 'a')]);
3530	let cls2 = uclass(&[]);
3531	let expected = uclass(&[('a', 'a')]);
3532	assert_eq!(expected, udifference(&cls1, &cls2));
3533
3534	let cls1 = uclass(&[]);
3535	let cls2 = uclass(&[('a', 'a')]);
3536	let expected = uclass(&[]);
3537	assert_eq!(expected, udifference(&cls1, &cls2));
3538
3539	let cls1 = uclass(&[('a', 'z')]);
3540	let cls2 = uclass(&[('a', 'a')]);
3541	let expected = uclass(&[('b', 'z')]);
3542	assert_eq!(expected, udifference(&cls1, &cls2));
3543
3544	let cls1 = uclass(&[('a', 'z')]);
3545	let cls2 = uclass(&[('z', 'z')]);
3546	let expected = uclass(&[('a', 'y')]);
3547	assert_eq!(expected, udifference(&cls1, &cls2));
3548
3549	let cls1 = uclass(&[('a', 'z')]);
3550	let cls2 = uclass(&[('m', 'm')]);
3551	let expected = uclass(&[('a', 'l'), ('n', 'z')]);
3552	assert_eq!(expected, udifference(&cls1, &cls2));
3553
3554	let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
3555	let cls2 = uclass(&[('a', 'z')]);
3556	let expected = uclass(&[]);
3557	assert_eq!(expected, udifference(&cls1, &cls2));
3558
3559	let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
3560	let cls2 = uclass(&[('d', 'v')]);
3561	let expected = uclass(&[('a', 'c')]);
3562	assert_eq!(expected, udifference(&cls1, &cls2));
3563
3564	let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
3565	let cls2 = uclass(&[('b', 'g'), ('s', 'u')]);
3566	let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]);
3567	assert_eq!(expected, udifference(&cls1, &cls2));
3568
3569	let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
3570	let cls2 = uclass(&[('b', 'd'), ('e', 'g'), ('s', 'u')]);
3571	let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]);
3572	assert_eq!(expected, udifference(&cls1, &cls2));
3573
3574	let cls1 = uclass(&[('x', 'z')]);
3575	let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]);
3576	let expected = uclass(&[('x', 'z')]);
3577	assert_eq!(expected, udifference(&cls1, &cls2));
3578
3579	let cls1 = uclass(&[('a', 'z')]);
3580	let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]);
3581	let expected = uclass(&[('d', 'd'), ('h', 'r'), ('v', 'z')]);
3582	assert_eq!(expected, udifference(&cls1, &cls2));
3583	}
3584
3585	#[test]
3586	fn class_difference_bytes() {
3587	let cls1 = bclass(&[(b'a', b'a')]);
3588	let cls2 = bclass(&[(b'a', b'a')]);
3589	let expected = bclass(&[]);
3590	assert_eq!(expected, bdifference(&cls1, &cls2));
3591
3592	let cls1 = bclass(&[(b'a', b'a')]);
3593	let cls2 = bclass(&[]);
3594	let expected = bclass(&[(b'a', b'a')]);
3595	assert_eq!(expected, bdifference(&cls1, &cls2));
3596
3597	let cls1 = bclass(&[]);
3598	let cls2 = bclass(&[(b'a', b'a')]);
3599	let expected = bclass(&[]);
3600	assert_eq!(expected, bdifference(&cls1, &cls2));
3601
3602	let cls1 = bclass(&[(b'a', b'z')]);
3603	let cls2 = bclass(&[(b'a', b'a')]);
3604	let expected = bclass(&[(b'b', b'z')]);
3605	assert_eq!(expected, bdifference(&cls1, &cls2));
3606
3607	let cls1 = bclass(&[(b'a', b'z')]);
3608	let cls2 = bclass(&[(b'z', b'z')]);
3609	let expected = bclass(&[(b'a', b'y')]);
3610	assert_eq!(expected, bdifference(&cls1, &cls2));
3611
3612	let cls1 = bclass(&[(b'a', b'z')]);
3613	let cls2 = bclass(&[(b'm', b'm')]);
3614	let expected = bclass(&[(b'a', b'l'), (b'n', b'z')]);
3615	assert_eq!(expected, bdifference(&cls1, &cls2));
3616
3617	let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
3618	let cls2 = bclass(&[(b'a', b'z')]);
3619	let expected = bclass(&[]);
3620	assert_eq!(expected, bdifference(&cls1, &cls2));
3621
3622	let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
3623	let cls2 = bclass(&[(b'd', b'v')]);
3624	let expected = bclass(&[(b'a', b'c')]);
3625	assert_eq!(expected, bdifference(&cls1, &cls2));
3626
3627	let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
3628	let cls2 = bclass(&[(b'b', b'g'), (b's', b'u')]);
3629	let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]);
3630	assert_eq!(expected, bdifference(&cls1, &cls2));
3631
3632	let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
3633	let cls2 = bclass(&[(b'b', b'd'), (b'e', b'g'), (b's', b'u')]);
3634	let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]);
3635	assert_eq!(expected, bdifference(&cls1, &cls2));
3636
3637	let cls1 = bclass(&[(b'x', b'z')]);
3638	let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]);
3639	let expected = bclass(&[(b'x', b'z')]);
3640	assert_eq!(expected, bdifference(&cls1, &cls2));
3641
3642	let cls1 = bclass(&[(b'a', b'z')]);
3643	let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]);
3644	let expected = bclass(&[(b'd', b'd'), (b'h', b'r'), (b'v', b'z')]);
3645	assert_eq!(expected, bdifference(&cls1, &cls2));
3646	}
3647
3648	#[test]
3649	fn class_symmetric_difference_unicode() {
3650	let cls1 = uclass(&[('a', 'm')]);
3651	let cls2 = uclass(&[('g', 't')]);
3652	let expected = uclass(&[('a', 'f'), ('n', 't')]);
3653	assert_eq!(expected, usymdifference(&cls1, &cls2));
3654	}
3655
3656	#[test]
3657	fn class_symmetric_difference_bytes() {
3658	let cls1 = bclass(&[(b'a', b'm')]);
3659	let cls2 = bclass(&[(b'g', b't')]);
3660	let expected = bclass(&[(b'a', b'f'), (b'n', b't')]);
3661	assert_eq!(expected, bsymdifference(&cls1, &cls2));
3662	}
3663
3664	// We use a thread with an explicit stack size to test that our destructor
3665	// for Hir can handle arbitrarily sized expressions in constant stack
3666	// space. In case we run on a platform without threads (WASM?), we limit
3667	// this test to Windows/Unix.
3668	#[test]
3669	#[cfg(any(unix, windows))]
3670	fn no_stack_overflow_on_drop() {
3671	use std::thread;
3672
3673	let run = \|\| {
3674	let mut expr = Hir::empty();
3675	for _ in `0`..`100` {
3676	expr = Hir::capture(Capture {
3677	index: `1`,
3678	name: None,
3679	sub: Box::new(expr),
3680	});
3681	expr = Hir::repetition(Repetition {
3682	min: `0`,
3683	max: Some(`1`),
3684	greedy: `true`,
3685	sub: Box::new(expr),
3686	});
3687
3688	expr = Hir {
3689	kind: HirKind::Concat(vec![expr]),
3690	props: Properties::empty(),
3691	};
3692	expr = Hir {
3693	kind: HirKind::Alternation(vec![expr]),
3694	props: Properties::empty(),
3695	};
3696	}
3697	assert!(!matches!(*expr.kind(), HirKind::Empty));
3698	};
3699
3700	// We run our test on a thread with a small stack size so we can
3701	// force the issue more easily.
3702	//
3703	// NOTE(2023-03-21): See the corresponding test in 'crate::ast::tests'
3704	// for context on the specific stack size chosen here.
3705	thread::Builder::new()
3706	.stack_size(`16` << `10`)
3707	.spawn(run)
3708	.unwrap()
3709	.join()
3710	.unwrap();
3711	}
3712
3713	#[test]
3714	fn look_set_iter() {
3715	let set = LookSet::empty();
3716	assert_eq!(`0`, set.iter().count());
3717
3718	let set = LookSet::full();
3719	assert_eq!(`10`, set.iter().count());
3720
3721	let set =
3722	LookSet::empty().insert(Look::StartLF).insert(Look::WordUnicode);
3723	assert_eq!(`2`, set.iter().count());
3724
3725	let set = LookSet::empty().insert(Look::StartLF);
3726	assert_eq!(`1`, set.iter().count());
3727
3728	let set = LookSet::empty().insert(Look::WordAsciiNegate);
3729	assert_eq!(`1`, set.iter().count());
3730	}
3731
3732	#[test]
3733	fn look_set_debug() {
3734	let res = format!("{:?}", LookSet::empty());
3735	assert_eq!("∅", res);
3736	let res = format!("{:?}", LookSet::full());
3737	assert_eq!("Az^$rRbB𝛃𝚩", res);
3738	}
3739	}
3740