mod.rs - Codebrowser

1	/!*
2	Defines a high-level intermediate representation for regular expressions.
3	*/
4	use std::char;
5	use std::cmp;
6	use std::error;
7	use std::fmt;
8	use std::result;
9	use std::u8;
10
11	use crate::ast::Span;
12	use crate::hir::interval::{Interval, IntervalSet, IntervalSetIter};
13	use crate::unicode;
14
15	pub use crate::hir::visitor::{visit, Visitor};
16	pub use crate::unicode::CaseFoldError;
17
18	mod interval;
19	pub mod literal;
20	pub mod print;
21	pub mod translate;
22	mod visitor;
23
24	/// An error that can occur while translating an `Ast` to a `Hir`.
25	#[derive(Clone, Debug, Eq, PartialEq)]
26	pub struct Error {
27	/// The kind of error.
28	kind: ErrorKind,
29	/// The original pattern that the translator's Ast was parsed from. Every
30	/// span in an error is a valid range into this string.
31	pattern: String,
32	/// The span of this error, derived from the Ast given to the translator.
33	span: Span,
34	}
35
36	impl Error {
37	/// Return the type of this error.
38	pub fn kind(&self) -> &ErrorKind {
39	&self.kind
40	}
41
42	/// The original pattern string in which this error occurred.
43	///
44	/// Every span reported by this error is reported in terms of this string.
45	pub fn pattern(&self) -> &str {
46	&self.pattern
47	}
48
49	/// Return the span at which this error occurred.
50	pub fn span(&self) -> &Span {
51	&self.span
52	}
53	}
54
55	/// The type of an error that occurred while building an `Hir`.
56	#[derive(Clone, Debug, Eq, PartialEq)]
57	pub enum ErrorKind {
58	/// This error occurs when a Unicode feature is used when Unicode
59	/// support is disabled. For example `(?-u:\pL)` would trigger this error.
60	UnicodeNotAllowed,
61	/// This error occurs when translating a pattern that could match a byte
62	/// sequence that isn't UTF-8 and `allow_invalid_utf8` was disabled.
63	InvalidUtf8,
64	/// This occurs when an unrecognized Unicode property name could not
65	/// be found.
66	UnicodePropertyNotFound,
67	/// This occurs when an unrecognized Unicode property value could not
68	/// be found.
69	UnicodePropertyValueNotFound,
70	/// This occurs when a Unicode-aware Perl character class (`\w`, `\s` or
71	/// `\d`) could not be found. This can occur when the `unicode-perl`
72	/// crate feature is not enabled.
73	UnicodePerlClassNotFound,
74	/// This occurs when the Unicode simple case mapping tables are not
75	/// available, and the regular expression required Unicode aware case
76	/// insensitivity.
77	UnicodeCaseUnavailable,
78	/// This occurs when the translator attempts to construct a character class
79	/// that is empty.
80	///
81	/// Note that this restriction in the translator may be removed in the
82	/// future.
83	EmptyClassNotAllowed,
84	/// Hints that destructuring should not be exhaustive.
85	///
86	/// This enum may grow additional variants, so this makes sure clients
87	/// don't count on exhaustive matching. (Otherwise, adding a new variant
88	/// could break existing code.)
89	#[doc(hidden)]
90	__Nonexhaustive,
91	}
92
93	impl ErrorKind {
94	// TODO: Remove this method entirely on the next breaking semver release.
95	#[allow(deprecated)]
96	fn description(&self) -> &str {
97	use self::ErrorKind::*;
98	match *self {
99	UnicodeNotAllowed => "Unicode not allowed here",
100	InvalidUtf8 => "pattern can match invalid UTF-8",
101	UnicodePropertyNotFound => "Unicode property not found",
102	UnicodePropertyValueNotFound => "Unicode property value not found",
103	UnicodePerlClassNotFound => {
104	"Unicode-aware Perl class not found \
105	(make sure the unicode-perl feature is enabled)"
106	}
107	UnicodeCaseUnavailable => {
108	"Unicode-aware case insensitivity matching is not available \
109	(make sure the unicode-case feature is enabled)"
110	}
111	EmptyClassNotAllowed => "empty character classes are not allowed",
112	__Nonexhaustive => unreachable!(),
113	}
114	}
115	}
116
117	impl error::Error for Error {
118	// TODO: Remove this method entirely on the next breaking semver release.
119	#[allow(deprecated)]
120	fn description(&self) -> &str {
121	self.kind.description()
122	}
123	}
124
125	impl fmt::Display for Error {
126	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
127	crate::error::Formatter::from(self).fmt(f)
128	}
129	}
130
131	impl fmt::Display for ErrorKind {
132	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
133	// TODO: Remove this on the next breaking semver release.
134	#[allow(deprecated)]
135	f.write_str(self.description())
136	}
137	}
138
139	/// A high-level intermediate representation (HIR) for a regular expression.
140	///
141	/// The HIR of a regular expression represents an intermediate step between its
142	/// abstract syntax (a structured description of the concrete syntax) and
143	/// compiled byte codes. The purpose of HIR is to make regular expressions
144	/// easier to analyze. In particular, the AST is much more complex than the
145	/// HIR. For example, while an AST supports arbitrarily nested character
146	/// classes, the HIR will flatten all nested classes into a single set. The HIR
147	/// will also "compile away" every flag present in the concrete syntax. For
148	/// example, users of HIR expressions never need to worry about case folding;
149	/// it is handled automatically by the translator (e.g., by translating `(?i)A`
150	/// to `[aA]`).
151	///
152	/// If the HIR was produced by a translator that disallows invalid UTF-8, then
153	/// the HIR is guaranteed to match UTF-8 exclusively.
154	///
155	/// This type defines its own destructor that uses constant stack space and
156	/// heap space proportional to the size of the HIR.
157	///
158	/// The specific type of an HIR expression can be accessed via its `kind`
159	/// or `into_kind` methods. This extra level of indirection exists for two
160	/// reasons:
161	///
162	/// 1. Construction of an HIR expression must* use the constructor methods*
163	/// on this `Hir` type instead of building the `HirKind` values directly.
164	/// This permits construction to enforce invariants like "concatenations
165	/// always consist of two or more sub-expressions."
166	/// 2. Every HIR expression contains attributes that are defined inductively,
167	/// and can be computed cheaply during the construction process. For
168	/// example, one such attribute is whether the expression must match at the
169	/// beginning of the text.
170	///
171	/// Also, an `Hir`'s `fmt::Display` implementation prints an HIR as a regular
172	/// expression pattern string, and uses constant stack space and heap space
173	/// proportional to the size of the `Hir`.
174	#[derive(Clone, Debug, Eq, PartialEq)]
175	pub struct Hir {
176	/// The underlying HIR kind.
177	kind: HirKind,
178	/// Analysis info about this HIR, computed during construction.
179	info: HirInfo,
180	}
181
182	/// The kind of an arbitrary `Hir` expression.
183	#[derive(Clone, Debug, Eq, PartialEq)]
184	pub enum HirKind {
185	/// The empty regular expression, which matches everything, including the
186	/// empty string.
187	Empty,
188	/// A single literal character that matches exactly this character.
189	Literal(Literal),
190	/// A single character class that matches any of the characters in the
191	/// class. A class can either consist of Unicode scalar values as
192	/// characters, or it can use bytes.
193	Class(Class),
194	/// An anchor assertion. An anchor assertion match always has zero length.
195	Anchor(Anchor),
196	/// A word boundary assertion, which may or may not be Unicode aware. A
197	/// word boundary assertion match always has zero length.
198	WordBoundary(WordBoundary),
199	/// A repetition operation applied to a child expression.
200	Repetition(Repetition),
201	/// A possibly capturing group, which contains a child expression.
202	Group(Group),
203	/// A concatenation of expressions. A concatenation always has at least two
204	/// child expressions.
205	///
206	/// A concatenation matches only if each of its child expression matches
207	/// one after the other.
208	Concat(Vec<Hir>),
209	/// An alternation of expressions. An alternation always has at least two
210	/// child expressions.
211	///
212	/// An alternation matches only if at least one of its child expression
213	/// matches. If multiple expressions match, then the leftmost is preferred.
214	Alternation(Vec<Hir>),
215	}
216
217	impl Hir {
218	/// Returns a reference to the underlying HIR kind.
219	pub fn kind(&self) -> &HirKind {
220	&self.kind
221	}
222
223	/// Consumes ownership of this HIR expression and returns its underlying
224	/// `HirKind`.
225	pub fn into_kind(mut self) -> HirKind {
226	use std::mem;
227	mem::replace(&mut self.kind, HirKind::Empty)
228	}
229
230	/// Returns an empty HIR expression.
231	///
232	/// An empty HIR expression always matches, including the empty string.
233	pub fn empty() -> Hir {
234	let mut info = HirInfo::new();
235	info.set_always_utf8(`true`);
236	info.set_all_assertions(`true`);
237	info.set_anchored_start(`false`);
238	info.set_anchored_end(`false`);
239	info.set_line_anchored_start(`false`);
240	info.set_line_anchored_end(`false`);
241	info.set_any_anchored_start(`false`);
242	info.set_any_anchored_end(`false`);
243	info.set_match_empty(`true`);
244	info.set_literal(`false`);
245	info.set_alternation_literal(`false`);
246	Hir { kind: HirKind::Empty, info }
247	}
248
249	/// Creates a literal HIR expression.
250	///
251	/// If the given literal has a `Byte` variant with an ASCII byte, then this
252	/// method panics. This enforces the invariant that `Byte` variants are
253	/// only used to express matching of invalid UTF-8.
254	pub fn literal(lit: Literal) -> Hir {
255	if let Literal::Byte(b) = lit {
256	assert!(b > `0x7F`);
257	}
258
259	let mut info = HirInfo::new();
260	info.set_always_utf8(lit.is_unicode());
261	info.set_all_assertions(`false`);
262	info.set_anchored_start(`false`);
263	info.set_anchored_end(`false`);
264	info.set_line_anchored_start(`false`);
265	info.set_line_anchored_end(`false`);
266	info.set_any_anchored_start(`false`);
267	info.set_any_anchored_end(`false`);
268	info.set_match_empty(`false`);
269	info.set_literal(`true`);
270	info.set_alternation_literal(`true`);
271	Hir { kind: HirKind::Literal(lit), info }
272	}
273
274	/// Creates a class HIR expression.
275	pub fn class(class: Class) -> Hir {
276	let mut info = HirInfo::new();
277	info.set_always_utf8(class.is_always_utf8());
278	info.set_all_assertions(`false`);
279	info.set_anchored_start(`false`);
280	info.set_anchored_end(`false`);
281	info.set_line_anchored_start(`false`);
282	info.set_line_anchored_end(`false`);
283	info.set_any_anchored_start(`false`);
284	info.set_any_anchored_end(`false`);
285	info.set_match_empty(`false`);
286	info.set_literal(`false`);
287	info.set_alternation_literal(`false`);
288	Hir { kind: HirKind::Class(class), info }
289	}
290
291	/// Creates an anchor assertion HIR expression.
292	pub fn anchor(anchor: Anchor) -> Hir {
293	let mut info = HirInfo::new();
294	info.set_always_utf8(`true`);
295	info.set_all_assertions(`true`);
296	info.set_anchored_start(`false`);
297	info.set_anchored_end(`false`);
298	info.set_line_anchored_start(`false`);
299	info.set_line_anchored_end(`false`);
300	info.set_any_anchored_start(`false`);
301	info.set_any_anchored_end(`false`);
302	info.set_match_empty(`true`);
303	info.set_literal(`false`);
304	info.set_alternation_literal(`false`);
305	if let Anchor::StartText = anchor {
306	info.set_anchored_start(`true`);
307	info.set_line_anchored_start(`true`);
308	info.set_any_anchored_start(`true`);
309	}
310	if let Anchor::EndText = anchor {
311	info.set_anchored_end(`true`);
312	info.set_line_anchored_end(`true`);
313	info.set_any_anchored_end(`true`);
314	}
315	if let Anchor::StartLine = anchor {
316	info.set_line_anchored_start(`true`);
317	}
318	if let Anchor::EndLine = anchor {
319	info.set_line_anchored_end(`true`);
320	}
321	Hir { kind: HirKind::Anchor(anchor), info }
322	}
323
324	/// Creates a word boundary assertion HIR expression.
325	pub fn word_boundary(word_boundary: WordBoundary) -> Hir {
326	let mut info = HirInfo::new();
327	info.set_always_utf8(`true`);
328	info.set_all_assertions(`true`);
329	info.set_anchored_start(`false`);
330	info.set_anchored_end(`false`);
331	info.set_line_anchored_start(`false`);
332	info.set_line_anchored_end(`false`);
333	info.set_any_anchored_start(`false`);
334	info.set_any_anchored_end(`false`);
335	info.set_literal(`false`);
336	info.set_alternation_literal(`false`);
337	// A negated word boundary matches '', so that's fine. But \b does not
338	// match \b, so why do we say it can match the empty string? Well,
339	// because, if you search for \b against 'a', it will report [0, 0) and
340	// [1, 1) as matches, and both of those matches correspond to the empty
341	// string. Thus, only certain* empty strings match \b, which similarly*
342	// applies to \B.
343	info.set_match_empty(`true`);
344	// Negated ASCII word boundaries can match invalid UTF-8.
345	if let WordBoundary::AsciiNegate = word_boundary {
346	info.set_always_utf8(`false`);
347	}
348	Hir { kind: HirKind::WordBoundary(word_boundary), info }
349	}
350
351	/// Creates a repetition HIR expression.
352	pub fn repetition(rep: Repetition) -> Hir {
353	let mut info = HirInfo::new();
354	info.set_always_utf8(rep.hir.is_always_utf8());
355	info.set_all_assertions(rep.hir.is_all_assertions());
356	// If this operator can match the empty string, then it can never
357	// be anchored.
358	info.set_anchored_start(
359	!rep.is_match_empty() && rep.hir.is_anchored_start(),
360	);
361	info.set_anchored_end(
362	!rep.is_match_empty() && rep.hir.is_anchored_end(),
363	);
364	info.set_line_anchored_start(
365	!rep.is_match_empty() && rep.hir.is_anchored_start(),
366	);
367	info.set_line_anchored_end(
368	!rep.is_match_empty() && rep.hir.is_anchored_end(),
369	);
370	info.set_any_anchored_start(rep.hir.is_any_anchored_start());
371	info.set_any_anchored_end(rep.hir.is_any_anchored_end());
372	info.set_match_empty(rep.is_match_empty() \|\| rep.hir.is_match_empty());
373	info.set_literal(`false`);
374	info.set_alternation_literal(`false`);
375	Hir { kind: HirKind::Repetition(rep), info }
376	}
377
378	/// Creates a group HIR expression.
379	pub fn group(group: Group) -> Hir {
380	let mut info = HirInfo::new();
381	info.set_always_utf8(group.hir.is_always_utf8());
382	info.set_all_assertions(group.hir.is_all_assertions());
383	info.set_anchored_start(group.hir.is_anchored_start());
384	info.set_anchored_end(group.hir.is_anchored_end());
385	info.set_line_anchored_start(group.hir.is_line_anchored_start());
386	info.set_line_anchored_end(group.hir.is_line_anchored_end());
387	info.set_any_anchored_start(group.hir.is_any_anchored_start());
388	info.set_any_anchored_end(group.hir.is_any_anchored_end());
389	info.set_match_empty(group.hir.is_match_empty());
390	info.set_literal(`false`);
391	info.set_alternation_literal(`false`);
392	Hir { kind: HirKind::Group(group), info }
393	}
394
395	/// Returns the concatenation of the given expressions.
396	///
397	/// This flattens the concatenation as appropriate.
398	pub fn concat(mut exprs: Vec<Hir>) -> Hir {
399	match exprs.len() {
400	`0` => Hir::empty(),
401	`1` => exprs.pop().unwrap(),
402	_ => {
403	let mut info = HirInfo::new();
404	info.set_always_utf8(`true`);
405	info.set_all_assertions(`true`);
406	info.set_any_anchored_start(`false`);
407	info.set_any_anchored_end(`false`);
408	info.set_match_empty(`true`);
409	info.set_literal(`true`);
410	info.set_alternation_literal(`true`);
411
412	// Some attributes require analyzing all sub-expressions.
413	for e in &exprs {
414	let x = info.is_always_utf8() && e.is_always_utf8();
415	info.set_always_utf8(x);
416
417	let x = info.is_all_assertions() && e.is_all_assertions();
418	info.set_all_assertions(x);
419
420	let x = info.is_any_anchored_start()
421	\|\| e.is_any_anchored_start();
422	info.set_any_anchored_start(x);
423
424	let x =
425	info.is_any_anchored_end() \|\| e.is_any_anchored_end();
426	info.set_any_anchored_end(x);
427
428	let x = info.is_match_empty() && e.is_match_empty();
429	info.set_match_empty(x);
430
431	let x = info.is_literal() && e.is_literal();
432	info.set_literal(x);
433
434	let x = info.is_alternation_literal()
435	&& e.is_alternation_literal();
436	info.set_alternation_literal(x);
437	}
438	// Anchored attributes require something slightly more
439	// sophisticated. Normally, WLOG, to determine whether an
440	// expression is anchored to the start, we'd only need to check
441	// the first expression of a concatenation. However,
442	// expressions like `$\b^` are still anchored to the start,
443	// but the first expression in the concatenation isn't
444	// anchored to the start. So the "first" expression to look at
445	// is actually one that is either not an assertion or is
446	// specifically the StartText assertion.
447	info.set_anchored_start(
448	exprs
449	.iter()
450	.take_while(\|e\| {
451	e.is_anchored_start() \|\| e.is_all_assertions()
452	})
453	.any(\|e\| e.is_anchored_start()),
454	);
455	// Similarly for the end anchor, but in reverse.
456	info.set_anchored_end(
457	exprs
458	.iter()
459	.rev()
460	.take_while(\|e\| {
461	e.is_anchored_end() \|\| e.is_all_assertions()
462	})
463	.any(\|e\| e.is_anchored_end()),
464	);
465	// Repeat the process for line anchors.
466	info.set_line_anchored_start(
467	exprs
468	.iter()
469	.take_while(\|e\| {
470	e.is_line_anchored_start() \|\| e.is_all_assertions()
471	})
472	.any(\|e\| e.is_line_anchored_start()),
473	);
474	info.set_line_anchored_end(
475	exprs
476	.iter()
477	.rev()
478	.take_while(\|e\| {
479	e.is_line_anchored_end() \|\| e.is_all_assertions()
480	})
481	.any(\|e\| e.is_line_anchored_end()),
482	);
483	Hir { kind: HirKind::Concat(exprs), info }
484	}
485	}
486	}
487
488	/// Returns the alternation of the given expressions.
489	///
490	/// This flattens the alternation as appropriate.
491	pub fn alternation(mut exprs: Vec<Hir>) -> Hir {
492	match exprs.len() {
493	`0` => Hir::empty(),
494	`1` => exprs.pop().unwrap(),
495	_ => {
496	let mut info = HirInfo::new();
497	info.set_always_utf8(`true`);
498	info.set_all_assertions(`true`);
499	info.set_anchored_start(`true`);
500	info.set_anchored_end(`true`);
501	info.set_line_anchored_start(`true`);
502	info.set_line_anchored_end(`true`);
503	info.set_any_anchored_start(`false`);
504	info.set_any_anchored_end(`false`);
505	info.set_match_empty(`false`);
506	info.set_literal(`false`);
507	info.set_alternation_literal(`true`);
508
509	// Some attributes require analyzing all sub-expressions.
510	for e in &exprs {
511	let x = info.is_always_utf8() && e.is_always_utf8();
512	info.set_always_utf8(x);
513
514	let x = info.is_all_assertions() && e.is_all_assertions();
515	info.set_all_assertions(x);
516
517	let x = info.is_anchored_start() && e.is_anchored_start();
518	info.set_anchored_start(x);
519
520	let x = info.is_anchored_end() && e.is_anchored_end();
521	info.set_anchored_end(x);
522
523	let x = info.is_line_anchored_start()
524	&& e.is_line_anchored_start();
525	info.set_line_anchored_start(x);
526
527	let x = info.is_line_anchored_end()
528	&& e.is_line_anchored_end();
529	info.set_line_anchored_end(x);
530
531	let x = info.is_any_anchored_start()
532	\|\| e.is_any_anchored_start();
533	info.set_any_anchored_start(x);
534
535	let x =
536	info.is_any_anchored_end() \|\| e.is_any_anchored_end();
537	info.set_any_anchored_end(x);
538
539	let x = info.is_match_empty() \|\| e.is_match_empty();
540	info.set_match_empty(x);
541
542	let x = info.is_alternation_literal() && e.is_literal();
543	info.set_alternation_literal(x);
544	}
545	Hir { kind: HirKind::Alternation(exprs), info }
546	}
547	}
548	}
549
550	/// Build an HIR expression for `.`.
551	///
552	/// A `.` expression matches any character except for `\n`. To build an
553	/// expression that matches any character, including `\n`, use the `any`
554	/// method.
555	///
556	/// If `bytes` is `true`, then this assumes characters are limited to a
557	/// single byte.
558	pub fn dot(bytes: bool) -> Hir {
559	if bytes {
560	let mut cls = ClassBytes::empty();
561	cls.push(ClassBytesRange::new(b'`\0`', b'`\x09`'));
562	cls.push(ClassBytesRange::new(b'`\x0B`', b'`\xFF`'));
563	Hir::class(Class::Bytes(cls))
564	} else {
565	let mut cls = ClassUnicode::empty();
566	cls.push(ClassUnicodeRange::new('`\0`', '`\x09`'));
567	cls.push(ClassUnicodeRange::new('`\x0B`', '`\u{10FFFF}`'));
568	Hir::class(Class::Unicode(cls))
569	}
570	}
571
572	/// Build an HIR expression for `(?s).`.
573	///
574	/// A `(?s).` expression matches any character, including `\n`. To build an
575	/// expression that matches any character except for `\n`, then use the
576	/// `dot` method.
577	///
578	/// If `bytes` is `true`, then this assumes characters are limited to a
579	/// single byte.
580	pub fn any(bytes: bool) -> Hir {
581	if bytes {
582	let mut cls = ClassBytes::empty();
583	cls.push(ClassBytesRange::new(b'`\0`', b'`\xFF`'));
584	Hir::class(Class::Bytes(cls))
585	} else {
586	let mut cls = ClassUnicode::empty();
587	cls.push(ClassUnicodeRange::new('`\0`', '`\u{10FFFF}`'));
588	Hir::class(Class::Unicode(cls))
589	}
590	}
591
592	/// Return true if and only if this HIR will always match valid UTF-8.
593	///
594	/// When this returns false, then it is possible for this HIR expression
595	/// to match invalid UTF-8.
596	pub fn is_always_utf8(&self) -> bool {
597	self.info.is_always_utf8()
598	}
599
600	/// Returns true if and only if this entire HIR expression is made up of
601	/// zero-width assertions.
602	///
603	/// This includes expressions like `^$\b\A\z` and even `((\b)+())^`, but*
604	/// not `^a`.
605	pub fn is_all_assertions(&self) -> bool {
606	self.info.is_all_assertions()
607	}
608
609	/// Return true if and only if this HIR is required to match from the
610	/// beginning of text. This includes expressions like `^foo`, `^(foo\|bar)`,
611	/// `^foo\|^bar` but not `^foo\|bar`.
612	pub fn is_anchored_start(&self) -> bool {
613	self.info.is_anchored_start()
614	}
615
616	/// Return true if and only if this HIR is required to match at the end
617	/// of text. This includes expressions like `foo$`, `(foo\|bar)$`,
618	/// `foo$\|bar$` but not `foo$\|bar`.
619	pub fn is_anchored_end(&self) -> bool {
620	self.info.is_anchored_end()
621	}
622
623	/// Return true if and only if this HIR is required to match from the
624	/// beginning of text or the beginning of a line. This includes expressions
625	/// like `^foo`, `(?m)^foo`, `^(foo\|bar)`, `^(foo\|bar)`, `(?m)^foo\|^bar`
626	/// but not `^foo\|bar` or `(?m)^foo\|bar`.
627	///
628	/// Note that if `is_anchored_start` is `true`, then
629	/// `is_line_anchored_start` will also be `true`. The reverse implication
630	/// is not true. For example, `(?m)^foo` is line anchored, but not
631	/// `is_anchored_start`.
632	pub fn is_line_anchored_start(&self) -> bool {
633	self.info.is_line_anchored_start()
634	}
635
636	/// Return true if and only if this HIR is required to match at the
637	/// end of text or the end of a line. This includes expressions like
638	/// `foo$`, `(?m)foo$`, `(foo\|bar)$`, `(?m)(foo\|bar)$`, `foo$\|bar$`,
639	/// `(?m)(foo\|bar)$`, but not `foo$\|bar` or `(?m)foo$\|bar`.
640	///
641	/// Note that if `is_anchored_end` is `true`, then
642	/// `is_line_anchored_end` will also be `true`. The reverse implication
643	/// is not true. For example, `(?m)foo$` is line anchored, but not
644	/// `is_anchored_end`.
645	pub fn is_line_anchored_end(&self) -> bool {
646	self.info.is_line_anchored_end()
647	}
648
649	/// Return true if and only if this HIR contains any sub-expression that
650	/// is required to match at the beginning of text. Specifically, this
651	/// returns true if the `^` symbol (when multiline mode is disabled) or the
652	/// `\A` escape appear anywhere in the regex.
653	pub fn is_any_anchored_start(&self) -> bool {
654	self.info.is_any_anchored_start()
655	}
656
657	/// Return true if and only if this HIR contains any sub-expression that is
658	/// required to match at the end of text. Specifically, this returns true
659	/// if the `$` symbol (when multiline mode is disabled) or the `\z` escape
660	/// appear anywhere in the regex.
661	pub fn is_any_anchored_end(&self) -> bool {
662	self.info.is_any_anchored_end()
663	}
664
665	/// Return true if and only if the empty string is part of the language
666	/// matched by this regular expression.
667	///
668	/// This includes `a`, `a?b`, `a{0}`, `()`, `()+`, `^$`, `a\|b?`, `\b`
669	/// and `\B`, but not `a` or `a+`.
670	pub fn is_match_empty(&self) -> bool {
671	self.info.is_match_empty()
672	}
673
674	/// Return true if and only if this HIR is a simple literal. This is only
675	/// true when this HIR expression is either itself a `Literal` or a
676	/// concatenation of only `Literal`s.
677	///
678	/// For example, `f` and `foo` are literals, but `f+`, `(foo)`, `foo()`,
679	/// `` are not (even though that contain sub-expressions that are literals).
680	pub fn is_literal(&self) -> bool {
681	self.info.is_literal()
682	}
683
684	/// Return true if and only if this HIR is either a simple literal or an
685	/// alternation of simple literals. This is only
686	/// true when this HIR expression is either itself a `Literal` or a
687	/// concatenation of only `Literal`s or an alternation of only `Literal`s.
688	///
689	/// For example, `f`, `foo`, `a\|b\|c`, and `foo\|bar\|baz` are alternation
690	/// literals, but `f+`, `(foo)`, `foo()`, ``
691	/// are not (even though that contain sub-expressions that are literals).
692	pub fn is_alternation_literal(&self) -> bool {
693	self.info.is_alternation_literal()
694	}
695	}
696
697	impl HirKind {
698	/// Return true if and only if this HIR is the empty regular expression.
699	///
700	/// Note that this is not defined inductively. That is, it only tests if
701	/// this kind is the `Empty` variant. To get the inductive definition,
702	/// use the `is_match_empty` method on [`Hir`](struct.Hir.html).
703	pub fn is_empty(&self) -> bool {
704	match *self {
705	HirKind::Empty => `true`,
706	_ => `false`,
707	}
708	}
709
710	/// Returns true if and only if this kind has any (including possibly
711	/// empty) subexpressions.
712	pub fn has_subexprs(&self) -> bool {
713	match *self {
714	HirKind::Empty
715	\| HirKind::Literal(_)
716	\| HirKind::Class(_)
717	\| HirKind::Anchor(_)
718	\| HirKind::WordBoundary(_) => `false`,
719	HirKind::Group(_)
720	\| HirKind::Repetition(_)
721	\| HirKind::Concat(_)
722	\| HirKind::Alternation(_) => `true`,
723	}
724	}
725	}
726
727	/// Print a display representation of this Hir.
728	///
729	/// The result of this is a valid regular expression pattern string.
730	///
731	/// This implementation uses constant stack space and heap space proportional
732	/// to the size of the `Hir`.
733	impl fmt::Display for Hir {
734	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
735	use crate::hir::print::Printer;
736	Printer::new().print(self, f)
737	}
738	}
739
740	/// The high-level intermediate representation of a literal.
741	///
742	/// A literal corresponds to a single character, where a character is either
743	/// defined by a Unicode scalar value or an arbitrary byte. Unicode characters
744	/// are preferred whenever possible. In particular, a `Byte` variant is only
745	/// ever produced when it could match invalid UTF-8.
746	#[derive(Clone, Debug, Eq, PartialEq)]
747	pub enum Literal {
748	/// A single character represented by a Unicode scalar value.
749	Unicode(char),
750	/// A single character represented by an arbitrary byte.
751	Byte(u8),
752	}
753
754	impl Literal {
755	/// Returns true if and only if this literal corresponds to a Unicode
756	/// scalar value.
757	pub fn is_unicode(&self) -> bool {
758	match *self {
759	Literal::Unicode(_) => `true`,
760	Literal::Byte(b) if b <= `0x7F` => `true`,
761	Literal::Byte(_) => `false`,
762	}
763	}
764	}
765
766	/// The high-level intermediate representation of a character class.
767	///
768	/// A character class corresponds to a set of characters. A character is either
769	/// defined by a Unicode scalar value or a byte. Unicode characters are used
770	/// by default, while bytes are used when Unicode mode (via the `u` flag) is
771	/// disabled.
772	///
773	/// A character class, regardless of its character type, is represented by a
774	/// sequence of non-overlapping non-adjacent ranges of characters.
775	///
776	/// Note that unlike [`Literal`](enum.Literal.html), a `Bytes` variant may
777	/// be produced even when it exclusively matches valid UTF-8. This is because
778	/// a `Bytes` variant represents an intention by the author of the regular
779	/// expression to disable Unicode mode, which in turn impacts the semantics of
780	/// case insensitive matching. For example, `(?i)k` and `(?i-u)k` will not
781	/// match the same set of strings.
782	#[derive(Clone, Debug, Eq, PartialEq)]
783	pub enum Class {
784	/// A set of characters represented by Unicode scalar values.
785	Unicode(ClassUnicode),
786	/// A set of characters represented by arbitrary bytes (one byte per
787	/// character).
788	Bytes(ClassBytes),
789	}
790
791	impl Class {
792	/// Apply Unicode simple case folding to this character class, in place.
793	/// The character class will be expanded to include all simple case folded
794	/// character variants.
795	///
796	/// If this is a byte oriented character class, then this will be limited
797	/// to the ASCII ranges `A-Z` and `a-z`.
798	pub fn case_fold_simple(&mut self) {
799	match *self {
800	Class::Unicode(ref mut x) => x.case_fold_simple(),
801	Class::Bytes(ref mut x) => x.case_fold_simple(),
802	}
803	}
804
805	/// Negate this character class in place.
806	///
807	/// After completion, this character class will contain precisely the
808	/// characters that weren't previously in the class.
809	pub fn negate(&mut self) {
810	match *self {
811	Class::Unicode(ref mut x) => x.negate(),
812	Class::Bytes(ref mut x) => x.negate(),
813	}
814	}
815
816	/// Returns true if and only if this character class will only ever match
817	/// valid UTF-8.
818	///
819	/// A character class can match invalid UTF-8 only when the following
820	/// conditions are met:
821	///
822	/// 1. The translator was configured to permit generating an expression
823	/// that can match invalid UTF-8. (By default, this is disabled.)
824	/// 2. Unicode mode (via the `u` flag) was disabled either in the concrete
825	/// syntax or in the parser builder. By default, Unicode mode is
826	/// enabled.
827	pub fn is_always_utf8(&self) -> bool {
828	match *self {
829	Class::Unicode(_) => `true`,
830	Class::Bytes(ref x) => x.is_all_ascii(),
831	}
832	}
833	}
834
835	/// A set of characters represented by Unicode scalar values.
836	#[derive(Clone, Debug, Eq, PartialEq)]
837	pub struct ClassUnicode {
838	set: IntervalSet<ClassUnicodeRange>,
839	}
840
841	impl ClassUnicode {
842	/// Create a new class from a sequence of ranges.
843	///
844	/// The given ranges do not need to be in any specific order, and ranges
845	/// may overlap.
846	pub fn new<I>(ranges: I) -> ClassUnicode
847	where
848	I: IntoIterator<Item = ClassUnicodeRange>,
849	{
850	ClassUnicode { set: IntervalSet::new(ranges) }
851	}
852
853	/// Create a new class with no ranges.
854	pub fn empty() -> ClassUnicode {
855	ClassUnicode::new(vec![])
856	}
857
858	/// Add a new range to this set.
859	pub fn push(&mut self, range: ClassUnicodeRange) {
860	self.set.push(range);
861	}
862
863	/// Return an iterator over all ranges in this class.
864	///
865	/// The iterator yields ranges in ascending order.
866	pub fn iter(&self) -> ClassUnicodeIter<'_> {
867	ClassUnicodeIter(self.set.iter())
868	}
869
870	/// Return the underlying ranges as a slice.
871	pub fn ranges(&self) -> &[ClassUnicodeRange] {
872	self.set.intervals()
873	}
874
875	/// Expand this character class such that it contains all case folded
876	/// characters, according to Unicode's "simple" mapping. For example, if
877	/// this class consists of the range `a-z`, then applying case folding will
878	/// result in the class containing both the ranges `a-z` and `A-Z`.
879	///
880	/// # Panics
881	///
882	/// This routine panics when the case mapping data necessary for this
883	/// routine to complete is unavailable. This occurs when the `unicode-case`
884	/// feature is not enabled.
885	///
886	/// Callers should prefer using `try_case_fold_simple` instead, which will
887	/// return an error instead of panicking.
888	pub fn case_fold_simple(&mut self) {
889	self.set
890	.case_fold_simple()
891	.expect("unicode-case feature must be enabled");
892	}
893
894	/// Expand this character class such that it contains all case folded
895	/// characters, according to Unicode's "simple" mapping. For example, if
896	/// this class consists of the range `a-z`, then applying case folding will
897	/// result in the class containing both the ranges `a-z` and `A-Z`.
898	///
899	/// # Error
900	///
901	/// This routine returns an error when the case mapping data necessary
902	/// for this routine to complete is unavailable. This occurs when the
903	/// `unicode-case` feature is not enabled.
904	pub fn try_case_fold_simple(
905	&mut self,
906	) -> result::Result<(), CaseFoldError> {
907	self.set.case_fold_simple()
908	}
909
910	/// Negate this character class.
911	///
912	/// For all `c` where `c` is a Unicode scalar value, if `c` was in this
913	/// set, then it will not be in this set after negation.
914	pub fn negate(&mut self) {
915	self.set.negate();
916	}
917
918	/// Union this character class with the given character class, in place.
919	pub fn union(&mut self, other: &ClassUnicode) {
920	self.set.union(&other.set);
921	}
922
923	/// Intersect this character class with the given character class, in
924	/// place.
925	pub fn intersect(&mut self, other: &ClassUnicode) {
926	self.set.intersect(&other.set);
927	}
928
929	/// Subtract the given character class from this character class, in place.
930	pub fn difference(&mut self, other: &ClassUnicode) {
931	self.set.difference(&other.set);
932	}
933
934	/// Compute the symmetric difference of the given character classes, in
935	/// place.
936	///
937	/// This computes the symmetric difference of two character classes. This
938	/// removes all elements in this class that are also in the given class,
939	/// but all adds all elements from the given class that aren't in this
940	/// class. That is, the class will contain all elements in either class,
941	/// but will not contain any elements that are in both classes.
942	pub fn symmetric_difference(&mut self, other: &ClassUnicode) {
943	self.set.symmetric_difference(&other.set);
944	}
945
946	/// Returns true if and only if this character class will either match
947	/// nothing or only ASCII bytes. Stated differently, this returns false
948	/// if and only if this class contains a non-ASCII codepoint.
949	pub fn is_all_ascii(&self) -> bool {
950	self.set.intervals().last().map_or(`true`, \|r\| r.end <= '`\x7F`')
951	}
952	}
953
954	/// An iterator over all ranges in a Unicode character class.
955	///
956	/// The lifetime `'a` refers to the lifetime of the underlying class.
957	#[derive(Debug)]
958	pub struct ClassUnicodeIter<'a>(IntervalSetIter<'a, ClassUnicodeRange>);
959
960	impl<'a> Iterator for ClassUnicodeIter<'a> {
961	type Item = &'a ClassUnicodeRange;
962
963	fn next(&mut self) -> Option<&'a ClassUnicodeRange> {
964	self.0.next()
965	}
966	}
967
968	/// A single range of characters represented by Unicode scalar values.
969	///
970	/// The range is closed. That is, the start and end of the range are included
971	/// in the range.
972	#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
973	pub struct ClassUnicodeRange {
974	start: char,
975	end: char,
976	}
977
978	impl fmt::Debug for ClassUnicodeRange {
979	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
980	let start = if !self.start.is_whitespace() && !self.start.is_control()
981	{
982	self.start.to_string()
983	} else {
984	format!("0x{:X}", self.start as u32)
985	};
986	let end = if !self.end.is_whitespace() && !self.end.is_control() {
987	self.end.to_string()
988	} else {
989	format!("0x{:X}", self.end as u32)
990	};
991	f.debug_struct("ClassUnicodeRange")
992	.field("start", &start)
993	.field("end", &end)
994	.finish()
995	}
996	}
997
998	impl Interval for ClassUnicodeRange {
999	type Bound = char;
1000
1001	#[inline]
1002	fn lower(&self) -> char {
1003	self.start
1004	}
1005	#[inline]
1006	fn upper(&self) -> char {
1007	self.end
1008	}
1009	#[inline]
1010	fn set_lower(&mut self, bound: char) {
1011	self.start = bound;
1012	}
1013	#[inline]
1014	fn set_upper(&mut self, bound: char) {
1015	self.end = bound;
1016	}
1017
1018	/// Apply simple case folding to this Unicode scalar value range.
1019	///
1020	/// Additional ranges are appended to the given vector. Canonical ordering
1021	/// is not* maintained in the given vector.*
1022	fn case_fold_simple(
1023	&self,
1024	ranges: &mut Vec<ClassUnicodeRange>,
1025	) -> Result<(), unicode::CaseFoldError> {
1026	if !unicode::contains_simple_case_mapping(self.start, self.end)? {
1027	return Ok(());
1028	}
1029	let start = self.start as u32;
1030	let end = (self.end as u32).saturating_add(`1`);
1031	let mut next_simple_cp = None;
1032	for cp in (start..end).filter_map(char::from_u32) {
1033	if next_simple_cp.map_or(`false`, \|next\| cp < next) {
1034	continue;
1035	}
1036	let it = match unicode::simple_fold(cp)? {
1037	Ok(it) => it,
1038	Err(next) => {
1039	next_simple_cp = next;
1040	continue;
1041	}
1042	};
1043	for cp_folded in it {
1044	ranges.push(ClassUnicodeRange::new(cp_folded, cp_folded));
1045	}
1046	}
1047	Ok(())
1048	}
1049	}
1050
1051	impl ClassUnicodeRange {
1052	/// Create a new Unicode scalar value range for a character class.
1053	///
1054	/// The returned range is always in a canonical form. That is, the range
1055	/// returned always satisfies the invariant that `start <= end`.
1056	pub fn new(start: char, end: char) -> ClassUnicodeRange {
1057	ClassUnicodeRange::create(start, end)
1058	}
1059
1060	/// Return the start of this range.
1061	///
1062	/// The start of a range is always less than or equal to the end of the
1063	/// range.
1064	pub fn start(&self) -> char {
1065	self.start
1066	}
1067
1068	/// Return the end of this range.
1069	///
1070	/// The end of a range is always greater than or equal to the start of the
1071	/// range.
1072	pub fn end(&self) -> char {
1073	self.end
1074	}
1075	}
1076
1077	/// A set of characters represented by arbitrary bytes (where one byte
1078	/// corresponds to one character).
1079	#[derive(Clone, Debug, Eq, PartialEq)]
1080	pub struct ClassBytes {
1081	set: IntervalSet<ClassBytesRange>,
1082	}
1083
1084	impl ClassBytes {
1085	/// Create a new class from a sequence of ranges.
1086	///
1087	/// The given ranges do not need to be in any specific order, and ranges
1088	/// may overlap.
1089	pub fn new<I>(ranges: I) -> ClassBytes
1090	where
1091	I: IntoIterator<Item = ClassBytesRange>,
1092	{
1093	ClassBytes { set: IntervalSet::new(ranges) }
1094	}
1095
1096	/// Create a new class with no ranges.
1097	pub fn empty() -> ClassBytes {
1098	ClassBytes::new(vec![])
1099	}
1100
1101	/// Add a new range to this set.
1102	pub fn push(&mut self, range: ClassBytesRange) {
1103	self.set.push(range);
1104	}
1105
1106	/// Return an iterator over all ranges in this class.
1107	///
1108	/// The iterator yields ranges in ascending order.
1109	pub fn iter(&self) -> ClassBytesIter<'_> {
1110	ClassBytesIter(self.set.iter())
1111	}
1112
1113	/// Return the underlying ranges as a slice.
1114	pub fn ranges(&self) -> &[ClassBytesRange] {
1115	self.set.intervals()
1116	}
1117
1118	/// Expand this character class such that it contains all case folded
1119	/// characters. For example, if this class consists of the range `a-z`,
1120	/// then applying case folding will result in the class containing both the
1121	/// ranges `a-z` and `A-Z`.
1122	///
1123	/// Note that this only applies ASCII case folding, which is limited to the
1124	/// characters `a-z` and `A-Z`.
1125	pub fn case_fold_simple(&mut self) {
1126	self.set.case_fold_simple().expect("ASCII case folding never fails");
1127	}
1128
1129	/// Negate this byte class.
1130	///
1131	/// For all `b` where `b` is a any byte, if `b` was in this set, then it
1132	/// will not be in this set after negation.
1133	pub fn negate(&mut self) {
1134	self.set.negate();
1135	}
1136
1137	/// Union this byte class with the given byte class, in place.
1138	pub fn union(&mut self, other: &ClassBytes) {
1139	self.set.union(&other.set);
1140	}
1141
1142	/// Intersect this byte class with the given byte class, in place.
1143	pub fn intersect(&mut self, other: &ClassBytes) {
1144	self.set.intersect(&other.set);
1145	}
1146
1147	/// Subtract the given byte class from this byte class, in place.
1148	pub fn difference(&mut self, other: &ClassBytes) {
1149	self.set.difference(&other.set);
1150	}
1151
1152	/// Compute the symmetric difference of the given byte classes, in place.
1153	///
1154	/// This computes the symmetric difference of two byte classes. This
1155	/// removes all elements in this class that are also in the given class,
1156	/// but all adds all elements from the given class that aren't in this
1157	/// class. That is, the class will contain all elements in either class,
1158	/// but will not contain any elements that are in both classes.
1159	pub fn symmetric_difference(&mut self, other: &ClassBytes) {
1160	self.set.symmetric_difference(&other.set);
1161	}
1162
1163	/// Returns true if and only if this character class will either match
1164	/// nothing or only ASCII bytes. Stated differently, this returns false
1165	/// if and only if this class contains a non-ASCII byte.
1166	pub fn is_all_ascii(&self) -> bool {
1167	self.set.intervals().last().map_or(`true`, \|r\| r.end <= `0x7F`)
1168	}
1169	}
1170
1171	/// An iterator over all ranges in a byte character class.
1172	///
1173	/// The lifetime `'a` refers to the lifetime of the underlying class.
1174	#[derive(Debug)]
1175	pub struct ClassBytesIter<'a>(IntervalSetIter<'a, ClassBytesRange>);
1176
1177	impl<'a> Iterator for ClassBytesIter<'a> {
1178	type Item = &'a ClassBytesRange;
1179
1180	fn next(&mut self) -> Option<&'a ClassBytesRange> {
1181	self.0.next()
1182	}
1183	}
1184
1185	/// A single range of characters represented by arbitrary bytes.
1186	///
1187	/// The range is closed. That is, the start and end of the range are included
1188	/// in the range.
1189	#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
1190	pub struct ClassBytesRange {
1191	start: u8,
1192	end: u8,
1193	}
1194
1195	impl Interval for ClassBytesRange {
1196	type Bound = u8;
1197
1198	#[inline]
1199	fn lower(&self) -> u8 {
1200	self.start
1201	}
1202	#[inline]
1203	fn upper(&self) -> u8 {
1204	self.end
1205	}
1206	#[inline]
1207	fn set_lower(&mut self, bound: u8) {
1208	self.start = bound;
1209	}
1210	#[inline]
1211	fn set_upper(&mut self, bound: u8) {
1212	self.end = bound;
1213	}
1214
1215	/// Apply simple case folding to this byte range. Only ASCII case mappings
1216	/// (for a-z) are applied.
1217	///
1218	/// Additional ranges are appended to the given vector. Canonical ordering
1219	/// is not* maintained in the given vector.*
1220	fn case_fold_simple(
1221	&self,
1222	ranges: &mut Vec<ClassBytesRange>,
1223	) -> Result<(), unicode::CaseFoldError> {
1224	if !ClassBytesRange::new(b'a', b'z').is_intersection_empty(self) {
1225	let lower = cmp::max(self.start, b'a');
1226	let upper = cmp::min(self.end, b'z');
1227	ranges.push(ClassBytesRange::new(lower - `32`, upper - `32`));
1228	}
1229	if !ClassBytesRange::new(b'A', b'Z').is_intersection_empty(self) {
1230	let lower = cmp::max(self.start, b'A');
1231	let upper = cmp::min(self.end, b'Z');
1232	ranges.push(ClassBytesRange::new(lower + `32`, upper + `32`));
1233	}
1234	Ok(())
1235	}
1236	}
1237
1238	impl ClassBytesRange {
1239	/// Create a new byte range for a character class.
1240	///
1241	/// The returned range is always in a canonical form. That is, the range
1242	/// returned always satisfies the invariant that `start <= end`.
1243	pub fn new(start: u8, end: u8) -> ClassBytesRange {
1244	ClassBytesRange::create(start, end)
1245	}
1246
1247	/// Return the start of this range.
1248	///
1249	/// The start of a range is always less than or equal to the end of the
1250	/// range.
1251	pub fn start(&self) -> u8 {
1252	self.start
1253	}
1254
1255	/// Return the end of this range.
1256	///
1257	/// The end of a range is always greater than or equal to the start of the
1258	/// range.
1259	pub fn end(&self) -> u8 {
1260	self.end
1261	}
1262	}
1263
1264	impl fmt::Debug for ClassBytesRange {
1265	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1266	let mut debug = f.debug_struct("ClassBytesRange");
1267	if self.start <= `0x7F` {
1268	debug.field("start", &(self.start as char));
1269	} else {
1270	debug.field("start", &self.start);
1271	}
1272	if self.end <= `0x7F` {
1273	debug.field("end", &(self.end as char));
1274	} else {
1275	debug.field("end", &self.end);
1276	}
1277	debug.finish()
1278	}
1279	}
1280
1281	/// The high-level intermediate representation for an anchor assertion.
1282	///
1283	/// A matching anchor assertion is always zero-length.
1284	#[derive(Clone, Debug, Eq, PartialEq)]
1285	pub enum Anchor {
1286	/// Match the beginning of a line or the beginning of text. Specifically,
1287	/// this matches at the starting position of the input, or at the position
1288	/// immediately following a `\n` character.
1289	StartLine,
1290	/// Match the end of a line or the end of text. Specifically,
1291	/// this matches at the end position of the input, or at the position
1292	/// immediately preceding a `\n` character.
1293	EndLine,
1294	/// Match the beginning of text. Specifically, this matches at the starting
1295	/// position of the input.
1296	StartText,
1297	/// Match the end of text. Specifically, this matches at the ending
1298	/// position of the input.
1299	EndText,
1300	}
1301
1302	/// The high-level intermediate representation for a word-boundary assertion.
1303	///
1304	/// A matching word boundary assertion is always zero-length.
1305	#[derive(Clone, Debug, Eq, PartialEq)]
1306	pub enum WordBoundary {
1307	/// Match a Unicode-aware word boundary. That is, this matches a position
1308	/// where the left adjacent character and right adjacent character
1309	/// correspond to a word and non-word or a non-word and word character.
1310	Unicode,
1311	/// Match a Unicode-aware negation of a word boundary.
1312	UnicodeNegate,
1313	/// Match an ASCII-only word boundary. That is, this matches a position
1314	/// where the left adjacent character and right adjacent character
1315	/// correspond to a word and non-word or a non-word and word character.
1316	Ascii,
1317	/// Match an ASCII-only negation of a word boundary.
1318	AsciiNegate,
1319	}
1320
1321	impl WordBoundary {
1322	/// Returns true if and only if this word boundary assertion is negated.
1323	pub fn is_negated(&self) -> bool {
1324	match *self {
1325	WordBoundary::Unicode \| WordBoundary::Ascii => `false`,
1326	WordBoundary::UnicodeNegate \| WordBoundary::AsciiNegate => `true`,
1327	}
1328	}
1329	}
1330
1331	/// The high-level intermediate representation for a group.
1332	///
1333	/// This represents one of three possible group types:
1334	///
1335	/// 1. A non-capturing group (e.g., `(?:expr)`).
1336	/// 2. A capturing group (e.g., `(expr)`).
1337	/// 3. A named capturing group (e.g., `(?P<name>expr)`).
1338	#[derive(Clone, Debug, Eq, PartialEq)]
1339	pub struct Group {
1340	/// The kind of this group. If it is a capturing group, then the kind
1341	/// contains the capture group index (and the name, if it is a named
1342	/// group).
1343	pub kind: GroupKind,
1344	/// The expression inside the capturing group, which may be empty.
1345	pub hir: Box<Hir>,
1346	}
1347
1348	/// The kind of group.
1349	#[derive(Clone, Debug, Eq, PartialEq)]
1350	pub enum GroupKind {
1351	/// A normal unnamed capturing group.
1352	///
1353	/// The value is the capture index of the group.
1354	CaptureIndex(u32),
1355	/// A named capturing group.
1356	CaptureName {
1357	/// The name of the group.
1358	name: String,
1359	/// The capture index of the group.
1360	index: u32,
1361	},
1362	/// A non-capturing group.
1363	NonCapturing,
1364	}
1365
1366	/// The high-level intermediate representation of a repetition operator.
1367	///
1368	/// A repetition operator permits the repetition of an arbitrary
1369	/// sub-expression.
1370	#[derive(Clone, Debug, Eq, PartialEq)]
1371	pub struct Repetition {
1372	/// The kind of this repetition operator.
1373	pub kind: RepetitionKind,
1374	/// Whether this repetition operator is greedy or not. A greedy operator
1375	/// will match as much as it can. A non-greedy operator will match as
1376	/// little as it can.
1377	///
1378	/// Typically, operators are greedy by default and are only non-greedy when
1379	/// a `?` suffix is used, e.g., `(expr)` is greedy while `(expr)?` is
1380	/// not. However, this can be inverted via the `U` "ungreedy" flag.
1381	pub greedy: bool,
1382	/// The expression being repeated.
1383	pub hir: Box<Hir>,
1384	}
1385
1386	impl Repetition {
1387	/// Returns true if and only if this repetition operator makes it possible
1388	/// to match the empty string.
1389	///
1390	/// Note that this is not defined inductively. For example, while `a`*
1391	/// will report `true`, `()+` will not, even though `()` matches the empty
1392	/// string and one or more occurrences of something that matches the empty
1393	/// string will always match the empty string. In order to get the
1394	/// inductive definition, see the corresponding method on
1395	/// [`Hir`](struct.Hir.html).
1396	pub fn is_match_empty(&self) -> bool {
1397	match self.kind {
1398	RepetitionKind::ZeroOrOne => `true`,
1399	RepetitionKind::ZeroOrMore => `true`,
1400	RepetitionKind::OneOrMore => `false`,
1401	RepetitionKind::Range(RepetitionRange::Exactly(m)) => m == `0`,
1402	RepetitionKind::Range(RepetitionRange::AtLeast(m)) => m == `0`,
1403	RepetitionKind::Range(RepetitionRange::Bounded(m, _)) => m == `0`,
1404	}
1405	}
1406	}
1407
1408	/// The kind of a repetition operator.
1409	#[derive(Clone, Debug, Eq, PartialEq)]
1410	pub enum RepetitionKind {
1411	/// Matches a sub-expression zero or one times.
1412	ZeroOrOne,
1413	/// Matches a sub-expression zero or more times.
1414	ZeroOrMore,
1415	/// Matches a sub-expression one or more times.
1416	OneOrMore,
1417	/// Matches a sub-expression within a bounded range of times.
1418	Range(RepetitionRange),
1419	}
1420
1421	/// The kind of a counted repetition operator.
1422	#[derive(Clone, Debug, Eq, PartialEq)]
1423	pub enum RepetitionRange {
1424	/// Matches a sub-expression exactly this many times.
1425	Exactly(u32),
1426	/// Matches a sub-expression at least this many times.
1427	AtLeast(u32),
1428	/// Matches a sub-expression at least `m` times and at most `n` times.
1429	Bounded(u32, u32),
1430	}
1431
1432	/// A custom `Drop` impl is used for `HirKind` such that it uses constant stack
1433	/// space but heap space proportional to the depth of the total `Hir`.
1434	impl Drop for Hir {
1435	fn drop(&mut self) {
1436	use std::mem;
1437
1438	match *self.kind() {
1439	HirKind::Empty
1440	\| HirKind::Literal(_)
1441	\| HirKind::Class(_)
1442	\| HirKind::Anchor(_)
1443	\| HirKind::WordBoundary(_) => return,
1444	HirKind::Group(ref x) if !x.hir.kind.has_subexprs() => return,
1445	HirKind::Repetition(ref x) if !x.hir.kind.has_subexprs() => return,
1446	HirKind::Concat(ref x) if x.is_empty() => return,
1447	HirKind::Alternation(ref x) if x.is_empty() => return,
1448	_ => {}
1449	}
1450
1451	let mut stack = vec![mem::replace(self, Hir::empty())];
1452	while let Some(mut expr) = stack.pop() {
1453	match expr.kind {
1454	HirKind::Empty
1455	\| HirKind::Literal(_)
1456	\| HirKind::Class(_)
1457	\| HirKind::Anchor(_)
1458	\| HirKind::WordBoundary(_) => {}
1459	HirKind::Group(ref mut x) => {
1460	stack.push(mem::replace(&mut x.hir, Hir::empty()));
1461	}
1462	HirKind::Repetition(ref mut x) => {
1463	stack.push(mem::replace(&mut x.hir, Hir::empty()));
1464	}
1465	HirKind::Concat(ref mut x) => {
1466	stack.extend(x.drain(..));
1467	}
1468	HirKind::Alternation(ref mut x) => {
1469	stack.extend(x.drain(..));
1470	}
1471	}
1472	}
1473	}
1474	}
1475
1476	/// A type that documents various attributes of an HIR expression.
1477	///
1478	/// These attributes are typically defined inductively on the HIR.
1479	#[derive(Clone, Debug, Eq, PartialEq)]
1480	struct HirInfo {
1481	/// Represent yes/no questions by a bitfield to conserve space, since
1482	/// this is included in every HIR expression.
1483	///
1484	/// If more attributes need to be added, it is OK to increase the size of
1485	/// this as appropriate.
1486	bools: u16,
1487	}
1488
1489	// A simple macro for defining bitfield accessors/mutators.
1490	macro_rules! define_bool {
1491	($bit:expr, $is_fn_name:ident, $set_fn_name:ident) => {
1492	fn $is_fn_name(&self) -> bool {
1493	self.bools & (`0b1` << $bit) > `0`
1494	}
1495
1496	fn $set_fn_name(&mut self, yes: bool) {
1497	if yes {
1498	self.bools \|= `1` << $bit;
1499	} else {
1500	self.bools &= !(`1` << $bit);
1501	}
1502	}
1503	};
1504	}
1505
1506	impl HirInfo {
1507	fn new() -> HirInfo {
1508	HirInfo { bools: `0` }
1509	}
1510
1511	define_bool!(`0`, is_always_utf8, set_always_utf8);
1512	define_bool!(`1`, is_all_assertions, set_all_assertions);
1513	define_bool!(`2`, is_anchored_start, set_anchored_start);
1514	define_bool!(`3`, is_anchored_end, set_anchored_end);
1515	define_bool!(`4`, is_line_anchored_start, set_line_anchored_start);
1516	define_bool!(`5`, is_line_anchored_end, set_line_anchored_end);
1517	define_bool!(`6`, is_any_anchored_start, set_any_anchored_start);
1518	define_bool!(`7`, is_any_anchored_end, set_any_anchored_end);
1519	define_bool!(`8`, is_match_empty, set_match_empty);
1520	define_bool!(`9`, is_literal, set_literal);
1521	define_bool!(`10`, is_alternation_literal, set_alternation_literal);
1522	}
1523
1524	#[cfg(test)]
1525	mod tests {
1526	use super::*;
1527
1528	fn uclass(ranges: &[(char, char)]) -> ClassUnicode {
1529	let ranges: Vec<ClassUnicodeRange> = ranges
1530	.iter()
1531	.map(\|&(s, e)\| ClassUnicodeRange::new(s, e))
1532	.collect();
1533	ClassUnicode::new(ranges)
1534	}
1535
1536	fn bclass(ranges: &[(u8, u8)]) -> ClassBytes {
1537	let ranges: Vec<ClassBytesRange> =
1538	ranges.iter().map(\|&(s, e)\| ClassBytesRange::new(s, e)).collect();
1539	ClassBytes::new(ranges)
1540	}
1541
1542	fn uranges(cls: &ClassUnicode) -> Vec<(char, char)> {
1543	cls.iter().map(\|x\| (x.start(), x.end())).collect()
1544	}
1545
1546	#[cfg(feature = "unicode-case")]
1547	fn ucasefold(cls: &ClassUnicode) -> ClassUnicode {
1548	let mut cls_ = cls.clone();
1549	cls_.case_fold_simple();
1550	cls_
1551	}
1552
1553	fn uunion(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
1554	let mut cls_ = cls1.clone();
1555	cls_.union(cls2);
1556	cls_
1557	}
1558
1559	fn uintersect(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
1560	let mut cls_ = cls1.clone();
1561	cls_.intersect(cls2);
1562	cls_
1563	}
1564
1565	fn udifference(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
1566	let mut cls_ = cls1.clone();
1567	cls_.difference(cls2);
1568	cls_
1569	}
1570
1571	fn usymdifference(
1572	cls1: &ClassUnicode,
1573	cls2: &ClassUnicode,
1574	) -> ClassUnicode {
1575	let mut cls_ = cls1.clone();
1576	cls_.symmetric_difference(cls2);
1577	cls_
1578	}
1579
1580	fn unegate(cls: &ClassUnicode) -> ClassUnicode {
1581	let mut cls_ = cls.clone();
1582	cls_.negate();
1583	cls_
1584	}
1585
1586	fn branges(cls: &ClassBytes) -> Vec<(u8, u8)> {
1587	cls.iter().map(\|x\| (x.start(), x.end())).collect()
1588	}
1589
1590	fn bcasefold(cls: &ClassBytes) -> ClassBytes {
1591	let mut cls_ = cls.clone();
1592	cls_.case_fold_simple();
1593	cls_
1594	}
1595
1596	fn bunion(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
1597	let mut cls_ = cls1.clone();
1598	cls_.union(cls2);
1599	cls_
1600	}
1601
1602	fn bintersect(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
1603	let mut cls_ = cls1.clone();
1604	cls_.intersect(cls2);
1605	cls_
1606	}
1607
1608	fn bdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
1609	let mut cls_ = cls1.clone();
1610	cls_.difference(cls2);
1611	cls_
1612	}
1613
1614	fn bsymdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
1615	let mut cls_ = cls1.clone();
1616	cls_.symmetric_difference(cls2);
1617	cls_
1618	}
1619
1620	fn bnegate(cls: &ClassBytes) -> ClassBytes {
1621	let mut cls_ = cls.clone();
1622	cls_.negate();
1623	cls_
1624	}
1625
1626	#[test]
1627	fn class_range_canonical_unicode() {
1628	let range = ClassUnicodeRange::new('`\u{00FF}`', '`\0`');
1629	assert_eq!('`\0`', range.start());
1630	assert_eq!('`\u{00FF}`', range.end());
1631	}
1632
1633	#[test]
1634	fn class_range_canonical_bytes() {
1635	let range = ClassBytesRange::new(b'`\xFF`', b'`\0`');
1636	assert_eq!(b'`\0`', range.start());
1637	assert_eq!(b'`\xFF`', range.end());
1638	}
1639
1640	#[test]
1641	fn class_canonicalize_unicode() {
1642	let cls = uclass(&[('a', 'c'), ('x', 'z')]);
1643	let expected = vec![('a', 'c'), ('x', 'z')];
1644	assert_eq!(expected, uranges(&cls));
1645
1646	let cls = uclass(&[('x', 'z'), ('a', 'c')]);
1647	let expected = vec![('a', 'c'), ('x', 'z')];
1648	assert_eq!(expected, uranges(&cls));
1649
1650	let cls = uclass(&[('x', 'z'), ('w', 'y')]);
1651	let expected = vec![('w', 'z')];
1652	assert_eq!(expected, uranges(&cls));
1653
1654	let cls = uclass(&[
1655	('c', 'f'),
1656	('a', 'g'),
1657	('d', 'j'),
1658	('a', 'c'),
1659	('m', 'p'),
1660	('l', 's'),
1661	]);
1662	let expected = vec![('a', 'j'), ('l', 's')];
1663	assert_eq!(expected, uranges(&cls));
1664
1665	let cls = uclass(&[('x', 'z'), ('u', 'w')]);
1666	let expected = vec![('u', 'z')];
1667	assert_eq!(expected, uranges(&cls));
1668
1669	let cls = uclass(&[('`\x00`', '`\u{10FFFF}`'), ('`\x00`', '`\u{10FFFF}`')]);
1670	let expected = vec![('`\x00`', '`\u{10FFFF}`')];
1671	assert_eq!(expected, uranges(&cls));
1672
1673	let cls = uclass(&[('a', 'a'), ('b', 'b')]);
1674	let expected = vec![('a', 'b')];
1675	assert_eq!(expected, uranges(&cls));
1676	}
1677
1678	#[test]
1679	fn class_canonicalize_bytes() {
1680	let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]);
1681	let expected = vec![(b'a', b'c'), (b'x', b'z')];
1682	assert_eq!(expected, branges(&cls));
1683
1684	let cls = bclass(&[(b'x', b'z'), (b'a', b'c')]);
1685	let expected = vec![(b'a', b'c'), (b'x', b'z')];
1686	assert_eq!(expected, branges(&cls));
1687
1688	let cls = bclass(&[(b'x', b'z'), (b'w', b'y')]);
1689	let expected = vec![(b'w', b'z')];
1690	assert_eq!(expected, branges(&cls));
1691
1692	let cls = bclass(&[
1693	(b'c', b'f'),
1694	(b'a', b'g'),
1695	(b'd', b'j'),
1696	(b'a', b'c'),
1697	(b'm', b'p'),
1698	(b'l', b's'),
1699	]);
1700	let expected = vec![(b'a', b'j'), (b'l', b's')];
1701	assert_eq!(expected, branges(&cls));
1702
1703	let cls = bclass(&[(b'x', b'z'), (b'u', b'w')]);
1704	let expected = vec![(b'u', b'z')];
1705	assert_eq!(expected, branges(&cls));
1706
1707	let cls = bclass(&[(b'`\x00`', b'`\xFF`'), (b'`\x00`', b'`\xFF`')]);
1708	let expected = vec![(b'`\x00`', b'`\xFF`')];
1709	assert_eq!(expected, branges(&cls));
1710
1711	let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]);
1712	let expected = vec![(b'a', b'b')];
1713	assert_eq!(expected, branges(&cls));
1714	}
1715
1716	#[test]
1717	#[cfg(feature = "unicode-case")]
1718	fn class_case_fold_unicode() {
1719	let cls = uclass(&[
1720	('C', 'F'),
1721	('A', 'G'),
1722	('D', 'J'),
1723	('A', 'C'),
1724	('M', 'P'),
1725	('L', 'S'),
1726	('c', 'f'),
1727	]);
1728	let expected = uclass(&[
1729	('A', 'J'),
1730	('L', 'S'),
1731	('a', 'j'),
1732	('l', 's'),
1733	('`\u{17F}`', '`\u{17F}`'),
1734	]);
1735	assert_eq!(expected, ucasefold(&cls));
1736
1737	let cls = uclass(&[('A', 'Z')]);
1738	let expected = uclass(&[
1739	('A', 'Z'),
1740	('a', 'z'),
1741	('`\u{17F}`', '`\u{17F}`'),
1742	('`\u{212A}`', '`\u{212A}`'),
1743	]);
1744	assert_eq!(expected, ucasefold(&cls));
1745
1746	let cls = uclass(&[('a', 'z')]);
1747	let expected = uclass(&[
1748	('A', 'Z'),
1749	('a', 'z'),
1750	('`\u{17F}`', '`\u{17F}`'),
1751	('`\u{212A}`', '`\u{212A}`'),
1752	]);
1753	assert_eq!(expected, ucasefold(&cls));
1754
1755	let cls = uclass(&[('A', 'A'), ('_', '_')]);
1756	let expected = uclass(&[('A', 'A'), ('_', '_'), ('a', 'a')]);
1757	assert_eq!(expected, ucasefold(&cls));
1758
1759	let cls = uclass(&[('A', 'A'), ('=', '=')]);
1760	let expected = uclass(&[('=', '='), ('A', 'A'), ('a', 'a')]);
1761	assert_eq!(expected, ucasefold(&cls));
1762
1763	let cls = uclass(&[('`\x00`', '`\x10`')]);
1764	assert_eq!(cls, ucasefold(&cls));
1765
1766	let cls = uclass(&[('k', 'k')]);
1767	let expected =
1768	uclass(&[('K', 'K'), ('k', 'k'), ('`\u{212A}`', '`\u{212A}`')]);
1769	assert_eq!(expected, ucasefold(&cls));
1770
1771	let cls = uclass(&[('@', '@')]);
1772	assert_eq!(cls, ucasefold(&cls));
1773	}
1774
1775	#[test]
1776	#[cfg(not(feature = "unicode-case"))]
1777	fn class_case_fold_unicode_disabled() {
1778	let mut cls = uclass(&[
1779	('C', 'F'),
1780	('A', 'G'),
1781	('D', 'J'),
1782	('A', 'C'),
1783	('M', 'P'),
1784	('L', 'S'),
1785	('c', 'f'),
1786	]);
1787	assert!(cls.try_case_fold_simple().is_err());
1788	}
1789
1790	#[test]
1791	#[should_panic]
1792	#[cfg(not(feature = "unicode-case"))]
1793	fn class_case_fold_unicode_disabled_panics() {
1794	let mut cls = uclass(&[
1795	('C', 'F'),
1796	('A', 'G'),
1797	('D', 'J'),
1798	('A', 'C'),
1799	('M', 'P'),
1800	('L', 'S'),
1801	('c', 'f'),
1802	]);
1803	cls.case_fold_simple();
1804	}
1805
1806	#[test]
1807	fn class_case_fold_bytes() {
1808	let cls = bclass(&[
1809	(b'C', b'F'),
1810	(b'A', b'G'),
1811	(b'D', b'J'),
1812	(b'A', b'C'),
1813	(b'M', b'P'),
1814	(b'L', b'S'),
1815	(b'c', b'f'),
1816	]);
1817	let expected =
1818	bclass(&[(b'A', b'J'), (b'L', b'S'), (b'a', b'j'), (b'l', b's')]);
1819	assert_eq!(expected, bcasefold(&cls));
1820
1821	let cls = bclass(&[(b'A', b'Z')]);
1822	let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]);
1823	assert_eq!(expected, bcasefold(&cls));
1824
1825	let cls = bclass(&[(b'a', b'z')]);
1826	let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]);
1827	assert_eq!(expected, bcasefold(&cls));
1828
1829	let cls = bclass(&[(b'A', b'A'), (b'_', b'_')]);
1830	let expected = bclass(&[(b'A', b'A'), (b'_', b'_'), (b'a', b'a')]);
1831	assert_eq!(expected, bcasefold(&cls));
1832
1833	let cls = bclass(&[(b'A', b'A'), (b'=', b'=')]);
1834	let expected = bclass(&[(b'=', b'='), (b'A', b'A'), (b'a', b'a')]);
1835	assert_eq!(expected, bcasefold(&cls));
1836
1837	let cls = bclass(&[(b'`\x00`', b'`\x10`')]);
1838	assert_eq!(cls, bcasefold(&cls));
1839
1840	let cls = bclass(&[(b'k', b'k')]);
1841	let expected = bclass(&[(b'K', b'K'), (b'k', b'k')]);
1842	assert_eq!(expected, bcasefold(&cls));
1843
1844	let cls = bclass(&[(b'@', b'@')]);
1845	assert_eq!(cls, bcasefold(&cls));
1846	}
1847
1848	#[test]
1849	fn class_negate_unicode() {
1850	let cls = uclass(&[('a', 'a')]);
1851	let expected = uclass(&[('`\x00`', '`\x60`'), ('`\x62`', '`\u{10FFFF}`')]);
1852	assert_eq!(expected, unegate(&cls));
1853
1854	let cls = uclass(&[('a', 'a'), ('b', 'b')]);
1855	let expected = uclass(&[('`\x00`', '`\x60`'), ('`\x63`', '`\u{10FFFF}`')]);
1856	assert_eq!(expected, unegate(&cls));
1857
1858	let cls = uclass(&[('a', 'c'), ('x', 'z')]);
1859	let expected = uclass(&[
1860	('`\x00`', '`\x60`'),
1861	('`\x64`', '`\x77`'),
1862	('`\x7B`', '`\u{10FFFF}`'),
1863	]);
1864	assert_eq!(expected, unegate(&cls));
1865
1866	let cls = uclass(&[('`\x00`', 'a')]);
1867	let expected = uclass(&[('`\x62`', '`\u{10FFFF}`')]);
1868	assert_eq!(expected, unegate(&cls));
1869
1870	let cls = uclass(&[('a', '`\u{10FFFF}`')]);
1871	let expected = uclass(&[('`\x00`', '`\x60`')]);
1872	assert_eq!(expected, unegate(&cls));
1873
1874	let cls = uclass(&[('`\x00`', '`\u{10FFFF}`')]);
1875	let expected = uclass(&[]);
1876	assert_eq!(expected, unegate(&cls));
1877
1878	let cls = uclass(&[]);
1879	let expected = uclass(&[('`\x00`', '`\u{10FFFF}`')]);
1880	assert_eq!(expected, unegate(&cls));
1881
1882	let cls =
1883	uclass(&[('`\x00`', '`\u{10FFFD}`'), ('`\u{10FFFF}`', '`\u{10FFFF}`')]);
1884	let expected = uclass(&[('`\u{10FFFE}`', '`\u{10FFFE}`')]);
1885	assert_eq!(expected, unegate(&cls));
1886
1887	let cls = uclass(&[('`\x00`', '`\u{D7FF}`')]);
1888	let expected = uclass(&[('`\u{E000}`', '`\u{10FFFF}`')]);
1889	assert_eq!(expected, unegate(&cls));
1890
1891	let cls = uclass(&[('`\x00`', '`\u{D7FE}`')]);
1892	let expected = uclass(&[('`\u{D7FF}`', '`\u{10FFFF}`')]);
1893	assert_eq!(expected, unegate(&cls));
1894
1895	let cls = uclass(&[('`\u{E000}`', '`\u{10FFFF}`')]);
1896	let expected = uclass(&[('`\x00`', '`\u{D7FF}`')]);
1897	assert_eq!(expected, unegate(&cls));
1898
1899	let cls = uclass(&[('`\u{E001}`', '`\u{10FFFF}`')]);
1900	let expected = uclass(&[('`\x00`', '`\u{E000}`')]);
1901	assert_eq!(expected, unegate(&cls));
1902	}
1903
1904	#[test]
1905	fn class_negate_bytes() {
1906	let cls = bclass(&[(b'a', b'a')]);
1907	let expected = bclass(&[(b'`\x00`', b'`\x60`'), (b'`\x62`', b'`\xFF`')]);
1908	assert_eq!(expected, bnegate(&cls));
1909
1910	let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]);
1911	let expected = bclass(&[(b'`\x00`', b'`\x60`'), (b'`\x63`', b'`\xFF`')]);
1912	assert_eq!(expected, bnegate(&cls));
1913
1914	let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]);
1915	let expected = bclass(&[
1916	(b'`\x00`', b'`\x60`'),
1917	(b'`\x64`', b'`\x77`'),
1918	(b'`\x7B`', b'`\xFF`'),
1919	]);
1920	assert_eq!(expected, bnegate(&cls));
1921
1922	let cls = bclass(&[(b'`\x00`', b'a')]);
1923	let expected = bclass(&[(b'`\x62`', b'`\xFF`')]);
1924	assert_eq!(expected, bnegate(&cls));
1925
1926	let cls = bclass(&[(b'a', b'`\xFF`')]);
1927	let expected = bclass(&[(b'`\x00`', b'`\x60`')]);
1928	assert_eq!(expected, bnegate(&cls));
1929
1930	let cls = bclass(&[(b'`\x00`', b'`\xFF`')]);
1931	let expected = bclass(&[]);
1932	assert_eq!(expected, bnegate(&cls));
1933
1934	let cls = bclass(&[]);
1935	let expected = bclass(&[(b'`\x00`', b'`\xFF`')]);
1936	assert_eq!(expected, bnegate(&cls));
1937
1938	let cls = bclass(&[(b'`\x00`', b'`\xFD`'), (b'`\xFF`', b'`\xFF`')]);
1939	let expected = bclass(&[(b'`\xFE`', b'`\xFE`')]);
1940	assert_eq!(expected, bnegate(&cls));
1941	}
1942
1943	#[test]
1944	fn class_union_unicode() {
1945	let cls1 = uclass(&[('a', 'g'), ('m', 't'), ('A', 'C')]);
1946	let cls2 = uclass(&[('a', 'z')]);
1947	let expected = uclass(&[('a', 'z'), ('A', 'C')]);
1948	assert_eq!(expected, uunion(&cls1, &cls2));
1949	}
1950
1951	#[test]
1952	fn class_union_bytes() {
1953	let cls1 = bclass(&[(b'a', b'g'), (b'm', b't'), (b'A', b'C')]);
1954	let cls2 = bclass(&[(b'a', b'z')]);
1955	let expected = bclass(&[(b'a', b'z'), (b'A', b'C')]);
1956	assert_eq!(expected, bunion(&cls1, &cls2));
1957	}
1958
1959	#[test]
1960	fn class_intersect_unicode() {
1961	let cls1 = uclass(&[]);
1962	let cls2 = uclass(&[('a', 'a')]);
1963	let expected = uclass(&[]);
1964	assert_eq!(expected, uintersect(&cls1, &cls2));
1965
1966	let cls1 = uclass(&[('a', 'a')]);
1967	let cls2 = uclass(&[('a', 'a')]);
1968	let expected = uclass(&[('a', 'a')]);
1969	assert_eq!(expected, uintersect(&cls1, &cls2));
1970
1971	let cls1 = uclass(&[('a', 'a')]);
1972	let cls2 = uclass(&[('b', 'b')]);
1973	let expected = uclass(&[]);
1974	assert_eq!(expected, uintersect(&cls1, &cls2));
1975
1976	let cls1 = uclass(&[('a', 'a')]);
1977	let cls2 = uclass(&[('a', 'c')]);
1978	let expected = uclass(&[('a', 'a')]);
1979	assert_eq!(expected, uintersect(&cls1, &cls2));
1980
1981	let cls1 = uclass(&[('a', 'b')]);
1982	let cls2 = uclass(&[('a', 'c')]);
1983	let expected = uclass(&[('a', 'b')]);
1984	assert_eq!(expected, uintersect(&cls1, &cls2));
1985
1986	let cls1 = uclass(&[('a', 'b')]);
1987	let cls2 = uclass(&[('b', 'c')]);
1988	let expected = uclass(&[('b', 'b')]);
1989	assert_eq!(expected, uintersect(&cls1, &cls2));
1990
1991	let cls1 = uclass(&[('a', 'b')]);
1992	let cls2 = uclass(&[('c', 'd')]);
1993	let expected = uclass(&[]);
1994	assert_eq!(expected, uintersect(&cls1, &cls2));
1995
1996	let cls1 = uclass(&[('b', 'c')]);
1997	let cls2 = uclass(&[('a', 'd')]);
1998	let expected = uclass(&[('b', 'c')]);
1999	assert_eq!(expected, uintersect(&cls1, &cls2));
2000
2001	let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2002	let cls2 = uclass(&[('a', 'h')]);
2003	let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2004	assert_eq!(expected, uintersect(&cls1, &cls2));
2005
2006	let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2007	let cls2 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2008	let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2009	assert_eq!(expected, uintersect(&cls1, &cls2));
2010
2011	let cls1 = uclass(&[('a', 'b'), ('g', 'h')]);
2012	let cls2 = uclass(&[('d', 'e'), ('k', 'l')]);
2013	let expected = uclass(&[]);
2014	assert_eq!(expected, uintersect(&cls1, &cls2));
2015
2016	let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2017	let cls2 = uclass(&[('h', 'h')]);
2018	let expected = uclass(&[('h', 'h')]);
2019	assert_eq!(expected, uintersect(&cls1, &cls2));
2020
2021	let cls1 = uclass(&[('a', 'b'), ('e', 'f'), ('i', 'j')]);
2022	let cls2 = uclass(&[('c', 'd'), ('g', 'h'), ('k', 'l')]);
2023	let expected = uclass(&[]);
2024	assert_eq!(expected, uintersect(&cls1, &cls2));
2025
2026	let cls1 = uclass(&[('a', 'b'), ('c', 'd'), ('e', 'f')]);
2027	let cls2 = uclass(&[('b', 'c'), ('d', 'e'), ('f', 'g')]);
2028	let expected = uclass(&[('b', 'f')]);
2029	assert_eq!(expected, uintersect(&cls1, &cls2));
2030	}
2031
2032	#[test]
2033	fn class_intersect_bytes() {
2034	let cls1 = bclass(&[]);
2035	let cls2 = bclass(&[(b'a', b'a')]);
2036	let expected = bclass(&[]);
2037	assert_eq!(expected, bintersect(&cls1, &cls2));
2038
2039	let cls1 = bclass(&[(b'a', b'a')]);
2040	let cls2 = bclass(&[(b'a', b'a')]);
2041	let expected = bclass(&[(b'a', b'a')]);
2042	assert_eq!(expected, bintersect(&cls1, &cls2));
2043
2044	let cls1 = bclass(&[(b'a', b'a')]);
2045	let cls2 = bclass(&[(b'b', b'b')]);
2046	let expected = bclass(&[]);
2047	assert_eq!(expected, bintersect(&cls1, &cls2));
2048
2049	let cls1 = bclass(&[(b'a', b'a')]);
2050	let cls2 = bclass(&[(b'a', b'c')]);
2051	let expected = bclass(&[(b'a', b'a')]);
2052	assert_eq!(expected, bintersect(&cls1, &cls2));
2053
2054	let cls1 = bclass(&[(b'a', b'b')]);
2055	let cls2 = bclass(&[(b'a', b'c')]);
2056	let expected = bclass(&[(b'a', b'b')]);
2057	assert_eq!(expected, bintersect(&cls1, &cls2));
2058
2059	let cls1 = bclass(&[(b'a', b'b')]);
2060	let cls2 = bclass(&[(b'b', b'c')]);
2061	let expected = bclass(&[(b'b', b'b')]);
2062	assert_eq!(expected, bintersect(&cls1, &cls2));
2063
2064	let cls1 = bclass(&[(b'a', b'b')]);
2065	let cls2 = bclass(&[(b'c', b'd')]);
2066	let expected = bclass(&[]);
2067	assert_eq!(expected, bintersect(&cls1, &cls2));
2068
2069	let cls1 = bclass(&[(b'b', b'c')]);
2070	let cls2 = bclass(&[(b'a', b'd')]);
2071	let expected = bclass(&[(b'b', b'c')]);
2072	assert_eq!(expected, bintersect(&cls1, &cls2));
2073
2074	let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
2075	let cls2 = bclass(&[(b'a', b'h')]);
2076	let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
2077	assert_eq!(expected, bintersect(&cls1, &cls2));
2078
2079	let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
2080	let cls2 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
2081	let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
2082	assert_eq!(expected, bintersect(&cls1, &cls2));
2083
2084	let cls1 = bclass(&[(b'a', b'b'), (b'g', b'h')]);
2085	let cls2 = bclass(&[(b'd', b'e'), (b'k', b'l')]);
2086	let expected = bclass(&[]);
2087	assert_eq!(expected, bintersect(&cls1, &cls2));
2088
2089	let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
2090	let cls2 = bclass(&[(b'h', b'h')]);
2091	let expected = bclass(&[(b'h', b'h')]);
2092	assert_eq!(expected, bintersect(&cls1, &cls2));
2093
2094	let cls1 = bclass(&[(b'a', b'b'), (b'e', b'f'), (b'i', b'j')]);
2095	let cls2 = bclass(&[(b'c', b'd'), (b'g', b'h'), (b'k', b'l')]);
2096	let expected = bclass(&[]);
2097	assert_eq!(expected, bintersect(&cls1, &cls2));
2098
2099	let cls1 = bclass(&[(b'a', b'b'), (b'c', b'd'), (b'e', b'f')]);
2100	let cls2 = bclass(&[(b'b', b'c'), (b'd', b'e'), (b'f', b'g')]);
2101	let expected = bclass(&[(b'b', b'f')]);
2102	assert_eq!(expected, bintersect(&cls1, &cls2));
2103	}
2104
2105	#[test]
2106	fn class_difference_unicode() {
2107	let cls1 = uclass(&[('a', 'a')]);
2108	let cls2 = uclass(&[('a', 'a')]);
2109	let expected = uclass(&[]);
2110	assert_eq!(expected, udifference(&cls1, &cls2));
2111
2112	let cls1 = uclass(&[('a', 'a')]);
2113	let cls2 = uclass(&[]);
2114	let expected = uclass(&[('a', 'a')]);
2115	assert_eq!(expected, udifference(&cls1, &cls2));
2116
2117	let cls1 = uclass(&[]);
2118	let cls2 = uclass(&[('a', 'a')]);
2119	let expected = uclass(&[]);
2120	assert_eq!(expected, udifference(&cls1, &cls2));
2121
2122	let cls1 = uclass(&[('a', 'z')]);
2123	let cls2 = uclass(&[('a', 'a')]);
2124	let expected = uclass(&[('b', 'z')]);
2125	assert_eq!(expected, udifference(&cls1, &cls2));
2126
2127	let cls1 = uclass(&[('a', 'z')]);
2128	let cls2 = uclass(&[('z', 'z')]);
2129	let expected = uclass(&[('a', 'y')]);
2130	assert_eq!(expected, udifference(&cls1, &cls2));
2131
2132	let cls1 = uclass(&[('a', 'z')]);
2133	let cls2 = uclass(&[('m', 'm')]);
2134	let expected = uclass(&[('a', 'l'), ('n', 'z')]);
2135	assert_eq!(expected, udifference(&cls1, &cls2));
2136
2137	let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
2138	let cls2 = uclass(&[('a', 'z')]);
2139	let expected = uclass(&[]);
2140	assert_eq!(expected, udifference(&cls1, &cls2));
2141
2142	let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
2143	let cls2 = uclass(&[('d', 'v')]);
2144	let expected = uclass(&[('a', 'c')]);
2145	assert_eq!(expected, udifference(&cls1, &cls2));
2146
2147	let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
2148	let cls2 = uclass(&[('b', 'g'), ('s', 'u')]);
2149	let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]);
2150	assert_eq!(expected, udifference(&cls1, &cls2));
2151
2152	let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
2153	let cls2 = uclass(&[('b', 'd'), ('e', 'g'), ('s', 'u')]);
2154	let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]);
2155	assert_eq!(expected, udifference(&cls1, &cls2));
2156
2157	let cls1 = uclass(&[('x', 'z')]);
2158	let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]);
2159	let expected = uclass(&[('x', 'z')]);
2160	assert_eq!(expected, udifference(&cls1, &cls2));
2161
2162	let cls1 = uclass(&[('a', 'z')]);
2163	let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]);
2164	let expected = uclass(&[('d', 'd'), ('h', 'r'), ('v', 'z')]);
2165	assert_eq!(expected, udifference(&cls1, &cls2));
2166	}
2167
2168	#[test]
2169	fn class_difference_bytes() {
2170	let cls1 = bclass(&[(b'a', b'a')]);
2171	let cls2 = bclass(&[(b'a', b'a')]);
2172	let expected = bclass(&[]);
2173	assert_eq!(expected, bdifference(&cls1, &cls2));
2174
2175	let cls1 = bclass(&[(b'a', b'a')]);
2176	let cls2 = bclass(&[]);
2177	let expected = bclass(&[(b'a', b'a')]);
2178	assert_eq!(expected, bdifference(&cls1, &cls2));
2179
2180	let cls1 = bclass(&[]);
2181	let cls2 = bclass(&[(b'a', b'a')]);
2182	let expected = bclass(&[]);
2183	assert_eq!(expected, bdifference(&cls1, &cls2));
2184
2185	let cls1 = bclass(&[(b'a', b'z')]);
2186	let cls2 = bclass(&[(b'a', b'a')]);
2187	let expected = bclass(&[(b'b', b'z')]);
2188	assert_eq!(expected, bdifference(&cls1, &cls2));
2189
2190	let cls1 = bclass(&[(b'a', b'z')]);
2191	let cls2 = bclass(&[(b'z', b'z')]);
2192	let expected = bclass(&[(b'a', b'y')]);
2193	assert_eq!(expected, bdifference(&cls1, &cls2));
2194
2195	let cls1 = bclass(&[(b'a', b'z')]);
2196	let cls2 = bclass(&[(b'm', b'm')]);
2197	let expected = bclass(&[(b'a', b'l'), (b'n', b'z')]);
2198	assert_eq!(expected, bdifference(&cls1, &cls2));
2199
2200	let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
2201	let cls2 = bclass(&[(b'a', b'z')]);
2202	let expected = bclass(&[]);
2203	assert_eq!(expected, bdifference(&cls1, &cls2));
2204
2205	let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
2206	let cls2 = bclass(&[(b'd', b'v')]);
2207	let expected = bclass(&[(b'a', b'c')]);
2208	assert_eq!(expected, bdifference(&cls1, &cls2));
2209
2210	let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
2211	let cls2 = bclass(&[(b'b', b'g'), (b's', b'u')]);
2212	let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]);
2213	assert_eq!(expected, bdifference(&cls1, &cls2));
2214
2215	let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
2216	let cls2 = bclass(&[(b'b', b'd'), (b'e', b'g'), (b's', b'u')]);
2217	let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]);
2218	assert_eq!(expected, bdifference(&cls1, &cls2));
2219
2220	let cls1 = bclass(&[(b'x', b'z')]);
2221	let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]);
2222	let expected = bclass(&[(b'x', b'z')]);
2223	assert_eq!(expected, bdifference(&cls1, &cls2));
2224
2225	let cls1 = bclass(&[(b'a', b'z')]);
2226	let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]);
2227	let expected = bclass(&[(b'd', b'd'), (b'h', b'r'), (b'v', b'z')]);
2228	assert_eq!(expected, bdifference(&cls1, &cls2));
2229	}
2230
2231	#[test]
2232	fn class_symmetric_difference_unicode() {
2233	let cls1 = uclass(&[('a', 'm')]);
2234	let cls2 = uclass(&[('g', 't')]);
2235	let expected = uclass(&[('a', 'f'), ('n', 't')]);
2236	assert_eq!(expected, usymdifference(&cls1, &cls2));
2237	}
2238
2239	#[test]
2240	fn class_symmetric_difference_bytes() {
2241	let cls1 = bclass(&[(b'a', b'm')]);
2242	let cls2 = bclass(&[(b'g', b't')]);
2243	let expected = bclass(&[(b'a', b'f'), (b'n', b't')]);
2244	assert_eq!(expected, bsymdifference(&cls1, &cls2));
2245	}
2246
2247	#[test]
2248	#[should_panic]
2249	fn hir_byte_literal_non_ascii() {
2250	Hir::literal(Literal::Byte(b'a'));
2251	}
2252
2253	// We use a thread with an explicit stack size to test that our destructor
2254	// for Hir can handle arbitrarily sized expressions in constant stack
2255	// space. In case we run on a platform without threads (WASM?), we limit
2256	// this test to Windows/Unix.
2257	#[test]
2258	#[cfg(any(unix, windows))]
2259	fn no_stack_overflow_on_drop() {
2260	use std::thread;
2261
2262	let run = \|\| {
2263	let mut expr = Hir::empty();
2264	for _ in `0`..`100` {
2265	expr = Hir::group(Group {
2266	kind: GroupKind::NonCapturing,
2267	hir: Box::new(expr),
2268	});
2269	expr = Hir::repetition(Repetition {
2270	kind: RepetitionKind::ZeroOrOne,
2271	greedy: `true`,
2272	hir: Box::new(expr),
2273	});
2274
2275	expr = Hir {
2276	kind: HirKind::Concat(vec![expr]),
2277	info: HirInfo::new(),
2278	};
2279	expr = Hir {
2280	kind: HirKind::Alternation(vec![expr]),
2281	info: HirInfo::new(),
2282	};
2283	}
2284	assert!(!expr.kind.is_empty());
2285	};
2286
2287	// We run our test on a thread with a small stack size so we can
2288	// force the issue more easily.
2289	//
2290	// NOTE(2023-03-21): See the corresponding test in 'crate::ast::tests'
2291	// for context on the specific stack size chosen here.
2292	thread::Builder::new()
2293	.stack_size(`16` << `10`)
2294	.spawn(run)
2295	.unwrap()
2296	.join()
2297	.unwrap();
2298	}
2299	}
2300