translate.rs source code [crates/regex-syntax-0.7.2/src/hir/translate.rs]

1	/!*
2	Defines a translator that converts an `Ast` to an `Hir`.
3	*/
4
5	use core::cell::{Cell, RefCell};
6
7	use alloc::{boxed::Box, string::ToString, vec, vec::Vec};
8
9	use crate::{
10	ast::{self, Ast, Span, Visitor},
11	either::Either,
12	hir::{self, Error, ErrorKind, Hir, HirKind},
13	unicode::{self, ClassQuery},
14	};
15
16	type Result<T> = core::result::Result<T, Error>;
17
18	/// A builder for constructing an AST->HIR translator.
19	#[derive(Clone, Debug)]
20	pub struct TranslatorBuilder {
21	utf8: bool,
22	flags: Flags,
23	}
24
25	impl Default for TranslatorBuilder {
26	fn default() -> TranslatorBuilder {
27	TranslatorBuilder::new()
28	}
29	}
30
31	impl TranslatorBuilder {
32	/// Create a new translator builder with a default c onfiguration.
33	pub fn new() -> TranslatorBuilder {
34	TranslatorBuilder { utf8: `true`, flags: Flags::default() }
35	}
36
37	/// Build a translator using the current configuration.
38	pub fn build(&self) -> Translator {
39	Translator {
40	stack: RefCell::new(vec![]),
41	flags: Cell::new(self.flags),
42	utf8: self.utf8,
43	}
44	}
45
46	/// When disabled, translation will permit the construction of a regular
47	/// expression that may match invalid UTF-8.
48	///
49	/// When enabled (the default), the translator is guaranteed to produce an
50	/// expression that, for non-empty matches, will only ever produce spans
51	/// that are entirely valid UTF-8 (otherwise, the translator will return an
52	/// error).
53	///
54	/// Perhaps surprisingly, when UTF-8 is enabled, an empty regex or even
55	/// a negated ASCII word boundary (uttered as `(?-u:\B)` in the concrete
56	/// syntax) will be allowed even though they can produce matches that split
57	/// a UTF-8 encoded codepoint. This only applies to zero-width or "empty"
58	/// matches, and it is expected that the regex engine itself must handle
59	/// these cases if necessary (perhaps by suppressing any zero-width matches
60	/// that split a codepoint).
61	pub fn utf8(&mut self, yes: bool) -> &mut TranslatorBuilder {
62	self.utf8 = yes;
63	self
64	}
65
66	/// Enable or disable the case insensitive flag (`i`) by default.
67	pub fn case_insensitive(&mut self, yes: bool) -> &mut TranslatorBuilder {
68	self.flags.case_insensitive = if yes { Some(`true`) } else { None };
69	self
70	}
71
72	/// Enable or disable the multi-line matching flag (`m`) by default.
73	pub fn multi_line(&mut self, yes: bool) -> &mut TranslatorBuilder {
74	self.flags.multi_line = if yes { Some(`true`) } else { None };
75	self
76	}
77
78	/// Enable or disable the "dot matches any character" flag (`s`) by
79	/// default.
80	pub fn dot_matches_new_line(
81	&mut self,
82	yes: bool,
83	) -> &mut TranslatorBuilder {
84	self.flags.dot_matches_new_line = if yes { Some(`true`) } else { None };
85	self
86	}
87
88	/// Enable or disable the CRLF mode flag (`R`) by default.
89	pub fn crlf(&mut self, yes: bool) -> &mut TranslatorBuilder {
90	self.flags.crlf = if yes { Some(`true`) } else { None };
91	self
92	}
93
94	/// Enable or disable the "swap greed" flag (`U`) by default.
95	pub fn swap_greed(&mut self, yes: bool) -> &mut TranslatorBuilder {
96	self.flags.swap_greed = if yes { Some(`true`) } else { None };
97	self
98	}
99
100	/// Enable or disable the Unicode flag (`u`) by default.
101	pub fn unicode(&mut self, yes: bool) -> &mut TranslatorBuilder {
102	self.flags.unicode = if yes { None } else { Some(`false`) };
103	self
104	}
105	}
106
107	/// A translator maps abstract syntax to a high level intermediate
108	/// representation.
109	///
110	/// A translator may be benefit from reuse. That is, a translator can translate
111	/// many abstract syntax trees.
112	///
113	/// A `Translator` can be configured in more detail via a
114	/// [`TranslatorBuilder`].
115	#[derive(Clone, Debug)]
116	pub struct Translator {
117	/// Our call stack, but on the heap.
118	stack: RefCell<Vec<HirFrame>>,
119	/// The current flag settings.
120	flags: Cell<Flags>,
121	/// Whether we're allowed to produce HIR that can match arbitrary bytes.
122	utf8: bool,
123	}
124
125	impl Translator {
126	/// Create a new translator using the default configuration.
127	pub fn new() -> Translator {
128	TranslatorBuilder::new().build()
129	}
130
131	/// Translate the given abstract syntax tree (AST) into a high level
132	/// intermediate representation (HIR).
133	///
134	/// If there was a problem doing the translation, then an HIR-specific
135	/// error is returned.
136	///
137	/// The original pattern string used to produce the `Ast` must* also be*
138	/// provided. The translator does not use the pattern string during any
139	/// correct translation, but is used for error reporting.
140	pub fn translate(&mut self, pattern: &str, ast: &Ast) -> Result<Hir> {
141	ast::visit(ast, visitor:TranslatorI::new(self, pattern))
142	}
143	}
144
145	/// An HirFrame is a single stack frame, represented explicitly, which is
146	/// created for each item in the Ast that we traverse.
147	///
148	/// Note that technically, this type doesn't represent our entire stack
149	/// frame. In particular, the Ast visitor represents any state associated with
150	/// traversing the Ast itself.
151	#[derive(Clone, Debug)]
152	enum HirFrame {
153	/// An arbitrary HIR expression. These get pushed whenever we hit a base
154	/// case in the Ast. They get popped after an inductive (i.e., recursive)
155	/// step is complete.
156	Expr(Hir),
157	/// A literal that is being constructed, character by character, from the
158	/// AST. We need this because the AST gives each individual character its
159	/// own node. So as we see characters, we peek at the top-most HirFrame.
160	/// If it's a literal, then we add to it. Otherwise, we push a new literal.
161	/// When it comes time to pop it, we convert it to an Hir via Hir::literal.
162	Literal(Vec<u8>),
163	/// A Unicode character class. This frame is mutated as we descend into
164	/// the Ast of a character class (which is itself its own mini recursive
165	/// structure).
166	ClassUnicode(hir::ClassUnicode),
167	/// A byte-oriented character class. This frame is mutated as we descend
168	/// into the Ast of a character class (which is itself its own mini
169	/// recursive structure).
170	///
171	/// Byte character classes are created when Unicode mode (`u`) is disabled.
172	/// If `utf8` is enabled (the default), then a byte character is only
173	/// permitted to match ASCII text.
174	ClassBytes(hir::ClassBytes),
175	/// This is pushed whenever a repetition is observed. After visiting every
176	/// sub-expression in the repetition, the translator's stack is expected to
177	/// have this sentinel at the top.
178	///
179	/// This sentinel only exists to stop other things (like flattening
180	/// literals) from reaching across repetition operators.
181	Repetition,
182	/// This is pushed on to the stack upon first seeing any kind of capture,
183	/// indicated by parentheses (including non-capturing groups). It is popped
184	/// upon leaving a group.
185	Group {
186	/// The old active flags when this group was opened.
187	///
188	/// If this group sets flags, then the new active flags are set to the
189	/// result of merging the old flags with the flags introduced by this
190	/// group. If the group doesn't set any flags, then this is simply
191	/// equivalent to whatever flags were set when the group was opened.
192	///
193	/// When this group is popped, the active flags should be restored to
194	/// the flags set here.
195	///
196	/// The "active" flags correspond to whatever flags are set in the
197	/// Translator.
198	old_flags: Flags,
199	},
200	/// This is pushed whenever a concatenation is observed. After visiting
201	/// every sub-expression in the concatenation, the translator's stack is
202	/// popped until it sees a Concat frame.
203	Concat,
204	/// This is pushed whenever an alternation is observed. After visiting
205	/// every sub-expression in the alternation, the translator's stack is
206	/// popped until it sees an Alternation frame.
207	Alternation,
208	/// This is pushed immediately before each sub-expression in an
209	/// alternation. This separates the branches of an alternation on the
210	/// stack and prevents literal flattening from reaching across alternation
211	/// branches.
212	///
213	/// It is popped after each expression in a branch until an 'Alternation'
214	/// frame is observed when doing a post visit on an alternation.
215	AlternationBranch,
216	}
217
218	impl HirFrame {
219	/// Assert that the current stack frame is an Hir expression and return it.
220	fn unwrap_expr(self) -> Hir {
221	match self {
222	HirFrame::Expr(expr) => expr,
223	HirFrame::Literal(lit) => Hir::literal(lit),
224	_ => panic!("tried to unwrap expr from HirFrame, got: {:?}", self),
225	}
226	}
227
228	/// Assert that the current stack frame is a Unicode class expression and
229	/// return it.
230	fn unwrap_class_unicode(self) -> hir::ClassUnicode {
231	match self {
232	HirFrame::ClassUnicode(cls) => cls,
233	_ => panic!(
234	"tried to unwrap Unicode class \
235	from HirFrame, got: {:?}",
236	self
237	),
238	}
239	}
240
241	/// Assert that the current stack frame is a byte class expression and
242	/// return it.
243	fn unwrap_class_bytes(self) -> hir::ClassBytes {
244	match self {
245	HirFrame::ClassBytes(cls) => cls,
246	_ => panic!(
247	"tried to unwrap byte class \
248	from HirFrame, got: {:?}",
249	self
250	),
251	}
252	}
253
254	/// Assert that the current stack frame is a repetition sentinel. If it
255	/// isn't, then panic.
256	fn unwrap_repetition(self) {
257	match self {
258	HirFrame::Repetition => {}
259	_ => {
260	panic!(
261	"tried to unwrap repetition from HirFrame, got: {:?}",
262	self
263	)
264	}
265	}
266	}
267
268	/// Assert that the current stack frame is a group indicator and return
269	/// its corresponding flags (the flags that were active at the time the
270	/// group was entered).
271	fn unwrap_group(self) -> Flags {
272	match self {
273	HirFrame::Group { old_flags } => old_flags,
274	_ => {
275	panic!("tried to unwrap group from HirFrame, got: {:?}", self)
276	}
277	}
278	}
279
280	/// Assert that the current stack frame is an alternation pipe sentinel. If
281	/// it isn't, then panic.
282	fn unwrap_alternation_pipe(self) {
283	match self {
284	HirFrame::AlternationBranch => {}
285	_ => {
286	panic!(
287	"tried to unwrap alt pipe from HirFrame, got: {:?}",
288	self
289	)
290	}
291	}
292	}
293	}
294
295	impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
296	type Output = Hir;
297	type Err = Error;
298
299	fn finish(self) -> Result<Hir> {
300	// ... otherwise, we should have exactly one HIR on the stack.
301	assert_eq!(self.trans().stack.borrow().len(), `1`);
302	Ok(self.pop().unwrap().unwrap_expr())
303	}
304
305	fn visit_pre(&mut self, ast: &Ast) -> Result<()> {
306	match *ast {
307	Ast::Class(ast::Class::Bracketed(_)) => {
308	if self.flags().unicode() {
309	let cls = hir::ClassUnicode::empty();
310	self.push(HirFrame::ClassUnicode(cls));
311	} else {
312	let cls = hir::ClassBytes::empty();
313	self.push(HirFrame::ClassBytes(cls));
314	}
315	}
316	Ast::Repetition(_) => self.push(HirFrame::Repetition),
317	Ast::Group(ref x) => {
318	let old_flags = x
319	.flags()
320	.map(\|ast\| self.set_flags(ast))
321	.unwrap_or_else(\|\| self.flags());
322	self.push(HirFrame::Group { old_flags });
323	}
324	Ast::Concat(ref x) if x.asts.is_empty() => {}
325	Ast::Concat(_) => {
326	self.push(HirFrame::Concat);
327	}
328	Ast::Alternation(ref x) if x.asts.is_empty() => {}
329	Ast::Alternation(_) => {
330	self.push(HirFrame::Alternation);
331	self.push(HirFrame::AlternationBranch);
332	}
333	_ => {}
334	}
335	Ok(())
336	}
337
338	fn visit_post(&mut self, ast: &Ast) -> Result<()> {
339	match *ast {
340	Ast::Empty(_) => {
341	self.push(HirFrame::Expr(Hir::empty()));
342	}
343	Ast::Flags(ref x) => {
344	self.set_flags(&x.flags);
345	// Flags in the AST are generally considered directives and
346	// not actual sub-expressions. However, they can be used in
347	// the concrete syntax like `((?i))`, and we need some kind of
348	// indication of an expression there, and Empty is the correct
349	// choice.
350	//
351	// There can also be things like `(?i)+`, but we rule those out
352	// in the parser. In the future, we might allow them for
353	// consistency sake.
354	self.push(HirFrame::Expr(Hir::empty()));
355	}
356	Ast::Literal(ref x) => {
357	match self.ast_literal_to_scalar(x)? {
358	Either::Right(byte) => self.push_byte(byte),
359	Either::Left(ch) => {
360	if !self.flags().unicode() && ch.len_utf8() > `1` {
361	return Err(self
362	.error(x.span, ErrorKind::UnicodeNotAllowed));
363	}
364	match self.case_fold_char(x.span, ch)? {
365	None => self.push_char(ch),
366	Some(expr) => self.push(HirFrame::Expr(expr)),
367	}
368	}
369	}
370	// self.push(HirFrame::Expr(self.hir_literal(x)?));
371	}
372	Ast::Dot(span) => {
373	self.push(HirFrame::Expr(self.hir_dot(span)?));
374	}
375	Ast::Assertion(ref x) => {
376	self.push(HirFrame::Expr(self.hir_assertion(x)?));
377	}
378	Ast::Class(ast::Class::Perl(ref x)) => {
379	if self.flags().unicode() {
380	let cls = self.hir_perl_unicode_class(x)?;
381	let hcls = hir::Class::Unicode(cls);
382	self.push(HirFrame::Expr(Hir::class(hcls)));
383	} else {
384	let cls = self.hir_perl_byte_class(x)?;
385	let hcls = hir::Class::Bytes(cls);
386	self.push(HirFrame::Expr(Hir::class(hcls)));
387	}
388	}
389	Ast::Class(ast::Class::Unicode(ref x)) => {
390	let cls = hir::Class::Unicode(self.hir_unicode_class(x)?);
391	self.push(HirFrame::Expr(Hir::class(cls)));
392	}
393	Ast::Class(ast::Class::Bracketed(ref ast)) => {
394	if self.flags().unicode() {
395	let mut cls = self.pop().unwrap().unwrap_class_unicode();
396	self.unicode_fold_and_negate(
397	&ast.span,
398	ast.negated,
399	&mut cls,
400	)?;
401	let expr = Hir::class(hir::Class::Unicode(cls));
402	self.push(HirFrame::Expr(expr));
403	} else {
404	let mut cls = self.pop().unwrap().unwrap_class_bytes();
405	self.bytes_fold_and_negate(
406	&ast.span,
407	ast.negated,
408	&mut cls,
409	)?;
410	let expr = Hir::class(hir::Class::Bytes(cls));
411	self.push(HirFrame::Expr(expr));
412	}
413	}
414	Ast::Repetition(ref x) => {
415	let expr = self.pop().unwrap().unwrap_expr();
416	self.pop().unwrap().unwrap_repetition();
417	self.push(HirFrame::Expr(self.hir_repetition(x, expr)));
418	}
419	Ast::Group(ref x) => {
420	let expr = self.pop().unwrap().unwrap_expr();
421	let old_flags = self.pop().unwrap().unwrap_group();
422	self.trans().flags.set(old_flags);
423	self.push(HirFrame::Expr(self.hir_capture(x, expr)));
424	}
425	Ast::Concat(_) => {
426	let mut exprs = vec![];
427	while let Some(expr) = self.pop_concat_expr() {
428	if !matches!(*expr.kind(), HirKind::Empty) {
429	exprs.push(expr);
430	}
431	}
432	exprs.reverse();
433	self.push(HirFrame::Expr(Hir::concat(exprs)));
434	}
435	Ast::Alternation(_) => {
436	let mut exprs = vec![];
437	while let Some(expr) = self.pop_alt_expr() {
438	self.pop().unwrap().unwrap_alternation_pipe();
439	exprs.push(expr);
440	}
441	exprs.reverse();
442	self.push(HirFrame::Expr(Hir::alternation(exprs)));
443	}
444	}
445	Ok(())
446	}
447
448	fn visit_alternation_in(&mut self) -> Result<()> {
449	self.push(HirFrame::AlternationBranch);
450	Ok(())
451	}
452
453	fn visit_class_set_item_pre(
454	&mut self,
455	ast: &ast::ClassSetItem,
456	) -> Result<()> {
457	match *ast {
458	ast::ClassSetItem::Bracketed(_) => {
459	if self.flags().unicode() {
460	let cls = hir::ClassUnicode::empty();
461	self.push(HirFrame::ClassUnicode(cls));
462	} else {
463	let cls = hir::ClassBytes::empty();
464	self.push(HirFrame::ClassBytes(cls));
465	}
466	}
467	// We needn't handle the Union case here since the visitor will
468	// do it for us.
469	_ => {}
470	}
471	Ok(())
472	}
473
474	fn visit_class_set_item_post(
475	&mut self,
476	ast: &ast::ClassSetItem,
477	) -> Result<()> {
478	match *ast {
479	ast::ClassSetItem::Empty(_) => {}
480	ast::ClassSetItem::Literal(ref x) => {
481	if self.flags().unicode() {
482	let mut cls = self.pop().unwrap().unwrap_class_unicode();
483	cls.push(hir::ClassUnicodeRange::new(x.c, x.c));
484	self.push(HirFrame::ClassUnicode(cls));
485	} else {
486	let mut cls = self.pop().unwrap().unwrap_class_bytes();
487	let byte = self.class_literal_byte(x)?;
488	cls.push(hir::ClassBytesRange::new(byte, byte));
489	self.push(HirFrame::ClassBytes(cls));
490	}
491	}
492	ast::ClassSetItem::Range(ref x) => {
493	if self.flags().unicode() {
494	let mut cls = self.pop().unwrap().unwrap_class_unicode();
495	cls.push(hir::ClassUnicodeRange::new(x.start.c, x.end.c));
496	self.push(HirFrame::ClassUnicode(cls));
497	} else {
498	let mut cls = self.pop().unwrap().unwrap_class_bytes();
499	let start = self.class_literal_byte(&x.start)?;
500	let end = self.class_literal_byte(&x.end)?;
501	cls.push(hir::ClassBytesRange::new(start, end));
502	self.push(HirFrame::ClassBytes(cls));
503	}
504	}
505	ast::ClassSetItem::Ascii(ref x) => {
506	if self.flags().unicode() {
507	let xcls = self.hir_ascii_unicode_class(x)?;
508	let mut cls = self.pop().unwrap().unwrap_class_unicode();
509	cls.union(&xcls);
510	self.push(HirFrame::ClassUnicode(cls));
511	} else {
512	let xcls = self.hir_ascii_byte_class(x)?;
513	let mut cls = self.pop().unwrap().unwrap_class_bytes();
514	cls.union(&xcls);
515	self.push(HirFrame::ClassBytes(cls));
516	}
517	}
518	ast::ClassSetItem::Unicode(ref x) => {
519	let xcls = self.hir_unicode_class(x)?;
520	let mut cls = self.pop().unwrap().unwrap_class_unicode();
521	cls.union(&xcls);
522	self.push(HirFrame::ClassUnicode(cls));
523	}
524	ast::ClassSetItem::Perl(ref x) => {
525	if self.flags().unicode() {
526	let xcls = self.hir_perl_unicode_class(x)?;
527	let mut cls = self.pop().unwrap().unwrap_class_unicode();
528	cls.union(&xcls);
529	self.push(HirFrame::ClassUnicode(cls));
530	} else {
531	let xcls = self.hir_perl_byte_class(x)?;
532	let mut cls = self.pop().unwrap().unwrap_class_bytes();
533	cls.union(&xcls);
534	self.push(HirFrame::ClassBytes(cls));
535	}
536	}
537	ast::ClassSetItem::Bracketed(ref ast) => {
538	if self.flags().unicode() {
539	let mut cls1 = self.pop().unwrap().unwrap_class_unicode();
540	self.unicode_fold_and_negate(
541	&ast.span,
542	ast.negated,
543	&mut cls1,
544	)?;
545
546	let mut cls2 = self.pop().unwrap().unwrap_class_unicode();
547	cls2.union(&cls1);
548	self.push(HirFrame::ClassUnicode(cls2));
549	} else {
550	let mut cls1 = self.pop().unwrap().unwrap_class_bytes();
551	self.bytes_fold_and_negate(
552	&ast.span,
553	ast.negated,
554	&mut cls1,
555	)?;
556
557	let mut cls2 = self.pop().unwrap().unwrap_class_bytes();
558	cls2.union(&cls1);
559	self.push(HirFrame::ClassBytes(cls2));
560	}
561	}
562	// This is handled automatically by the visitor.
563	ast::ClassSetItem::Union(_) => {}
564	}
565	Ok(())
566	}
567
568	fn visit_class_set_binary_op_pre(
569	&mut self,
570	_op: &ast::ClassSetBinaryOp,
571	) -> Result<()> {
572	if self.flags().unicode() {
573	let cls = hir::ClassUnicode::empty();
574	self.push(HirFrame::ClassUnicode(cls));
575	} else {
576	let cls = hir::ClassBytes::empty();
577	self.push(HirFrame::ClassBytes(cls));
578	}
579	Ok(())
580	}
581
582	fn visit_class_set_binary_op_in(
583	&mut self,
584	_op: &ast::ClassSetBinaryOp,
585	) -> Result<()> {
586	if self.flags().unicode() {
587	let cls = hir::ClassUnicode::empty();
588	self.push(HirFrame::ClassUnicode(cls));
589	} else {
590	let cls = hir::ClassBytes::empty();
591	self.push(HirFrame::ClassBytes(cls));
592	}
593	Ok(())
594	}
595
596	fn visit_class_set_binary_op_post(
597	&mut self,
598	op: &ast::ClassSetBinaryOp,
599	) -> Result<()> {
600	use crate::ast::ClassSetBinaryOpKind::*;
601
602	if self.flags().unicode() {
603	let mut rhs = self.pop().unwrap().unwrap_class_unicode();
604	let mut lhs = self.pop().unwrap().unwrap_class_unicode();
605	let mut cls = self.pop().unwrap().unwrap_class_unicode();
606	if self.flags().case_insensitive() {
607	rhs.try_case_fold_simple().map_err(\|_\| {
608	self.error(
609	op.rhs.span().clone(),
610	ErrorKind::UnicodeCaseUnavailable,
611	)
612	})?;
613	lhs.try_case_fold_simple().map_err(\|_\| {
614	self.error(
615	op.lhs.span().clone(),
616	ErrorKind::UnicodeCaseUnavailable,
617	)
618	})?;
619	}
620	match op.kind {
621	Intersection => lhs.intersect(&rhs),
622	Difference => lhs.difference(&rhs),
623	SymmetricDifference => lhs.symmetric_difference(&rhs),
624	}
625	cls.union(&lhs);
626	self.push(HirFrame::ClassUnicode(cls));
627	} else {
628	let mut rhs = self.pop().unwrap().unwrap_class_bytes();
629	let mut lhs = self.pop().unwrap().unwrap_class_bytes();
630	let mut cls = self.pop().unwrap().unwrap_class_bytes();
631	if self.flags().case_insensitive() {
632	rhs.case_fold_simple();
633	lhs.case_fold_simple();
634	}
635	match op.kind {
636	Intersection => lhs.intersect(&rhs),
637	Difference => lhs.difference(&rhs),
638	SymmetricDifference => lhs.symmetric_difference(&rhs),
639	}
640	cls.union(&lhs);
641	self.push(HirFrame::ClassBytes(cls));
642	}
643	Ok(())
644	}
645	}
646
647	/// The internal implementation of a translator.
648	///
649	/// This type is responsible for carrying around the original pattern string,
650	/// which is not tied to the internal state of a translator.
651	///
652	/// A TranslatorI exists for the time it takes to translate a single Ast.
653	#[derive(Clone, Debug)]
654	struct TranslatorI<'t, 'p> {
655	trans: &'t Translator,
656	pattern: &'p str,
657	}
658
659	impl<'t, 'p> TranslatorI<'t, 'p> {
660	/// Build a new internal translator.
661	fn new(trans: &'t Translator, pattern: &'p str) -> TranslatorI<'t, 'p> {
662	TranslatorI { trans, pattern }
663	}
664
665	/// Return a reference to the underlying translator.
666	fn trans(&self) -> &Translator {
667	&self.trans
668	}
669
670	/// Push the given frame on to the call stack.
671	fn push(&self, frame: HirFrame) {
672	self.trans().stack.borrow_mut().push(frame);
673	}
674
675	/// Push the given literal char on to the call stack.
676	///
677	/// If the top-most element of the stack is a literal, then the char
678	/// is appended to the end of that literal. Otherwise, a new literal
679	/// containing just the given char is pushed to the top of the stack.
680	fn push_char(&self, ch: char) {
681	let mut buf = [`0`; `4`];
682	let bytes = ch.encode_utf8(&mut buf).as_bytes();
683	let mut stack = self.trans().stack.borrow_mut();
684	if let Some(HirFrame::Literal(ref mut literal)) = stack.last_mut() {
685	literal.extend_from_slice(bytes);
686	} else {
687	stack.push(HirFrame::Literal(bytes.to_vec()));
688	}
689	}
690
691	/// Push the given literal byte on to the call stack.
692	///
693	/// If the top-most element of the stack is a literal, then the byte
694	/// is appended to the end of that literal. Otherwise, a new literal
695	/// containing just the given byte is pushed to the top of the stack.
696	fn push_byte(&self, byte: u8) {
697	let mut stack = self.trans().stack.borrow_mut();
698	if let Some(HirFrame::Literal(ref mut literal)) = stack.last_mut() {
699	literal.push(byte);
700	} else {
701	stack.push(HirFrame::Literal(vec![byte]));
702	}
703	}
704
705	/// Pop the top of the call stack. If the call stack is empty, return None.
706	fn pop(&self) -> Option<HirFrame> {
707	self.trans().stack.borrow_mut().pop()
708	}
709
710	/// Pop an HIR expression from the top of the stack for a concatenation.
711	///
712	/// This returns None if the stack is empty or when a concat frame is seen.
713	/// Otherwise, it panics if it could not find an HIR expression.
714	fn pop_concat_expr(&self) -> Option<Hir> {
715	let frame = self.pop()?;
716	match frame {
717	HirFrame::Concat => None,
718	HirFrame::Expr(expr) => Some(expr),
719	HirFrame::Literal(lit) => Some(Hir::literal(lit)),
720	HirFrame::ClassUnicode(_) => {
721	unreachable!("expected expr or concat, got Unicode class")
722	}
723	HirFrame::ClassBytes(_) => {
724	unreachable!("expected expr or concat, got byte class")
725	}
726	HirFrame::Repetition => {
727	unreachable!("expected expr or concat, got repetition")
728	}
729	HirFrame::Group { .. } => {
730	unreachable!("expected expr or concat, got group")
731	}
732	HirFrame::Alternation => {
733	unreachable!("expected expr or concat, got alt marker")
734	}
735	HirFrame::AlternationBranch => {
736	unreachable!("expected expr or concat, got alt branch marker")
737	}
738	}
739	}
740
741	/// Pop an HIR expression from the top of the stack for an alternation.
742	///
743	/// This returns None if the stack is empty or when an alternation frame is
744	/// seen. Otherwise, it panics if it could not find an HIR expression.
745	fn pop_alt_expr(&self) -> Option<Hir> {
746	let frame = self.pop()?;
747	match frame {
748	HirFrame::Alternation => None,
749	HirFrame::Expr(expr) => Some(expr),
750	HirFrame::Literal(lit) => Some(Hir::literal(lit)),
751	HirFrame::ClassUnicode(_) => {
752	unreachable!("expected expr or alt, got Unicode class")
753	}
754	HirFrame::ClassBytes(_) => {
755	unreachable!("expected expr or alt, got byte class")
756	}
757	HirFrame::Repetition => {
758	unreachable!("expected expr or alt, got repetition")
759	}
760	HirFrame::Group { .. } => {
761	unreachable!("expected expr or alt, got group")
762	}
763	HirFrame::Concat => {
764	unreachable!("expected expr or alt, got concat marker")
765	}
766	HirFrame::AlternationBranch => {
767	unreachable!("expected expr or alt, got alt branch marker")
768	}
769	}
770	}
771
772	/// Create a new error with the given span and error type.
773	fn error(&self, span: Span, kind: ErrorKind) -> Error {
774	Error { kind, pattern: self.pattern.to_string(), span }
775	}
776
777	/// Return a copy of the active flags.
778	fn flags(&self) -> Flags {
779	self.trans().flags.get()
780	}
781
782	/// Set the flags of this translator from the flags set in the given AST.
783	/// Then, return the old flags.
784	fn set_flags(&self, ast_flags: &ast::Flags) -> Flags {
785	let old_flags = self.flags();
786	let mut new_flags = Flags::from_ast(ast_flags);
787	new_flags.merge(&old_flags);
788	self.trans().flags.set(new_flags);
789	old_flags
790	}
791
792	/// Convert an Ast literal to its scalar representation.
793	///
794	/// When Unicode mode is enabled, then this always succeeds and returns a
795	/// `char` (Unicode scalar value).
796	///
797	/// When Unicode mode is disabled, then a `char` will still be returned
798	/// whenever possible. A byte is returned only when invalid UTF-8 is
799	/// allowed and when the byte is not ASCII. Otherwise, a non-ASCII byte
800	/// will result in an error when invalid UTF-8 is not allowed.
801	fn ast_literal_to_scalar(
802	&self,
803	lit: &ast::Literal,
804	) -> Result<Either<char, u8>> {
805	if self.flags().unicode() {
806	return Ok(Either::Left(lit.c));
807	}
808	let byte = match lit.byte() {
809	None => return Ok(Either::Left(lit.c)),
810	Some(byte) => byte,
811	};
812	if byte <= `0x7F` {
813	return Ok(Either::Left(char::try_from(byte).unwrap()));
814	}
815	if self.trans().utf8 {
816	return Err(self.error(lit.span, ErrorKind::InvalidUtf8));
817	}
818	Ok(Either::Right(byte))
819	}
820
821	fn case_fold_char(&self, span: Span, c: char) -> Result<Option<Hir>> {
822	if !self.flags().case_insensitive() {
823	return Ok(None);
824	}
825	if self.flags().unicode() {
826	// If case folding won't do anything, then don't bother trying.
827	let map = unicode::SimpleCaseFolder::new()
828	.map(\|f\| f.overlaps(c, c))
829	.map_err(\|_\| {
830	self.error(span, ErrorKind::UnicodeCaseUnavailable)
831	})?;
832	if !map {
833	return Ok(None);
834	}
835	let mut cls =
836	hir::ClassUnicode::new(vec![hir::ClassUnicodeRange::new(
837	c, c,
838	)]);
839	cls.try_case_fold_simple().map_err(\|_\| {
840	self.error(span, ErrorKind::UnicodeCaseUnavailable)
841	})?;
842	Ok(Some(Hir::class(hir::Class::Unicode(cls))))
843	} else {
844	if c.len_utf8() > `1` {
845	return Err(self.error(span, ErrorKind::UnicodeNotAllowed));
846	}
847	// If case folding won't do anything, then don't bother trying.
848	match c {
849	'A'..='Z' \| 'a'..='z' => {}
850	_ => return Ok(None),
851	}
852	let mut cls =
853	hir::ClassBytes::new(vec![hir::ClassBytesRange::new(
854	// OK because 'c.len_utf8() == 1' which in turn implies
855	// that 'c' is ASCII.
856	u8::try_from(c).unwrap(),
857	u8::try_from(c).unwrap(),
858	)]);
859	cls.case_fold_simple();
860	Ok(Some(Hir::class(hir::Class::Bytes(cls))))
861	}
862	}
863
864	fn hir_dot(&self, span: Span) -> Result<Hir> {
865	if !self.flags().unicode() && self.trans().utf8 {
866	return Err(self.error(span, ErrorKind::InvalidUtf8));
867	}
868	Ok(Hir::dot(self.flags().dot()))
869	}
870
871	fn hir_assertion(&self, asst: &ast::Assertion) -> Result<Hir> {
872	let unicode = self.flags().unicode();
873	let multi_line = self.flags().multi_line();
874	let crlf = self.flags().crlf();
875	Ok(match asst.kind {
876	ast::AssertionKind::StartLine => Hir::look(if multi_line {
877	if crlf {
878	hir::Look::StartCRLF
879	} else {
880	hir::Look::StartLF
881	}
882	} else {
883	hir::Look::Start
884	}),
885	ast::AssertionKind::EndLine => Hir::look(if multi_line {
886	if crlf {
887	hir::Look::EndCRLF
888	} else {
889	hir::Look::EndLF
890	}
891	} else {
892	hir::Look::End
893	}),
894	ast::AssertionKind::StartText => Hir::look(hir::Look::Start),
895	ast::AssertionKind::EndText => Hir::look(hir::Look::End),
896	ast::AssertionKind::WordBoundary => Hir::look(if unicode {
897	hir::Look::WordUnicode
898	} else {
899	hir::Look::WordAscii
900	}),
901	ast::AssertionKind::NotWordBoundary => Hir::look(if unicode {
902	hir::Look::WordUnicodeNegate
903	} else {
904	hir::Look::WordAsciiNegate
905	}),
906	})
907	}
908
909	fn hir_capture(&self, group: &ast::Group, expr: Hir) -> Hir {
910	let (index, name) = match group.kind {
911	ast::GroupKind::CaptureIndex(index) => (index, None),
912	ast::GroupKind::CaptureName { ref name, .. } => {
913	(name.index, Some(name.name.clone().into_boxed_str()))
914	}
915	// The HIR doesn't need to use non-capturing groups, since the way
916	// in which the data type is defined handles this automatically.
917	ast::GroupKind::NonCapturing(_) => return expr,
918	};
919	Hir::capture(hir::Capture { index, name, sub: Box::new(expr) })
920	}
921
922	fn hir_repetition(&self, rep: &ast::Repetition, expr: Hir) -> Hir {
923	let (min, max) = match rep.op.kind {
924	ast::RepetitionKind::ZeroOrOne => (`0`, Some(`1`)),
925	ast::RepetitionKind::ZeroOrMore => (`0`, None),
926	ast::RepetitionKind::OneOrMore => (`1`, None),
927	ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(m)) => {
928	(m, Some(m))
929	}
930	ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast(m)) => {
931	(m, None)
932	}
933	ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(
934	m,
935	n,
936	)) => (m, Some(n)),
937	};
938	let greedy =
939	if self.flags().swap_greed() { !rep.greedy } else { rep.greedy };
940	Hir::repetition(hir::Repetition {
941	min,
942	max,
943	greedy,
944	sub: Box::new(expr),
945	})
946	}
947
948	fn hir_unicode_class(
949	&self,
950	ast_class: &ast::ClassUnicode,
951	) -> Result<hir::ClassUnicode> {
952	use crate::ast::ClassUnicodeKind::*;
953
954	if !self.flags().unicode() {
955	return Err(
956	self.error(ast_class.span, ErrorKind::UnicodeNotAllowed)
957	);
958	}
959	let query = match ast_class.kind {
960	OneLetter(name) => ClassQuery::OneLetter(name),
961	Named(ref name) => ClassQuery::Binary(name),
962	NamedValue { ref name, ref value, .. } => ClassQuery::ByValue {
963	property_name: name,
964	property_value: value,
965	},
966	};
967	let mut result = self.convert_unicode_class_error(
968	&ast_class.span,
969	unicode::class(query),
970	);
971	if let Ok(ref mut class) = result {
972	self.unicode_fold_and_negate(
973	&ast_class.span,
974	ast_class.negated,
975	class,
976	)?;
977	}
978	result
979	}
980
981	fn hir_ascii_unicode_class(
982	&self,
983	ast: &ast::ClassAscii,
984	) -> Result<hir::ClassUnicode> {
985	let mut cls = hir::ClassUnicode::new(
986	ascii_class_as_chars(&ast.kind)
987	.map(\|(s, e)\| hir::ClassUnicodeRange::new(s, e)),
988	);
989	self.unicode_fold_and_negate(&ast.span, ast.negated, &mut cls)?;
990	Ok(cls)
991	}
992
993	fn hir_ascii_byte_class(
994	&self,
995	ast: &ast::ClassAscii,
996	) -> Result<hir::ClassBytes> {
997	let mut cls = hir::ClassBytes::new(
998	ascii_class(&ast.kind)
999	.map(\|(s, e)\| hir::ClassBytesRange::new(s, e)),
1000	);
1001	self.bytes_fold_and_negate(&ast.span, ast.negated, &mut cls)?;
1002	Ok(cls)
1003	}
1004
1005	fn hir_perl_unicode_class(
1006	&self,
1007	ast_class: &ast::ClassPerl,
1008	) -> Result<hir::ClassUnicode> {
1009	use crate::ast::ClassPerlKind::*;
1010
1011	assert!(self.flags().unicode());
1012	let result = match ast_class.kind {
1013	Digit => unicode::perl_digit(),
1014	Space => unicode::perl_space(),
1015	Word => unicode::perl_word(),
1016	};
1017	let mut class =
1018	self.convert_unicode_class_error(&ast_class.span, result)?;
1019	// We needn't apply case folding here because the Perl Unicode classes
1020	// are already closed under Unicode simple case folding.
1021	if ast_class.negated {
1022	class.negate();
1023	}
1024	Ok(class)
1025	}
1026
1027	fn hir_perl_byte_class(
1028	&self,
1029	ast_class: &ast::ClassPerl,
1030	) -> Result<hir::ClassBytes> {
1031	use crate::ast::ClassPerlKind::*;
1032
1033	assert!(!self.flags().unicode());
1034	let mut class = match ast_class.kind {
1035	Digit => hir_ascii_class_bytes(&ast::ClassAsciiKind::Digit),
1036	Space => hir_ascii_class_bytes(&ast::ClassAsciiKind::Space),
1037	Word => hir_ascii_class_bytes(&ast::ClassAsciiKind::Word),
1038	};
1039	// We needn't apply case folding here because the Perl ASCII classes
1040	// are already closed (under ASCII case folding).
1041	if ast_class.negated {
1042	class.negate();
1043	}
1044	// Negating a Perl byte class is likely to cause it to match invalid
1045	// UTF-8. That's only OK if the translator is configured to allow such
1046	// things.
1047	if self.trans().utf8 && !class.is_ascii() {
1048	return Err(self.error(ast_class.span, ErrorKind::InvalidUtf8));
1049	}
1050	Ok(class)
1051	}
1052
1053	/// Converts the given Unicode specific error to an HIR translation error.
1054	///
1055	/// The span given should approximate the position at which an error would
1056	/// occur.
1057	fn convert_unicode_class_error(
1058	&self,
1059	span: &Span,
1060	result: core::result::Result<hir::ClassUnicode, unicode::Error>,
1061	) -> Result<hir::ClassUnicode> {
1062	result.map_err(\|err\| {
1063	let sp = span.clone();
1064	match err {
1065	unicode::Error::PropertyNotFound => {
1066	self.error(sp, ErrorKind::UnicodePropertyNotFound)
1067	}
1068	unicode::Error::PropertyValueNotFound => {
1069	self.error(sp, ErrorKind::UnicodePropertyValueNotFound)
1070	}
1071	unicode::Error::PerlClassNotFound => {
1072	self.error(sp, ErrorKind::UnicodePerlClassNotFound)
1073	}
1074	}
1075	})
1076	}
1077
1078	fn unicode_fold_and_negate(
1079	&self,
1080	span: &Span,
1081	negated: bool,
1082	class: &mut hir::ClassUnicode,
1083	) -> Result<()> {
1084	// Note that we must apply case folding before negation!
1085	// Consider `(?i)[^x]`. If we applied negation first, then
1086	// the result would be the character class that matched any
1087	// Unicode scalar value.
1088	if self.flags().case_insensitive() {
1089	class.try_case_fold_simple().map_err(\|_\| {
1090	self.error(span.clone(), ErrorKind::UnicodeCaseUnavailable)
1091	})?;
1092	}
1093	if negated {
1094	class.negate();
1095	}
1096	Ok(())
1097	}
1098
1099	fn bytes_fold_and_negate(
1100	&self,
1101	span: &Span,
1102	negated: bool,
1103	class: &mut hir::ClassBytes,
1104	) -> Result<()> {
1105	// Note that we must apply case folding before negation!
1106	// Consider `(?i)[^x]`. If we applied negation first, then
1107	// the result would be the character class that matched any
1108	// Unicode scalar value.
1109	if self.flags().case_insensitive() {
1110	class.case_fold_simple();
1111	}
1112	if negated {
1113	class.negate();
1114	}
1115	if self.trans().utf8 && !class.is_ascii() {
1116	return Err(self.error(span.clone(), ErrorKind::InvalidUtf8));
1117	}
1118	Ok(())
1119	}
1120
1121	/// Return a scalar byte value suitable for use as a literal in a byte
1122	/// character class.
1123	fn class_literal_byte(&self, ast: &ast::Literal) -> Result<u8> {
1124	match self.ast_literal_to_scalar(ast)? {
1125	Either::Right(byte) => Ok(byte),
1126	Either::Left(ch) => {
1127	let cp = u32::from(ch);
1128	if cp <= `0x7F` {
1129	Ok(u8::try_from(cp).unwrap())
1130	} else {
1131	// We can't feasibly support Unicode in
1132	// byte oriented classes. Byte classes don't
1133	// do Unicode case folding.
1134	Err(self.error(ast.span, ErrorKind::UnicodeNotAllowed))
1135	}
1136	}
1137	}
1138	}
1139	}
1140
1141	/// A translator's representation of a regular expression's flags at any given
1142	/// moment in time.
1143	///
1144	/// Each flag can be in one of three states: absent, present but disabled or
1145	/// present but enabled.
1146	#[derive(Clone, Copy, Debug, Default)]
1147	struct Flags {
1148	case_insensitive: Option<bool>,
1149	multi_line: Option<bool>,
1150	dot_matches_new_line: Option<bool>,
1151	swap_greed: Option<bool>,
1152	unicode: Option<bool>,
1153	crlf: Option<bool>,
1154	// Note that `ignore_whitespace` is omitted here because it is handled
1155	// entirely in the parser.
1156	}
1157
1158	impl Flags {
1159	fn from_ast(ast: &ast::Flags) -> Flags {
1160	let mut flags = Flags::default();
1161	let mut enable = `true`;
1162	for item in &ast.items {
1163	match item.kind {
1164	ast::FlagsItemKind::Negation => {
1165	enable = `false`;
1166	}
1167	ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive) => {
1168	flags.case_insensitive = Some(enable);
1169	}
1170	ast::FlagsItemKind::Flag(ast::Flag::MultiLine) => {
1171	flags.multi_line = Some(enable);
1172	}
1173	ast::FlagsItemKind::Flag(ast::Flag::DotMatchesNewLine) => {
1174	flags.dot_matches_new_line = Some(enable);
1175	}
1176	ast::FlagsItemKind::Flag(ast::Flag::SwapGreed) => {
1177	flags.swap_greed = Some(enable);
1178	}
1179	ast::FlagsItemKind::Flag(ast::Flag::Unicode) => {
1180	flags.unicode = Some(enable);
1181	}
1182	ast::FlagsItemKind::Flag(ast::Flag::CRLF) => {
1183	flags.crlf = Some(enable);
1184	}
1185	ast::FlagsItemKind::Flag(ast::Flag::IgnoreWhitespace) => {}
1186	}
1187	}
1188	flags
1189	}
1190
1191	fn merge(&mut self, previous: &Flags) {
1192	if self.case_insensitive.is_none() {
1193	self.case_insensitive = previous.case_insensitive;
1194	}
1195	if self.multi_line.is_none() {
1196	self.multi_line = previous.multi_line;
1197	}
1198	if self.dot_matches_new_line.is_none() {
1199	self.dot_matches_new_line = previous.dot_matches_new_line;
1200	}
1201	if self.swap_greed.is_none() {
1202	self.swap_greed = previous.swap_greed;
1203	}
1204	if self.unicode.is_none() {
1205	self.unicode = previous.unicode;
1206	}
1207	if self.crlf.is_none() {
1208	self.crlf = previous.crlf;
1209	}
1210	}
1211
1212	fn dot(&self) -> hir::Dot {
1213	if self.dot_matches_new_line() {
1214	if self.unicode() {
1215	hir::Dot::AnyChar
1216	} else {
1217	hir::Dot::AnyByte
1218	}
1219	} else {
1220	if self.unicode() {
1221	if self.crlf() {
1222	hir::Dot::AnyCharExceptCRLF
1223	} else {
1224	hir::Dot::AnyCharExceptLF
1225	}
1226	} else {
1227	if self.crlf() {
1228	hir::Dot::AnyByteExceptCRLF
1229	} else {
1230	hir::Dot::AnyByteExceptLF
1231	}
1232	}
1233	}
1234	}
1235
1236	fn case_insensitive(&self) -> bool {
1237	self.case_insensitive.unwrap_or(`false`)
1238	}
1239
1240	fn multi_line(&self) -> bool {
1241	self.multi_line.unwrap_or(`false`)
1242	}
1243
1244	fn dot_matches_new_line(&self) -> bool {
1245	self.dot_matches_new_line.unwrap_or(`false`)
1246	}
1247
1248	fn swap_greed(&self) -> bool {
1249	self.swap_greed.unwrap_or(`false`)
1250	}
1251
1252	fn unicode(&self) -> bool {
1253	self.unicode.unwrap_or(`true`)
1254	}
1255
1256	fn crlf(&self) -> bool {
1257	self.crlf.unwrap_or(`false`)
1258	}
1259	}
1260
1261	fn hir_ascii_class_bytes(kind: &ast::ClassAsciiKind) -> hir::ClassBytes {
1262	let ranges: Vec<_> = ascii_classimpl Iterator(kind)
1263	.map(\|(s: u8, e: u8)\| hir::ClassBytesRange::new(start:s, end:e))
1264	.collect();
1265	hir::ClassBytes::new(ranges)
1266	}
1267
1268	fn ascii_class(kind: &ast::ClassAsciiKind) -> impl Iterator<Item = (u8, u8)> {
1269	use crate::ast::ClassAsciiKind::*;
1270
1271	let slice: &'static [(u8, u8)] = match *kind {
1272	Alnum => &[(b'0', b'9'), (b'A', b'Z'), (b'a', b'z')],
1273	Alpha => &[(b'A', b'Z'), (b'a', b'z')],
1274	Ascii => &[(b'`\x00`', b'`\x7F`')],
1275	Blank => &[(b'`\t`', b'`\t`'), (b' ', b' ')],
1276	Cntrl => &[(b'`\x00`', b'`\x1F`'), (b'`\x7F`', b'`\x7F`')],
1277	Digit => &[(b'0', b'9')],
1278	Graph => &[(b'!', b'~')],
1279	Lower => &[(b'a', b'z')],
1280	Print => &[(b' ', b'~')],
1281	Punct => &[(b'!', b'/'), (b':', b'@'), (b'[', b'`'), (b'{', b'~')],
1282	Space => &[
1283	(b'`\t`', b'`\t`'),
1284	(b'`\n`', b'`\n`'),
1285	(b'`\x0B`', b'`\x0B`'),
1286	(b'`\x0C`', b'`\x0C`'),
1287	(b'`\r`', b'`\r`'),
1288	(b' ', b' '),
1289	],
1290	Upper => &[(b'A', b'Z')],
1291	Word => &[(b'0', b'9'), (b'A', b'Z'), (b'_', b'_'), (b'a', b'z')],
1292	Xdigit => &[(b'0', b'9'), (b'A', b'F'), (b'a', b'f')],
1293	};
1294	slice.iter().copied()
1295	}
1296
1297	fn ascii_class_as_chars(
1298	kind: &ast::ClassAsciiKind,
1299	) -> impl Iterator<Item = (char, char)> {
1300	ascii_class(kind).map(\|(s: u8, e: u8)\| (char::from(s), char::from(e)))
1301	}
1302
1303	#[cfg(test)]
1304	mod tests {
1305	use crate::{
1306	ast::{self, parse::ParserBuilder, Ast, Position, Span},
1307	hir::{self, Hir, HirKind, Look, Properties},
1308	unicode::{self, ClassQuery},
1309	};
1310
1311	use super::*;
1312
1313	// We create these errors to compare with real hir::Errors in the tests.
1314	// We define equality between TestError and hir::Error to disregard the
1315	// pattern string in hir::Error, which is annoying to provide in tests.
1316	#[derive(Clone, Debug)]
1317	struct TestError {
1318	span: Span,
1319	kind: hir::ErrorKind,
1320	}
1321
1322	impl PartialEq<hir::Error> for TestError {
1323	fn eq(&self, other: &hir::Error) -> bool {
1324	self.span == other.span && self.kind == other.kind
1325	}
1326	}
1327
1328	impl PartialEq<TestError> for hir::Error {
1329	fn eq(&self, other: &TestError) -> bool {
1330	self.span == other.span && self.kind == other.kind
1331	}
1332	}
1333
1334	fn parse(pattern: &str) -> Ast {
1335	ParserBuilder::new().octal(`true`).build().parse(pattern).unwrap()
1336	}
1337
1338	fn t(pattern: &str) -> Hir {
1339	TranslatorBuilder::new()
1340	.utf8(`true`)
1341	.build()
1342	.translate(pattern, &parse(pattern))
1343	.unwrap()
1344	}
1345
1346	fn t_err(pattern: &str) -> hir::Error {
1347	TranslatorBuilder::new()
1348	.utf8(`true`)
1349	.build()
1350	.translate(pattern, &parse(pattern))
1351	.unwrap_err()
1352	}
1353
1354	fn t_bytes(pattern: &str) -> Hir {
1355	TranslatorBuilder::new()
1356	.utf8(`false`)
1357	.build()
1358	.translate(pattern, &parse(pattern))
1359	.unwrap()
1360	}
1361
1362	fn props(pattern: &str) -> Properties {
1363	t(pattern).properties().clone()
1364	}
1365
1366	fn props_bytes(pattern: &str) -> Properties {
1367	t_bytes(pattern).properties().clone()
1368	}
1369
1370	fn hir_lit(s: &str) -> Hir {
1371	hir_blit(s.as_bytes())
1372	}
1373
1374	fn hir_blit(s: &[u8]) -> Hir {
1375	Hir::literal(s)
1376	}
1377
1378	fn hir_capture(index: u32, expr: Hir) -> Hir {
1379	Hir::capture(hir::Capture { index, name: None, sub: Box::new(expr) })
1380	}
1381
1382	fn hir_capture_name(index: u32, name: &str, expr: Hir) -> Hir {
1383	Hir::capture(hir::Capture {
1384	index,
1385	name: Some(name.into()),
1386	sub: Box::new(expr),
1387	})
1388	}
1389
1390	fn hir_quest(greedy: bool, expr: Hir) -> Hir {
1391	Hir::repetition(hir::Repetition {
1392	min: `0`,
1393	max: Some(`1`),
1394	greedy,
1395	sub: Box::new(expr),
1396	})
1397	}
1398
1399	fn hir_star(greedy: bool, expr: Hir) -> Hir {
1400	Hir::repetition(hir::Repetition {
1401	min: `0`,
1402	max: None,
1403	greedy,
1404	sub: Box::new(expr),
1405	})
1406	}
1407
1408	fn hir_plus(greedy: bool, expr: Hir) -> Hir {
1409	Hir::repetition(hir::Repetition {
1410	min: `1`,
1411	max: None,
1412	greedy,
1413	sub: Box::new(expr),
1414	})
1415	}
1416
1417	fn hir_range(greedy: bool, min: u32, max: Option<u32>, expr: Hir) -> Hir {
1418	Hir::repetition(hir::Repetition {
1419	min,
1420	max,
1421	greedy,
1422	sub: Box::new(expr),
1423	})
1424	}
1425
1426	fn hir_alt(alts: Vec<Hir>) -> Hir {
1427	Hir::alternation(alts)
1428	}
1429
1430	fn hir_cat(exprs: Vec<Hir>) -> Hir {
1431	Hir::concat(exprs)
1432	}
1433
1434	#[allow(dead_code)]
1435	fn hir_uclass_query(query: ClassQuery<'_>) -> Hir {
1436	Hir::class(hir::Class::Unicode(unicode::class(query).unwrap()))
1437	}
1438
1439	#[allow(dead_code)]
1440	fn hir_uclass_perl_word() -> Hir {
1441	Hir::class(hir::Class::Unicode(unicode::perl_word().unwrap()))
1442	}
1443
1444	fn hir_ascii_uclass(kind: &ast::ClassAsciiKind) -> Hir {
1445	Hir::class(hir::Class::Unicode(hir::ClassUnicode::new(
1446	ascii_class_as_chars(kind)
1447	.map(\|(s, e)\| hir::ClassUnicodeRange::new(s, e)),
1448	)))
1449	}
1450
1451	fn hir_ascii_bclass(kind: &ast::ClassAsciiKind) -> Hir {
1452	Hir::class(hir::Class::Bytes(hir::ClassBytes::new(
1453	ascii_class(kind).map(\|(s, e)\| hir::ClassBytesRange::new(s, e)),
1454	)))
1455	}
1456
1457	fn hir_uclass(ranges: &[(char, char)]) -> Hir {
1458	Hir::class(uclass(ranges))
1459	}
1460
1461	fn hir_bclass(ranges: &[(u8, u8)]) -> Hir {
1462	Hir::class(bclass(ranges))
1463	}
1464
1465	fn hir_case_fold(expr: Hir) -> Hir {
1466	match expr.into_kind() {
1467	HirKind::Class(mut cls) => {
1468	cls.case_fold_simple();
1469	Hir::class(cls)
1470	}
1471	_ => panic!("cannot case fold non-class Hir expr"),
1472	}
1473	}
1474
1475	fn hir_negate(expr: Hir) -> Hir {
1476	match expr.into_kind() {
1477	HirKind::Class(mut cls) => {
1478	cls.negate();
1479	Hir::class(cls)
1480	}
1481	_ => panic!("cannot negate non-class Hir expr"),
1482	}
1483	}
1484
1485	fn uclass(ranges: &[(char, char)]) -> hir::Class {
1486	let ranges: Vec<hir::ClassUnicodeRange> = ranges
1487	.iter()
1488	.map(\|&(s, e)\| hir::ClassUnicodeRange::new(s, e))
1489	.collect();
1490	hir::Class::Unicode(hir::ClassUnicode::new(ranges))
1491	}
1492
1493	fn bclass(ranges: &[(u8, u8)]) -> hir::Class {
1494	let ranges: Vec<hir::ClassBytesRange> = ranges
1495	.iter()
1496	.map(\|&(s, e)\| hir::ClassBytesRange::new(s, e))
1497	.collect();
1498	hir::Class::Bytes(hir::ClassBytes::new(ranges))
1499	}
1500
1501	#[cfg(feature = "unicode-case")]
1502	fn class_case_fold(mut cls: hir::Class) -> Hir {
1503	cls.case_fold_simple();
1504	Hir::class(cls)
1505	}
1506
1507	fn class_negate(mut cls: hir::Class) -> Hir {
1508	cls.negate();
1509	Hir::class(cls)
1510	}
1511
1512	#[allow(dead_code)]
1513	fn hir_union(expr1: Hir, expr2: Hir) -> Hir {
1514	use crate::hir::Class::{Bytes, Unicode};
1515
1516	match (expr1.into_kind(), expr2.into_kind()) {
1517	(HirKind::Class(Unicode(mut c1)), HirKind::Class(Unicode(c2))) => {
1518	c1.union(&c2);
1519	Hir::class(hir::Class::Unicode(c1))
1520	}
1521	(HirKind::Class(Bytes(mut c1)), HirKind::Class(Bytes(c2))) => {
1522	c1.union(&c2);
1523	Hir::class(hir::Class::Bytes(c1))
1524	}
1525	_ => panic!("cannot union non-class Hir exprs"),
1526	}
1527	}
1528
1529	#[allow(dead_code)]
1530	fn hir_difference(expr1: Hir, expr2: Hir) -> Hir {
1531	use crate::hir::Class::{Bytes, Unicode};
1532
1533	match (expr1.into_kind(), expr2.into_kind()) {
1534	(HirKind::Class(Unicode(mut c1)), HirKind::Class(Unicode(c2))) => {
1535	c1.difference(&c2);
1536	Hir::class(hir::Class::Unicode(c1))
1537	}
1538	(HirKind::Class(Bytes(mut c1)), HirKind::Class(Bytes(c2))) => {
1539	c1.difference(&c2);
1540	Hir::class(hir::Class::Bytes(c1))
1541	}
1542	_ => panic!("cannot difference non-class Hir exprs"),
1543	}
1544	}
1545
1546	fn hir_look(look: hir::Look) -> Hir {
1547	Hir::look(look)
1548	}
1549
1550	#[test]
1551	fn empty() {
1552	assert_eq!(t(""), Hir::empty());
1553	assert_eq!(t("(?i)"), Hir::empty());
1554	assert_eq!(t("()"), hir_capture(`1`, Hir::empty()));
1555	assert_eq!(t("(?:)"), Hir::empty());
1556	assert_eq!(t("(?P<wat>)"), hir_capture_name(`1`, "wat", Hir::empty()));
1557	assert_eq!(t("\|"), hir_alt(vec![Hir::empty(), Hir::empty()]));
1558	assert_eq!(
1559	t("()\|()"),
1560	hir_alt(vec![
1561	hir_capture(`1`, Hir::empty()),
1562	hir_capture(`2`, Hir::empty()),
1563	])
1564	);
1565	assert_eq!(
1566	t("(\|b)"),
1567	hir_capture(`1`, hir_alt(vec![Hir::empty(), hir_lit("b"),]))
1568	);
1569	assert_eq!(
1570	t("(a\|)"),
1571	hir_capture(`1`, hir_alt(vec![hir_lit("a"), Hir::empty(),]))
1572	);
1573	assert_eq!(
1574	t("(a\|\|c)"),
1575	hir_capture(
1576	`1`,
1577	hir_alt(vec![hir_lit("a"), Hir::empty(), hir_lit("c"),])
1578	)
1579	);
1580	assert_eq!(
1581	t("(\|\|)"),
1582	hir_capture(
1583	`1`,
1584	hir_alt(vec![Hir::empty(), Hir::empty(), Hir::empty(),])
1585	)
1586	);
1587	}
1588
1589	#[test]
1590	fn literal() {
1591	assert_eq!(t("a"), hir_lit("a"));
1592	assert_eq!(t("(?-u)a"), hir_lit("a"));
1593	assert_eq!(t("☃"), hir_lit("☃"));
1594	assert_eq!(t("abcd"), hir_lit("abcd"));
1595
1596	assert_eq!(t_bytes("(?-u)a"), hir_lit("a"));
1597	assert_eq!(t_bytes("(?-u)`\x61`"), hir_lit("a"));
1598	assert_eq!(t_bytes(r"(?-u)\x61"), hir_lit("a"));
1599	assert_eq!(t_bytes(r"(?-u)\xFF"), hir_blit(b"`\xFF`"));
1600
1601	assert_eq!(
1602	t_err("(?-u)☃"),
1603	TestError {
1604	kind: hir::ErrorKind::UnicodeNotAllowed,
1605	span: Span::new(
1606	Position::new(`5`, `1`, `6`),
1607	Position::new(`8`, `1`, `7`)
1608	),
1609	}
1610	);
1611	assert_eq!(
1612	t_err(r"(?-u)\xFF"),
1613	TestError {
1614	kind: hir::ErrorKind::InvalidUtf8,
1615	span: Span::new(
1616	Position::new(`5`, `1`, `6`),
1617	Position::new(`9`, `1`, `10`)
1618	),
1619	}
1620	);
1621	}
1622
1623	#[test]
1624	fn literal_case_insensitive() {
1625	#[cfg(feature = "unicode-case")]
1626	assert_eq!(t("(?i)a"), hir_uclass(&[('A', 'A'), ('a', 'a'),]));
1627	#[cfg(feature = "unicode-case")]
1628	assert_eq!(t("(?i:a)"), hir_uclass(&[('A', 'A'), ('a', 'a')]));
1629	#[cfg(feature = "unicode-case")]
1630	assert_eq!(
1631	t("a(?i)a(?-i)a"),
1632	hir_cat(vec![
1633	hir_lit("a"),
1634	hir_uclass(&[('A', 'A'), ('a', 'a')]),
1635	hir_lit("a"),
1636	])
1637	);
1638	#[cfg(feature = "unicode-case")]
1639	assert_eq!(
1640	t("(?i)ab@c"),
1641	hir_cat(vec![
1642	hir_uclass(&[('A', 'A'), ('a', 'a')]),
1643	hir_uclass(&[('B', 'B'), ('b', 'b')]),
1644	hir_lit("@"),
1645	hir_uclass(&[('C', 'C'), ('c', 'c')]),
1646	])
1647	);
1648	#[cfg(feature = "unicode-case")]
1649	assert_eq!(
1650	t("(?i)β"),
1651	hir_uclass(&[('Β', 'Β'), ('β', 'β'), ('ϐ', 'ϐ'),])
1652	);
1653
1654	assert_eq!(t("(?i-u)a"), hir_bclass(&[(b'A', b'A'), (b'a', b'a'),]));
1655	#[cfg(feature = "unicode-case")]
1656	assert_eq!(
1657	t("(?-u)a(?i)a(?-i)a"),
1658	hir_cat(vec![
1659	hir_lit("a"),
1660	hir_bclass(&[(b'A', b'A'), (b'a', b'a')]),
1661	hir_lit("a"),
1662	])
1663	);
1664	assert_eq!(
1665	t("(?i-u)ab@c"),
1666	hir_cat(vec![
1667	hir_bclass(&[(b'A', b'A'), (b'a', b'a')]),
1668	hir_bclass(&[(b'B', b'B'), (b'b', b'b')]),
1669	hir_lit("@"),
1670	hir_bclass(&[(b'C', b'C'), (b'c', b'c')]),
1671	])
1672	);
1673
1674	assert_eq!(
1675	t_bytes("(?i-u)a"),
1676	hir_bclass(&[(b'A', b'A'), (b'a', b'a'),])
1677	);
1678	assert_eq!(
1679	t_bytes("(?i-u)`\x61`"),
1680	hir_bclass(&[(b'A', b'A'), (b'a', b'a'),])
1681	);
1682	assert_eq!(
1683	t_bytes(r"(?i-u)\x61"),
1684	hir_bclass(&[(b'A', b'A'), (b'a', b'a'),])
1685	);
1686	assert_eq!(t_bytes(r"(?i-u)\xFF"), hir_blit(b"`\xFF`"));
1687
1688	assert_eq!(
1689	t_err("(?i-u)β"),
1690	TestError {
1691	kind: hir::ErrorKind::UnicodeNotAllowed,
1692	span: Span::new(
1693	Position::new(`6`, `1`, `7`),
1694	Position::new(`8`, `1`, `8`),
1695	),
1696	}
1697	);
1698	}
1699
1700	#[test]
1701	fn dot() {
1702	assert_eq!(
1703	t("."),
1704	hir_uclass(&[('`\0`', '`\t`'), ('`\x0B`', '`\u{10FFFF}`')])
1705	);
1706	assert_eq!(
1707	t("(?R)."),
1708	hir_uclass(&[
1709	('`\0`', '`\t`'),
1710	('`\x0B`', '`\x0C`'),
1711	('`\x0E`', '`\u{10FFFF}`'),
1712	])
1713	);
1714	assert_eq!(t("(?s)."), hir_uclass(&[('`\0`', '`\u{10FFFF}`')]));
1715	assert_eq!(t("(?Rs)."), hir_uclass(&[('`\0`', '`\u{10FFFF}`')]));
1716	assert_eq!(
1717	t_bytes("(?-u)."),
1718	hir_bclass(&[(b'`\0`', b'`\t`'), (b'`\x0B`', b'`\xFF`')])
1719	);
1720	assert_eq!(
1721	t_bytes("(?R-u)."),
1722	hir_bclass(&[
1723	(b'`\0`', b'`\t`'),
1724	(b'`\x0B`', b'`\x0C`'),
1725	(b'`\x0E`', b'`\xFF`'),
1726	])
1727	);
1728	assert_eq!(t_bytes("(?s-u)."), hir_bclass(&[(b'`\0`', b'`\xFF`'),]));
1729	assert_eq!(t_bytes("(?Rs-u)."), hir_bclass(&[(b'`\0`', b'`\xFF`'),]));
1730
1731	// If invalid UTF-8 isn't allowed, then non-Unicode `.` isn't allowed.
1732	assert_eq!(
1733	t_err("(?-u)."),
1734	TestError {
1735	kind: hir::ErrorKind::InvalidUtf8,
1736	span: Span::new(
1737	Position::new(`5`, `1`, `6`),
1738	Position::new(`6`, `1`, `7`)
1739	),
1740	}
1741	);
1742	assert_eq!(
1743	t_err("(?R-u)."),
1744	TestError {
1745	kind: hir::ErrorKind::InvalidUtf8,
1746	span: Span::new(
1747	Position::new(`6`, `1`, `7`),
1748	Position::new(`7`, `1`, `8`)
1749	),
1750	}
1751	);
1752	assert_eq!(
1753	t_err("(?s-u)."),
1754	TestError {
1755	kind: hir::ErrorKind::InvalidUtf8,
1756	span: Span::new(
1757	Position::new(`6`, `1`, `7`),
1758	Position::new(`7`, `1`, `8`)
1759	),
1760	}
1761	);
1762	assert_eq!(
1763	t_err("(?Rs-u)."),
1764	TestError {
1765	kind: hir::ErrorKind::InvalidUtf8,
1766	span: Span::new(
1767	Position::new(`7`, `1`, `8`),
1768	Position::new(`8`, `1`, `9`)
1769	),
1770	}
1771	);
1772	}
1773
1774	#[test]
1775	fn assertions() {
1776	assert_eq!(t("^"), hir_look(hir::Look::Start));
1777	assert_eq!(t("$"), hir_look(hir::Look::End));
1778	assert_eq!(t(r"\A"), hir_look(hir::Look::Start));
1779	assert_eq!(t(r"\z"), hir_look(hir::Look::End));
1780	assert_eq!(t("(?m)^"), hir_look(hir::Look::StartLF));
1781	assert_eq!(t("(?m)$"), hir_look(hir::Look::EndLF));
1782	assert_eq!(t(r"(?m)\A"), hir_look(hir::Look::Start));
1783	assert_eq!(t(r"(?m)\z"), hir_look(hir::Look::End));
1784
1785	assert_eq!(t(r"\b"), hir_look(hir::Look::WordUnicode));
1786	assert_eq!(t(r"\B"), hir_look(hir::Look::WordUnicodeNegate));
1787	assert_eq!(t(r"(?-u)\b"), hir_look(hir::Look::WordAscii));
1788	assert_eq!(t(r"(?-u)\B"), hir_look(hir::Look::WordAsciiNegate));
1789	}
1790
1791	#[test]
1792	fn group() {
1793	assert_eq!(t("(a)"), hir_capture(`1`, hir_lit("a")));
1794	assert_eq!(
1795	t("(a)(b)"),
1796	hir_cat(vec![
1797	hir_capture(`1`, hir_lit("a")),
1798	hir_capture(`2`, hir_lit("b")),
1799	])
1800	);
1801	assert_eq!(
1802	t("(a)\|(b)"),
1803	hir_alt(vec![
1804	hir_capture(`1`, hir_lit("a")),
1805	hir_capture(`2`, hir_lit("b")),
1806	])
1807	);
1808	assert_eq!(t("(?P<foo>)"), hir_capture_name(`1`, "foo", Hir::empty()));
1809	assert_eq!(t("(?P<foo>a)"), hir_capture_name(`1`, "foo", hir_lit("a")));
1810	assert_eq!(
1811	t("(?P<foo>a)(?P<bar>b)"),
1812	hir_cat(vec![
1813	hir_capture_name(`1`, "foo", hir_lit("a")),
1814	hir_capture_name(`2`, "bar", hir_lit("b")),
1815	])
1816	);
1817	assert_eq!(t("(?:)"), Hir::empty());
1818	assert_eq!(t("(?:a)"), hir_lit("a"));
1819	assert_eq!(
1820	t("(?:a)(b)"),
1821	hir_cat(vec![hir_lit("a"), hir_capture(`1`, hir_lit("b")),])
1822	);
1823	assert_eq!(
1824	t("(a)(?:b)(c)"),
1825	hir_cat(vec![
1826	hir_capture(`1`, hir_lit("a")),
1827	hir_lit("b"),
1828	hir_capture(`2`, hir_lit("c")),
1829	])
1830	);
1831	assert_eq!(
1832	t("(a)(?P<foo>b)(c)"),
1833	hir_cat(vec![
1834	hir_capture(`1`, hir_lit("a")),
1835	hir_capture_name(`2`, "foo", hir_lit("b")),
1836	hir_capture(`3`, hir_lit("c")),
1837	])
1838	);
1839	assert_eq!(t("()"), hir_capture(`1`, Hir::empty()));
1840	assert_eq!(t("((?i))"), hir_capture(`1`, Hir::empty()));
1841	assert_eq!(t("((?x))"), hir_capture(`1`, Hir::empty()));
1842	assert_eq!(
1843	t("(((?x)))"),
1844	hir_capture(`1`, hir_capture(`2`, Hir::empty()))
1845	);
1846	}
1847
1848	#[test]
1849	fn line_anchors() {
1850	assert_eq!(t("^"), hir_look(hir::Look::Start));
1851	assert_eq!(t("$"), hir_look(hir::Look::End));
1852	assert_eq!(t(r"\A"), hir_look(hir::Look::Start));
1853	assert_eq!(t(r"\z"), hir_look(hir::Look::End));
1854
1855	assert_eq!(t(r"(?m)\A"), hir_look(hir::Look::Start));
1856	assert_eq!(t(r"(?m)\z"), hir_look(hir::Look::End));
1857	assert_eq!(t("(?m)^"), hir_look(hir::Look::StartLF));
1858	assert_eq!(t("(?m)$"), hir_look(hir::Look::EndLF));
1859
1860	assert_eq!(t(r"(?R)\A"), hir_look(hir::Look::Start));
1861	assert_eq!(t(r"(?R)\z"), hir_look(hir::Look::End));
1862	assert_eq!(t("(?R)^"), hir_look(hir::Look::Start));
1863	assert_eq!(t("(?R)$"), hir_look(hir::Look::End));
1864
1865	assert_eq!(t(r"(?Rm)\A"), hir_look(hir::Look::Start));
1866	assert_eq!(t(r"(?Rm)\z"), hir_look(hir::Look::End));
1867	assert_eq!(t("(?Rm)^"), hir_look(hir::Look::StartCRLF));
1868	assert_eq!(t("(?Rm)$"), hir_look(hir::Look::EndCRLF));
1869	}
1870
1871	#[test]
1872	fn flags() {
1873	#[cfg(feature = "unicode-case")]
1874	assert_eq!(
1875	t("(?i:a)a"),
1876	hir_cat(
1877	vec![hir_uclass(&[('A', 'A'), ('a', 'a')]), hir_lit("a"),]
1878	)
1879	);
1880	assert_eq!(
1881	t("(?i-u:a)β"),
1882	hir_cat(vec![
1883	hir_bclass(&[(b'A', b'A'), (b'a', b'a')]),
1884	hir_lit("β"),
1885	])
1886	);
1887	assert_eq!(
1888	t("(?:(?i-u)a)b"),
1889	hir_cat(vec![
1890	hir_bclass(&[(b'A', b'A'), (b'a', b'a')]),
1891	hir_lit("b"),
1892	])
1893	);
1894	assert_eq!(
1895	t("((?i-u)a)b"),
1896	hir_cat(vec![
1897	hir_capture(`1`, hir_bclass(&[(b'A', b'A'), (b'a', b'a')])),
1898	hir_lit("b"),
1899	])
1900	);
1901	#[cfg(feature = "unicode-case")]
1902	assert_eq!(
1903	t("(?i)(?-i:a)a"),
1904	hir_cat(
1905	vec![hir_lit("a"), hir_uclass(&[('A', 'A'), ('a', 'a')]),]
1906	)
1907	);
1908	#[cfg(feature = "unicode-case")]
1909	assert_eq!(
1910	t("(?im)a^"),
1911	hir_cat(vec![
1912	hir_uclass(&[('A', 'A'), ('a', 'a')]),
1913	hir_look(hir::Look::StartLF),
1914	])
1915	);
1916	#[cfg(feature = "unicode-case")]
1917	assert_eq!(
1918	t("(?im)a^(?i-m)a^"),
1919	hir_cat(vec![
1920	hir_uclass(&[('A', 'A'), ('a', 'a')]),
1921	hir_look(hir::Look::StartLF),
1922	hir_uclass(&[('A', 'A'), ('a', 'a')]),
1923	hir_look(hir::Look::Start),
1924	])
1925	);
1926	assert_eq!(
1927	t("(?U)aa?(?-U)aa?"),
1928	hir_cat(vec![
1929	hir_star(`false`, hir_lit("a")),
1930	hir_star(`true`, hir_lit("a")),
1931	hir_star(`true`, hir_lit("a")),
1932	hir_star(`false`, hir_lit("a")),
1933	])
1934	);
1935	#[cfg(feature = "unicode-case")]
1936	assert_eq!(
1937	t("(?:a(?i)a)a"),
1938	hir_cat(vec![
1939	hir_cat(vec![
1940	hir_lit("a"),
1941	hir_uclass(&[('A', 'A'), ('a', 'a')]),
1942	]),
1943	hir_lit("a"),
1944	])
1945	);
1946	#[cfg(feature = "unicode-case")]
1947	assert_eq!(
1948	t("(?i)(?:a(?-i)a)a"),
1949	hir_cat(vec![
1950	hir_cat(vec![
1951	hir_uclass(&[('A', 'A'), ('a', 'a')]),
1952	hir_lit("a"),
1953	]),
1954	hir_uclass(&[('A', 'A'), ('a', 'a')]),
1955	])
1956	);
1957	}
1958
1959	#[test]
1960	fn escape() {
1961	assert_eq!(
1962	t(r"\\\.\+\*\?\\|\[\]\{\}\^\$\#"),
1963	hir_lit(r"\.+*?()\|[]{}^$#")
1964	);
1965	}
1966
1967	#[test]
1968	fn repetition() {
1969	assert_eq!(t("a?"), hir_quest(`true`, hir_lit("a")));
1970	assert_eq!(t("a*"), hir_star(`true`, hir_lit("a")));
1971	assert_eq!(t("a+"), hir_plus(`true`, hir_lit("a")));
1972	assert_eq!(t("a??"), hir_quest(`false`, hir_lit("a")));
1973	assert_eq!(t("a*?"), hir_star(`false`, hir_lit("a")));
1974	assert_eq!(t("a+?"), hir_plus(`false`, hir_lit("a")));
1975
1976	assert_eq!(t("a{1}"), hir_range(`true`, `1`, Some(`1`), hir_lit("a"),));
1977	assert_eq!(t("a{1,}"), hir_range(`true`, `1`, None, hir_lit("a"),));
1978	assert_eq!(t("a{1,2}"), hir_range(`true`, `1`, Some(`2`), hir_lit("a"),));
1979	assert_eq!(t("a{1}?"), hir_range(`false`, `1`, Some(`1`), hir_lit("a"),));
1980	assert_eq!(t("a{1,}?"), hir_range(`false`, `1`, None, hir_lit("a"),));
1981	assert_eq!(t("a{1,2}?"), hir_range(`false`, `1`, Some(`2`), hir_lit("a"),));
1982
1983	assert_eq!(
1984	t("ab?"),
1985	hir_cat(vec![hir_lit("a"), hir_quest(`true`, hir_lit("b")),])
1986	);
1987	assert_eq!(t("(ab)?"), hir_quest(`true`, hir_capture(`1`, hir_lit("ab"))));
1988	assert_eq!(
1989	t("a\|b?"),
1990	hir_alt(vec![hir_lit("a"), hir_quest(`true`, hir_lit("b")),])
1991	);
1992	}
1993
1994	#[test]
1995	fn cat_alt() {
1996	let a = \|\| hir_look(hir::Look::Start);
1997	let b = \|\| hir_look(hir::Look::End);
1998	let c = \|\| hir_look(hir::Look::WordUnicode);
1999	let d = \|\| hir_look(hir::Look::WordUnicodeNegate);
2000
2001	assert_eq!(t("(^$)"), hir_capture(`1`, hir_cat(vec![a(), b()])));
2002	assert_eq!(t("^\|$"), hir_alt(vec![a(), b()]));
2003	assert_eq!(t(r"^\|$\|\b"), hir_alt(vec![a(), b(), c()]));
2004	assert_eq!(
2005	t(r"^$\|$\b\|\b\B"),
2006	hir_alt(vec![
2007	hir_cat(vec![a(), b()]),
2008	hir_cat(vec![b(), c()]),
2009	hir_cat(vec![c(), d()]),
2010	])
2011	);
2012	assert_eq!(t("(^\|$)"), hir_capture(`1`, hir_alt(vec![a(), b()])));
2013	assert_eq!(
2014	t(r"(^\|$\|\b)"),
2015	hir_capture(`1`, hir_alt(vec![a(), b(), c()]))
2016	);
2017	assert_eq!(
2018	t(r"(^$\|$\b\|\b\B)"),
2019	hir_capture(
2020	`1`,
2021	hir_alt(vec![
2022	hir_cat(vec![a(), b()]),
2023	hir_cat(vec![b(), c()]),
2024	hir_cat(vec![c(), d()]),
2025	])
2026	)
2027	);
2028	assert_eq!(
2029	t(r"(^$\|($\b\|(\b\B)))"),
2030	hir_capture(
2031	`1`,
2032	hir_alt(vec![
2033	hir_cat(vec![a(), b()]),
2034	hir_capture(
2035	`2`,
2036	hir_alt(vec![
2037	hir_cat(vec![b(), c()]),
2038	hir_capture(`3`, hir_cat(vec![c(), d()])),
2039	])
2040	),
2041	])
2042	)
2043	);
2044	}
2045
2046	// Tests the HIR transformation of things like '[a-z]\|[A-Z]' into
2047	// '[A-Za-z]'. In other words, an alternation of just classes is always
2048	// equivalent to a single class corresponding to the union of the branches
2049	// in that class. (Unless some branches match invalid UTF-8 and others
2050	// match non-ASCII Unicode.)
2051	#[test]
2052	fn cat_class_flattened() {
2053	assert_eq!(t(r"[a-z]\|[A-Z]"), hir_uclass(&[('A', 'Z'), ('a', 'z')]));
2054	// Combining all of the letter properties should give us the one giant
2055	// letter property.
2056	#[cfg(feature = "unicode-gencat")]
2057	assert_eq!(
2058	t(r"(?x)
2059	\p{Lowercase_Letter}
2060	\|\p{Uppercase_Letter}
2061	\|\p{Titlecase_Letter}
2062	\|\p{Modifier_Letter}
2063	\|\p{Other_Letter}
2064	"),
2065	hir_uclass_query(ClassQuery::Binary("letter"))
2066	);
2067	// Byte classes that can truly match invalid UTF-8 cannot be combined
2068	// with Unicode classes.
2069	assert_eq!(
2070	t_bytes(r"[Δδ]\|(?-u:[\x90-\xFF])\|[Λλ]"),
2071	hir_alt(vec![
2072	hir_uclass(&[('Δ', 'Δ'), ('δ', 'δ')]),
2073	hir_bclass(&[(b'`\x90`', b'`\xFF`')]),
2074	hir_uclass(&[('Λ', 'Λ'), ('λ', 'λ')]),
2075	])
2076	);
2077	// Byte classes on their own can be combined, even if some are ASCII
2078	// and others are invalid UTF-8.
2079	assert_eq!(
2080	t_bytes(r"[a-z]\|(?-u:[\x90-\xFF])\|[A-Z]"),
2081	hir_bclass(&[(b'A', b'Z'), (b'a', b'z'), (b'`\x90`', b'`\xFF`')]),
2082	);
2083	}
2084
2085	#[test]
2086	fn class_ascii() {
2087	assert_eq!(
2088	t("[[:alnum:]]"),
2089	hir_ascii_uclass(&ast::ClassAsciiKind::Alnum)
2090	);
2091	assert_eq!(
2092	t("[[:alpha:]]"),
2093	hir_ascii_uclass(&ast::ClassAsciiKind::Alpha)
2094	);
2095	assert_eq!(
2096	t("[[:ascii:]]"),
2097	hir_ascii_uclass(&ast::ClassAsciiKind::Ascii)
2098	);
2099	assert_eq!(
2100	t("[[:blank:]]"),
2101	hir_ascii_uclass(&ast::ClassAsciiKind::Blank)
2102	);
2103	assert_eq!(
2104	t("[[:cntrl:]]"),
2105	hir_ascii_uclass(&ast::ClassAsciiKind::Cntrl)
2106	);
2107	assert_eq!(
2108	t("[[:digit:]]"),
2109	hir_ascii_uclass(&ast::ClassAsciiKind::Digit)
2110	);
2111	assert_eq!(
2112	t("[[:graph:]]"),
2113	hir_ascii_uclass(&ast::ClassAsciiKind::Graph)
2114	);
2115	assert_eq!(
2116	t("[[:lower:]]"),
2117	hir_ascii_uclass(&ast::ClassAsciiKind::Lower)
2118	);
2119	assert_eq!(
2120	t("[[:print:]]"),
2121	hir_ascii_uclass(&ast::ClassAsciiKind::Print)
2122	);
2123	assert_eq!(
2124	t("[[:punct:]]"),
2125	hir_ascii_uclass(&ast::ClassAsciiKind::Punct)
2126	);
2127	assert_eq!(
2128	t("[[:space:]]"),
2129	hir_ascii_uclass(&ast::ClassAsciiKind::Space)
2130	);
2131	assert_eq!(
2132	t("[[:upper:]]"),
2133	hir_ascii_uclass(&ast::ClassAsciiKind::Upper)
2134	);
2135	assert_eq!(
2136	t("[[:word:]]"),
2137	hir_ascii_uclass(&ast::ClassAsciiKind::Word)
2138	);
2139	assert_eq!(
2140	t("[[:xdigit:]]"),
2141	hir_ascii_uclass(&ast::ClassAsciiKind::Xdigit)
2142	);
2143
2144	assert_eq!(
2145	t("[[:^lower:]]"),
2146	hir_negate(hir_ascii_uclass(&ast::ClassAsciiKind::Lower))
2147	);
2148	#[cfg(feature = "unicode-case")]
2149	assert_eq!(
2150	t("(?i)[[:lower:]]"),
2151	hir_uclass(&[
2152	('A', 'Z'),
2153	('a', 'z'),
2154	('`\u{17F}`', '`\u{17F}`'),
2155	('`\u{212A}`', '`\u{212A}`'),
2156	])
2157	);
2158
2159	assert_eq!(
2160	t("(?-u)[[:lower:]]"),
2161	hir_ascii_bclass(&ast::ClassAsciiKind::Lower)
2162	);
2163	assert_eq!(
2164	t("(?i-u)[[:lower:]]"),
2165	hir_case_fold(hir_ascii_bclass(&ast::ClassAsciiKind::Lower))
2166	);
2167
2168	assert_eq!(
2169	t_err("(?-u)[[:^lower:]]"),
2170	TestError {
2171	kind: hir::ErrorKind::InvalidUtf8,
2172	span: Span::new(
2173	Position::new(`6`, `1`, `7`),
2174	Position::new(`16`, `1`, `17`)
2175	),
2176	}
2177	);
2178	assert_eq!(
2179	t_err("(?i-u)[[:^lower:]]"),
2180	TestError {
2181	kind: hir::ErrorKind::InvalidUtf8,
2182	span: Span::new(
2183	Position::new(`7`, `1`, `8`),
2184	Position::new(`17`, `1`, `18`)
2185	),
2186	}
2187	);
2188	}
2189
2190	#[test]
2191	fn class_ascii_multiple() {
2192	// See: https://github.com/rust-lang/regex/issues/680
2193	assert_eq!(
2194	t("[[:alnum:][:^ascii:]]"),
2195	hir_union(
2196	hir_ascii_uclass(&ast::ClassAsciiKind::Alnum),
2197	hir_uclass(&[('`\u{80}`', '`\u{10FFFF}`')]),
2198	),
2199	);
2200	assert_eq!(
2201	t_bytes("(?-u)[[:alnum:][:^ascii:]]"),
2202	hir_union(
2203	hir_ascii_bclass(&ast::ClassAsciiKind::Alnum),
2204	hir_bclass(&[(`0x80`, `0xFF`)]),
2205	),
2206	);
2207	}
2208
2209	#[test]
2210	#[cfg(feature = "unicode-perl")]
2211	fn class_perl_unicode() {
2212	// Unicode
2213	assert_eq!(t(r"\d"), hir_uclass_query(ClassQuery::Binary("digit")));
2214	assert_eq!(t(r"\s"), hir_uclass_query(ClassQuery::Binary("space")));
2215	assert_eq!(t(r"\w"), hir_uclass_perl_word());
2216	#[cfg(feature = "unicode-case")]
2217	assert_eq!(
2218	t(r"(?i)\d"),
2219	hir_uclass_query(ClassQuery::Binary("digit"))
2220	);
2221	#[cfg(feature = "unicode-case")]
2222	assert_eq!(
2223	t(r"(?i)\s"),
2224	hir_uclass_query(ClassQuery::Binary("space"))
2225	);
2226	#[cfg(feature = "unicode-case")]
2227	assert_eq!(t(r"(?i)\w"), hir_uclass_perl_word());
2228
2229	// Unicode, negated
2230	assert_eq!(
2231	t(r"\D"),
2232	hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
2233	);
2234	assert_eq!(
2235	t(r"\S"),
2236	hir_negate(hir_uclass_query(ClassQuery::Binary("space")))
2237	);
2238	assert_eq!(t(r"\W"), hir_negate(hir_uclass_perl_word()));
2239	#[cfg(feature = "unicode-case")]
2240	assert_eq!(
2241	t(r"(?i)\D"),
2242	hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
2243	);
2244	#[cfg(feature = "unicode-case")]
2245	assert_eq!(
2246	t(r"(?i)\S"),
2247	hir_negate(hir_uclass_query(ClassQuery::Binary("space")))
2248	);
2249	#[cfg(feature = "unicode-case")]
2250	assert_eq!(t(r"(?i)\W"), hir_negate(hir_uclass_perl_word()));
2251	}
2252
2253	#[test]
2254	fn class_perl_ascii() {
2255	// ASCII only
2256	assert_eq!(
2257	t(r"(?-u)\d"),
2258	hir_ascii_bclass(&ast::ClassAsciiKind::Digit)
2259	);
2260	assert_eq!(
2261	t(r"(?-u)\s"),
2262	hir_ascii_bclass(&ast::ClassAsciiKind::Space)
2263	);
2264	assert_eq!(
2265	t(r"(?-u)\w"),
2266	hir_ascii_bclass(&ast::ClassAsciiKind::Word)
2267	);
2268	assert_eq!(
2269	t(r"(?i-u)\d"),
2270	hir_ascii_bclass(&ast::ClassAsciiKind::Digit)
2271	);
2272	assert_eq!(
2273	t(r"(?i-u)\s"),
2274	hir_ascii_bclass(&ast::ClassAsciiKind::Space)
2275	);
2276	assert_eq!(
2277	t(r"(?i-u)\w"),
2278	hir_ascii_bclass(&ast::ClassAsciiKind::Word)
2279	);
2280
2281	// ASCII only, negated
2282	assert_eq!(
2283	t_bytes(r"(?-u)\D"),
2284	hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit))
2285	);
2286	assert_eq!(
2287	t_bytes(r"(?-u)\S"),
2288	hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space))
2289	);
2290	assert_eq!(
2291	t_bytes(r"(?-u)\W"),
2292	hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word))
2293	);
2294	assert_eq!(
2295	t_bytes(r"(?i-u)\D"),
2296	hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit))
2297	);
2298	assert_eq!(
2299	t_bytes(r"(?i-u)\S"),
2300	hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space))
2301	);
2302	assert_eq!(
2303	t_bytes(r"(?i-u)\W"),
2304	hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word))
2305	);
2306
2307	// ASCII only, negated, with UTF-8 mode enabled.
2308	// In this case, negating any Perl class results in an error because
2309	// all such classes can match invalid UTF-8.
2310	assert_eq!(
2311	t_err(r"(?-u)\D"),
2312	TestError {
2313	kind: hir::ErrorKind::InvalidUtf8,
2314	span: Span::new(
2315	Position::new(`5`, `1`, `6`),
2316	Position::new(`7`, `1`, `8`),
2317	),
2318	},
2319	);
2320	assert_eq!(
2321	t_err(r"(?-u)\S"),
2322	TestError {
2323	kind: hir::ErrorKind::InvalidUtf8,
2324	span: Span::new(
2325	Position::new(`5`, `1`, `6`),
2326	Position::new(`7`, `1`, `8`),
2327	),
2328	},
2329	);
2330	assert_eq!(
2331	t_err(r"(?-u)\W"),
2332	TestError {
2333	kind: hir::ErrorKind::InvalidUtf8,
2334	span: Span::new(
2335	Position::new(`5`, `1`, `6`),
2336	Position::new(`7`, `1`, `8`),
2337	),
2338	},
2339	);
2340	assert_eq!(
2341	t_err(r"(?i-u)\D"),
2342	TestError {
2343	kind: hir::ErrorKind::InvalidUtf8,
2344	span: Span::new(
2345	Position::new(`6`, `1`, `7`),
2346	Position::new(`8`, `1`, `9`),
2347	),
2348	},
2349	);
2350	assert_eq!(
2351	t_err(r"(?i-u)\S"),
2352	TestError {
2353	kind: hir::ErrorKind::InvalidUtf8,
2354	span: Span::new(
2355	Position::new(`6`, `1`, `7`),
2356	Position::new(`8`, `1`, `9`),
2357	),
2358	},
2359	);
2360	assert_eq!(
2361	t_err(r"(?i-u)\W"),
2362	TestError {
2363	kind: hir::ErrorKind::InvalidUtf8,
2364	span: Span::new(
2365	Position::new(`6`, `1`, `7`),
2366	Position::new(`8`, `1`, `9`),
2367	),
2368	},
2369	);
2370	}
2371
2372	#[test]
2373	#[cfg(not(feature = "unicode-perl"))]
2374	fn class_perl_word_disabled() {
2375	assert_eq!(
2376	t_err(r"\w"),
2377	TestError {
2378	kind: hir::ErrorKind::UnicodePerlClassNotFound,
2379	span: Span::new(
2380	Position::new(`0`, `1`, `1`),
2381	Position::new(`2`, `1`, `3`)
2382	),
2383	}
2384	);
2385	}
2386
2387	#[test]
2388	#[cfg(all(not(feature = "unicode-perl"), not(feature = "unicode-bool")))]
2389	fn class_perl_space_disabled() {
2390	assert_eq!(
2391	t_err(r"\s"),
2392	TestError {
2393	kind: hir::ErrorKind::UnicodePerlClassNotFound,
2394	span: Span::new(
2395	Position::new(`0`, `1`, `1`),
2396	Position::new(`2`, `1`, `3`)
2397	),
2398	}
2399	);
2400	}
2401
2402	#[test]
2403	#[cfg(all(
2404	not(feature = "unicode-perl"),
2405	not(feature = "unicode-gencat")
2406	))]
2407	fn class_perl_digit_disabled() {
2408	assert_eq!(
2409	t_err(r"\d"),
2410	TestError {
2411	kind: hir::ErrorKind::UnicodePerlClassNotFound,
2412	span: Span::new(
2413	Position::new(`0`, `1`, `1`),
2414	Position::new(`2`, `1`, `3`)
2415	),
2416	}
2417	);
2418	}
2419
2420	#[test]
2421	#[cfg(feature = "unicode-gencat")]
2422	fn class_unicode_gencat() {
2423	assert_eq!(t(r"\pZ"), hir_uclass_query(ClassQuery::Binary("Z")));
2424	assert_eq!(t(r"\pz"), hir_uclass_query(ClassQuery::Binary("Z")));
2425	assert_eq!(
2426	t(r"\p{Separator}"),
2427	hir_uclass_query(ClassQuery::Binary("Z"))
2428	);
2429	assert_eq!(
2430	t(r"\p{se PaRa ToR}"),
2431	hir_uclass_query(ClassQuery::Binary("Z"))
2432	);
2433	assert_eq!(
2434	t(r"\p{gc:Separator}"),
2435	hir_uclass_query(ClassQuery::Binary("Z"))
2436	);
2437	assert_eq!(
2438	t(r"\p{gc=Separator}"),
2439	hir_uclass_query(ClassQuery::Binary("Z"))
2440	);
2441	assert_eq!(
2442	t(r"\p{Other}"),
2443	hir_uclass_query(ClassQuery::Binary("Other"))
2444	);
2445	assert_eq!(t(r"\pC"), hir_uclass_query(ClassQuery::Binary("Other")));
2446
2447	assert_eq!(
2448	t(r"\PZ"),
2449	hir_negate(hir_uclass_query(ClassQuery::Binary("Z")))
2450	);
2451	assert_eq!(
2452	t(r"\P{separator}"),
2453	hir_negate(hir_uclass_query(ClassQuery::Binary("Z")))
2454	);
2455	assert_eq!(
2456	t(r"\P{gc!=separator}"),
2457	hir_negate(hir_uclass_query(ClassQuery::Binary("Z")))
2458	);
2459
2460	assert_eq!(t(r"\p{any}"), hir_uclass_query(ClassQuery::Binary("Any")));
2461	assert_eq!(
2462	t(r"\p{assigned}"),
2463	hir_uclass_query(ClassQuery::Binary("Assigned"))
2464	);
2465	assert_eq!(
2466	t(r"\p{ascii}"),
2467	hir_uclass_query(ClassQuery::Binary("ASCII"))
2468	);
2469	assert_eq!(
2470	t(r"\p{gc:any}"),
2471	hir_uclass_query(ClassQuery::Binary("Any"))
2472	);
2473	assert_eq!(
2474	t(r"\p{gc:assigned}"),
2475	hir_uclass_query(ClassQuery::Binary("Assigned"))
2476	);
2477	assert_eq!(
2478	t(r"\p{gc:ascii}"),
2479	hir_uclass_query(ClassQuery::Binary("ASCII"))
2480	);
2481
2482	assert_eq!(
2483	t_err(r"(?-u)\pZ"),
2484	TestError {
2485	kind: hir::ErrorKind::UnicodeNotAllowed,
2486	span: Span::new(
2487	Position::new(`5`, `1`, `6`),
2488	Position::new(`8`, `1`, `9`)
2489	),
2490	}
2491	);
2492	assert_eq!(
2493	t_err(r"(?-u)\p{Separator}"),
2494	TestError {
2495	kind: hir::ErrorKind::UnicodeNotAllowed,
2496	span: Span::new(
2497	Position::new(`5`, `1`, `6`),
2498	Position::new(`18`, `1`, `19`)
2499	),
2500	}
2501	);
2502	assert_eq!(
2503	t_err(r"\pE"),
2504	TestError {
2505	kind: hir::ErrorKind::UnicodePropertyNotFound,
2506	span: Span::new(
2507	Position::new(`0`, `1`, `1`),
2508	Position::new(`3`, `1`, `4`)
2509	),
2510	}
2511	);
2512	assert_eq!(
2513	t_err(r"\p{Foo}"),
2514	TestError {
2515	kind: hir::ErrorKind::UnicodePropertyNotFound,
2516	span: Span::new(
2517	Position::new(`0`, `1`, `1`),
2518	Position::new(`7`, `1`, `8`)
2519	),
2520	}
2521	);
2522	assert_eq!(
2523	t_err(r"\p{gc:Foo}"),
2524	TestError {
2525	kind: hir::ErrorKind::UnicodePropertyValueNotFound,
2526	span: Span::new(
2527	Position::new(`0`, `1`, `1`),
2528	Position::new(`10`, `1`, `11`)
2529	),
2530	}
2531	);
2532	}
2533
2534	#[test]
2535	#[cfg(not(feature = "unicode-gencat"))]
2536	fn class_unicode_gencat_disabled() {
2537	assert_eq!(
2538	t_err(r"\p{Separator}"),
2539	TestError {
2540	kind: hir::ErrorKind::UnicodePropertyNotFound,
2541	span: Span::new(
2542	Position::new(`0`, `1`, `1`),
2543	Position::new(`13`, `1`, `14`)
2544	),
2545	}
2546	);
2547
2548	assert_eq!(
2549	t_err(r"\p{Any}"),
2550	TestError {
2551	kind: hir::ErrorKind::UnicodePropertyNotFound,
2552	span: Span::new(
2553	Position::new(`0`, `1`, `1`),
2554	Position::new(`7`, `1`, `8`)
2555	),
2556	}
2557	);
2558	}
2559
2560	#[test]
2561	#[cfg(feature = "unicode-script")]
2562	fn class_unicode_script() {
2563	assert_eq!(
2564	t(r"\p{Greek}"),
2565	hir_uclass_query(ClassQuery::Binary("Greek"))
2566	);
2567	#[cfg(feature = "unicode-case")]
2568	assert_eq!(
2569	t(r"(?i)\p{Greek}"),
2570	hir_case_fold(hir_uclass_query(ClassQuery::Binary("Greek")))
2571	);
2572	#[cfg(feature = "unicode-case")]
2573	assert_eq!(
2574	t(r"(?i)\P{Greek}"),
2575	hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary(
2576	"Greek"
2577	))))
2578	);
2579
2580	assert_eq!(
2581	t_err(r"\p{sc:Foo}"),
2582	TestError {
2583	kind: hir::ErrorKind::UnicodePropertyValueNotFound,
2584	span: Span::new(
2585	Position::new(`0`, `1`, `1`),
2586	Position::new(`10`, `1`, `11`)
2587	),
2588	}
2589	);
2590	assert_eq!(
2591	t_err(r"\p{scx:Foo}"),
2592	TestError {
2593	kind: hir::ErrorKind::UnicodePropertyValueNotFound,
2594	span: Span::new(
2595	Position::new(`0`, `1`, `1`),
2596	Position::new(`11`, `1`, `12`)
2597	),
2598	}
2599	);
2600	}
2601
2602	#[test]
2603	#[cfg(not(feature = "unicode-script"))]
2604	fn class_unicode_script_disabled() {
2605	assert_eq!(
2606	t_err(r"\p{Greek}"),
2607	TestError {
2608	kind: hir::ErrorKind::UnicodePropertyNotFound,
2609	span: Span::new(
2610	Position::new(`0`, `1`, `1`),
2611	Position::new(`9`, `1`, `10`)
2612	),
2613	}
2614	);
2615
2616	assert_eq!(
2617	t_err(r"\p{scx:Greek}"),
2618	TestError {
2619	kind: hir::ErrorKind::UnicodePropertyNotFound,
2620	span: Span::new(
2621	Position::new(`0`, `1`, `1`),
2622	Position::new(`13`, `1`, `14`)
2623	),
2624	}
2625	);
2626	}
2627
2628	#[test]
2629	#[cfg(feature = "unicode-age")]
2630	fn class_unicode_age() {
2631	assert_eq!(
2632	t_err(r"\p{age:Foo}"),
2633	TestError {
2634	kind: hir::ErrorKind::UnicodePropertyValueNotFound,
2635	span: Span::new(
2636	Position::new(`0`, `1`, `1`),
2637	Position::new(`11`, `1`, `12`)
2638	),
2639	}
2640	);
2641	}
2642
2643	#[test]
2644	#[cfg(feature = "unicode-gencat")]
2645	fn class_unicode_any_empty() {
2646	assert_eq!(t(r"\P{any}"), hir_uclass(&[]),);
2647	}
2648
2649	#[test]
2650	#[cfg(not(feature = "unicode-age"))]
2651	fn class_unicode_age_disabled() {
2652	assert_eq!(
2653	t_err(r"\p{age:3.0}"),
2654	TestError {
2655	kind: hir::ErrorKind::UnicodePropertyNotFound,
2656	span: Span::new(
2657	Position::new(`0`, `1`, `1`),
2658	Position::new(`11`, `1`, `12`)
2659	),
2660	}
2661	);
2662	}
2663
2664	#[test]
2665	fn class_bracketed() {
2666	assert_eq!(t("[a]"), hir_lit("a"));
2667	assert_eq!(t("[ab]"), hir_uclass(&[('a', 'b')]));
2668	assert_eq!(t("[^[a]]"), class_negate(uclass(&[('a', 'a')])));
2669	assert_eq!(t("[a-z]"), hir_uclass(&[('a', 'z')]));
2670	assert_eq!(t("[a-fd-h]"), hir_uclass(&[('a', 'h')]));
2671	assert_eq!(t("[a-fg-m]"), hir_uclass(&[('a', 'm')]));
2672	assert_eq!(t(r"[\x00]"), hir_uclass(&[('`\0`', '`\0`')]));
2673	assert_eq!(t(r"[\n]"), hir_uclass(&[('`\n`', '`\n`')]));
2674	assert_eq!(t("[`\n`]"), hir_uclass(&[('`\n`', '`\n`')]));
2675	#[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))]
2676	assert_eq!(t(r"[\d]"), hir_uclass_query(ClassQuery::Binary("digit")));
2677	#[cfg(feature = "unicode-gencat")]
2678	assert_eq!(
2679	t(r"[\pZ]"),
2680	hir_uclass_query(ClassQuery::Binary("separator"))
2681	);
2682	#[cfg(feature = "unicode-gencat")]
2683	assert_eq!(
2684	t(r"[\p{separator}]"),
2685	hir_uclass_query(ClassQuery::Binary("separator"))
2686	);
2687	#[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))]
2688	assert_eq!(t(r"[^\D]"), hir_uclass_query(ClassQuery::Binary("digit")));
2689	#[cfg(feature = "unicode-gencat")]
2690	assert_eq!(
2691	t(r"[^\PZ]"),
2692	hir_uclass_query(ClassQuery::Binary("separator"))
2693	);
2694	#[cfg(feature = "unicode-gencat")]
2695	assert_eq!(
2696	t(r"[^\P{separator}]"),
2697	hir_uclass_query(ClassQuery::Binary("separator"))
2698	);
2699	#[cfg(all(
2700	feature = "unicode-case",
2701	any(feature = "unicode-perl", feature = "unicode-gencat")
2702	))]
2703	assert_eq!(
2704	t(r"(?i)[^\D]"),
2705	hir_uclass_query(ClassQuery::Binary("digit"))
2706	);
2707	#[cfg(all(feature = "unicode-case", feature = "unicode-script"))]
2708	assert_eq!(
2709	t(r"(?i)[^\P{greek}]"),
2710	hir_case_fold(hir_uclass_query(ClassQuery::Binary("greek")))
2711	);
2712
2713	assert_eq!(t("(?-u)[a]"), hir_bclass(&[(b'a', b'a')]));
2714	assert_eq!(t(r"(?-u)[\x00]"), hir_bclass(&[(b'`\0`', b'`\0`')]));
2715	assert_eq!(t_bytes(r"(?-u)[\xFF]"), hir_bclass(&[(b'`\xFF`', b'`\xFF`')]));
2716
2717	#[cfg(feature = "unicode-case")]
2718	assert_eq!(t("(?i)[a]"), hir_uclass(&[('A', 'A'), ('a', 'a')]));
2719	#[cfg(feature = "unicode-case")]
2720	assert_eq!(
2721	t("(?i)[k]"),
2722	hir_uclass(&[('K', 'K'), ('k', 'k'), ('`\u{212A}`', '`\u{212A}`'),])
2723	);
2724	#[cfg(feature = "unicode-case")]
2725	assert_eq!(
2726	t("(?i)[β]"),
2727	hir_uclass(&[('Β', 'Β'), ('β', 'β'), ('ϐ', 'ϐ'),])
2728	);
2729	assert_eq!(t("(?i-u)[k]"), hir_bclass(&[(b'K', b'K'), (b'k', b'k'),]));
2730
2731	assert_eq!(t("[^a]"), class_negate(uclass(&[('a', 'a')])));
2732	assert_eq!(t(r"[^\x00]"), class_negate(uclass(&[('`\0`', '`\0`')])));
2733	assert_eq!(
2734	t_bytes("(?-u)[^a]"),
2735	class_negate(bclass(&[(b'a', b'a')]))
2736	);
2737	#[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))]
2738	assert_eq!(
2739	t(r"[^\d]"),
2740	hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
2741	);
2742	#[cfg(feature = "unicode-gencat")]
2743	assert_eq!(
2744	t(r"[^\pZ]"),
2745	hir_negate(hir_uclass_query(ClassQuery::Binary("separator")))
2746	);
2747	#[cfg(feature = "unicode-gencat")]
2748	assert_eq!(
2749	t(r"[^\p{separator}]"),
2750	hir_negate(hir_uclass_query(ClassQuery::Binary("separator")))
2751	);
2752	#[cfg(all(feature = "unicode-case", feature = "unicode-script"))]
2753	assert_eq!(
2754	t(r"(?i)[^\p{greek}]"),
2755	hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary(
2756	"greek"
2757	))))
2758	);
2759	#[cfg(all(feature = "unicode-case", feature = "unicode-script"))]
2760	assert_eq!(
2761	t(r"(?i)[\P{greek}]"),
2762	hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary(
2763	"greek"
2764	))))
2765	);
2766
2767	// Test some weird cases.
2768	assert_eq!(t(r"[\[]"), hir_uclass(&[('[', '[')]));
2769
2770	assert_eq!(t(r"[&]"), hir_uclass(&[('&', '&')]));
2771	assert_eq!(t(r"[\&]"), hir_uclass(&[('&', '&')]));
2772	assert_eq!(t(r"[\&\&]"), hir_uclass(&[('&', '&')]));
2773	assert_eq!(t(r"[\x00-&]"), hir_uclass(&[('`\0`', '&')]));
2774	assert_eq!(t(r"[&-\xFF]"), hir_uclass(&[('&', '`\u{FF}`')]));
2775
2776	assert_eq!(t(r"[~]"), hir_uclass(&[('~', '~')]));
2777	assert_eq!(t(r"[\~]"), hir_uclass(&[('~', '~')]));
2778	assert_eq!(t(r"[\~\~]"), hir_uclass(&[('~', '~')]));
2779	assert_eq!(t(r"[\x00-~]"), hir_uclass(&[('`\0`', '~')]));
2780	assert_eq!(t(r"[~-\xFF]"), hir_uclass(&[('~', '`\u{FF}`')]));
2781
2782	assert_eq!(t(r"[-]"), hir_uclass(&[('-', '-')]));
2783	assert_eq!(t(r"[\-]"), hir_uclass(&[('-', '-')]));
2784	assert_eq!(t(r"[\-\-]"), hir_uclass(&[('-', '-')]));
2785	assert_eq!(t(r"[\x00-\-]"), hir_uclass(&[('`\0`', '-')]));
2786	assert_eq!(t(r"[\--\xFF]"), hir_uclass(&[('-', '`\u{FF}`')]));
2787
2788	assert_eq!(
2789	t_err("(?-u)[^a]"),
2790	TestError {
2791	kind: hir::ErrorKind::InvalidUtf8,
2792	span: Span::new(
2793	Position::new(`5`, `1`, `6`),
2794	Position::new(`9`, `1`, `10`)
2795	),
2796	}
2797	);
2798	#[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))]
2799	assert_eq!(t(r"[^\s\S]"), hir_uclass(&[]),);
2800	#[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))]
2801	assert_eq!(t_bytes(r"(?-u)[^\s\S]"), hir_bclass(&[]),);
2802	}
2803
2804	#[test]
2805	fn class_bracketed_union() {
2806	assert_eq!(t("[a-zA-Z]"), hir_uclass(&[('A', 'Z'), ('a', 'z')]));
2807	#[cfg(feature = "unicode-gencat")]
2808	assert_eq!(
2809	t(r"[a\pZb]"),
2810	hir_union(
2811	hir_uclass(&[('a', 'b')]),
2812	hir_uclass_query(ClassQuery::Binary("separator"))
2813	)
2814	);
2815	#[cfg(all(feature = "unicode-gencat", feature = "unicode-script"))]
2816	assert_eq!(
2817	t(r"[\pZ\p{Greek}]"),
2818	hir_union(
2819	hir_uclass_query(ClassQuery::Binary("greek")),
2820	hir_uclass_query(ClassQuery::Binary("separator"))
2821	)
2822	);
2823	#[cfg(all(
2824	feature = "unicode-age",
2825	feature = "unicode-gencat",
2826	feature = "unicode-script"
2827	))]
2828	assert_eq!(
2829	t(r"[\p{age:3.0}\pZ\p{Greek}]"),
2830	hir_union(
2831	hir_uclass_query(ClassQuery::ByValue {
2832	property_name: "age",
2833	property_value: "3.0",
2834	}),
2835	hir_union(
2836	hir_uclass_query(ClassQuery::Binary("greek")),
2837	hir_uclass_query(ClassQuery::Binary("separator"))
2838	)
2839	)
2840	);
2841	#[cfg(all(
2842	feature = "unicode-age",
2843	feature = "unicode-gencat",
2844	feature = "unicode-script"
2845	))]
2846	assert_eq!(
2847	t(r"[[[\p{age:3.0}\pZ]\p{Greek}][\p{Cyrillic}]]"),
2848	hir_union(
2849	hir_uclass_query(ClassQuery::ByValue {
2850	property_name: "age",
2851	property_value: "3.0",
2852	}),
2853	hir_union(
2854	hir_uclass_query(ClassQuery::Binary("cyrillic")),
2855	hir_union(
2856	hir_uclass_query(ClassQuery::Binary("greek")),
2857	hir_uclass_query(ClassQuery::Binary("separator"))
2858	)
2859	)
2860	)
2861	);
2862
2863	#[cfg(all(
2864	feature = "unicode-age",
2865	feature = "unicode-case",
2866	feature = "unicode-gencat",
2867	feature = "unicode-script"
2868	))]
2869	assert_eq!(
2870	t(r"(?i)[\p{age:3.0}\pZ\p{Greek}]"),
2871	hir_case_fold(hir_union(
2872	hir_uclass_query(ClassQuery::ByValue {
2873	property_name: "age",
2874	property_value: "3.0",
2875	}),
2876	hir_union(
2877	hir_uclass_query(ClassQuery::Binary("greek")),
2878	hir_uclass_query(ClassQuery::Binary("separator"))
2879	)
2880	))
2881	);
2882	#[cfg(all(
2883	feature = "unicode-age",
2884	feature = "unicode-gencat",
2885	feature = "unicode-script"
2886	))]
2887	assert_eq!(
2888	t(r"[^\p{age:3.0}\pZ\p{Greek}]"),
2889	hir_negate(hir_union(
2890	hir_uclass_query(ClassQuery::ByValue {
2891	property_name: "age",
2892	property_value: "3.0",
2893	}),
2894	hir_union(
2895	hir_uclass_query(ClassQuery::Binary("greek")),
2896	hir_uclass_query(ClassQuery::Binary("separator"))
2897	)
2898	))
2899	);
2900	#[cfg(all(
2901	feature = "unicode-age",
2902	feature = "unicode-case",
2903	feature = "unicode-gencat",
2904	feature = "unicode-script"
2905	))]
2906	assert_eq!(
2907	t(r"(?i)[^\p{age:3.0}\pZ\p{Greek}]"),
2908	hir_negate(hir_case_fold(hir_union(
2909	hir_uclass_query(ClassQuery::ByValue {
2910	property_name: "age",
2911	property_value: "3.0",
2912	}),
2913	hir_union(
2914	hir_uclass_query(ClassQuery::Binary("greek")),
2915	hir_uclass_query(ClassQuery::Binary("separator"))
2916	)
2917	)))
2918	);
2919	}
2920
2921	#[test]
2922	fn class_bracketed_nested() {
2923	assert_eq!(t(r"[a[^c]]"), class_negate(uclass(&[('c', 'c')])));
2924	assert_eq!(t(r"[a-b[^c]]"), class_negate(uclass(&[('c', 'c')])));
2925	assert_eq!(t(r"[a-c[^c]]"), class_negate(uclass(&[])));
2926
2927	assert_eq!(t(r"[^a[^c]]"), hir_uclass(&[('c', 'c')]));
2928	assert_eq!(t(r"[^a-b[^c]]"), hir_uclass(&[('c', 'c')]));
2929
2930	#[cfg(feature = "unicode-case")]
2931	assert_eq!(
2932	t(r"(?i)[a[^c]]"),
2933	hir_negate(class_case_fold(uclass(&[('c', 'c')])))
2934	);
2935	#[cfg(feature = "unicode-case")]
2936	assert_eq!(
2937	t(r"(?i)[a-b[^c]]"),
2938	hir_negate(class_case_fold(uclass(&[('c', 'c')])))
2939	);
2940
2941	#[cfg(feature = "unicode-case")]
2942	assert_eq!(t(r"(?i)[^a[^c]]"), hir_uclass(&[('C', 'C'), ('c', 'c')]));
2943	#[cfg(feature = "unicode-case")]
2944	assert_eq!(
2945	t(r"(?i)[^a-b[^c]]"),
2946	hir_uclass(&[('C', 'C'), ('c', 'c')])
2947	);
2948
2949	assert_eq!(t(r"[^a-c[^c]]"), hir_uclass(&[]),);
2950	#[cfg(feature = "unicode-case")]
2951	assert_eq!(t(r"(?i)[^a-c[^c]]"), hir_uclass(&[]),);
2952	}
2953
2954	#[test]
2955	fn class_bracketed_intersect() {
2956	assert_eq!(t("[abc&&b-c]"), hir_uclass(&[('b', 'c')]));
2957	assert_eq!(t("[abc&&[b-c]]"), hir_uclass(&[('b', 'c')]));
2958	assert_eq!(t("[[abc]&&[b-c]]"), hir_uclass(&[('b', 'c')]));
2959	assert_eq!(t("[a-z&&b-y&&c-x]"), hir_uclass(&[('c', 'x')]));
2960	assert_eq!(t("[c-da-b&&a-d]"), hir_uclass(&[('a', 'd')]));
2961	assert_eq!(t("[a-d&&c-da-b]"), hir_uclass(&[('a', 'd')]));
2962	assert_eq!(t(r"[a-z&&a-c]"), hir_uclass(&[('a', 'c')]));
2963	assert_eq!(t(r"[[a-z&&a-c]]"), hir_uclass(&[('a', 'c')]));
2964	assert_eq!(t(r"[^[a-z&&a-c]]"), hir_negate(hir_uclass(&[('a', 'c')])));
2965
2966	assert_eq!(t("(?-u)[abc&&b-c]"), hir_bclass(&[(b'b', b'c')]));
2967	assert_eq!(t("(?-u)[abc&&[b-c]]"), hir_bclass(&[(b'b', b'c')]));
2968	assert_eq!(t("(?-u)[[abc]&&[b-c]]"), hir_bclass(&[(b'b', b'c')]));
2969	assert_eq!(t("(?-u)[a-z&&b-y&&c-x]"), hir_bclass(&[(b'c', b'x')]));
2970	assert_eq!(t("(?-u)[c-da-b&&a-d]"), hir_bclass(&[(b'a', b'd')]));
2971	assert_eq!(t("(?-u)[a-d&&c-da-b]"), hir_bclass(&[(b'a', b'd')]));
2972
2973	#[cfg(feature = "unicode-case")]
2974	assert_eq!(
2975	t("(?i)[abc&&b-c]"),
2976	hir_case_fold(hir_uclass(&[('b', 'c')]))
2977	);
2978	#[cfg(feature = "unicode-case")]
2979	assert_eq!(
2980	t("(?i)[abc&&[b-c]]"),
2981	hir_case_fold(hir_uclass(&[('b', 'c')]))
2982	);
2983	#[cfg(feature = "unicode-case")]
2984	assert_eq!(
2985	t("(?i)[[abc]&&[b-c]]"),
2986	hir_case_fold(hir_uclass(&[('b', 'c')]))
2987	);
2988	#[cfg(feature = "unicode-case")]
2989	assert_eq!(
2990	t("(?i)[a-z&&b-y&&c-x]"),
2991	hir_case_fold(hir_uclass(&[('c', 'x')]))
2992	);
2993	#[cfg(feature = "unicode-case")]
2994	assert_eq!(
2995	t("(?i)[c-da-b&&a-d]"),
2996	hir_case_fold(hir_uclass(&[('a', 'd')]))
2997	);
2998	#[cfg(feature = "unicode-case")]
2999	assert_eq!(
3000	t("(?i)[a-d&&c-da-b]"),
3001	hir_case_fold(hir_uclass(&[('a', 'd')]))
3002	);
3003
3004	assert_eq!(
3005	t("(?i-u)[abc&&b-c]"),
3006	hir_case_fold(hir_bclass(&[(b'b', b'c')]))
3007	);
3008	assert_eq!(
3009	t("(?i-u)[abc&&[b-c]]"),
3010	hir_case_fold(hir_bclass(&[(b'b', b'c')]))
3011	);
3012	assert_eq!(
3013	t("(?i-u)[[abc]&&[b-c]]"),
3014	hir_case_fold(hir_bclass(&[(b'b', b'c')]))
3015	);
3016	assert_eq!(
3017	t("(?i-u)[a-z&&b-y&&c-x]"),
3018	hir_case_fold(hir_bclass(&[(b'c', b'x')]))
3019	);
3020	assert_eq!(
3021	t("(?i-u)[c-da-b&&a-d]"),
3022	hir_case_fold(hir_bclass(&[(b'a', b'd')]))
3023	);
3024	assert_eq!(
3025	t("(?i-u)[a-d&&c-da-b]"),
3026	hir_case_fold(hir_bclass(&[(b'a', b'd')]))
3027	);
3028
3029	// In `[a^]`, `^` does not need to be escaped, so it makes sense that
3030	// `^` is also allowed to be unescaped after `&&`.
3031	assert_eq!(t(r"[\^&&^]"), hir_uclass(&[('^', '^')]));
3032	// `]` needs to be escaped after `&&` since it's not at start of class.
3033	assert_eq!(t(r"[]&&\]]"), hir_uclass(&[(']', ']')]));
3034	assert_eq!(t(r"[-&&-]"), hir_uclass(&[('-', '-')]));
3035	assert_eq!(t(r"[\&&&&]"), hir_uclass(&[('&', '&')]));
3036	assert_eq!(t(r"[\&&&\&]"), hir_uclass(&[('&', '&')]));
3037	// Test precedence.
3038	assert_eq!(
3039	t(r"[a-w&&[^c-g]z]"),
3040	hir_uclass(&[('a', 'b'), ('h', 'w')])
3041	);
3042	}
3043
3044	#[test]
3045	fn class_bracketed_intersect_negate() {
3046	#[cfg(feature = "unicode-perl")]
3047	assert_eq!(
3048	t(r"[^\w&&\d]"),
3049	hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
3050	);
3051	assert_eq!(t(r"[^[a-z&&a-c]]"), hir_negate(hir_uclass(&[('a', 'c')])));
3052	#[cfg(feature = "unicode-perl")]
3053	assert_eq!(
3054	t(r"[^[\w&&\d]]"),
3055	hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
3056	);
3057	#[cfg(feature = "unicode-perl")]
3058	assert_eq!(
3059	t(r"[^[^\w&&\d]]"),
3060	hir_uclass_query(ClassQuery::Binary("digit"))
3061	);
3062	#[cfg(feature = "unicode-perl")]
3063	assert_eq!(t(r"[[[^\w]&&[^\d]]]"), hir_negate(hir_uclass_perl_word()));
3064
3065	#[cfg(feature = "unicode-perl")]
3066	assert_eq!(
3067	t_bytes(r"(?-u)[^\w&&\d]"),
3068	hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit))
3069	);
3070	assert_eq!(
3071	t_bytes(r"(?-u)[^[a-z&&a-c]]"),
3072	hir_negate(hir_bclass(&[(b'a', b'c')]))
3073	);
3074	assert_eq!(
3075	t_bytes(r"(?-u)[^[\w&&\d]]"),
3076	hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit))
3077	);
3078	assert_eq!(
3079	t_bytes(r"(?-u)[^[^\w&&\d]]"),
3080	hir_ascii_bclass(&ast::ClassAsciiKind::Digit)
3081	);
3082	assert_eq!(
3083	t_bytes(r"(?-u)[[[^\w]&&[^\d]]]"),
3084	hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word))
3085	);
3086	}
3087
3088	#[test]
3089	fn class_bracketed_difference() {
3090	#[cfg(feature = "unicode-gencat")]
3091	assert_eq!(
3092	t(r"[\pL--[:ascii:]]"),
3093	hir_difference(
3094	hir_uclass_query(ClassQuery::Binary("letter")),
3095	hir_uclass(&[('`\0`', '`\x7F`')])
3096	)
3097	);
3098
3099	assert_eq!(
3100	t(r"(?-u)[[:alpha:]--[:lower:]]"),
3101	hir_bclass(&[(b'A', b'Z')])
3102	);
3103	}
3104
3105	#[test]
3106	fn class_bracketed_symmetric_difference() {
3107	#[cfg(feature = "unicode-script")]
3108	assert_eq!(
3109	t(r"[\p{sc:Greek}~~\p{scx:Greek}]"),
3110	hir_uclass(&[
3111	('`\u{0342}`', '`\u{0342}`'),
3112	('`\u{0345}`', '`\u{0345}`'),
3113	('`\u{1DC0}`', '`\u{1DC1}`'),
3114	])
3115	);
3116	assert_eq!(t(r"[a-g~~c-j]"), hir_uclass(&[('a', 'b'), ('h', 'j')]));
3117
3118	assert_eq!(
3119	t(r"(?-u)[a-g~~c-j]"),
3120	hir_bclass(&[(b'a', b'b'), (b'h', b'j')])
3121	);
3122	}
3123
3124	#[test]
3125	fn ignore_whitespace() {
3126	assert_eq!(t(r"(?x)\12 3"), hir_lit("`\n`3"));
3127	assert_eq!(t(r"(?x)\x { 53 }"), hir_lit("S"));
3128	assert_eq!(
3129	t(r"(?x)\x # comment
3130	{ # comment
3131	53 # comment
3132	} #comment"),
3133	hir_lit("S")
3134	);
3135
3136	assert_eq!(t(r"(?x)\x 53"), hir_lit("S"));
3137	assert_eq!(
3138	t(r"(?x)\x # comment
3139	53 # comment"),
3140	hir_lit("S")
3141	);
3142	assert_eq!(t(r"(?x)\x5 3"), hir_lit("S"));
3143
3144	#[cfg(feature = "unicode-gencat")]
3145	assert_eq!(
3146	t(r"(?x)\p # comment
3147	{ # comment
3148	Separator # comment
3149	} # comment"),
3150	hir_uclass_query(ClassQuery::Binary("separator"))
3151	);
3152
3153	assert_eq!(
3154	t(r"(?x)a # comment
3155	{ # comment
3156	5 # comment
3157	, # comment
3158	10 # comment
3159	} # comment"),
3160	hir_range(`true`, `5`, Some(`10`), hir_lit("a"))
3161	);
3162
3163	assert_eq!(t(r"(?x)a\ # hi there"), hir_lit("a "));
3164	}
3165
3166	#[test]
3167	fn analysis_is_utf8() {
3168	// Positive examples.
3169	assert!(props_bytes(r"a").is_utf8());
3170	assert!(props_bytes(r"ab").is_utf8());
3171	assert!(props_bytes(r"(?-u)a").is_utf8());
3172	assert!(props_bytes(r"(?-u)ab").is_utf8());
3173	assert!(props_bytes(r"\xFF").is_utf8());
3174	assert!(props_bytes(r"\xFF\xFF").is_utf8());
3175	assert!(props_bytes(r"[^a]").is_utf8());
3176	assert!(props_bytes(r"[^a][^a]").is_utf8());
3177	assert!(props_bytes(r"\b").is_utf8());
3178	assert!(props_bytes(r"\B").is_utf8());
3179	assert!(props_bytes(r"(?-u)\b").is_utf8());
3180	assert!(props_bytes(r"(?-u)\B").is_utf8());
3181
3182	// Negative examples.
3183	assert!(!props_bytes(r"(?-u)\xFF").is_utf8());
3184	assert!(!props_bytes(r"(?-u)\xFF\xFF").is_utf8());
3185	assert!(!props_bytes(r"(?-u)[^a]").is_utf8());
3186	assert!(!props_bytes(r"(?-u)[^a][^a]").is_utf8());
3187	}
3188
3189	#[test]
3190	fn analysis_captures_len() {
3191	assert_eq!(`0`, props(r"a").explicit_captures_len());
3192	assert_eq!(`0`, props(r"(?:a)").explicit_captures_len());
3193	assert_eq!(`0`, props(r"(?i-u:a)").explicit_captures_len());
3194	assert_eq!(`0`, props(r"(?i-u)a").explicit_captures_len());
3195	assert_eq!(`1`, props(r"(a)").explicit_captures_len());
3196	assert_eq!(`1`, props(r"(?P<foo>a)").explicit_captures_len());
3197	assert_eq!(`1`, props(r"()").explicit_captures_len());
3198	assert_eq!(`1`, props(r"()a").explicit_captures_len());
3199	assert_eq!(`1`, props(r"(a)+").explicit_captures_len());
3200	assert_eq!(`2`, props(r"(a)(b)").explicit_captures_len());
3201	assert_eq!(`2`, props(r"(a)\|(b)").explicit_captures_len());
3202	assert_eq!(`2`, props(r"((a))").explicit_captures_len());
3203	assert_eq!(`1`, props(r"([a&&b])").explicit_captures_len());
3204	}
3205
3206	#[test]
3207	fn analysis_static_captures_len() {
3208	let len = \|pattern\| props(pattern).static_explicit_captures_len();
3209	assert_eq!(Some(`0`), len(r""));
3210	assert_eq!(Some(`0`), len(r"foo\|bar"));
3211	assert_eq!(None, len(r"(foo)\|bar"));
3212	assert_eq!(None, len(r"foo\|(bar)"));
3213	assert_eq!(Some(`1`), len(r"(foo\|bar)"));
3214	assert_eq!(Some(`1`), len(r"(a\|b\|c\|d\|e\|f)"));
3215	assert_eq!(Some(`1`), len(r"(a)\|(b)\|(c)\|(d)\|(e)\|(f)"));
3216	assert_eq!(Some(`2`), len(r"(a)(b)\|(c)(d)\|(e)(f)"));
3217	assert_eq!(Some(`6`), len(r"(a)(b)(c)(d)(e)(f)"));
3218	assert_eq!(Some(`3`), len(r"(a)(b)(extra)\|(a)(b)()"));
3219	assert_eq!(Some(`3`), len(r"(a)(b)((?:extra)?)"));
3220	assert_eq!(None, len(r"(a)(b)(extra)?"));
3221	assert_eq!(Some(`1`), len(r"(foo)\|(bar)"));
3222	assert_eq!(Some(`2`), len(r"(foo)(bar)"));
3223	assert_eq!(Some(`2`), len(r"(foo)+(bar)"));
3224	assert_eq!(None, len(r"(foo)*(bar)"));
3225	assert_eq!(Some(`0`), len(r"(foo)?{0}"));
3226	assert_eq!(None, len(r"(foo)?{1}"));
3227	assert_eq!(Some(`1`), len(r"(foo){1}"));
3228	assert_eq!(Some(`1`), len(r"(foo){1,}"));
3229	assert_eq!(Some(`1`), len(r"(foo){1,}?"));
3230	assert_eq!(None, len(r"(foo){1,}??"));
3231	assert_eq!(None, len(r"(foo){0,}"));
3232	assert_eq!(Some(`1`), len(r"(foo)(?:bar)"));
3233	assert_eq!(Some(`2`), len(r"(foo(?:bar)+)(?:baz(boo))"));
3234	assert_eq!(Some(`2`), len(r"(?P<bar>foo)(?:bar)(bal\|loon)"));
3235	assert_eq!(
3236	Some(`2`),
3237	len(r#"<(a)[^>]+href="([^"]+)"\|<(img)[^>]+src="([^"]+)""#)
3238	);
3239	}
3240
3241	#[test]
3242	fn analysis_is_all_assertions() {
3243	// Positive examples.
3244	let p = props(r"\b");
3245	assert!(!p.look_set().is_empty());
3246	assert_eq!(p.minimum_len(), Some(`0`));
3247
3248	let p = props(r"\B");
3249	assert!(!p.look_set().is_empty());
3250	assert_eq!(p.minimum_len(), Some(`0`));
3251
3252	let p = props(r"^");
3253	assert!(!p.look_set().is_empty());
3254	assert_eq!(p.minimum_len(), Some(`0`));
3255
3256	let p = props(r"$");
3257	assert!(!p.look_set().is_empty());
3258	assert_eq!(p.minimum_len(), Some(`0`));
3259
3260	let p = props(r"\A");
3261	assert!(!p.look_set().is_empty());
3262	assert_eq!(p.minimum_len(), Some(`0`));
3263
3264	let p = props(r"\z");
3265	assert!(!p.look_set().is_empty());
3266	assert_eq!(p.minimum_len(), Some(`0`));
3267
3268	let p = props(r"$^\z\A\b\B");
3269	assert!(!p.look_set().is_empty());
3270	assert_eq!(p.minimum_len(), Some(`0`));
3271
3272	let p = props(r"$\|^\|\z\|\A\|\b\|\B");
3273	assert!(!p.look_set().is_empty());
3274	assert_eq!(p.minimum_len(), Some(`0`));
3275
3276	let p = props(r"^$\|$^");
3277	assert!(!p.look_set().is_empty());
3278	assert_eq!(p.minimum_len(), Some(`0`));
3279
3280	let p = props(r"((\b)+())*^");
3281	assert!(!p.look_set().is_empty());
3282	assert_eq!(p.minimum_len(), Some(`0`));
3283
3284	// Negative examples.
3285	let p = props(r"^a");
3286	assert!(!p.look_set().is_empty());
3287	assert_eq!(p.minimum_len(), Some(`1`));
3288	}
3289
3290	#[test]
3291	fn analysis_look_set_prefix_any() {
3292	let p = props(r"(?-u)(?i:(?:\b\|_)win(?:32\|64\|dows)?(?:\b\|_))");
3293	assert!(p.look_set_prefix_any().contains(Look::WordAscii));
3294	}
3295
3296	#[test]
3297	fn analysis_is_anchored() {
3298	let is_start = \|p\| props(p).look_set_prefix().contains(Look::Start);
3299	let is_end = \|p\| props(p).look_set_suffix().contains(Look::End);
3300
3301	// Positive examples.
3302	assert!(is_start(r"^"));
3303	assert!(is_end(r"$"));
3304
3305	assert!(is_start(r"^^"));
3306	assert!(props(r"$$").look_set_suffix().contains(Look::End));
3307
3308	assert!(is_start(r"^$"));
3309	assert!(is_end(r"^$"));
3310
3311	assert!(is_start(r"^foo"));
3312	assert!(is_end(r"foo$"));
3313
3314	assert!(is_start(r"^foo\|^bar"));
3315	assert!(is_end(r"foo$\|bar$"));
3316
3317	assert!(is_start(r"^(foo\|bar)"));
3318	assert!(is_end(r"(foo\|bar)$"));
3319
3320	assert!(is_start(r"^+"));
3321	assert!(is_end(r"$+"));
3322	assert!(is_start(r"^++"));
3323	assert!(is_end(r"$++"));
3324	assert!(is_start(r"(^)+"));
3325	assert!(is_end(r"($)+"));
3326
3327	assert!(is_start(r"$^"));
3328	assert!(is_start(r"$^"));
3329	assert!(is_start(r"$^\|^$"));
3330	assert!(is_end(r"$^\|^$"));
3331
3332	assert!(is_start(r"\b^"));
3333	assert!(is_end(r"$\b"));
3334	assert!(is_start(r"^(?m:^)"));
3335	assert!(is_end(r"(?m:$)$"));
3336	assert!(is_start(r"(?m:^)^"));
3337	assert!(is_end(r"$(?m:$)"));
3338
3339	// Negative examples.
3340	assert!(!is_start(r"(?m)^"));
3341	assert!(!is_end(r"(?m)$"));
3342	assert!(!is_start(r"(?m:^$)\|$^"));
3343	assert!(!is_end(r"(?m:^$)\|$^"));
3344	assert!(!is_start(r"$^\|(?m:^$)"));
3345	assert!(!is_end(r"$^\|(?m:^$)"));
3346
3347	assert!(!is_start(r"a^"));
3348	assert!(!is_start(r"$a"));
3349
3350	assert!(!is_end(r"a^"));
3351	assert!(!is_end(r"$a"));
3352
3353	assert!(!is_start(r"^foo\|bar"));
3354	assert!(!is_end(r"foo\|bar$"));
3355
3356	assert!(!is_start(r"^*"));
3357	assert!(!is_end(r"$*"));
3358	assert!(!is_start(r"^*+"));
3359	assert!(!is_end(r"$*+"));
3360	assert!(!is_start(r"^+*"));
3361	assert!(!is_end(r"$+*"));
3362	assert!(!is_start(r"(^)*"));
3363	assert!(!is_end(r"($)*"));
3364	}
3365
3366	#[test]
3367	fn analysis_is_any_anchored() {
3368	let is_start = \|p\| props(p).look_set().contains(Look::Start);
3369	let is_end = \|p\| props(p).look_set().contains(Look::End);
3370
3371	// Positive examples.
3372	assert!(is_start(r"^"));
3373	assert!(is_end(r"$"));
3374	assert!(is_start(r"\A"));
3375	assert!(is_end(r"\z"));
3376
3377	// Negative examples.
3378	assert!(!is_start(r"(?m)^"));
3379	assert!(!is_end(r"(?m)$"));
3380	assert!(!is_start(r"$"));
3381	assert!(!is_end(r"^"));
3382	}
3383
3384	#[test]
3385	fn analysis_can_empty() {
3386	// Positive examples.
3387	let assert_empty =
3388	\|p\| assert_eq!(Some(`0`), props_bytes(p).minimum_len());
3389	assert_empty(r"");
3390	assert_empty(r"()");
3391	assert_empty(r"()*");
3392	assert_empty(r"()+");
3393	assert_empty(r"()?");
3394	assert_empty(r"a*");
3395	assert_empty(r"a?");
3396	assert_empty(r"a{0}");
3397	assert_empty(r"a{0,}");
3398	assert_empty(r"a{0,1}");
3399	assert_empty(r"a{0,10}");
3400	#[cfg(feature = "unicode-gencat")]
3401	assert_empty(r"\pL*");
3402	assert_empty(r"a*\|b");
3403	assert_empty(r"b\|a*");
3404	assert_empty(r"a\|");
3405	assert_empty(r"\|a");
3406	assert_empty(r"a\|\|b");
3407	assert_empty(r"aa?(abcd)");
3408	assert_empty(r"^");
3409	assert_empty(r"$");
3410	assert_empty(r"(?m)^");
3411	assert_empty(r"(?m)$");
3412	assert_empty(r"\A");
3413	assert_empty(r"\z");
3414	assert_empty(r"\B");
3415	assert_empty(r"(?-u)\B");
3416	assert_empty(r"\b");
3417	assert_empty(r"(?-u)\b");
3418
3419	// Negative examples.
3420	let assert_non_empty =
3421	\|p\| assert_ne!(Some(`0`), props_bytes(p).minimum_len());
3422	assert_non_empty(r"a+");
3423	assert_non_empty(r"a{1}");
3424	assert_non_empty(r"a{1,}");
3425	assert_non_empty(r"a{1,2}");
3426	assert_non_empty(r"a{1,10}");
3427	assert_non_empty(r"b\|a");
3428	assert_non_empty(r"aa+(abcd)");
3429	#[cfg(feature = "unicode-gencat")]
3430	assert_non_empty(r"\P{any}");
3431	assert_non_empty(r"[a--a]");
3432	assert_non_empty(r"[a&&b]");
3433	}
3434
3435	#[test]
3436	fn analysis_is_literal() {
3437	// Positive examples.
3438	assert!(props(r"a").is_literal());
3439	assert!(props(r"ab").is_literal());
3440	assert!(props(r"abc").is_literal());
3441	assert!(props(r"(?m)abc").is_literal());
3442	assert!(props(r"(?:a)").is_literal());
3443	assert!(props(r"foo(?:a)").is_literal());
3444	assert!(props(r"(?:a)foo").is_literal());
3445	assert!(props(r"[a]").is_literal());
3446
3447	// Negative examples.
3448	assert!(!props(r"").is_literal());
3449	assert!(!props(r"^").is_literal());
3450	assert!(!props(r"a\|b").is_literal());
3451	assert!(!props(r"(a)").is_literal());
3452	assert!(!props(r"a+").is_literal());
3453	assert!(!props(r"foo(a)").is_literal());
3454	assert!(!props(r"(a)foo").is_literal());
3455	assert!(!props(r"[ab]").is_literal());
3456	}
3457
3458	#[test]
3459	fn analysis_is_alternation_literal() {
3460	// Positive examples.
3461	assert!(props(r"a").is_alternation_literal());
3462	assert!(props(r"ab").is_alternation_literal());
3463	assert!(props(r"abc").is_alternation_literal());
3464	assert!(props(r"(?m)abc").is_alternation_literal());
3465	assert!(props(r"foo\|bar").is_alternation_literal());
3466	assert!(props(r"foo\|bar\|baz").is_alternation_literal());
3467	assert!(props(r"[a]").is_alternation_literal());
3468	assert!(props(r"(?:ab)\|cd").is_alternation_literal());
3469	assert!(props(r"ab\|(?:cd)").is_alternation_literal());
3470
3471	// Negative examples.
3472	assert!(!props(r"").is_alternation_literal());
3473	assert!(!props(r"^").is_alternation_literal());
3474	assert!(!props(r"(a)").is_alternation_literal());
3475	assert!(!props(r"a+").is_alternation_literal());
3476	assert!(!props(r"foo(a)").is_alternation_literal());
3477	assert!(!props(r"(a)foo").is_alternation_literal());
3478	assert!(!props(r"[ab]").is_alternation_literal());
3479	assert!(!props(r"[ab]\|b").is_alternation_literal());
3480	assert!(!props(r"a\|[ab]").is_alternation_literal());
3481	assert!(!props(r"(a)\|b").is_alternation_literal());
3482	assert!(!props(r"a\|(b)").is_alternation_literal());
3483	assert!(!props(r"a\|b").is_alternation_literal());
3484	assert!(!props(r"a\|b\|c").is_alternation_literal());
3485	assert!(!props(r"[a]\|b").is_alternation_literal());
3486	assert!(!props(r"a\|[b]").is_alternation_literal());
3487	assert!(!props(r"(?:a)\|b").is_alternation_literal());
3488	assert!(!props(r"a\|(?:b)").is_alternation_literal());
3489	assert!(!props(r"(?:z\|xx)@\|xx").is_alternation_literal());
3490	}
3491
3492	// This tests that the smart Hir::concat constructor simplifies the given
3493	// exprs in a way we expect.
3494	#[test]
3495	fn smart_concat() {
3496	assert_eq!(t(""), Hir::empty());
3497	assert_eq!(t("(?:)"), Hir::empty());
3498	assert_eq!(t("abc"), hir_lit("abc"));
3499	assert_eq!(t("(?:foo)(?:bar)"), hir_lit("foobar"));
3500	assert_eq!(t("quux(?:foo)(?:bar)baz"), hir_lit("quuxfoobarbaz"));
3501	assert_eq!(
3502	t("foo(?:bar^baz)quux"),
3503	hir_cat(vec![
3504	hir_lit("foobar"),
3505	hir_look(hir::Look::Start),
3506	hir_lit("bazquux"),
3507	])
3508	);
3509	assert_eq!(
3510	t("foo(?:ba(?:r^b)az)quux"),
3511	hir_cat(vec![
3512	hir_lit("foobar"),
3513	hir_look(hir::Look::Start),
3514	hir_lit("bazquux"),
3515	])
3516	);
3517	}
3518
3519	// This tests that the smart Hir::alternation constructor simplifies the
3520	// given exprs in a way we expect.
3521	#[test]
3522	fn smart_alternation() {
3523	assert_eq!(
3524	t("(?:foo)\|(?:bar)"),
3525	hir_alt(vec![hir_lit("foo"), hir_lit("bar")])
3526	);
3527	assert_eq!(
3528	t("quux\|(?:abc\|def\|xyz)\|baz"),
3529	hir_alt(vec![
3530	hir_lit("quux"),
3531	hir_lit("abc"),
3532	hir_lit("def"),
3533	hir_lit("xyz"),
3534	hir_lit("baz"),
3535	])
3536	);
3537	assert_eq!(
3538	t("quux\|(?:abc\|(?:def\|mno)\|xyz)\|baz"),
3539	hir_alt(vec![
3540	hir_lit("quux"),
3541	hir_lit("abc"),
3542	hir_lit("def"),
3543	hir_lit("mno"),
3544	hir_lit("xyz"),
3545	hir_lit("baz"),
3546	])
3547	);
3548	assert_eq!(
3549	t("a\|b\|c\|d\|e\|f\|x\|y\|z"),
3550	hir_uclass(&[('a', 'f'), ('x', 'z')]),
3551	);
3552	// Tests that we lift common prefixes out of an alternation.
3553	assert_eq!(
3554	t("[A-Z]foo\|[A-Z]quux"),
3555	hir_cat(vec![
3556	hir_uclass(&[('A', 'Z')]),
3557	hir_alt(vec![hir_lit("foo"), hir_lit("quux")]),
3558	]),
3559	);
3560	assert_eq!(
3561	t("[A-Z][A-Z]\|[A-Z]quux"),
3562	hir_cat(vec![
3563	hir_uclass(&[('A', 'Z')]),
3564	hir_alt(vec![hir_uclass(&[('A', 'Z')]), hir_lit("quux")]),
3565	]),
3566	);
3567	assert_eq!(
3568	t("[A-Z][A-Z]\|[A-Z][A-Z]quux"),
3569	hir_cat(vec![
3570	hir_uclass(&[('A', 'Z')]),
3571	hir_uclass(&[('A', 'Z')]),
3572	hir_alt(vec![Hir::empty(), hir_lit("quux")]),
3573	]),
3574	);
3575	assert_eq!(
3576	t("[A-Z]foo\|[A-Z]foobar"),
3577	hir_cat(vec![
3578	hir_uclass(&[('A', 'Z')]),
3579	hir_alt(vec![hir_lit("foo"), hir_lit("foobar")]),
3580	]),
3581	);
3582	}
3583	}
3584