1/*!
2Defines a translator that converts an `Ast` to an `Hir`.
3*/
4
5use core::cell::{Cell, RefCell};
6
7use alloc::{boxed::Box, string::ToString, vec, vec::Vec};
8
9use crate::{
10 ast::{self, Ast, Span, Visitor},
11 either::Either,
12 hir::{self, Error, ErrorKind, Hir, HirKind},
13 unicode::{self, ClassQuery},
14};
15
16type Result<T> = core::result::Result<T, Error>;
17
18/// A builder for constructing an AST->HIR translator.
19#[derive(Clone, Debug)]
20pub struct TranslatorBuilder {
21 utf8: bool,
22 line_terminator: u8,
23 flags: Flags,
24}
25
26impl Default for TranslatorBuilder {
27 fn default() -> TranslatorBuilder {
28 TranslatorBuilder::new()
29 }
30}
31
32impl TranslatorBuilder {
33 /// Create a new translator builder with a default c onfiguration.
34 pub fn new() -> TranslatorBuilder {
35 TranslatorBuilder {
36 utf8: true,
37 line_terminator: b'\n',
38 flags: Flags::default(),
39 }
40 }
41
42 /// Build a translator using the current configuration.
43 pub fn build(&self) -> Translator {
44 Translator {
45 stack: RefCell::new(vec![]),
46 flags: Cell::new(self.flags),
47 utf8: self.utf8,
48 line_terminator: self.line_terminator,
49 }
50 }
51
52 /// When disabled, translation will permit the construction of a regular
53 /// expression that may match invalid UTF-8.
54 ///
55 /// When enabled (the default), the translator is guaranteed to produce an
56 /// expression that, for non-empty matches, will only ever produce spans
57 /// that are entirely valid UTF-8 (otherwise, the translator will return an
58 /// error).
59 ///
60 /// Perhaps surprisingly, when UTF-8 is enabled, an empty regex or even
61 /// a negated ASCII word boundary (uttered as `(?-u:\B)` in the concrete
62 /// syntax) will be allowed even though they can produce matches that split
63 /// a UTF-8 encoded codepoint. This only applies to zero-width or "empty"
64 /// matches, and it is expected that the regex engine itself must handle
65 /// these cases if necessary (perhaps by suppressing any zero-width matches
66 /// that split a codepoint).
67 pub fn utf8(&mut self, yes: bool) -> &mut TranslatorBuilder {
68 self.utf8 = yes;
69 self
70 }
71
72 /// Sets the line terminator for use with `(?u-s:.)` and `(?-us:.)`.
73 ///
74 /// Namely, instead of `.` (by default) matching everything except for `\n`,
75 /// this will cause `.` to match everything except for the byte given.
76 ///
77 /// If `.` is used in a context where Unicode mode is enabled and this byte
78 /// isn't ASCII, then an error will be returned. When Unicode mode is
79 /// disabled, then any byte is permitted, but will return an error if UTF-8
80 /// mode is enabled and it is a non-ASCII byte.
81 ///
82 /// In short, any ASCII value for a line terminator is always okay. But a
83 /// non-ASCII byte might result in an error depending on whether Unicode
84 /// mode or UTF-8 mode are enabled.
85 ///
86 /// Note that if `R` mode is enabled then it always takes precedence and
87 /// the line terminator will be treated as `\r` and `\n` simultaneously.
88 ///
89 /// Note also that this *doesn't* impact the look-around assertions
90 /// `(?m:^)` and `(?m:$)`. That's usually controlled by additional
91 /// configuration in the regex engine itself.
92 pub fn line_terminator(&mut self, byte: u8) -> &mut TranslatorBuilder {
93 self.line_terminator = byte;
94 self
95 }
96
97 /// Enable or disable the case insensitive flag (`i`) by default.
98 pub fn case_insensitive(&mut self, yes: bool) -> &mut TranslatorBuilder {
99 self.flags.case_insensitive = if yes { Some(true) } else { None };
100 self
101 }
102
103 /// Enable or disable the multi-line matching flag (`m`) by default.
104 pub fn multi_line(&mut self, yes: bool) -> &mut TranslatorBuilder {
105 self.flags.multi_line = if yes { Some(true) } else { None };
106 self
107 }
108
109 /// Enable or disable the "dot matches any character" flag (`s`) by
110 /// default.
111 pub fn dot_matches_new_line(
112 &mut self,
113 yes: bool,
114 ) -> &mut TranslatorBuilder {
115 self.flags.dot_matches_new_line = if yes { Some(true) } else { None };
116 self
117 }
118
119 /// Enable or disable the CRLF mode flag (`R`) by default.
120 pub fn crlf(&mut self, yes: bool) -> &mut TranslatorBuilder {
121 self.flags.crlf = if yes { Some(true) } else { None };
122 self
123 }
124
125 /// Enable or disable the "swap greed" flag (`U`) by default.
126 pub fn swap_greed(&mut self, yes: bool) -> &mut TranslatorBuilder {
127 self.flags.swap_greed = if yes { Some(true) } else { None };
128 self
129 }
130
131 /// Enable or disable the Unicode flag (`u`) by default.
132 pub fn unicode(&mut self, yes: bool) -> &mut TranslatorBuilder {
133 self.flags.unicode = if yes { None } else { Some(false) };
134 self
135 }
136}
137
138/// A translator maps abstract syntax to a high level intermediate
139/// representation.
140///
141/// A translator may be benefit from reuse. That is, a translator can translate
142/// many abstract syntax trees.
143///
144/// A `Translator` can be configured in more detail via a
145/// [`TranslatorBuilder`].
146#[derive(Clone, Debug)]
147pub struct Translator {
148 /// Our call stack, but on the heap.
149 stack: RefCell<Vec<HirFrame>>,
150 /// The current flag settings.
151 flags: Cell<Flags>,
152 /// Whether we're allowed to produce HIR that can match arbitrary bytes.
153 utf8: bool,
154 /// The line terminator to use for `.`.
155 line_terminator: u8,
156}
157
158impl Translator {
159 /// Create a new translator using the default configuration.
160 pub fn new() -> Translator {
161 TranslatorBuilder::new().build()
162 }
163
164 /// Translate the given abstract syntax tree (AST) into a high level
165 /// intermediate representation (HIR).
166 ///
167 /// If there was a problem doing the translation, then an HIR-specific
168 /// error is returned.
169 ///
170 /// The original pattern string used to produce the `Ast` *must* also be
171 /// provided. The translator does not use the pattern string during any
172 /// correct translation, but is used for error reporting.
173 pub fn translate(&mut self, pattern: &str, ast: &Ast) -> Result<Hir> {
174 ast::visit(ast, visitor:TranslatorI::new(self, pattern))
175 }
176}
177
178/// An HirFrame is a single stack frame, represented explicitly, which is
179/// created for each item in the Ast that we traverse.
180///
181/// Note that technically, this type doesn't represent our entire stack
182/// frame. In particular, the Ast visitor represents any state associated with
183/// traversing the Ast itself.
184#[derive(Clone, Debug)]
185enum HirFrame {
186 /// An arbitrary HIR expression. These get pushed whenever we hit a base
187 /// case in the Ast. They get popped after an inductive (i.e., recursive)
188 /// step is complete.
189 Expr(Hir),
190 /// A literal that is being constructed, character by character, from the
191 /// AST. We need this because the AST gives each individual character its
192 /// own node. So as we see characters, we peek at the top-most HirFrame.
193 /// If it's a literal, then we add to it. Otherwise, we push a new literal.
194 /// When it comes time to pop it, we convert it to an Hir via Hir::literal.
195 Literal(Vec<u8>),
196 /// A Unicode character class. This frame is mutated as we descend into
197 /// the Ast of a character class (which is itself its own mini recursive
198 /// structure).
199 ClassUnicode(hir::ClassUnicode),
200 /// A byte-oriented character class. This frame is mutated as we descend
201 /// into the Ast of a character class (which is itself its own mini
202 /// recursive structure).
203 ///
204 /// Byte character classes are created when Unicode mode (`u`) is disabled.
205 /// If `utf8` is enabled (the default), then a byte character is only
206 /// permitted to match ASCII text.
207 ClassBytes(hir::ClassBytes),
208 /// This is pushed whenever a repetition is observed. After visiting every
209 /// sub-expression in the repetition, the translator's stack is expected to
210 /// have this sentinel at the top.
211 ///
212 /// This sentinel only exists to stop other things (like flattening
213 /// literals) from reaching across repetition operators.
214 Repetition,
215 /// This is pushed on to the stack upon first seeing any kind of capture,
216 /// indicated by parentheses (including non-capturing groups). It is popped
217 /// upon leaving a group.
218 Group {
219 /// The old active flags when this group was opened.
220 ///
221 /// If this group sets flags, then the new active flags are set to the
222 /// result of merging the old flags with the flags introduced by this
223 /// group. If the group doesn't set any flags, then this is simply
224 /// equivalent to whatever flags were set when the group was opened.
225 ///
226 /// When this group is popped, the active flags should be restored to
227 /// the flags set here.
228 ///
229 /// The "active" flags correspond to whatever flags are set in the
230 /// Translator.
231 old_flags: Flags,
232 },
233 /// This is pushed whenever a concatenation is observed. After visiting
234 /// every sub-expression in the concatenation, the translator's stack is
235 /// popped until it sees a Concat frame.
236 Concat,
237 /// This is pushed whenever an alternation is observed. After visiting
238 /// every sub-expression in the alternation, the translator's stack is
239 /// popped until it sees an Alternation frame.
240 Alternation,
241 /// This is pushed immediately before each sub-expression in an
242 /// alternation. This separates the branches of an alternation on the
243 /// stack and prevents literal flattening from reaching across alternation
244 /// branches.
245 ///
246 /// It is popped after each expression in a branch until an 'Alternation'
247 /// frame is observed when doing a post visit on an alternation.
248 AlternationBranch,
249}
250
251impl HirFrame {
252 /// Assert that the current stack frame is an Hir expression and return it.
253 fn unwrap_expr(self) -> Hir {
254 match self {
255 HirFrame::Expr(expr) => expr,
256 HirFrame::Literal(lit) => Hir::literal(lit),
257 _ => panic!("tried to unwrap expr from HirFrame, got: {:?}", self),
258 }
259 }
260
261 /// Assert that the current stack frame is a Unicode class expression and
262 /// return it.
263 fn unwrap_class_unicode(self) -> hir::ClassUnicode {
264 match self {
265 HirFrame::ClassUnicode(cls) => cls,
266 _ => panic!(
267 "tried to unwrap Unicode class \
268 from HirFrame, got: {:?}",
269 self
270 ),
271 }
272 }
273
274 /// Assert that the current stack frame is a byte class expression and
275 /// return it.
276 fn unwrap_class_bytes(self) -> hir::ClassBytes {
277 match self {
278 HirFrame::ClassBytes(cls) => cls,
279 _ => panic!(
280 "tried to unwrap byte class \
281 from HirFrame, got: {:?}",
282 self
283 ),
284 }
285 }
286
287 /// Assert that the current stack frame is a repetition sentinel. If it
288 /// isn't, then panic.
289 fn unwrap_repetition(self) {
290 match self {
291 HirFrame::Repetition => {}
292 _ => {
293 panic!(
294 "tried to unwrap repetition from HirFrame, got: {:?}",
295 self
296 )
297 }
298 }
299 }
300
301 /// Assert that the current stack frame is a group indicator and return
302 /// its corresponding flags (the flags that were active at the time the
303 /// group was entered).
304 fn unwrap_group(self) -> Flags {
305 match self {
306 HirFrame::Group { old_flags } => old_flags,
307 _ => {
308 panic!("tried to unwrap group from HirFrame, got: {:?}", self)
309 }
310 }
311 }
312
313 /// Assert that the current stack frame is an alternation pipe sentinel. If
314 /// it isn't, then panic.
315 fn unwrap_alternation_pipe(self) {
316 match self {
317 HirFrame::AlternationBranch => {}
318 _ => {
319 panic!(
320 "tried to unwrap alt pipe from HirFrame, got: {:?}",
321 self
322 )
323 }
324 }
325 }
326}
327
328impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
329 type Output = Hir;
330 type Err = Error;
331
332 fn finish(self) -> Result<Hir> {
333 // ... otherwise, we should have exactly one HIR on the stack.
334 assert_eq!(self.trans().stack.borrow().len(), 1);
335 Ok(self.pop().unwrap().unwrap_expr())
336 }
337
338 fn visit_pre(&mut self, ast: &Ast) -> Result<()> {
339 match *ast {
340 Ast::ClassBracketed(_) => {
341 if self.flags().unicode() {
342 let cls = hir::ClassUnicode::empty();
343 self.push(HirFrame::ClassUnicode(cls));
344 } else {
345 let cls = hir::ClassBytes::empty();
346 self.push(HirFrame::ClassBytes(cls));
347 }
348 }
349 Ast::Repetition(_) => self.push(HirFrame::Repetition),
350 Ast::Group(ref x) => {
351 let old_flags = x
352 .flags()
353 .map(|ast| self.set_flags(ast))
354 .unwrap_or_else(|| self.flags());
355 self.push(HirFrame::Group { old_flags });
356 }
357 Ast::Concat(_) => {
358 self.push(HirFrame::Concat);
359 }
360 Ast::Alternation(ref x) => {
361 self.push(HirFrame::Alternation);
362 if !x.asts.is_empty() {
363 self.push(HirFrame::AlternationBranch);
364 }
365 }
366 _ => {}
367 }
368 Ok(())
369 }
370
371 fn visit_post(&mut self, ast: &Ast) -> Result<()> {
372 match *ast {
373 Ast::Empty(_) => {
374 self.push(HirFrame::Expr(Hir::empty()));
375 }
376 Ast::Flags(ref x) => {
377 self.set_flags(&x.flags);
378 // Flags in the AST are generally considered directives and
379 // not actual sub-expressions. However, they can be used in
380 // the concrete syntax like `((?i))`, and we need some kind of
381 // indication of an expression there, and Empty is the correct
382 // choice.
383 //
384 // There can also be things like `(?i)+`, but we rule those out
385 // in the parser. In the future, we might allow them for
386 // consistency sake.
387 self.push(HirFrame::Expr(Hir::empty()));
388 }
389 Ast::Literal(ref x) => match self.ast_literal_to_scalar(x)? {
390 Either::Right(byte) => self.push_byte(byte),
391 Either::Left(ch) => match self.case_fold_char(x.span, ch)? {
392 None => self.push_char(ch),
393 Some(expr) => self.push(HirFrame::Expr(expr)),
394 },
395 },
396 Ast::Dot(ref span) => {
397 self.push(HirFrame::Expr(self.hir_dot(**span)?));
398 }
399 Ast::Assertion(ref x) => {
400 self.push(HirFrame::Expr(self.hir_assertion(x)?));
401 }
402 Ast::ClassPerl(ref x) => {
403 if self.flags().unicode() {
404 let cls = self.hir_perl_unicode_class(x)?;
405 let hcls = hir::Class::Unicode(cls);
406 self.push(HirFrame::Expr(Hir::class(hcls)));
407 } else {
408 let cls = self.hir_perl_byte_class(x)?;
409 let hcls = hir::Class::Bytes(cls);
410 self.push(HirFrame::Expr(Hir::class(hcls)));
411 }
412 }
413 Ast::ClassUnicode(ref x) => {
414 let cls = hir::Class::Unicode(self.hir_unicode_class(x)?);
415 self.push(HirFrame::Expr(Hir::class(cls)));
416 }
417 Ast::ClassBracketed(ref ast) => {
418 if self.flags().unicode() {
419 let mut cls = self.pop().unwrap().unwrap_class_unicode();
420 self.unicode_fold_and_negate(
421 &ast.span,
422 ast.negated,
423 &mut cls,
424 )?;
425 let expr = Hir::class(hir::Class::Unicode(cls));
426 self.push(HirFrame::Expr(expr));
427 } else {
428 let mut cls = self.pop().unwrap().unwrap_class_bytes();
429 self.bytes_fold_and_negate(
430 &ast.span,
431 ast.negated,
432 &mut cls,
433 )?;
434 let expr = Hir::class(hir::Class::Bytes(cls));
435 self.push(HirFrame::Expr(expr));
436 }
437 }
438 Ast::Repetition(ref x) => {
439 let expr = self.pop().unwrap().unwrap_expr();
440 self.pop().unwrap().unwrap_repetition();
441 self.push(HirFrame::Expr(self.hir_repetition(x, expr)));
442 }
443 Ast::Group(ref x) => {
444 let expr = self.pop().unwrap().unwrap_expr();
445 let old_flags = self.pop().unwrap().unwrap_group();
446 self.trans().flags.set(old_flags);
447 self.push(HirFrame::Expr(self.hir_capture(x, expr)));
448 }
449 Ast::Concat(_) => {
450 let mut exprs = vec![];
451 while let Some(expr) = self.pop_concat_expr() {
452 if !matches!(*expr.kind(), HirKind::Empty) {
453 exprs.push(expr);
454 }
455 }
456 exprs.reverse();
457 self.push(HirFrame::Expr(Hir::concat(exprs)));
458 }
459 Ast::Alternation(_) => {
460 let mut exprs = vec![];
461 while let Some(expr) = self.pop_alt_expr() {
462 self.pop().unwrap().unwrap_alternation_pipe();
463 exprs.push(expr);
464 }
465 exprs.reverse();
466 self.push(HirFrame::Expr(Hir::alternation(exprs)));
467 }
468 }
469 Ok(())
470 }
471
472 fn visit_alternation_in(&mut self) -> Result<()> {
473 self.push(HirFrame::AlternationBranch);
474 Ok(())
475 }
476
477 fn visit_class_set_item_pre(
478 &mut self,
479 ast: &ast::ClassSetItem,
480 ) -> Result<()> {
481 match *ast {
482 ast::ClassSetItem::Bracketed(_) => {
483 if self.flags().unicode() {
484 let cls = hir::ClassUnicode::empty();
485 self.push(HirFrame::ClassUnicode(cls));
486 } else {
487 let cls = hir::ClassBytes::empty();
488 self.push(HirFrame::ClassBytes(cls));
489 }
490 }
491 // We needn't handle the Union case here since the visitor will
492 // do it for us.
493 _ => {}
494 }
495 Ok(())
496 }
497
498 fn visit_class_set_item_post(
499 &mut self,
500 ast: &ast::ClassSetItem,
501 ) -> Result<()> {
502 match *ast {
503 ast::ClassSetItem::Empty(_) => {}
504 ast::ClassSetItem::Literal(ref x) => {
505 if self.flags().unicode() {
506 let mut cls = self.pop().unwrap().unwrap_class_unicode();
507 cls.push(hir::ClassUnicodeRange::new(x.c, x.c));
508 self.push(HirFrame::ClassUnicode(cls));
509 } else {
510 let mut cls = self.pop().unwrap().unwrap_class_bytes();
511 let byte = self.class_literal_byte(x)?;
512 cls.push(hir::ClassBytesRange::new(byte, byte));
513 self.push(HirFrame::ClassBytes(cls));
514 }
515 }
516 ast::ClassSetItem::Range(ref x) => {
517 if self.flags().unicode() {
518 let mut cls = self.pop().unwrap().unwrap_class_unicode();
519 cls.push(hir::ClassUnicodeRange::new(x.start.c, x.end.c));
520 self.push(HirFrame::ClassUnicode(cls));
521 } else {
522 let mut cls = self.pop().unwrap().unwrap_class_bytes();
523 let start = self.class_literal_byte(&x.start)?;
524 let end = self.class_literal_byte(&x.end)?;
525 cls.push(hir::ClassBytesRange::new(start, end));
526 self.push(HirFrame::ClassBytes(cls));
527 }
528 }
529 ast::ClassSetItem::Ascii(ref x) => {
530 if self.flags().unicode() {
531 let xcls = self.hir_ascii_unicode_class(x)?;
532 let mut cls = self.pop().unwrap().unwrap_class_unicode();
533 cls.union(&xcls);
534 self.push(HirFrame::ClassUnicode(cls));
535 } else {
536 let xcls = self.hir_ascii_byte_class(x)?;
537 let mut cls = self.pop().unwrap().unwrap_class_bytes();
538 cls.union(&xcls);
539 self.push(HirFrame::ClassBytes(cls));
540 }
541 }
542 ast::ClassSetItem::Unicode(ref x) => {
543 let xcls = self.hir_unicode_class(x)?;
544 let mut cls = self.pop().unwrap().unwrap_class_unicode();
545 cls.union(&xcls);
546 self.push(HirFrame::ClassUnicode(cls));
547 }
548 ast::ClassSetItem::Perl(ref x) => {
549 if self.flags().unicode() {
550 let xcls = self.hir_perl_unicode_class(x)?;
551 let mut cls = self.pop().unwrap().unwrap_class_unicode();
552 cls.union(&xcls);
553 self.push(HirFrame::ClassUnicode(cls));
554 } else {
555 let xcls = self.hir_perl_byte_class(x)?;
556 let mut cls = self.pop().unwrap().unwrap_class_bytes();
557 cls.union(&xcls);
558 self.push(HirFrame::ClassBytes(cls));
559 }
560 }
561 ast::ClassSetItem::Bracketed(ref ast) => {
562 if self.flags().unicode() {
563 let mut cls1 = self.pop().unwrap().unwrap_class_unicode();
564 self.unicode_fold_and_negate(
565 &ast.span,
566 ast.negated,
567 &mut cls1,
568 )?;
569
570 let mut cls2 = self.pop().unwrap().unwrap_class_unicode();
571 cls2.union(&cls1);
572 self.push(HirFrame::ClassUnicode(cls2));
573 } else {
574 let mut cls1 = self.pop().unwrap().unwrap_class_bytes();
575 self.bytes_fold_and_negate(
576 &ast.span,
577 ast.negated,
578 &mut cls1,
579 )?;
580
581 let mut cls2 = self.pop().unwrap().unwrap_class_bytes();
582 cls2.union(&cls1);
583 self.push(HirFrame::ClassBytes(cls2));
584 }
585 }
586 // This is handled automatically by the visitor.
587 ast::ClassSetItem::Union(_) => {}
588 }
589 Ok(())
590 }
591
592 fn visit_class_set_binary_op_pre(
593 &mut self,
594 _op: &ast::ClassSetBinaryOp,
595 ) -> Result<()> {
596 if self.flags().unicode() {
597 let cls = hir::ClassUnicode::empty();
598 self.push(HirFrame::ClassUnicode(cls));
599 } else {
600 let cls = hir::ClassBytes::empty();
601 self.push(HirFrame::ClassBytes(cls));
602 }
603 Ok(())
604 }
605
606 fn visit_class_set_binary_op_in(
607 &mut self,
608 _op: &ast::ClassSetBinaryOp,
609 ) -> Result<()> {
610 if self.flags().unicode() {
611 let cls = hir::ClassUnicode::empty();
612 self.push(HirFrame::ClassUnicode(cls));
613 } else {
614 let cls = hir::ClassBytes::empty();
615 self.push(HirFrame::ClassBytes(cls));
616 }
617 Ok(())
618 }
619
620 fn visit_class_set_binary_op_post(
621 &mut self,
622 op: &ast::ClassSetBinaryOp,
623 ) -> Result<()> {
624 use crate::ast::ClassSetBinaryOpKind::*;
625
626 if self.flags().unicode() {
627 let mut rhs = self.pop().unwrap().unwrap_class_unicode();
628 let mut lhs = self.pop().unwrap().unwrap_class_unicode();
629 let mut cls = self.pop().unwrap().unwrap_class_unicode();
630 if self.flags().case_insensitive() {
631 rhs.try_case_fold_simple().map_err(|_| {
632 self.error(
633 op.rhs.span().clone(),
634 ErrorKind::UnicodeCaseUnavailable,
635 )
636 })?;
637 lhs.try_case_fold_simple().map_err(|_| {
638 self.error(
639 op.lhs.span().clone(),
640 ErrorKind::UnicodeCaseUnavailable,
641 )
642 })?;
643 }
644 match op.kind {
645 Intersection => lhs.intersect(&rhs),
646 Difference => lhs.difference(&rhs),
647 SymmetricDifference => lhs.symmetric_difference(&rhs),
648 }
649 cls.union(&lhs);
650 self.push(HirFrame::ClassUnicode(cls));
651 } else {
652 let mut rhs = self.pop().unwrap().unwrap_class_bytes();
653 let mut lhs = self.pop().unwrap().unwrap_class_bytes();
654 let mut cls = self.pop().unwrap().unwrap_class_bytes();
655 if self.flags().case_insensitive() {
656 rhs.case_fold_simple();
657 lhs.case_fold_simple();
658 }
659 match op.kind {
660 Intersection => lhs.intersect(&rhs),
661 Difference => lhs.difference(&rhs),
662 SymmetricDifference => lhs.symmetric_difference(&rhs),
663 }
664 cls.union(&lhs);
665 self.push(HirFrame::ClassBytes(cls));
666 }
667 Ok(())
668 }
669}
670
671/// The internal implementation of a translator.
672///
673/// This type is responsible for carrying around the original pattern string,
674/// which is not tied to the internal state of a translator.
675///
676/// A TranslatorI exists for the time it takes to translate a single Ast.
677#[derive(Clone, Debug)]
678struct TranslatorI<'t, 'p> {
679 trans: &'t Translator,
680 pattern: &'p str,
681}
682
683impl<'t, 'p> TranslatorI<'t, 'p> {
684 /// Build a new internal translator.
685 fn new(trans: &'t Translator, pattern: &'p str) -> TranslatorI<'t, 'p> {
686 TranslatorI { trans, pattern }
687 }
688
689 /// Return a reference to the underlying translator.
690 fn trans(&self) -> &Translator {
691 &self.trans
692 }
693
694 /// Push the given frame on to the call stack.
695 fn push(&self, frame: HirFrame) {
696 self.trans().stack.borrow_mut().push(frame);
697 }
698
699 /// Push the given literal char on to the call stack.
700 ///
701 /// If the top-most element of the stack is a literal, then the char
702 /// is appended to the end of that literal. Otherwise, a new literal
703 /// containing just the given char is pushed to the top of the stack.
704 fn push_char(&self, ch: char) {
705 let mut buf = [0; 4];
706 let bytes = ch.encode_utf8(&mut buf).as_bytes();
707 let mut stack = self.trans().stack.borrow_mut();
708 if let Some(HirFrame::Literal(ref mut literal)) = stack.last_mut() {
709 literal.extend_from_slice(bytes);
710 } else {
711 stack.push(HirFrame::Literal(bytes.to_vec()));
712 }
713 }
714
715 /// Push the given literal byte on to the call stack.
716 ///
717 /// If the top-most element of the stack is a literal, then the byte
718 /// is appended to the end of that literal. Otherwise, a new literal
719 /// containing just the given byte is pushed to the top of the stack.
720 fn push_byte(&self, byte: u8) {
721 let mut stack = self.trans().stack.borrow_mut();
722 if let Some(HirFrame::Literal(ref mut literal)) = stack.last_mut() {
723 literal.push(byte);
724 } else {
725 stack.push(HirFrame::Literal(vec![byte]));
726 }
727 }
728
729 /// Pop the top of the call stack. If the call stack is empty, return None.
730 fn pop(&self) -> Option<HirFrame> {
731 self.trans().stack.borrow_mut().pop()
732 }
733
734 /// Pop an HIR expression from the top of the stack for a concatenation.
735 ///
736 /// This returns None if the stack is empty or when a concat frame is seen.
737 /// Otherwise, it panics if it could not find an HIR expression.
738 fn pop_concat_expr(&self) -> Option<Hir> {
739 let frame = self.pop()?;
740 match frame {
741 HirFrame::Concat => None,
742 HirFrame::Expr(expr) => Some(expr),
743 HirFrame::Literal(lit) => Some(Hir::literal(lit)),
744 HirFrame::ClassUnicode(_) => {
745 unreachable!("expected expr or concat, got Unicode class")
746 }
747 HirFrame::ClassBytes(_) => {
748 unreachable!("expected expr or concat, got byte class")
749 }
750 HirFrame::Repetition => {
751 unreachable!("expected expr or concat, got repetition")
752 }
753 HirFrame::Group { .. } => {
754 unreachable!("expected expr or concat, got group")
755 }
756 HirFrame::Alternation => {
757 unreachable!("expected expr or concat, got alt marker")
758 }
759 HirFrame::AlternationBranch => {
760 unreachable!("expected expr or concat, got alt branch marker")
761 }
762 }
763 }
764
765 /// Pop an HIR expression from the top of the stack for an alternation.
766 ///
767 /// This returns None if the stack is empty or when an alternation frame is
768 /// seen. Otherwise, it panics if it could not find an HIR expression.
769 fn pop_alt_expr(&self) -> Option<Hir> {
770 let frame = self.pop()?;
771 match frame {
772 HirFrame::Alternation => None,
773 HirFrame::Expr(expr) => Some(expr),
774 HirFrame::Literal(lit) => Some(Hir::literal(lit)),
775 HirFrame::ClassUnicode(_) => {
776 unreachable!("expected expr or alt, got Unicode class")
777 }
778 HirFrame::ClassBytes(_) => {
779 unreachable!("expected expr or alt, got byte class")
780 }
781 HirFrame::Repetition => {
782 unreachable!("expected expr or alt, got repetition")
783 }
784 HirFrame::Group { .. } => {
785 unreachable!("expected expr or alt, got group")
786 }
787 HirFrame::Concat => {
788 unreachable!("expected expr or alt, got concat marker")
789 }
790 HirFrame::AlternationBranch => {
791 unreachable!("expected expr or alt, got alt branch marker")
792 }
793 }
794 }
795
796 /// Create a new error with the given span and error type.
797 fn error(&self, span: Span, kind: ErrorKind) -> Error {
798 Error { kind, pattern: self.pattern.to_string(), span }
799 }
800
801 /// Return a copy of the active flags.
802 fn flags(&self) -> Flags {
803 self.trans().flags.get()
804 }
805
806 /// Set the flags of this translator from the flags set in the given AST.
807 /// Then, return the old flags.
808 fn set_flags(&self, ast_flags: &ast::Flags) -> Flags {
809 let old_flags = self.flags();
810 let mut new_flags = Flags::from_ast(ast_flags);
811 new_flags.merge(&old_flags);
812 self.trans().flags.set(new_flags);
813 old_flags
814 }
815
816 /// Convert an Ast literal to its scalar representation.
817 ///
818 /// When Unicode mode is enabled, then this always succeeds and returns a
819 /// `char` (Unicode scalar value).
820 ///
821 /// When Unicode mode is disabled, then a `char` will still be returned
822 /// whenever possible. A byte is returned only when invalid UTF-8 is
823 /// allowed and when the byte is not ASCII. Otherwise, a non-ASCII byte
824 /// will result in an error when invalid UTF-8 is not allowed.
825 fn ast_literal_to_scalar(
826 &self,
827 lit: &ast::Literal,
828 ) -> Result<Either<char, u8>> {
829 if self.flags().unicode() {
830 return Ok(Either::Left(lit.c));
831 }
832 let byte = match lit.byte() {
833 None => return Ok(Either::Left(lit.c)),
834 Some(byte) => byte,
835 };
836 if byte <= 0x7F {
837 return Ok(Either::Left(char::try_from(byte).unwrap()));
838 }
839 if self.trans().utf8 {
840 return Err(self.error(lit.span, ErrorKind::InvalidUtf8));
841 }
842 Ok(Either::Right(byte))
843 }
844
845 fn case_fold_char(&self, span: Span, c: char) -> Result<Option<Hir>> {
846 if !self.flags().case_insensitive() {
847 return Ok(None);
848 }
849 if self.flags().unicode() {
850 // If case folding won't do anything, then don't bother trying.
851 let map = unicode::SimpleCaseFolder::new()
852 .map(|f| f.overlaps(c, c))
853 .map_err(|_| {
854 self.error(span, ErrorKind::UnicodeCaseUnavailable)
855 })?;
856 if !map {
857 return Ok(None);
858 }
859 let mut cls =
860 hir::ClassUnicode::new(vec![hir::ClassUnicodeRange::new(
861 c, c,
862 )]);
863 cls.try_case_fold_simple().map_err(|_| {
864 self.error(span, ErrorKind::UnicodeCaseUnavailable)
865 })?;
866 Ok(Some(Hir::class(hir::Class::Unicode(cls))))
867 } else {
868 if !c.is_ascii() {
869 return Ok(None);
870 }
871 // If case folding won't do anything, then don't bother trying.
872 match c {
873 'A'..='Z' | 'a'..='z' => {}
874 _ => return Ok(None),
875 }
876 let mut cls =
877 hir::ClassBytes::new(vec![hir::ClassBytesRange::new(
878 // OK because 'c.len_utf8() == 1' which in turn implies
879 // that 'c' is ASCII.
880 u8::try_from(c).unwrap(),
881 u8::try_from(c).unwrap(),
882 )]);
883 cls.case_fold_simple();
884 Ok(Some(Hir::class(hir::Class::Bytes(cls))))
885 }
886 }
887
888 fn hir_dot(&self, span: Span) -> Result<Hir> {
889 let (utf8, lineterm, flags) =
890 (self.trans().utf8, self.trans().line_terminator, self.flags());
891 if utf8 && (!flags.unicode() || !lineterm.is_ascii()) {
892 return Err(self.error(span, ErrorKind::InvalidUtf8));
893 }
894 let dot = if flags.dot_matches_new_line() {
895 if flags.unicode() {
896 hir::Dot::AnyChar
897 } else {
898 hir::Dot::AnyByte
899 }
900 } else {
901 if flags.unicode() {
902 if flags.crlf() {
903 hir::Dot::AnyCharExceptCRLF
904 } else {
905 if !lineterm.is_ascii() {
906 return Err(
907 self.error(span, ErrorKind::InvalidLineTerminator)
908 );
909 }
910 hir::Dot::AnyCharExcept(char::from(lineterm))
911 }
912 } else {
913 if flags.crlf() {
914 hir::Dot::AnyByteExceptCRLF
915 } else {
916 hir::Dot::AnyByteExcept(lineterm)
917 }
918 }
919 };
920 Ok(Hir::dot(dot))
921 }
922
923 fn hir_assertion(&self, asst: &ast::Assertion) -> Result<Hir> {
924 let unicode = self.flags().unicode();
925 let multi_line = self.flags().multi_line();
926 let crlf = self.flags().crlf();
927 Ok(match asst.kind {
928 ast::AssertionKind::StartLine => Hir::look(if multi_line {
929 if crlf {
930 hir::Look::StartCRLF
931 } else {
932 hir::Look::StartLF
933 }
934 } else {
935 hir::Look::Start
936 }),
937 ast::AssertionKind::EndLine => Hir::look(if multi_line {
938 if crlf {
939 hir::Look::EndCRLF
940 } else {
941 hir::Look::EndLF
942 }
943 } else {
944 hir::Look::End
945 }),
946 ast::AssertionKind::StartText => Hir::look(hir::Look::Start),
947 ast::AssertionKind::EndText => Hir::look(hir::Look::End),
948 ast::AssertionKind::WordBoundary => Hir::look(if unicode {
949 hir::Look::WordUnicode
950 } else {
951 hir::Look::WordAscii
952 }),
953 ast::AssertionKind::NotWordBoundary => Hir::look(if unicode {
954 hir::Look::WordUnicodeNegate
955 } else {
956 hir::Look::WordAsciiNegate
957 }),
958 ast::AssertionKind::WordBoundaryStart
959 | ast::AssertionKind::WordBoundaryStartAngle => {
960 Hir::look(if unicode {
961 hir::Look::WordStartUnicode
962 } else {
963 hir::Look::WordStartAscii
964 })
965 }
966 ast::AssertionKind::WordBoundaryEnd
967 | ast::AssertionKind::WordBoundaryEndAngle => {
968 Hir::look(if unicode {
969 hir::Look::WordEndUnicode
970 } else {
971 hir::Look::WordEndAscii
972 })
973 }
974 ast::AssertionKind::WordBoundaryStartHalf => {
975 Hir::look(if unicode {
976 hir::Look::WordStartHalfUnicode
977 } else {
978 hir::Look::WordStartHalfAscii
979 })
980 }
981 ast::AssertionKind::WordBoundaryEndHalf => Hir::look(if unicode {
982 hir::Look::WordEndHalfUnicode
983 } else {
984 hir::Look::WordEndHalfAscii
985 }),
986 })
987 }
988
989 fn hir_capture(&self, group: &ast::Group, expr: Hir) -> Hir {
990 let (index, name) = match group.kind {
991 ast::GroupKind::CaptureIndex(index) => (index, None),
992 ast::GroupKind::CaptureName { ref name, .. } => {
993 (name.index, Some(name.name.clone().into_boxed_str()))
994 }
995 // The HIR doesn't need to use non-capturing groups, since the way
996 // in which the data type is defined handles this automatically.
997 ast::GroupKind::NonCapturing(_) => return expr,
998 };
999 Hir::capture(hir::Capture { index, name, sub: Box::new(expr) })
1000 }
1001
1002 fn hir_repetition(&self, rep: &ast::Repetition, expr: Hir) -> Hir {
1003 let (min, max) = match rep.op.kind {
1004 ast::RepetitionKind::ZeroOrOne => (0, Some(1)),
1005 ast::RepetitionKind::ZeroOrMore => (0, None),
1006 ast::RepetitionKind::OneOrMore => (1, None),
1007 ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(m)) => {
1008 (m, Some(m))
1009 }
1010 ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast(m)) => {
1011 (m, None)
1012 }
1013 ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(
1014 m,
1015 n,
1016 )) => (m, Some(n)),
1017 };
1018 let greedy =
1019 if self.flags().swap_greed() { !rep.greedy } else { rep.greedy };
1020 Hir::repetition(hir::Repetition {
1021 min,
1022 max,
1023 greedy,
1024 sub: Box::new(expr),
1025 })
1026 }
1027
1028 fn hir_unicode_class(
1029 &self,
1030 ast_class: &ast::ClassUnicode,
1031 ) -> Result<hir::ClassUnicode> {
1032 use crate::ast::ClassUnicodeKind::*;
1033
1034 if !self.flags().unicode() {
1035 return Err(
1036 self.error(ast_class.span, ErrorKind::UnicodeNotAllowed)
1037 );
1038 }
1039 let query = match ast_class.kind {
1040 OneLetter(name) => ClassQuery::OneLetter(name),
1041 Named(ref name) => ClassQuery::Binary(name),
1042 NamedValue { ref name, ref value, .. } => ClassQuery::ByValue {
1043 property_name: name,
1044 property_value: value,
1045 },
1046 };
1047 let mut result = self.convert_unicode_class_error(
1048 &ast_class.span,
1049 unicode::class(query),
1050 );
1051 if let Ok(ref mut class) = result {
1052 self.unicode_fold_and_negate(
1053 &ast_class.span,
1054 ast_class.negated,
1055 class,
1056 )?;
1057 }
1058 result
1059 }
1060
1061 fn hir_ascii_unicode_class(
1062 &self,
1063 ast: &ast::ClassAscii,
1064 ) -> Result<hir::ClassUnicode> {
1065 let mut cls = hir::ClassUnicode::new(
1066 ascii_class_as_chars(&ast.kind)
1067 .map(|(s, e)| hir::ClassUnicodeRange::new(s, e)),
1068 );
1069 self.unicode_fold_and_negate(&ast.span, ast.negated, &mut cls)?;
1070 Ok(cls)
1071 }
1072
1073 fn hir_ascii_byte_class(
1074 &self,
1075 ast: &ast::ClassAscii,
1076 ) -> Result<hir::ClassBytes> {
1077 let mut cls = hir::ClassBytes::new(
1078 ascii_class(&ast.kind)
1079 .map(|(s, e)| hir::ClassBytesRange::new(s, e)),
1080 );
1081 self.bytes_fold_and_negate(&ast.span, ast.negated, &mut cls)?;
1082 Ok(cls)
1083 }
1084
1085 fn hir_perl_unicode_class(
1086 &self,
1087 ast_class: &ast::ClassPerl,
1088 ) -> Result<hir::ClassUnicode> {
1089 use crate::ast::ClassPerlKind::*;
1090
1091 assert!(self.flags().unicode());
1092 let result = match ast_class.kind {
1093 Digit => unicode::perl_digit(),
1094 Space => unicode::perl_space(),
1095 Word => unicode::perl_word(),
1096 };
1097 let mut class =
1098 self.convert_unicode_class_error(&ast_class.span, result)?;
1099 // We needn't apply case folding here because the Perl Unicode classes
1100 // are already closed under Unicode simple case folding.
1101 if ast_class.negated {
1102 class.negate();
1103 }
1104 Ok(class)
1105 }
1106
1107 fn hir_perl_byte_class(
1108 &self,
1109 ast_class: &ast::ClassPerl,
1110 ) -> Result<hir::ClassBytes> {
1111 use crate::ast::ClassPerlKind::*;
1112
1113 assert!(!self.flags().unicode());
1114 let mut class = match ast_class.kind {
1115 Digit => hir_ascii_class_bytes(&ast::ClassAsciiKind::Digit),
1116 Space => hir_ascii_class_bytes(&ast::ClassAsciiKind::Space),
1117 Word => hir_ascii_class_bytes(&ast::ClassAsciiKind::Word),
1118 };
1119 // We needn't apply case folding here because the Perl ASCII classes
1120 // are already closed (under ASCII case folding).
1121 if ast_class.negated {
1122 class.negate();
1123 }
1124 // Negating a Perl byte class is likely to cause it to match invalid
1125 // UTF-8. That's only OK if the translator is configured to allow such
1126 // things.
1127 if self.trans().utf8 && !class.is_ascii() {
1128 return Err(self.error(ast_class.span, ErrorKind::InvalidUtf8));
1129 }
1130 Ok(class)
1131 }
1132
1133 /// Converts the given Unicode specific error to an HIR translation error.
1134 ///
1135 /// The span given should approximate the position at which an error would
1136 /// occur.
1137 fn convert_unicode_class_error(
1138 &self,
1139 span: &Span,
1140 result: core::result::Result<hir::ClassUnicode, unicode::Error>,
1141 ) -> Result<hir::ClassUnicode> {
1142 result.map_err(|err| {
1143 let sp = span.clone();
1144 match err {
1145 unicode::Error::PropertyNotFound => {
1146 self.error(sp, ErrorKind::UnicodePropertyNotFound)
1147 }
1148 unicode::Error::PropertyValueNotFound => {
1149 self.error(sp, ErrorKind::UnicodePropertyValueNotFound)
1150 }
1151 unicode::Error::PerlClassNotFound => {
1152 self.error(sp, ErrorKind::UnicodePerlClassNotFound)
1153 }
1154 }
1155 })
1156 }
1157
1158 fn unicode_fold_and_negate(
1159 &self,
1160 span: &Span,
1161 negated: bool,
1162 class: &mut hir::ClassUnicode,
1163 ) -> Result<()> {
1164 // Note that we must apply case folding before negation!
1165 // Consider `(?i)[^x]`. If we applied negation first, then
1166 // the result would be the character class that matched any
1167 // Unicode scalar value.
1168 if self.flags().case_insensitive() {
1169 class.try_case_fold_simple().map_err(|_| {
1170 self.error(span.clone(), ErrorKind::UnicodeCaseUnavailable)
1171 })?;
1172 }
1173 if negated {
1174 class.negate();
1175 }
1176 Ok(())
1177 }
1178
1179 fn bytes_fold_and_negate(
1180 &self,
1181 span: &Span,
1182 negated: bool,
1183 class: &mut hir::ClassBytes,
1184 ) -> Result<()> {
1185 // Note that we must apply case folding before negation!
1186 // Consider `(?i)[^x]`. If we applied negation first, then
1187 // the result would be the character class that matched any
1188 // Unicode scalar value.
1189 if self.flags().case_insensitive() {
1190 class.case_fold_simple();
1191 }
1192 if negated {
1193 class.negate();
1194 }
1195 if self.trans().utf8 && !class.is_ascii() {
1196 return Err(self.error(span.clone(), ErrorKind::InvalidUtf8));
1197 }
1198 Ok(())
1199 }
1200
1201 /// Return a scalar byte value suitable for use as a literal in a byte
1202 /// character class.
1203 fn class_literal_byte(&self, ast: &ast::Literal) -> Result<u8> {
1204 match self.ast_literal_to_scalar(ast)? {
1205 Either::Right(byte) => Ok(byte),
1206 Either::Left(ch) => {
1207 if ch.is_ascii() {
1208 Ok(u8::try_from(ch).unwrap())
1209 } else {
1210 // We can't feasibly support Unicode in
1211 // byte oriented classes. Byte classes don't
1212 // do Unicode case folding.
1213 Err(self.error(ast.span, ErrorKind::UnicodeNotAllowed))
1214 }
1215 }
1216 }
1217 }
1218}
1219
1220/// A translator's representation of a regular expression's flags at any given
1221/// moment in time.
1222///
1223/// Each flag can be in one of three states: absent, present but disabled or
1224/// present but enabled.
1225#[derive(Clone, Copy, Debug, Default)]
1226struct Flags {
1227 case_insensitive: Option<bool>,
1228 multi_line: Option<bool>,
1229 dot_matches_new_line: Option<bool>,
1230 swap_greed: Option<bool>,
1231 unicode: Option<bool>,
1232 crlf: Option<bool>,
1233 // Note that `ignore_whitespace` is omitted here because it is handled
1234 // entirely in the parser.
1235}
1236
1237impl Flags {
1238 fn from_ast(ast: &ast::Flags) -> Flags {
1239 let mut flags = Flags::default();
1240 let mut enable = true;
1241 for item in &ast.items {
1242 match item.kind {
1243 ast::FlagsItemKind::Negation => {
1244 enable = false;
1245 }
1246 ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive) => {
1247 flags.case_insensitive = Some(enable);
1248 }
1249 ast::FlagsItemKind::Flag(ast::Flag::MultiLine) => {
1250 flags.multi_line = Some(enable);
1251 }
1252 ast::FlagsItemKind::Flag(ast::Flag::DotMatchesNewLine) => {
1253 flags.dot_matches_new_line = Some(enable);
1254 }
1255 ast::FlagsItemKind::Flag(ast::Flag::SwapGreed) => {
1256 flags.swap_greed = Some(enable);
1257 }
1258 ast::FlagsItemKind::Flag(ast::Flag::Unicode) => {
1259 flags.unicode = Some(enable);
1260 }
1261 ast::FlagsItemKind::Flag(ast::Flag::CRLF) => {
1262 flags.crlf = Some(enable);
1263 }
1264 ast::FlagsItemKind::Flag(ast::Flag::IgnoreWhitespace) => {}
1265 }
1266 }
1267 flags
1268 }
1269
1270 fn merge(&mut self, previous: &Flags) {
1271 if self.case_insensitive.is_none() {
1272 self.case_insensitive = previous.case_insensitive;
1273 }
1274 if self.multi_line.is_none() {
1275 self.multi_line = previous.multi_line;
1276 }
1277 if self.dot_matches_new_line.is_none() {
1278 self.dot_matches_new_line = previous.dot_matches_new_line;
1279 }
1280 if self.swap_greed.is_none() {
1281 self.swap_greed = previous.swap_greed;
1282 }
1283 if self.unicode.is_none() {
1284 self.unicode = previous.unicode;
1285 }
1286 if self.crlf.is_none() {
1287 self.crlf = previous.crlf;
1288 }
1289 }
1290
1291 fn case_insensitive(&self) -> bool {
1292 self.case_insensitive.unwrap_or(false)
1293 }
1294
1295 fn multi_line(&self) -> bool {
1296 self.multi_line.unwrap_or(false)
1297 }
1298
1299 fn dot_matches_new_line(&self) -> bool {
1300 self.dot_matches_new_line.unwrap_or(false)
1301 }
1302
1303 fn swap_greed(&self) -> bool {
1304 self.swap_greed.unwrap_or(false)
1305 }
1306
1307 fn unicode(&self) -> bool {
1308 self.unicode.unwrap_or(true)
1309 }
1310
1311 fn crlf(&self) -> bool {
1312 self.crlf.unwrap_or(false)
1313 }
1314}
1315
1316fn hir_ascii_class_bytes(kind: &ast::ClassAsciiKind) -> hir::ClassBytes {
1317 let ranges: Vec<_> = ascii_classimpl Iterator(kind)
1318 .map(|(s: u8, e: u8)| hir::ClassBytesRange::new(start:s, end:e))
1319 .collect();
1320 hir::ClassBytes::new(ranges)
1321}
1322
1323fn ascii_class(kind: &ast::ClassAsciiKind) -> impl Iterator<Item = (u8, u8)> {
1324 use crate::ast::ClassAsciiKind::*;
1325
1326 let slice: &'static [(u8, u8)] = match *kind {
1327 Alnum => &[(b'0', b'9'), (b'A', b'Z'), (b'a', b'z')],
1328 Alpha => &[(b'A', b'Z'), (b'a', b'z')],
1329 Ascii => &[(b'\x00', b'\x7F')],
1330 Blank => &[(b'\t', b'\t'), (b' ', b' ')],
1331 Cntrl => &[(b'\x00', b'\x1F'), (b'\x7F', b'\x7F')],
1332 Digit => &[(b'0', b'9')],
1333 Graph => &[(b'!', b'~')],
1334 Lower => &[(b'a', b'z')],
1335 Print => &[(b' ', b'~')],
1336 Punct => &[(b'!', b'/'), (b':', b'@'), (b'[', b'`'), (b'{', b'~')],
1337 Space => &[
1338 (b'\t', b'\t'),
1339 (b'\n', b'\n'),
1340 (b'\x0B', b'\x0B'),
1341 (b'\x0C', b'\x0C'),
1342 (b'\r', b'\r'),
1343 (b' ', b' '),
1344 ],
1345 Upper => &[(b'A', b'Z')],
1346 Word => &[(b'0', b'9'), (b'A', b'Z'), (b'_', b'_'), (b'a', b'z')],
1347 Xdigit => &[(b'0', b'9'), (b'A', b'F'), (b'a', b'f')],
1348 };
1349 slice.iter().copied()
1350}
1351
1352fn ascii_class_as_chars(
1353 kind: &ast::ClassAsciiKind,
1354) -> impl Iterator<Item = (char, char)> {
1355 ascii_class(kind).map(|(s: u8, e: u8)| (char::from(s), char::from(e)))
1356}
1357
1358#[cfg(test)]
1359mod tests {
1360 use crate::{
1361 ast::{self, parse::ParserBuilder, Ast, Position, Span},
1362 hir::{self, Hir, HirKind, Look, Properties},
1363 unicode::{self, ClassQuery},
1364 };
1365
1366 use super::*;
1367
1368 // We create these errors to compare with real hir::Errors in the tests.
1369 // We define equality between TestError and hir::Error to disregard the
1370 // pattern string in hir::Error, which is annoying to provide in tests.
1371 #[derive(Clone, Debug)]
1372 struct TestError {
1373 span: Span,
1374 kind: hir::ErrorKind,
1375 }
1376
1377 impl PartialEq<hir::Error> for TestError {
1378 fn eq(&self, other: &hir::Error) -> bool {
1379 self.span == other.span && self.kind == other.kind
1380 }
1381 }
1382
1383 impl PartialEq<TestError> for hir::Error {
1384 fn eq(&self, other: &TestError) -> bool {
1385 self.span == other.span && self.kind == other.kind
1386 }
1387 }
1388
1389 fn parse(pattern: &str) -> Ast {
1390 ParserBuilder::new().octal(true).build().parse(pattern).unwrap()
1391 }
1392
1393 fn t(pattern: &str) -> Hir {
1394 TranslatorBuilder::new()
1395 .utf8(true)
1396 .build()
1397 .translate(pattern, &parse(pattern))
1398 .unwrap()
1399 }
1400
1401 fn t_err(pattern: &str) -> hir::Error {
1402 TranslatorBuilder::new()
1403 .utf8(true)
1404 .build()
1405 .translate(pattern, &parse(pattern))
1406 .unwrap_err()
1407 }
1408
1409 fn t_bytes(pattern: &str) -> Hir {
1410 TranslatorBuilder::new()
1411 .utf8(false)
1412 .build()
1413 .translate(pattern, &parse(pattern))
1414 .unwrap()
1415 }
1416
1417 fn props(pattern: &str) -> Properties {
1418 t(pattern).properties().clone()
1419 }
1420
1421 fn props_bytes(pattern: &str) -> Properties {
1422 t_bytes(pattern).properties().clone()
1423 }
1424
1425 fn hir_lit(s: &str) -> Hir {
1426 hir_blit(s.as_bytes())
1427 }
1428
1429 fn hir_blit(s: &[u8]) -> Hir {
1430 Hir::literal(s)
1431 }
1432
1433 fn hir_capture(index: u32, expr: Hir) -> Hir {
1434 Hir::capture(hir::Capture { index, name: None, sub: Box::new(expr) })
1435 }
1436
1437 fn hir_capture_name(index: u32, name: &str, expr: Hir) -> Hir {
1438 Hir::capture(hir::Capture {
1439 index,
1440 name: Some(name.into()),
1441 sub: Box::new(expr),
1442 })
1443 }
1444
1445 fn hir_quest(greedy: bool, expr: Hir) -> Hir {
1446 Hir::repetition(hir::Repetition {
1447 min: 0,
1448 max: Some(1),
1449 greedy,
1450 sub: Box::new(expr),
1451 })
1452 }
1453
1454 fn hir_star(greedy: bool, expr: Hir) -> Hir {
1455 Hir::repetition(hir::Repetition {
1456 min: 0,
1457 max: None,
1458 greedy,
1459 sub: Box::new(expr),
1460 })
1461 }
1462
1463 fn hir_plus(greedy: bool, expr: Hir) -> Hir {
1464 Hir::repetition(hir::Repetition {
1465 min: 1,
1466 max: None,
1467 greedy,
1468 sub: Box::new(expr),
1469 })
1470 }
1471
1472 fn hir_range(greedy: bool, min: u32, max: Option<u32>, expr: Hir) -> Hir {
1473 Hir::repetition(hir::Repetition {
1474 min,
1475 max,
1476 greedy,
1477 sub: Box::new(expr),
1478 })
1479 }
1480
1481 fn hir_alt(alts: Vec<Hir>) -> Hir {
1482 Hir::alternation(alts)
1483 }
1484
1485 fn hir_cat(exprs: Vec<Hir>) -> Hir {
1486 Hir::concat(exprs)
1487 }
1488
1489 #[allow(dead_code)]
1490 fn hir_uclass_query(query: ClassQuery<'_>) -> Hir {
1491 Hir::class(hir::Class::Unicode(unicode::class(query).unwrap()))
1492 }
1493
1494 #[allow(dead_code)]
1495 fn hir_uclass_perl_word() -> Hir {
1496 Hir::class(hir::Class::Unicode(unicode::perl_word().unwrap()))
1497 }
1498
1499 fn hir_ascii_uclass(kind: &ast::ClassAsciiKind) -> Hir {
1500 Hir::class(hir::Class::Unicode(hir::ClassUnicode::new(
1501 ascii_class_as_chars(kind)
1502 .map(|(s, e)| hir::ClassUnicodeRange::new(s, e)),
1503 )))
1504 }
1505
1506 fn hir_ascii_bclass(kind: &ast::ClassAsciiKind) -> Hir {
1507 Hir::class(hir::Class::Bytes(hir::ClassBytes::new(
1508 ascii_class(kind).map(|(s, e)| hir::ClassBytesRange::new(s, e)),
1509 )))
1510 }
1511
1512 fn hir_uclass(ranges: &[(char, char)]) -> Hir {
1513 Hir::class(uclass(ranges))
1514 }
1515
1516 fn hir_bclass(ranges: &[(u8, u8)]) -> Hir {
1517 Hir::class(bclass(ranges))
1518 }
1519
1520 fn hir_case_fold(expr: Hir) -> Hir {
1521 match expr.into_kind() {
1522 HirKind::Class(mut cls) => {
1523 cls.case_fold_simple();
1524 Hir::class(cls)
1525 }
1526 _ => panic!("cannot case fold non-class Hir expr"),
1527 }
1528 }
1529
1530 fn hir_negate(expr: Hir) -> Hir {
1531 match expr.into_kind() {
1532 HirKind::Class(mut cls) => {
1533 cls.negate();
1534 Hir::class(cls)
1535 }
1536 _ => panic!("cannot negate non-class Hir expr"),
1537 }
1538 }
1539
1540 fn uclass(ranges: &[(char, char)]) -> hir::Class {
1541 let ranges: Vec<hir::ClassUnicodeRange> = ranges
1542 .iter()
1543 .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e))
1544 .collect();
1545 hir::Class::Unicode(hir::ClassUnicode::new(ranges))
1546 }
1547
1548 fn bclass(ranges: &[(u8, u8)]) -> hir::Class {
1549 let ranges: Vec<hir::ClassBytesRange> = ranges
1550 .iter()
1551 .map(|&(s, e)| hir::ClassBytesRange::new(s, e))
1552 .collect();
1553 hir::Class::Bytes(hir::ClassBytes::new(ranges))
1554 }
1555
1556 #[cfg(feature = "unicode-case")]
1557 fn class_case_fold(mut cls: hir::Class) -> Hir {
1558 cls.case_fold_simple();
1559 Hir::class(cls)
1560 }
1561
1562 fn class_negate(mut cls: hir::Class) -> Hir {
1563 cls.negate();
1564 Hir::class(cls)
1565 }
1566
1567 #[allow(dead_code)]
1568 fn hir_union(expr1: Hir, expr2: Hir) -> Hir {
1569 use crate::hir::Class::{Bytes, Unicode};
1570
1571 match (expr1.into_kind(), expr2.into_kind()) {
1572 (HirKind::Class(Unicode(mut c1)), HirKind::Class(Unicode(c2))) => {
1573 c1.union(&c2);
1574 Hir::class(hir::Class::Unicode(c1))
1575 }
1576 (HirKind::Class(Bytes(mut c1)), HirKind::Class(Bytes(c2))) => {
1577 c1.union(&c2);
1578 Hir::class(hir::Class::Bytes(c1))
1579 }
1580 _ => panic!("cannot union non-class Hir exprs"),
1581 }
1582 }
1583
1584 #[allow(dead_code)]
1585 fn hir_difference(expr1: Hir, expr2: Hir) -> Hir {
1586 use crate::hir::Class::{Bytes, Unicode};
1587
1588 match (expr1.into_kind(), expr2.into_kind()) {
1589 (HirKind::Class(Unicode(mut c1)), HirKind::Class(Unicode(c2))) => {
1590 c1.difference(&c2);
1591 Hir::class(hir::Class::Unicode(c1))
1592 }
1593 (HirKind::Class(Bytes(mut c1)), HirKind::Class(Bytes(c2))) => {
1594 c1.difference(&c2);
1595 Hir::class(hir::Class::Bytes(c1))
1596 }
1597 _ => panic!("cannot difference non-class Hir exprs"),
1598 }
1599 }
1600
1601 fn hir_look(look: hir::Look) -> Hir {
1602 Hir::look(look)
1603 }
1604
1605 #[test]
1606 fn empty() {
1607 assert_eq!(t(""), Hir::empty());
1608 assert_eq!(t("(?i)"), Hir::empty());
1609 assert_eq!(t("()"), hir_capture(1, Hir::empty()));
1610 assert_eq!(t("(?:)"), Hir::empty());
1611 assert_eq!(t("(?P<wat>)"), hir_capture_name(1, "wat", Hir::empty()));
1612 assert_eq!(t("|"), hir_alt(vec![Hir::empty(), Hir::empty()]));
1613 assert_eq!(
1614 t("()|()"),
1615 hir_alt(vec![
1616 hir_capture(1, Hir::empty()),
1617 hir_capture(2, Hir::empty()),
1618 ])
1619 );
1620 assert_eq!(
1621 t("(|b)"),
1622 hir_capture(1, hir_alt(vec![Hir::empty(), hir_lit("b"),]))
1623 );
1624 assert_eq!(
1625 t("(a|)"),
1626 hir_capture(1, hir_alt(vec![hir_lit("a"), Hir::empty(),]))
1627 );
1628 assert_eq!(
1629 t("(a||c)"),
1630 hir_capture(
1631 1,
1632 hir_alt(vec![hir_lit("a"), Hir::empty(), hir_lit("c"),])
1633 )
1634 );
1635 assert_eq!(
1636 t("(||)"),
1637 hir_capture(
1638 1,
1639 hir_alt(vec![Hir::empty(), Hir::empty(), Hir::empty(),])
1640 )
1641 );
1642 }
1643
1644 #[test]
1645 fn literal() {
1646 assert_eq!(t("a"), hir_lit("a"));
1647 assert_eq!(t("(?-u)a"), hir_lit("a"));
1648 assert_eq!(t("☃"), hir_lit("☃"));
1649 assert_eq!(t("abcd"), hir_lit("abcd"));
1650
1651 assert_eq!(t_bytes("(?-u)a"), hir_lit("a"));
1652 assert_eq!(t_bytes("(?-u)\x61"), hir_lit("a"));
1653 assert_eq!(t_bytes(r"(?-u)\x61"), hir_lit("a"));
1654 assert_eq!(t_bytes(r"(?-u)\xFF"), hir_blit(b"\xFF"));
1655
1656 assert_eq!(t("(?-u)☃"), hir_lit("☃"));
1657 assert_eq!(
1658 t_err(r"(?-u)\xFF"),
1659 TestError {
1660 kind: hir::ErrorKind::InvalidUtf8,
1661 span: Span::new(
1662 Position::new(5, 1, 6),
1663 Position::new(9, 1, 10)
1664 ),
1665 }
1666 );
1667 }
1668
1669 #[test]
1670 fn literal_case_insensitive() {
1671 #[cfg(feature = "unicode-case")]
1672 assert_eq!(t("(?i)a"), hir_uclass(&[('A', 'A'), ('a', 'a'),]));
1673 #[cfg(feature = "unicode-case")]
1674 assert_eq!(t("(?i:a)"), hir_uclass(&[('A', 'A'), ('a', 'a')]));
1675 #[cfg(feature = "unicode-case")]
1676 assert_eq!(
1677 t("a(?i)a(?-i)a"),
1678 hir_cat(vec![
1679 hir_lit("a"),
1680 hir_uclass(&[('A', 'A'), ('a', 'a')]),
1681 hir_lit("a"),
1682 ])
1683 );
1684 #[cfg(feature = "unicode-case")]
1685 assert_eq!(
1686 t("(?i)ab@c"),
1687 hir_cat(vec![
1688 hir_uclass(&[('A', 'A'), ('a', 'a')]),
1689 hir_uclass(&[('B', 'B'), ('b', 'b')]),
1690 hir_lit("@"),
1691 hir_uclass(&[('C', 'C'), ('c', 'c')]),
1692 ])
1693 );
1694 #[cfg(feature = "unicode-case")]
1695 assert_eq!(
1696 t("(?i)β"),
1697 hir_uclass(&[('Β', 'Β'), ('β', 'β'), ('ϐ', 'ϐ'),])
1698 );
1699
1700 assert_eq!(t("(?i-u)a"), hir_bclass(&[(b'A', b'A'), (b'a', b'a'),]));
1701 #[cfg(feature = "unicode-case")]
1702 assert_eq!(
1703 t("(?-u)a(?i)a(?-i)a"),
1704 hir_cat(vec![
1705 hir_lit("a"),
1706 hir_bclass(&[(b'A', b'A'), (b'a', b'a')]),
1707 hir_lit("a"),
1708 ])
1709 );
1710 assert_eq!(
1711 t("(?i-u)ab@c"),
1712 hir_cat(vec![
1713 hir_bclass(&[(b'A', b'A'), (b'a', b'a')]),
1714 hir_bclass(&[(b'B', b'B'), (b'b', b'b')]),
1715 hir_lit("@"),
1716 hir_bclass(&[(b'C', b'C'), (b'c', b'c')]),
1717 ])
1718 );
1719
1720 assert_eq!(
1721 t_bytes("(?i-u)a"),
1722 hir_bclass(&[(b'A', b'A'), (b'a', b'a'),])
1723 );
1724 assert_eq!(
1725 t_bytes("(?i-u)\x61"),
1726 hir_bclass(&[(b'A', b'A'), (b'a', b'a'),])
1727 );
1728 assert_eq!(
1729 t_bytes(r"(?i-u)\x61"),
1730 hir_bclass(&[(b'A', b'A'), (b'a', b'a'),])
1731 );
1732 assert_eq!(t_bytes(r"(?i-u)\xFF"), hir_blit(b"\xFF"));
1733
1734 assert_eq!(t("(?i-u)β"), hir_lit("β"),);
1735 }
1736
1737 #[test]
1738 fn dot() {
1739 assert_eq!(
1740 t("."),
1741 hir_uclass(&[('\0', '\t'), ('\x0B', '\u{10FFFF}')])
1742 );
1743 assert_eq!(
1744 t("(?R)."),
1745 hir_uclass(&[
1746 ('\0', '\t'),
1747 ('\x0B', '\x0C'),
1748 ('\x0E', '\u{10FFFF}'),
1749 ])
1750 );
1751 assert_eq!(t("(?s)."), hir_uclass(&[('\0', '\u{10FFFF}')]));
1752 assert_eq!(t("(?Rs)."), hir_uclass(&[('\0', '\u{10FFFF}')]));
1753 assert_eq!(
1754 t_bytes("(?-u)."),
1755 hir_bclass(&[(b'\0', b'\t'), (b'\x0B', b'\xFF')])
1756 );
1757 assert_eq!(
1758 t_bytes("(?R-u)."),
1759 hir_bclass(&[
1760 (b'\0', b'\t'),
1761 (b'\x0B', b'\x0C'),
1762 (b'\x0E', b'\xFF'),
1763 ])
1764 );
1765 assert_eq!(t_bytes("(?s-u)."), hir_bclass(&[(b'\0', b'\xFF'),]));
1766 assert_eq!(t_bytes("(?Rs-u)."), hir_bclass(&[(b'\0', b'\xFF'),]));
1767
1768 // If invalid UTF-8 isn't allowed, then non-Unicode `.` isn't allowed.
1769 assert_eq!(
1770 t_err("(?-u)."),
1771 TestError {
1772 kind: hir::ErrorKind::InvalidUtf8,
1773 span: Span::new(
1774 Position::new(5, 1, 6),
1775 Position::new(6, 1, 7)
1776 ),
1777 }
1778 );
1779 assert_eq!(
1780 t_err("(?R-u)."),
1781 TestError {
1782 kind: hir::ErrorKind::InvalidUtf8,
1783 span: Span::new(
1784 Position::new(6, 1, 7),
1785 Position::new(7, 1, 8)
1786 ),
1787 }
1788 );
1789 assert_eq!(
1790 t_err("(?s-u)."),
1791 TestError {
1792 kind: hir::ErrorKind::InvalidUtf8,
1793 span: Span::new(
1794 Position::new(6, 1, 7),
1795 Position::new(7, 1, 8)
1796 ),
1797 }
1798 );
1799 assert_eq!(
1800 t_err("(?Rs-u)."),
1801 TestError {
1802 kind: hir::ErrorKind::InvalidUtf8,
1803 span: Span::new(
1804 Position::new(7, 1, 8),
1805 Position::new(8, 1, 9)
1806 ),
1807 }
1808 );
1809 }
1810
1811 #[test]
1812 fn assertions() {
1813 assert_eq!(t("^"), hir_look(hir::Look::Start));
1814 assert_eq!(t("$"), hir_look(hir::Look::End));
1815 assert_eq!(t(r"\A"), hir_look(hir::Look::Start));
1816 assert_eq!(t(r"\z"), hir_look(hir::Look::End));
1817 assert_eq!(t("(?m)^"), hir_look(hir::Look::StartLF));
1818 assert_eq!(t("(?m)$"), hir_look(hir::Look::EndLF));
1819 assert_eq!(t(r"(?m)\A"), hir_look(hir::Look::Start));
1820 assert_eq!(t(r"(?m)\z"), hir_look(hir::Look::End));
1821
1822 assert_eq!(t(r"\b"), hir_look(hir::Look::WordUnicode));
1823 assert_eq!(t(r"\B"), hir_look(hir::Look::WordUnicodeNegate));
1824 assert_eq!(t(r"(?-u)\b"), hir_look(hir::Look::WordAscii));
1825 assert_eq!(t(r"(?-u)\B"), hir_look(hir::Look::WordAsciiNegate));
1826 }
1827
1828 #[test]
1829 fn group() {
1830 assert_eq!(t("(a)"), hir_capture(1, hir_lit("a")));
1831 assert_eq!(
1832 t("(a)(b)"),
1833 hir_cat(vec![
1834 hir_capture(1, hir_lit("a")),
1835 hir_capture(2, hir_lit("b")),
1836 ])
1837 );
1838 assert_eq!(
1839 t("(a)|(b)"),
1840 hir_alt(vec![
1841 hir_capture(1, hir_lit("a")),
1842 hir_capture(2, hir_lit("b")),
1843 ])
1844 );
1845 assert_eq!(t("(?P<foo>)"), hir_capture_name(1, "foo", Hir::empty()));
1846 assert_eq!(t("(?P<foo>a)"), hir_capture_name(1, "foo", hir_lit("a")));
1847 assert_eq!(
1848 t("(?P<foo>a)(?P<bar>b)"),
1849 hir_cat(vec![
1850 hir_capture_name(1, "foo", hir_lit("a")),
1851 hir_capture_name(2, "bar", hir_lit("b")),
1852 ])
1853 );
1854 assert_eq!(t("(?:)"), Hir::empty());
1855 assert_eq!(t("(?:a)"), hir_lit("a"));
1856 assert_eq!(
1857 t("(?:a)(b)"),
1858 hir_cat(vec![hir_lit("a"), hir_capture(1, hir_lit("b")),])
1859 );
1860 assert_eq!(
1861 t("(a)(?:b)(c)"),
1862 hir_cat(vec![
1863 hir_capture(1, hir_lit("a")),
1864 hir_lit("b"),
1865 hir_capture(2, hir_lit("c")),
1866 ])
1867 );
1868 assert_eq!(
1869 t("(a)(?P<foo>b)(c)"),
1870 hir_cat(vec![
1871 hir_capture(1, hir_lit("a")),
1872 hir_capture_name(2, "foo", hir_lit("b")),
1873 hir_capture(3, hir_lit("c")),
1874 ])
1875 );
1876 assert_eq!(t("()"), hir_capture(1, Hir::empty()));
1877 assert_eq!(t("((?i))"), hir_capture(1, Hir::empty()));
1878 assert_eq!(t("((?x))"), hir_capture(1, Hir::empty()));
1879 assert_eq!(
1880 t("(((?x)))"),
1881 hir_capture(1, hir_capture(2, Hir::empty()))
1882 );
1883 }
1884
1885 #[test]
1886 fn line_anchors() {
1887 assert_eq!(t("^"), hir_look(hir::Look::Start));
1888 assert_eq!(t("$"), hir_look(hir::Look::End));
1889 assert_eq!(t(r"\A"), hir_look(hir::Look::Start));
1890 assert_eq!(t(r"\z"), hir_look(hir::Look::End));
1891
1892 assert_eq!(t(r"(?m)\A"), hir_look(hir::Look::Start));
1893 assert_eq!(t(r"(?m)\z"), hir_look(hir::Look::End));
1894 assert_eq!(t("(?m)^"), hir_look(hir::Look::StartLF));
1895 assert_eq!(t("(?m)$"), hir_look(hir::Look::EndLF));
1896
1897 assert_eq!(t(r"(?R)\A"), hir_look(hir::Look::Start));
1898 assert_eq!(t(r"(?R)\z"), hir_look(hir::Look::End));
1899 assert_eq!(t("(?R)^"), hir_look(hir::Look::Start));
1900 assert_eq!(t("(?R)$"), hir_look(hir::Look::End));
1901
1902 assert_eq!(t(r"(?Rm)\A"), hir_look(hir::Look::Start));
1903 assert_eq!(t(r"(?Rm)\z"), hir_look(hir::Look::End));
1904 assert_eq!(t("(?Rm)^"), hir_look(hir::Look::StartCRLF));
1905 assert_eq!(t("(?Rm)$"), hir_look(hir::Look::EndCRLF));
1906 }
1907
1908 #[test]
1909 fn flags() {
1910 #[cfg(feature = "unicode-case")]
1911 assert_eq!(
1912 t("(?i:a)a"),
1913 hir_cat(
1914 vec![hir_uclass(&[('A', 'A'), ('a', 'a')]), hir_lit("a"),]
1915 )
1916 );
1917 assert_eq!(
1918 t("(?i-u:a)β"),
1919 hir_cat(vec![
1920 hir_bclass(&[(b'A', b'A'), (b'a', b'a')]),
1921 hir_lit("β"),
1922 ])
1923 );
1924 assert_eq!(
1925 t("(?:(?i-u)a)b"),
1926 hir_cat(vec![
1927 hir_bclass(&[(b'A', b'A'), (b'a', b'a')]),
1928 hir_lit("b"),
1929 ])
1930 );
1931 assert_eq!(
1932 t("((?i-u)a)b"),
1933 hir_cat(vec![
1934 hir_capture(1, hir_bclass(&[(b'A', b'A'), (b'a', b'a')])),
1935 hir_lit("b"),
1936 ])
1937 );
1938 #[cfg(feature = "unicode-case")]
1939 assert_eq!(
1940 t("(?i)(?-i:a)a"),
1941 hir_cat(
1942 vec![hir_lit("a"), hir_uclass(&[('A', 'A'), ('a', 'a')]),]
1943 )
1944 );
1945 #[cfg(feature = "unicode-case")]
1946 assert_eq!(
1947 t("(?im)a^"),
1948 hir_cat(vec![
1949 hir_uclass(&[('A', 'A'), ('a', 'a')]),
1950 hir_look(hir::Look::StartLF),
1951 ])
1952 );
1953 #[cfg(feature = "unicode-case")]
1954 assert_eq!(
1955 t("(?im)a^(?i-m)a^"),
1956 hir_cat(vec![
1957 hir_uclass(&[('A', 'A'), ('a', 'a')]),
1958 hir_look(hir::Look::StartLF),
1959 hir_uclass(&[('A', 'A'), ('a', 'a')]),
1960 hir_look(hir::Look::Start),
1961 ])
1962 );
1963 assert_eq!(
1964 t("(?U)a*a*?(?-U)a*a*?"),
1965 hir_cat(vec![
1966 hir_star(false, hir_lit("a")),
1967 hir_star(true, hir_lit("a")),
1968 hir_star(true, hir_lit("a")),
1969 hir_star(false, hir_lit("a")),
1970 ])
1971 );
1972 #[cfg(feature = "unicode-case")]
1973 assert_eq!(
1974 t("(?:a(?i)a)a"),
1975 hir_cat(vec![
1976 hir_cat(vec![
1977 hir_lit("a"),
1978 hir_uclass(&[('A', 'A'), ('a', 'a')]),
1979 ]),
1980 hir_lit("a"),
1981 ])
1982 );
1983 #[cfg(feature = "unicode-case")]
1984 assert_eq!(
1985 t("(?i)(?:a(?-i)a)a"),
1986 hir_cat(vec![
1987 hir_cat(vec![
1988 hir_uclass(&[('A', 'A'), ('a', 'a')]),
1989 hir_lit("a"),
1990 ]),
1991 hir_uclass(&[('A', 'A'), ('a', 'a')]),
1992 ])
1993 );
1994 }
1995
1996 #[test]
1997 fn escape() {
1998 assert_eq!(
1999 t(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#"),
2000 hir_lit(r"\.+*?()|[]{}^$#")
2001 );
2002 }
2003
2004 #[test]
2005 fn repetition() {
2006 assert_eq!(t("a?"), hir_quest(true, hir_lit("a")));
2007 assert_eq!(t("a*"), hir_star(true, hir_lit("a")));
2008 assert_eq!(t("a+"), hir_plus(true, hir_lit("a")));
2009 assert_eq!(t("a??"), hir_quest(false, hir_lit("a")));
2010 assert_eq!(t("a*?"), hir_star(false, hir_lit("a")));
2011 assert_eq!(t("a+?"), hir_plus(false, hir_lit("a")));
2012
2013 assert_eq!(t("a{1}"), hir_range(true, 1, Some(1), hir_lit("a"),));
2014 assert_eq!(t("a{1,}"), hir_range(true, 1, None, hir_lit("a"),));
2015 assert_eq!(t("a{1,2}"), hir_range(true, 1, Some(2), hir_lit("a"),));
2016 assert_eq!(t("a{1}?"), hir_range(false, 1, Some(1), hir_lit("a"),));
2017 assert_eq!(t("a{1,}?"), hir_range(false, 1, None, hir_lit("a"),));
2018 assert_eq!(t("a{1,2}?"), hir_range(false, 1, Some(2), hir_lit("a"),));
2019
2020 assert_eq!(
2021 t("ab?"),
2022 hir_cat(vec![hir_lit("a"), hir_quest(true, hir_lit("b")),])
2023 );
2024 assert_eq!(t("(ab)?"), hir_quest(true, hir_capture(1, hir_lit("ab"))));
2025 assert_eq!(
2026 t("a|b?"),
2027 hir_alt(vec![hir_lit("a"), hir_quest(true, hir_lit("b")),])
2028 );
2029 }
2030
2031 #[test]
2032 fn cat_alt() {
2033 let a = || hir_look(hir::Look::Start);
2034 let b = || hir_look(hir::Look::End);
2035 let c = || hir_look(hir::Look::WordUnicode);
2036 let d = || hir_look(hir::Look::WordUnicodeNegate);
2037
2038 assert_eq!(t("(^$)"), hir_capture(1, hir_cat(vec![a(), b()])));
2039 assert_eq!(t("^|$"), hir_alt(vec![a(), b()]));
2040 assert_eq!(t(r"^|$|\b"), hir_alt(vec![a(), b(), c()]));
2041 assert_eq!(
2042 t(r"^$|$\b|\b\B"),
2043 hir_alt(vec![
2044 hir_cat(vec![a(), b()]),
2045 hir_cat(vec![b(), c()]),
2046 hir_cat(vec![c(), d()]),
2047 ])
2048 );
2049 assert_eq!(t("(^|$)"), hir_capture(1, hir_alt(vec![a(), b()])));
2050 assert_eq!(
2051 t(r"(^|$|\b)"),
2052 hir_capture(1, hir_alt(vec![a(), b(), c()]))
2053 );
2054 assert_eq!(
2055 t(r"(^$|$\b|\b\B)"),
2056 hir_capture(
2057 1,
2058 hir_alt(vec![
2059 hir_cat(vec![a(), b()]),
2060 hir_cat(vec![b(), c()]),
2061 hir_cat(vec![c(), d()]),
2062 ])
2063 )
2064 );
2065 assert_eq!(
2066 t(r"(^$|($\b|(\b\B)))"),
2067 hir_capture(
2068 1,
2069 hir_alt(vec![
2070 hir_cat(vec![a(), b()]),
2071 hir_capture(
2072 2,
2073 hir_alt(vec![
2074 hir_cat(vec![b(), c()]),
2075 hir_capture(3, hir_cat(vec![c(), d()])),
2076 ])
2077 ),
2078 ])
2079 )
2080 );
2081 }
2082
2083 // Tests the HIR transformation of things like '[a-z]|[A-Z]' into
2084 // '[A-Za-z]'. In other words, an alternation of just classes is always
2085 // equivalent to a single class corresponding to the union of the branches
2086 // in that class. (Unless some branches match invalid UTF-8 and others
2087 // match non-ASCII Unicode.)
2088 #[test]
2089 fn cat_class_flattened() {
2090 assert_eq!(t(r"[a-z]|[A-Z]"), hir_uclass(&[('A', 'Z'), ('a', 'z')]));
2091 // Combining all of the letter properties should give us the one giant
2092 // letter property.
2093 #[cfg(feature = "unicode-gencat")]
2094 assert_eq!(
2095 t(r"(?x)
2096 \p{Lowercase_Letter}
2097 |\p{Uppercase_Letter}
2098 |\p{Titlecase_Letter}
2099 |\p{Modifier_Letter}
2100 |\p{Other_Letter}
2101 "),
2102 hir_uclass_query(ClassQuery::Binary("letter"))
2103 );
2104 // Byte classes that can truly match invalid UTF-8 cannot be combined
2105 // with Unicode classes.
2106 assert_eq!(
2107 t_bytes(r"[Δδ]|(?-u:[\x90-\xFF])|[Λλ]"),
2108 hir_alt(vec![
2109 hir_uclass(&[('Δ', 'Δ'), ('δ', 'δ')]),
2110 hir_bclass(&[(b'\x90', b'\xFF')]),
2111 hir_uclass(&[('Λ', 'Λ'), ('λ', 'λ')]),
2112 ])
2113 );
2114 // Byte classes on their own can be combined, even if some are ASCII
2115 // and others are invalid UTF-8.
2116 assert_eq!(
2117 t_bytes(r"[a-z]|(?-u:[\x90-\xFF])|[A-Z]"),
2118 hir_bclass(&[(b'A', b'Z'), (b'a', b'z'), (b'\x90', b'\xFF')]),
2119 );
2120 }
2121
2122 #[test]
2123 fn class_ascii() {
2124 assert_eq!(
2125 t("[[:alnum:]]"),
2126 hir_ascii_uclass(&ast::ClassAsciiKind::Alnum)
2127 );
2128 assert_eq!(
2129 t("[[:alpha:]]"),
2130 hir_ascii_uclass(&ast::ClassAsciiKind::Alpha)
2131 );
2132 assert_eq!(
2133 t("[[:ascii:]]"),
2134 hir_ascii_uclass(&ast::ClassAsciiKind::Ascii)
2135 );
2136 assert_eq!(
2137 t("[[:blank:]]"),
2138 hir_ascii_uclass(&ast::ClassAsciiKind::Blank)
2139 );
2140 assert_eq!(
2141 t("[[:cntrl:]]"),
2142 hir_ascii_uclass(&ast::ClassAsciiKind::Cntrl)
2143 );
2144 assert_eq!(
2145 t("[[:digit:]]"),
2146 hir_ascii_uclass(&ast::ClassAsciiKind::Digit)
2147 );
2148 assert_eq!(
2149 t("[[:graph:]]"),
2150 hir_ascii_uclass(&ast::ClassAsciiKind::Graph)
2151 );
2152 assert_eq!(
2153 t("[[:lower:]]"),
2154 hir_ascii_uclass(&ast::ClassAsciiKind::Lower)
2155 );
2156 assert_eq!(
2157 t("[[:print:]]"),
2158 hir_ascii_uclass(&ast::ClassAsciiKind::Print)
2159 );
2160 assert_eq!(
2161 t("[[:punct:]]"),
2162 hir_ascii_uclass(&ast::ClassAsciiKind::Punct)
2163 );
2164 assert_eq!(
2165 t("[[:space:]]"),
2166 hir_ascii_uclass(&ast::ClassAsciiKind::Space)
2167 );
2168 assert_eq!(
2169 t("[[:upper:]]"),
2170 hir_ascii_uclass(&ast::ClassAsciiKind::Upper)
2171 );
2172 assert_eq!(
2173 t("[[:word:]]"),
2174 hir_ascii_uclass(&ast::ClassAsciiKind::Word)
2175 );
2176 assert_eq!(
2177 t("[[:xdigit:]]"),
2178 hir_ascii_uclass(&ast::ClassAsciiKind::Xdigit)
2179 );
2180
2181 assert_eq!(
2182 t("[[:^lower:]]"),
2183 hir_negate(hir_ascii_uclass(&ast::ClassAsciiKind::Lower))
2184 );
2185 #[cfg(feature = "unicode-case")]
2186 assert_eq!(
2187 t("(?i)[[:lower:]]"),
2188 hir_uclass(&[
2189 ('A', 'Z'),
2190 ('a', 'z'),
2191 ('\u{17F}', '\u{17F}'),
2192 ('\u{212A}', '\u{212A}'),
2193 ])
2194 );
2195
2196 assert_eq!(
2197 t("(?-u)[[:lower:]]"),
2198 hir_ascii_bclass(&ast::ClassAsciiKind::Lower)
2199 );
2200 assert_eq!(
2201 t("(?i-u)[[:lower:]]"),
2202 hir_case_fold(hir_ascii_bclass(&ast::ClassAsciiKind::Lower))
2203 );
2204
2205 assert_eq!(
2206 t_err("(?-u)[[:^lower:]]"),
2207 TestError {
2208 kind: hir::ErrorKind::InvalidUtf8,
2209 span: Span::new(
2210 Position::new(6, 1, 7),
2211 Position::new(16, 1, 17)
2212 ),
2213 }
2214 );
2215 assert_eq!(
2216 t_err("(?i-u)[[:^lower:]]"),
2217 TestError {
2218 kind: hir::ErrorKind::InvalidUtf8,
2219 span: Span::new(
2220 Position::new(7, 1, 8),
2221 Position::new(17, 1, 18)
2222 ),
2223 }
2224 );
2225 }
2226
2227 #[test]
2228 fn class_ascii_multiple() {
2229 // See: https://github.com/rust-lang/regex/issues/680
2230 assert_eq!(
2231 t("[[:alnum:][:^ascii:]]"),
2232 hir_union(
2233 hir_ascii_uclass(&ast::ClassAsciiKind::Alnum),
2234 hir_uclass(&[('\u{80}', '\u{10FFFF}')]),
2235 ),
2236 );
2237 assert_eq!(
2238 t_bytes("(?-u)[[:alnum:][:^ascii:]]"),
2239 hir_union(
2240 hir_ascii_bclass(&ast::ClassAsciiKind::Alnum),
2241 hir_bclass(&[(0x80, 0xFF)]),
2242 ),
2243 );
2244 }
2245
2246 #[test]
2247 #[cfg(feature = "unicode-perl")]
2248 fn class_perl_unicode() {
2249 // Unicode
2250 assert_eq!(t(r"\d"), hir_uclass_query(ClassQuery::Binary("digit")));
2251 assert_eq!(t(r"\s"), hir_uclass_query(ClassQuery::Binary("space")));
2252 assert_eq!(t(r"\w"), hir_uclass_perl_word());
2253 #[cfg(feature = "unicode-case")]
2254 assert_eq!(
2255 t(r"(?i)\d"),
2256 hir_uclass_query(ClassQuery::Binary("digit"))
2257 );
2258 #[cfg(feature = "unicode-case")]
2259 assert_eq!(
2260 t(r"(?i)\s"),
2261 hir_uclass_query(ClassQuery::Binary("space"))
2262 );
2263 #[cfg(feature = "unicode-case")]
2264 assert_eq!(t(r"(?i)\w"), hir_uclass_perl_word());
2265
2266 // Unicode, negated
2267 assert_eq!(
2268 t(r"\D"),
2269 hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
2270 );
2271 assert_eq!(
2272 t(r"\S"),
2273 hir_negate(hir_uclass_query(ClassQuery::Binary("space")))
2274 );
2275 assert_eq!(t(r"\W"), hir_negate(hir_uclass_perl_word()));
2276 #[cfg(feature = "unicode-case")]
2277 assert_eq!(
2278 t(r"(?i)\D"),
2279 hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
2280 );
2281 #[cfg(feature = "unicode-case")]
2282 assert_eq!(
2283 t(r"(?i)\S"),
2284 hir_negate(hir_uclass_query(ClassQuery::Binary("space")))
2285 );
2286 #[cfg(feature = "unicode-case")]
2287 assert_eq!(t(r"(?i)\W"), hir_negate(hir_uclass_perl_word()));
2288 }
2289
2290 #[test]
2291 fn class_perl_ascii() {
2292 // ASCII only
2293 assert_eq!(
2294 t(r"(?-u)\d"),
2295 hir_ascii_bclass(&ast::ClassAsciiKind::Digit)
2296 );
2297 assert_eq!(
2298 t(r"(?-u)\s"),
2299 hir_ascii_bclass(&ast::ClassAsciiKind::Space)
2300 );
2301 assert_eq!(
2302 t(r"(?-u)\w"),
2303 hir_ascii_bclass(&ast::ClassAsciiKind::Word)
2304 );
2305 assert_eq!(
2306 t(r"(?i-u)\d"),
2307 hir_ascii_bclass(&ast::ClassAsciiKind::Digit)
2308 );
2309 assert_eq!(
2310 t(r"(?i-u)\s"),
2311 hir_ascii_bclass(&ast::ClassAsciiKind::Space)
2312 );
2313 assert_eq!(
2314 t(r"(?i-u)\w"),
2315 hir_ascii_bclass(&ast::ClassAsciiKind::Word)
2316 );
2317
2318 // ASCII only, negated
2319 assert_eq!(
2320 t_bytes(r"(?-u)\D"),
2321 hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit))
2322 );
2323 assert_eq!(
2324 t_bytes(r"(?-u)\S"),
2325 hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space))
2326 );
2327 assert_eq!(
2328 t_bytes(r"(?-u)\W"),
2329 hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word))
2330 );
2331 assert_eq!(
2332 t_bytes(r"(?i-u)\D"),
2333 hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit))
2334 );
2335 assert_eq!(
2336 t_bytes(r"(?i-u)\S"),
2337 hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space))
2338 );
2339 assert_eq!(
2340 t_bytes(r"(?i-u)\W"),
2341 hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word))
2342 );
2343
2344 // ASCII only, negated, with UTF-8 mode enabled.
2345 // In this case, negating any Perl class results in an error because
2346 // all such classes can match invalid UTF-8.
2347 assert_eq!(
2348 t_err(r"(?-u)\D"),
2349 TestError {
2350 kind: hir::ErrorKind::InvalidUtf8,
2351 span: Span::new(
2352 Position::new(5, 1, 6),
2353 Position::new(7, 1, 8),
2354 ),
2355 },
2356 );
2357 assert_eq!(
2358 t_err(r"(?-u)\S"),
2359 TestError {
2360 kind: hir::ErrorKind::InvalidUtf8,
2361 span: Span::new(
2362 Position::new(5, 1, 6),
2363 Position::new(7, 1, 8),
2364 ),
2365 },
2366 );
2367 assert_eq!(
2368 t_err(r"(?-u)\W"),
2369 TestError {
2370 kind: hir::ErrorKind::InvalidUtf8,
2371 span: Span::new(
2372 Position::new(5, 1, 6),
2373 Position::new(7, 1, 8),
2374 ),
2375 },
2376 );
2377 assert_eq!(
2378 t_err(r"(?i-u)\D"),
2379 TestError {
2380 kind: hir::ErrorKind::InvalidUtf8,
2381 span: Span::new(
2382 Position::new(6, 1, 7),
2383 Position::new(8, 1, 9),
2384 ),
2385 },
2386 );
2387 assert_eq!(
2388 t_err(r"(?i-u)\S"),
2389 TestError {
2390 kind: hir::ErrorKind::InvalidUtf8,
2391 span: Span::new(
2392 Position::new(6, 1, 7),
2393 Position::new(8, 1, 9),
2394 ),
2395 },
2396 );
2397 assert_eq!(
2398 t_err(r"(?i-u)\W"),
2399 TestError {
2400 kind: hir::ErrorKind::InvalidUtf8,
2401 span: Span::new(
2402 Position::new(6, 1, 7),
2403 Position::new(8, 1, 9),
2404 ),
2405 },
2406 );
2407 }
2408
2409 #[test]
2410 #[cfg(not(feature = "unicode-perl"))]
2411 fn class_perl_word_disabled() {
2412 assert_eq!(
2413 t_err(r"\w"),
2414 TestError {
2415 kind: hir::ErrorKind::UnicodePerlClassNotFound,
2416 span: Span::new(
2417 Position::new(0, 1, 1),
2418 Position::new(2, 1, 3)
2419 ),
2420 }
2421 );
2422 }
2423
2424 #[test]
2425 #[cfg(all(not(feature = "unicode-perl"), not(feature = "unicode-bool")))]
2426 fn class_perl_space_disabled() {
2427 assert_eq!(
2428 t_err(r"\s"),
2429 TestError {
2430 kind: hir::ErrorKind::UnicodePerlClassNotFound,
2431 span: Span::new(
2432 Position::new(0, 1, 1),
2433 Position::new(2, 1, 3)
2434 ),
2435 }
2436 );
2437 }
2438
2439 #[test]
2440 #[cfg(all(
2441 not(feature = "unicode-perl"),
2442 not(feature = "unicode-gencat")
2443 ))]
2444 fn class_perl_digit_disabled() {
2445 assert_eq!(
2446 t_err(r"\d"),
2447 TestError {
2448 kind: hir::ErrorKind::UnicodePerlClassNotFound,
2449 span: Span::new(
2450 Position::new(0, 1, 1),
2451 Position::new(2, 1, 3)
2452 ),
2453 }
2454 );
2455 }
2456
2457 #[test]
2458 #[cfg(feature = "unicode-gencat")]
2459 fn class_unicode_gencat() {
2460 assert_eq!(t(r"\pZ"), hir_uclass_query(ClassQuery::Binary("Z")));
2461 assert_eq!(t(r"\pz"), hir_uclass_query(ClassQuery::Binary("Z")));
2462 assert_eq!(
2463 t(r"\p{Separator}"),
2464 hir_uclass_query(ClassQuery::Binary("Z"))
2465 );
2466 assert_eq!(
2467 t(r"\p{se PaRa ToR}"),
2468 hir_uclass_query(ClassQuery::Binary("Z"))
2469 );
2470 assert_eq!(
2471 t(r"\p{gc:Separator}"),
2472 hir_uclass_query(ClassQuery::Binary("Z"))
2473 );
2474 assert_eq!(
2475 t(r"\p{gc=Separator}"),
2476 hir_uclass_query(ClassQuery::Binary("Z"))
2477 );
2478 assert_eq!(
2479 t(r"\p{Other}"),
2480 hir_uclass_query(ClassQuery::Binary("Other"))
2481 );
2482 assert_eq!(t(r"\pC"), hir_uclass_query(ClassQuery::Binary("Other")));
2483
2484 assert_eq!(
2485 t(r"\PZ"),
2486 hir_negate(hir_uclass_query(ClassQuery::Binary("Z")))
2487 );
2488 assert_eq!(
2489 t(r"\P{separator}"),
2490 hir_negate(hir_uclass_query(ClassQuery::Binary("Z")))
2491 );
2492 assert_eq!(
2493 t(r"\P{gc!=separator}"),
2494 hir_negate(hir_uclass_query(ClassQuery::Binary("Z")))
2495 );
2496
2497 assert_eq!(t(r"\p{any}"), hir_uclass_query(ClassQuery::Binary("Any")));
2498 assert_eq!(
2499 t(r"\p{assigned}"),
2500 hir_uclass_query(ClassQuery::Binary("Assigned"))
2501 );
2502 assert_eq!(
2503 t(r"\p{ascii}"),
2504 hir_uclass_query(ClassQuery::Binary("ASCII"))
2505 );
2506 assert_eq!(
2507 t(r"\p{gc:any}"),
2508 hir_uclass_query(ClassQuery::Binary("Any"))
2509 );
2510 assert_eq!(
2511 t(r"\p{gc:assigned}"),
2512 hir_uclass_query(ClassQuery::Binary("Assigned"))
2513 );
2514 assert_eq!(
2515 t(r"\p{gc:ascii}"),
2516 hir_uclass_query(ClassQuery::Binary("ASCII"))
2517 );
2518
2519 assert_eq!(
2520 t_err(r"(?-u)\pZ"),
2521 TestError {
2522 kind: hir::ErrorKind::UnicodeNotAllowed,
2523 span: Span::new(
2524 Position::new(5, 1, 6),
2525 Position::new(8, 1, 9)
2526 ),
2527 }
2528 );
2529 assert_eq!(
2530 t_err(r"(?-u)\p{Separator}"),
2531 TestError {
2532 kind: hir::ErrorKind::UnicodeNotAllowed,
2533 span: Span::new(
2534 Position::new(5, 1, 6),
2535 Position::new(18, 1, 19)
2536 ),
2537 }
2538 );
2539 assert_eq!(
2540 t_err(r"\pE"),
2541 TestError {
2542 kind: hir::ErrorKind::UnicodePropertyNotFound,
2543 span: Span::new(
2544 Position::new(0, 1, 1),
2545 Position::new(3, 1, 4)
2546 ),
2547 }
2548 );
2549 assert_eq!(
2550 t_err(r"\p{Foo}"),
2551 TestError {
2552 kind: hir::ErrorKind::UnicodePropertyNotFound,
2553 span: Span::new(
2554 Position::new(0, 1, 1),
2555 Position::new(7, 1, 8)
2556 ),
2557 }
2558 );
2559 assert_eq!(
2560 t_err(r"\p{gc:Foo}"),
2561 TestError {
2562 kind: hir::ErrorKind::UnicodePropertyValueNotFound,
2563 span: Span::new(
2564 Position::new(0, 1, 1),
2565 Position::new(10, 1, 11)
2566 ),
2567 }
2568 );
2569 }
2570
2571 #[test]
2572 #[cfg(not(feature = "unicode-gencat"))]
2573 fn class_unicode_gencat_disabled() {
2574 assert_eq!(
2575 t_err(r"\p{Separator}"),
2576 TestError {
2577 kind: hir::ErrorKind::UnicodePropertyNotFound,
2578 span: Span::new(
2579 Position::new(0, 1, 1),
2580 Position::new(13, 1, 14)
2581 ),
2582 }
2583 );
2584
2585 assert_eq!(
2586 t_err(r"\p{Any}"),
2587 TestError {
2588 kind: hir::ErrorKind::UnicodePropertyNotFound,
2589 span: Span::new(
2590 Position::new(0, 1, 1),
2591 Position::new(7, 1, 8)
2592 ),
2593 }
2594 );
2595 }
2596
2597 #[test]
2598 #[cfg(feature = "unicode-script")]
2599 fn class_unicode_script() {
2600 assert_eq!(
2601 t(r"\p{Greek}"),
2602 hir_uclass_query(ClassQuery::Binary("Greek"))
2603 );
2604 #[cfg(feature = "unicode-case")]
2605 assert_eq!(
2606 t(r"(?i)\p{Greek}"),
2607 hir_case_fold(hir_uclass_query(ClassQuery::Binary("Greek")))
2608 );
2609 #[cfg(feature = "unicode-case")]
2610 assert_eq!(
2611 t(r"(?i)\P{Greek}"),
2612 hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary(
2613 "Greek"
2614 ))))
2615 );
2616
2617 assert_eq!(
2618 t_err(r"\p{sc:Foo}"),
2619 TestError {
2620 kind: hir::ErrorKind::UnicodePropertyValueNotFound,
2621 span: Span::new(
2622 Position::new(0, 1, 1),
2623 Position::new(10, 1, 11)
2624 ),
2625 }
2626 );
2627 assert_eq!(
2628 t_err(r"\p{scx:Foo}"),
2629 TestError {
2630 kind: hir::ErrorKind::UnicodePropertyValueNotFound,
2631 span: Span::new(
2632 Position::new(0, 1, 1),
2633 Position::new(11, 1, 12)
2634 ),
2635 }
2636 );
2637 }
2638
2639 #[test]
2640 #[cfg(not(feature = "unicode-script"))]
2641 fn class_unicode_script_disabled() {
2642 assert_eq!(
2643 t_err(r"\p{Greek}"),
2644 TestError {
2645 kind: hir::ErrorKind::UnicodePropertyNotFound,
2646 span: Span::new(
2647 Position::new(0, 1, 1),
2648 Position::new(9, 1, 10)
2649 ),
2650 }
2651 );
2652
2653 assert_eq!(
2654 t_err(r"\p{scx:Greek}"),
2655 TestError {
2656 kind: hir::ErrorKind::UnicodePropertyNotFound,
2657 span: Span::new(
2658 Position::new(0, 1, 1),
2659 Position::new(13, 1, 14)
2660 ),
2661 }
2662 );
2663 }
2664
2665 #[test]
2666 #[cfg(feature = "unicode-age")]
2667 fn class_unicode_age() {
2668 assert_eq!(
2669 t_err(r"\p{age:Foo}"),
2670 TestError {
2671 kind: hir::ErrorKind::UnicodePropertyValueNotFound,
2672 span: Span::new(
2673 Position::new(0, 1, 1),
2674 Position::new(11, 1, 12)
2675 ),
2676 }
2677 );
2678 }
2679
2680 #[test]
2681 #[cfg(feature = "unicode-gencat")]
2682 fn class_unicode_any_empty() {
2683 assert_eq!(t(r"\P{any}"), hir_uclass(&[]),);
2684 }
2685
2686 #[test]
2687 #[cfg(not(feature = "unicode-age"))]
2688 fn class_unicode_age_disabled() {
2689 assert_eq!(
2690 t_err(r"\p{age:3.0}"),
2691 TestError {
2692 kind: hir::ErrorKind::UnicodePropertyNotFound,
2693 span: Span::new(
2694 Position::new(0, 1, 1),
2695 Position::new(11, 1, 12)
2696 ),
2697 }
2698 );
2699 }
2700
2701 #[test]
2702 fn class_bracketed() {
2703 assert_eq!(t("[a]"), hir_lit("a"));
2704 assert_eq!(t("[ab]"), hir_uclass(&[('a', 'b')]));
2705 assert_eq!(t("[^[a]]"), class_negate(uclass(&[('a', 'a')])));
2706 assert_eq!(t("[a-z]"), hir_uclass(&[('a', 'z')]));
2707 assert_eq!(t("[a-fd-h]"), hir_uclass(&[('a', 'h')]));
2708 assert_eq!(t("[a-fg-m]"), hir_uclass(&[('a', 'm')]));
2709 assert_eq!(t(r"[\x00]"), hir_uclass(&[('\0', '\0')]));
2710 assert_eq!(t(r"[\n]"), hir_uclass(&[('\n', '\n')]));
2711 assert_eq!(t("[\n]"), hir_uclass(&[('\n', '\n')]));
2712 #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))]
2713 assert_eq!(t(r"[\d]"), hir_uclass_query(ClassQuery::Binary("digit")));
2714 #[cfg(feature = "unicode-gencat")]
2715 assert_eq!(
2716 t(r"[\pZ]"),
2717 hir_uclass_query(ClassQuery::Binary("separator"))
2718 );
2719 #[cfg(feature = "unicode-gencat")]
2720 assert_eq!(
2721 t(r"[\p{separator}]"),
2722 hir_uclass_query(ClassQuery::Binary("separator"))
2723 );
2724 #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))]
2725 assert_eq!(t(r"[^\D]"), hir_uclass_query(ClassQuery::Binary("digit")));
2726 #[cfg(feature = "unicode-gencat")]
2727 assert_eq!(
2728 t(r"[^\PZ]"),
2729 hir_uclass_query(ClassQuery::Binary("separator"))
2730 );
2731 #[cfg(feature = "unicode-gencat")]
2732 assert_eq!(
2733 t(r"[^\P{separator}]"),
2734 hir_uclass_query(ClassQuery::Binary("separator"))
2735 );
2736 #[cfg(all(
2737 feature = "unicode-case",
2738 any(feature = "unicode-perl", feature = "unicode-gencat")
2739 ))]
2740 assert_eq!(
2741 t(r"(?i)[^\D]"),
2742 hir_uclass_query(ClassQuery::Binary("digit"))
2743 );
2744 #[cfg(all(feature = "unicode-case", feature = "unicode-script"))]
2745 assert_eq!(
2746 t(r"(?i)[^\P{greek}]"),
2747 hir_case_fold(hir_uclass_query(ClassQuery::Binary("greek")))
2748 );
2749
2750 assert_eq!(t("(?-u)[a]"), hir_bclass(&[(b'a', b'a')]));
2751 assert_eq!(t(r"(?-u)[\x00]"), hir_bclass(&[(b'\0', b'\0')]));
2752 assert_eq!(t_bytes(r"(?-u)[\xFF]"), hir_bclass(&[(b'\xFF', b'\xFF')]));
2753
2754 #[cfg(feature = "unicode-case")]
2755 assert_eq!(t("(?i)[a]"), hir_uclass(&[('A', 'A'), ('a', 'a')]));
2756 #[cfg(feature = "unicode-case")]
2757 assert_eq!(
2758 t("(?i)[k]"),
2759 hir_uclass(&[('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}'),])
2760 );
2761 #[cfg(feature = "unicode-case")]
2762 assert_eq!(
2763 t("(?i)[β]"),
2764 hir_uclass(&[('Β', 'Β'), ('β', 'β'), ('ϐ', 'ϐ'),])
2765 );
2766 assert_eq!(t("(?i-u)[k]"), hir_bclass(&[(b'K', b'K'), (b'k', b'k'),]));
2767
2768 assert_eq!(t("[^a]"), class_negate(uclass(&[('a', 'a')])));
2769 assert_eq!(t(r"[^\x00]"), class_negate(uclass(&[('\0', '\0')])));
2770 assert_eq!(
2771 t_bytes("(?-u)[^a]"),
2772 class_negate(bclass(&[(b'a', b'a')]))
2773 );
2774 #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))]
2775 assert_eq!(
2776 t(r"[^\d]"),
2777 hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
2778 );
2779 #[cfg(feature = "unicode-gencat")]
2780 assert_eq!(
2781 t(r"[^\pZ]"),
2782 hir_negate(hir_uclass_query(ClassQuery::Binary("separator")))
2783 );
2784 #[cfg(feature = "unicode-gencat")]
2785 assert_eq!(
2786 t(r"[^\p{separator}]"),
2787 hir_negate(hir_uclass_query(ClassQuery::Binary("separator")))
2788 );
2789 #[cfg(all(feature = "unicode-case", feature = "unicode-script"))]
2790 assert_eq!(
2791 t(r"(?i)[^\p{greek}]"),
2792 hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary(
2793 "greek"
2794 ))))
2795 );
2796 #[cfg(all(feature = "unicode-case", feature = "unicode-script"))]
2797 assert_eq!(
2798 t(r"(?i)[\P{greek}]"),
2799 hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary(
2800 "greek"
2801 ))))
2802 );
2803
2804 // Test some weird cases.
2805 assert_eq!(t(r"[\[]"), hir_uclass(&[('[', '[')]));
2806
2807 assert_eq!(t(r"[&]"), hir_uclass(&[('&', '&')]));
2808 assert_eq!(t(r"[\&]"), hir_uclass(&[('&', '&')]));
2809 assert_eq!(t(r"[\&\&]"), hir_uclass(&[('&', '&')]));
2810 assert_eq!(t(r"[\x00-&]"), hir_uclass(&[('\0', '&')]));
2811 assert_eq!(t(r"[&-\xFF]"), hir_uclass(&[('&', '\u{FF}')]));
2812
2813 assert_eq!(t(r"[~]"), hir_uclass(&[('~', '~')]));
2814 assert_eq!(t(r"[\~]"), hir_uclass(&[('~', '~')]));
2815 assert_eq!(t(r"[\~\~]"), hir_uclass(&[('~', '~')]));
2816 assert_eq!(t(r"[\x00-~]"), hir_uclass(&[('\0', '~')]));
2817 assert_eq!(t(r"[~-\xFF]"), hir_uclass(&[('~', '\u{FF}')]));
2818
2819 assert_eq!(t(r"[-]"), hir_uclass(&[('-', '-')]));
2820 assert_eq!(t(r"[\-]"), hir_uclass(&[('-', '-')]));
2821 assert_eq!(t(r"[\-\-]"), hir_uclass(&[('-', '-')]));
2822 assert_eq!(t(r"[\x00-\-]"), hir_uclass(&[('\0', '-')]));
2823 assert_eq!(t(r"[\--\xFF]"), hir_uclass(&[('-', '\u{FF}')]));
2824
2825 assert_eq!(
2826 t_err("(?-u)[^a]"),
2827 TestError {
2828 kind: hir::ErrorKind::InvalidUtf8,
2829 span: Span::new(
2830 Position::new(5, 1, 6),
2831 Position::new(9, 1, 10)
2832 ),
2833 }
2834 );
2835 #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))]
2836 assert_eq!(t(r"[^\s\S]"), hir_uclass(&[]),);
2837 #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))]
2838 assert_eq!(t_bytes(r"(?-u)[^\s\S]"), hir_bclass(&[]),);
2839 }
2840
2841 #[test]
2842 fn class_bracketed_union() {
2843 assert_eq!(t("[a-zA-Z]"), hir_uclass(&[('A', 'Z'), ('a', 'z')]));
2844 #[cfg(feature = "unicode-gencat")]
2845 assert_eq!(
2846 t(r"[a\pZb]"),
2847 hir_union(
2848 hir_uclass(&[('a', 'b')]),
2849 hir_uclass_query(ClassQuery::Binary("separator"))
2850 )
2851 );
2852 #[cfg(all(feature = "unicode-gencat", feature = "unicode-script"))]
2853 assert_eq!(
2854 t(r"[\pZ\p{Greek}]"),
2855 hir_union(
2856 hir_uclass_query(ClassQuery::Binary("greek")),
2857 hir_uclass_query(ClassQuery::Binary("separator"))
2858 )
2859 );
2860 #[cfg(all(
2861 feature = "unicode-age",
2862 feature = "unicode-gencat",
2863 feature = "unicode-script"
2864 ))]
2865 assert_eq!(
2866 t(r"[\p{age:3.0}\pZ\p{Greek}]"),
2867 hir_union(
2868 hir_uclass_query(ClassQuery::ByValue {
2869 property_name: "age",
2870 property_value: "3.0",
2871 }),
2872 hir_union(
2873 hir_uclass_query(ClassQuery::Binary("greek")),
2874 hir_uclass_query(ClassQuery::Binary("separator"))
2875 )
2876 )
2877 );
2878 #[cfg(all(
2879 feature = "unicode-age",
2880 feature = "unicode-gencat",
2881 feature = "unicode-script"
2882 ))]
2883 assert_eq!(
2884 t(r"[[[\p{age:3.0}\pZ]\p{Greek}][\p{Cyrillic}]]"),
2885 hir_union(
2886 hir_uclass_query(ClassQuery::ByValue {
2887 property_name: "age",
2888 property_value: "3.0",
2889 }),
2890 hir_union(
2891 hir_uclass_query(ClassQuery::Binary("cyrillic")),
2892 hir_union(
2893 hir_uclass_query(ClassQuery::Binary("greek")),
2894 hir_uclass_query(ClassQuery::Binary("separator"))
2895 )
2896 )
2897 )
2898 );
2899
2900 #[cfg(all(
2901 feature = "unicode-age",
2902 feature = "unicode-case",
2903 feature = "unicode-gencat",
2904 feature = "unicode-script"
2905 ))]
2906 assert_eq!(
2907 t(r"(?i)[\p{age:3.0}\pZ\p{Greek}]"),
2908 hir_case_fold(hir_union(
2909 hir_uclass_query(ClassQuery::ByValue {
2910 property_name: "age",
2911 property_value: "3.0",
2912 }),
2913 hir_union(
2914 hir_uclass_query(ClassQuery::Binary("greek")),
2915 hir_uclass_query(ClassQuery::Binary("separator"))
2916 )
2917 ))
2918 );
2919 #[cfg(all(
2920 feature = "unicode-age",
2921 feature = "unicode-gencat",
2922 feature = "unicode-script"
2923 ))]
2924 assert_eq!(
2925 t(r"[^\p{age:3.0}\pZ\p{Greek}]"),
2926 hir_negate(hir_union(
2927 hir_uclass_query(ClassQuery::ByValue {
2928 property_name: "age",
2929 property_value: "3.0",
2930 }),
2931 hir_union(
2932 hir_uclass_query(ClassQuery::Binary("greek")),
2933 hir_uclass_query(ClassQuery::Binary("separator"))
2934 )
2935 ))
2936 );
2937 #[cfg(all(
2938 feature = "unicode-age",
2939 feature = "unicode-case",
2940 feature = "unicode-gencat",
2941 feature = "unicode-script"
2942 ))]
2943 assert_eq!(
2944 t(r"(?i)[^\p{age:3.0}\pZ\p{Greek}]"),
2945 hir_negate(hir_case_fold(hir_union(
2946 hir_uclass_query(ClassQuery::ByValue {
2947 property_name: "age",
2948 property_value: "3.0",
2949 }),
2950 hir_union(
2951 hir_uclass_query(ClassQuery::Binary("greek")),
2952 hir_uclass_query(ClassQuery::Binary("separator"))
2953 )
2954 )))
2955 );
2956 }
2957
2958 #[test]
2959 fn class_bracketed_nested() {
2960 assert_eq!(t(r"[a[^c]]"), class_negate(uclass(&[('c', 'c')])));
2961 assert_eq!(t(r"[a-b[^c]]"), class_negate(uclass(&[('c', 'c')])));
2962 assert_eq!(t(r"[a-c[^c]]"), class_negate(uclass(&[])));
2963
2964 assert_eq!(t(r"[^a[^c]]"), hir_uclass(&[('c', 'c')]));
2965 assert_eq!(t(r"[^a-b[^c]]"), hir_uclass(&[('c', 'c')]));
2966
2967 #[cfg(feature = "unicode-case")]
2968 assert_eq!(
2969 t(r"(?i)[a[^c]]"),
2970 hir_negate(class_case_fold(uclass(&[('c', 'c')])))
2971 );
2972 #[cfg(feature = "unicode-case")]
2973 assert_eq!(
2974 t(r"(?i)[a-b[^c]]"),
2975 hir_negate(class_case_fold(uclass(&[('c', 'c')])))
2976 );
2977
2978 #[cfg(feature = "unicode-case")]
2979 assert_eq!(t(r"(?i)[^a[^c]]"), hir_uclass(&[('C', 'C'), ('c', 'c')]));
2980 #[cfg(feature = "unicode-case")]
2981 assert_eq!(
2982 t(r"(?i)[^a-b[^c]]"),
2983 hir_uclass(&[('C', 'C'), ('c', 'c')])
2984 );
2985
2986 assert_eq!(t(r"[^a-c[^c]]"), hir_uclass(&[]),);
2987 #[cfg(feature = "unicode-case")]
2988 assert_eq!(t(r"(?i)[^a-c[^c]]"), hir_uclass(&[]),);
2989 }
2990
2991 #[test]
2992 fn class_bracketed_intersect() {
2993 assert_eq!(t("[abc&&b-c]"), hir_uclass(&[('b', 'c')]));
2994 assert_eq!(t("[abc&&[b-c]]"), hir_uclass(&[('b', 'c')]));
2995 assert_eq!(t("[[abc]&&[b-c]]"), hir_uclass(&[('b', 'c')]));
2996 assert_eq!(t("[a-z&&b-y&&c-x]"), hir_uclass(&[('c', 'x')]));
2997 assert_eq!(t("[c-da-b&&a-d]"), hir_uclass(&[('a', 'd')]));
2998 assert_eq!(t("[a-d&&c-da-b]"), hir_uclass(&[('a', 'd')]));
2999 assert_eq!(t(r"[a-z&&a-c]"), hir_uclass(&[('a', 'c')]));
3000 assert_eq!(t(r"[[a-z&&a-c]]"), hir_uclass(&[('a', 'c')]));
3001 assert_eq!(t(r"[^[a-z&&a-c]]"), hir_negate(hir_uclass(&[('a', 'c')])));
3002
3003 assert_eq!(t("(?-u)[abc&&b-c]"), hir_bclass(&[(b'b', b'c')]));
3004 assert_eq!(t("(?-u)[abc&&[b-c]]"), hir_bclass(&[(b'b', b'c')]));
3005 assert_eq!(t("(?-u)[[abc]&&[b-c]]"), hir_bclass(&[(b'b', b'c')]));
3006 assert_eq!(t("(?-u)[a-z&&b-y&&c-x]"), hir_bclass(&[(b'c', b'x')]));
3007 assert_eq!(t("(?-u)[c-da-b&&a-d]"), hir_bclass(&[(b'a', b'd')]));
3008 assert_eq!(t("(?-u)[a-d&&c-da-b]"), hir_bclass(&[(b'a', b'd')]));
3009
3010 #[cfg(feature = "unicode-case")]
3011 assert_eq!(
3012 t("(?i)[abc&&b-c]"),
3013 hir_case_fold(hir_uclass(&[('b', 'c')]))
3014 );
3015 #[cfg(feature = "unicode-case")]
3016 assert_eq!(
3017 t("(?i)[abc&&[b-c]]"),
3018 hir_case_fold(hir_uclass(&[('b', 'c')]))
3019 );
3020 #[cfg(feature = "unicode-case")]
3021 assert_eq!(
3022 t("(?i)[[abc]&&[b-c]]"),
3023 hir_case_fold(hir_uclass(&[('b', 'c')]))
3024 );
3025 #[cfg(feature = "unicode-case")]
3026 assert_eq!(
3027 t("(?i)[a-z&&b-y&&c-x]"),
3028 hir_case_fold(hir_uclass(&[('c', 'x')]))
3029 );
3030 #[cfg(feature = "unicode-case")]
3031 assert_eq!(
3032 t("(?i)[c-da-b&&a-d]"),
3033 hir_case_fold(hir_uclass(&[('a', 'd')]))
3034 );
3035 #[cfg(feature = "unicode-case")]
3036 assert_eq!(
3037 t("(?i)[a-d&&c-da-b]"),
3038 hir_case_fold(hir_uclass(&[('a', 'd')]))
3039 );
3040
3041 assert_eq!(
3042 t("(?i-u)[abc&&b-c]"),
3043 hir_case_fold(hir_bclass(&[(b'b', b'c')]))
3044 );
3045 assert_eq!(
3046 t("(?i-u)[abc&&[b-c]]"),
3047 hir_case_fold(hir_bclass(&[(b'b', b'c')]))
3048 );
3049 assert_eq!(
3050 t("(?i-u)[[abc]&&[b-c]]"),
3051 hir_case_fold(hir_bclass(&[(b'b', b'c')]))
3052 );
3053 assert_eq!(
3054 t("(?i-u)[a-z&&b-y&&c-x]"),
3055 hir_case_fold(hir_bclass(&[(b'c', b'x')]))
3056 );
3057 assert_eq!(
3058 t("(?i-u)[c-da-b&&a-d]"),
3059 hir_case_fold(hir_bclass(&[(b'a', b'd')]))
3060 );
3061 assert_eq!(
3062 t("(?i-u)[a-d&&c-da-b]"),
3063 hir_case_fold(hir_bclass(&[(b'a', b'd')]))
3064 );
3065
3066 // In `[a^]`, `^` does not need to be escaped, so it makes sense that
3067 // `^` is also allowed to be unescaped after `&&`.
3068 assert_eq!(t(r"[\^&&^]"), hir_uclass(&[('^', '^')]));
3069 // `]` needs to be escaped after `&&` since it's not at start of class.
3070 assert_eq!(t(r"[]&&\]]"), hir_uclass(&[(']', ']')]));
3071 assert_eq!(t(r"[-&&-]"), hir_uclass(&[('-', '-')]));
3072 assert_eq!(t(r"[\&&&&]"), hir_uclass(&[('&', '&')]));
3073 assert_eq!(t(r"[\&&&\&]"), hir_uclass(&[('&', '&')]));
3074 // Test precedence.
3075 assert_eq!(
3076 t(r"[a-w&&[^c-g]z]"),
3077 hir_uclass(&[('a', 'b'), ('h', 'w')])
3078 );
3079 }
3080
3081 #[test]
3082 fn class_bracketed_intersect_negate() {
3083 #[cfg(feature = "unicode-perl")]
3084 assert_eq!(
3085 t(r"[^\w&&\d]"),
3086 hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
3087 );
3088 assert_eq!(t(r"[^[a-z&&a-c]]"), hir_negate(hir_uclass(&[('a', 'c')])));
3089 #[cfg(feature = "unicode-perl")]
3090 assert_eq!(
3091 t(r"[^[\w&&\d]]"),
3092 hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
3093 );
3094 #[cfg(feature = "unicode-perl")]
3095 assert_eq!(
3096 t(r"[^[^\w&&\d]]"),
3097 hir_uclass_query(ClassQuery::Binary("digit"))
3098 );
3099 #[cfg(feature = "unicode-perl")]
3100 assert_eq!(t(r"[[[^\w]&&[^\d]]]"), hir_negate(hir_uclass_perl_word()));
3101
3102 #[cfg(feature = "unicode-perl")]
3103 assert_eq!(
3104 t_bytes(r"(?-u)[^\w&&\d]"),
3105 hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit))
3106 );
3107 assert_eq!(
3108 t_bytes(r"(?-u)[^[a-z&&a-c]]"),
3109 hir_negate(hir_bclass(&[(b'a', b'c')]))
3110 );
3111 assert_eq!(
3112 t_bytes(r"(?-u)[^[\w&&\d]]"),
3113 hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit))
3114 );
3115 assert_eq!(
3116 t_bytes(r"(?-u)[^[^\w&&\d]]"),
3117 hir_ascii_bclass(&ast::ClassAsciiKind::Digit)
3118 );
3119 assert_eq!(
3120 t_bytes(r"(?-u)[[[^\w]&&[^\d]]]"),
3121 hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word))
3122 );
3123 }
3124
3125 #[test]
3126 fn class_bracketed_difference() {
3127 #[cfg(feature = "unicode-gencat")]
3128 assert_eq!(
3129 t(r"[\pL--[:ascii:]]"),
3130 hir_difference(
3131 hir_uclass_query(ClassQuery::Binary("letter")),
3132 hir_uclass(&[('\0', '\x7F')])
3133 )
3134 );
3135
3136 assert_eq!(
3137 t(r"(?-u)[[:alpha:]--[:lower:]]"),
3138 hir_bclass(&[(b'A', b'Z')])
3139 );
3140 }
3141
3142 #[test]
3143 fn class_bracketed_symmetric_difference() {
3144 #[cfg(feature = "unicode-script")]
3145 assert_eq!(
3146 t(r"[\p{sc:Greek}~~\p{scx:Greek}]"),
3147 hir_uclass(&[
3148 ('\u{0342}', '\u{0342}'),
3149 ('\u{0345}', '\u{0345}'),
3150 ('\u{1DC0}', '\u{1DC1}'),
3151 ])
3152 );
3153 assert_eq!(t(r"[a-g~~c-j]"), hir_uclass(&[('a', 'b'), ('h', 'j')]));
3154
3155 assert_eq!(
3156 t(r"(?-u)[a-g~~c-j]"),
3157 hir_bclass(&[(b'a', b'b'), (b'h', b'j')])
3158 );
3159 }
3160
3161 #[test]
3162 fn ignore_whitespace() {
3163 assert_eq!(t(r"(?x)\12 3"), hir_lit("\n3"));
3164 assert_eq!(t(r"(?x)\x { 53 }"), hir_lit("S"));
3165 assert_eq!(
3166 t(r"(?x)\x # comment
3167{ # comment
3168 53 # comment
3169} #comment"),
3170 hir_lit("S")
3171 );
3172
3173 assert_eq!(t(r"(?x)\x 53"), hir_lit("S"));
3174 assert_eq!(
3175 t(r"(?x)\x # comment
3176 53 # comment"),
3177 hir_lit("S")
3178 );
3179 assert_eq!(t(r"(?x)\x5 3"), hir_lit("S"));
3180
3181 #[cfg(feature = "unicode-gencat")]
3182 assert_eq!(
3183 t(r"(?x)\p # comment
3184{ # comment
3185 Separator # comment
3186} # comment"),
3187 hir_uclass_query(ClassQuery::Binary("separator"))
3188 );
3189
3190 assert_eq!(
3191 t(r"(?x)a # comment
3192{ # comment
3193 5 # comment
3194 , # comment
3195 10 # comment
3196} # comment"),
3197 hir_range(true, 5, Some(10), hir_lit("a"))
3198 );
3199
3200 assert_eq!(t(r"(?x)a\ # hi there"), hir_lit("a "));
3201 }
3202
3203 #[test]
3204 fn analysis_is_utf8() {
3205 // Positive examples.
3206 assert!(props_bytes(r"a").is_utf8());
3207 assert!(props_bytes(r"ab").is_utf8());
3208 assert!(props_bytes(r"(?-u)a").is_utf8());
3209 assert!(props_bytes(r"(?-u)ab").is_utf8());
3210 assert!(props_bytes(r"\xFF").is_utf8());
3211 assert!(props_bytes(r"\xFF\xFF").is_utf8());
3212 assert!(props_bytes(r"[^a]").is_utf8());
3213 assert!(props_bytes(r"[^a][^a]").is_utf8());
3214 assert!(props_bytes(r"\b").is_utf8());
3215 assert!(props_bytes(r"\B").is_utf8());
3216 assert!(props_bytes(r"(?-u)\b").is_utf8());
3217 assert!(props_bytes(r"(?-u)\B").is_utf8());
3218
3219 // Negative examples.
3220 assert!(!props_bytes(r"(?-u)\xFF").is_utf8());
3221 assert!(!props_bytes(r"(?-u)\xFF\xFF").is_utf8());
3222 assert!(!props_bytes(r"(?-u)[^a]").is_utf8());
3223 assert!(!props_bytes(r"(?-u)[^a][^a]").is_utf8());
3224 }
3225
3226 #[test]
3227 fn analysis_captures_len() {
3228 assert_eq!(0, props(r"a").explicit_captures_len());
3229 assert_eq!(0, props(r"(?:a)").explicit_captures_len());
3230 assert_eq!(0, props(r"(?i-u:a)").explicit_captures_len());
3231 assert_eq!(0, props(r"(?i-u)a").explicit_captures_len());
3232 assert_eq!(1, props(r"(a)").explicit_captures_len());
3233 assert_eq!(1, props(r"(?P<foo>a)").explicit_captures_len());
3234 assert_eq!(1, props(r"()").explicit_captures_len());
3235 assert_eq!(1, props(r"()a").explicit_captures_len());
3236 assert_eq!(1, props(r"(a)+").explicit_captures_len());
3237 assert_eq!(2, props(r"(a)(b)").explicit_captures_len());
3238 assert_eq!(2, props(r"(a)|(b)").explicit_captures_len());
3239 assert_eq!(2, props(r"((a))").explicit_captures_len());
3240 assert_eq!(1, props(r"([a&&b])").explicit_captures_len());
3241 }
3242
3243 #[test]
3244 fn analysis_static_captures_len() {
3245 let len = |pattern| props(pattern).static_explicit_captures_len();
3246 assert_eq!(Some(0), len(r""));
3247 assert_eq!(Some(0), len(r"foo|bar"));
3248 assert_eq!(None, len(r"(foo)|bar"));
3249 assert_eq!(None, len(r"foo|(bar)"));
3250 assert_eq!(Some(1), len(r"(foo|bar)"));
3251 assert_eq!(Some(1), len(r"(a|b|c|d|e|f)"));
3252 assert_eq!(Some(1), len(r"(a)|(b)|(c)|(d)|(e)|(f)"));
3253 assert_eq!(Some(2), len(r"(a)(b)|(c)(d)|(e)(f)"));
3254 assert_eq!(Some(6), len(r"(a)(b)(c)(d)(e)(f)"));
3255 assert_eq!(Some(3), len(r"(a)(b)(extra)|(a)(b)()"));
3256 assert_eq!(Some(3), len(r"(a)(b)((?:extra)?)"));
3257 assert_eq!(None, len(r"(a)(b)(extra)?"));
3258 assert_eq!(Some(1), len(r"(foo)|(bar)"));
3259 assert_eq!(Some(2), len(r"(foo)(bar)"));
3260 assert_eq!(Some(2), len(r"(foo)+(bar)"));
3261 assert_eq!(None, len(r"(foo)*(bar)"));
3262 assert_eq!(Some(0), len(r"(foo)?{0}"));
3263 assert_eq!(None, len(r"(foo)?{1}"));
3264 assert_eq!(Some(1), len(r"(foo){1}"));
3265 assert_eq!(Some(1), len(r"(foo){1,}"));
3266 assert_eq!(Some(1), len(r"(foo){1,}?"));
3267 assert_eq!(None, len(r"(foo){1,}??"));
3268 assert_eq!(None, len(r"(foo){0,}"));
3269 assert_eq!(Some(1), len(r"(foo)(?:bar)"));
3270 assert_eq!(Some(2), len(r"(foo(?:bar)+)(?:baz(boo))"));
3271 assert_eq!(Some(2), len(r"(?P<bar>foo)(?:bar)(bal|loon)"));
3272 assert_eq!(
3273 Some(2),
3274 len(r#"<(a)[^>]+href="([^"]+)"|<(img)[^>]+src="([^"]+)""#)
3275 );
3276 }
3277
3278 #[test]
3279 fn analysis_is_all_assertions() {
3280 // Positive examples.
3281 let p = props(r"\b");
3282 assert!(!p.look_set().is_empty());
3283 assert_eq!(p.minimum_len(), Some(0));
3284
3285 let p = props(r"\B");
3286 assert!(!p.look_set().is_empty());
3287 assert_eq!(p.minimum_len(), Some(0));
3288
3289 let p = props(r"^");
3290 assert!(!p.look_set().is_empty());
3291 assert_eq!(p.minimum_len(), Some(0));
3292
3293 let p = props(r"$");
3294 assert!(!p.look_set().is_empty());
3295 assert_eq!(p.minimum_len(), Some(0));
3296
3297 let p = props(r"\A");
3298 assert!(!p.look_set().is_empty());
3299 assert_eq!(p.minimum_len(), Some(0));
3300
3301 let p = props(r"\z");
3302 assert!(!p.look_set().is_empty());
3303 assert_eq!(p.minimum_len(), Some(0));
3304
3305 let p = props(r"$^\z\A\b\B");
3306 assert!(!p.look_set().is_empty());
3307 assert_eq!(p.minimum_len(), Some(0));
3308
3309 let p = props(r"$|^|\z|\A|\b|\B");
3310 assert!(!p.look_set().is_empty());
3311 assert_eq!(p.minimum_len(), Some(0));
3312
3313 let p = props(r"^$|$^");
3314 assert!(!p.look_set().is_empty());
3315 assert_eq!(p.minimum_len(), Some(0));
3316
3317 let p = props(r"((\b)+())*^");
3318 assert!(!p.look_set().is_empty());
3319 assert_eq!(p.minimum_len(), Some(0));
3320
3321 // Negative examples.
3322 let p = props(r"^a");
3323 assert!(!p.look_set().is_empty());
3324 assert_eq!(p.minimum_len(), Some(1));
3325 }
3326
3327 #[test]
3328 fn analysis_look_set_prefix_any() {
3329 let p = props(r"(?-u)(?i:(?:\b|_)win(?:32|64|dows)?(?:\b|_))");
3330 assert!(p.look_set_prefix_any().contains(Look::WordAscii));
3331 }
3332
3333 #[test]
3334 fn analysis_is_anchored() {
3335 let is_start = |p| props(p).look_set_prefix().contains(Look::Start);
3336 let is_end = |p| props(p).look_set_suffix().contains(Look::End);
3337
3338 // Positive examples.
3339 assert!(is_start(r"^"));
3340 assert!(is_end(r"$"));
3341
3342 assert!(is_start(r"^^"));
3343 assert!(props(r"$$").look_set_suffix().contains(Look::End));
3344
3345 assert!(is_start(r"^$"));
3346 assert!(is_end(r"^$"));
3347
3348 assert!(is_start(r"^foo"));
3349 assert!(is_end(r"foo$"));
3350
3351 assert!(is_start(r"^foo|^bar"));
3352 assert!(is_end(r"foo$|bar$"));
3353
3354 assert!(is_start(r"^(foo|bar)"));
3355 assert!(is_end(r"(foo|bar)$"));
3356
3357 assert!(is_start(r"^+"));
3358 assert!(is_end(r"$+"));
3359 assert!(is_start(r"^++"));
3360 assert!(is_end(r"$++"));
3361 assert!(is_start(r"(^)+"));
3362 assert!(is_end(r"($)+"));
3363
3364 assert!(is_start(r"$^"));
3365 assert!(is_start(r"$^"));
3366 assert!(is_start(r"$^|^$"));
3367 assert!(is_end(r"$^|^$"));
3368
3369 assert!(is_start(r"\b^"));
3370 assert!(is_end(r"$\b"));
3371 assert!(is_start(r"^(?m:^)"));
3372 assert!(is_end(r"(?m:$)$"));
3373 assert!(is_start(r"(?m:^)^"));
3374 assert!(is_end(r"$(?m:$)"));
3375
3376 // Negative examples.
3377 assert!(!is_start(r"(?m)^"));
3378 assert!(!is_end(r"(?m)$"));
3379 assert!(!is_start(r"(?m:^$)|$^"));
3380 assert!(!is_end(r"(?m:^$)|$^"));
3381 assert!(!is_start(r"$^|(?m:^$)"));
3382 assert!(!is_end(r"$^|(?m:^$)"));
3383
3384 assert!(!is_start(r"a^"));
3385 assert!(!is_start(r"$a"));
3386
3387 assert!(!is_end(r"a^"));
3388 assert!(!is_end(r"$a"));
3389
3390 assert!(!is_start(r"^foo|bar"));
3391 assert!(!is_end(r"foo|bar$"));
3392
3393 assert!(!is_start(r"^*"));
3394 assert!(!is_end(r"$*"));
3395 assert!(!is_start(r"^*+"));
3396 assert!(!is_end(r"$*+"));
3397 assert!(!is_start(r"^+*"));
3398 assert!(!is_end(r"$+*"));
3399 assert!(!is_start(r"(^)*"));
3400 assert!(!is_end(r"($)*"));
3401 }
3402
3403 #[test]
3404 fn analysis_is_any_anchored() {
3405 let is_start = |p| props(p).look_set().contains(Look::Start);
3406 let is_end = |p| props(p).look_set().contains(Look::End);
3407
3408 // Positive examples.
3409 assert!(is_start(r"^"));
3410 assert!(is_end(r"$"));
3411 assert!(is_start(r"\A"));
3412 assert!(is_end(r"\z"));
3413
3414 // Negative examples.
3415 assert!(!is_start(r"(?m)^"));
3416 assert!(!is_end(r"(?m)$"));
3417 assert!(!is_start(r"$"));
3418 assert!(!is_end(r"^"));
3419 }
3420
3421 #[test]
3422 fn analysis_can_empty() {
3423 // Positive examples.
3424 let assert_empty =
3425 |p| assert_eq!(Some(0), props_bytes(p).minimum_len());
3426 assert_empty(r"");
3427 assert_empty(r"()");
3428 assert_empty(r"()*");
3429 assert_empty(r"()+");
3430 assert_empty(r"()?");
3431 assert_empty(r"a*");
3432 assert_empty(r"a?");
3433 assert_empty(r"a{0}");
3434 assert_empty(r"a{0,}");
3435 assert_empty(r"a{0,1}");
3436 assert_empty(r"a{0,10}");
3437 #[cfg(feature = "unicode-gencat")]
3438 assert_empty(r"\pL*");
3439 assert_empty(r"a*|b");
3440 assert_empty(r"b|a*");
3441 assert_empty(r"a|");
3442 assert_empty(r"|a");
3443 assert_empty(r"a||b");
3444 assert_empty(r"a*a?(abcd)*");
3445 assert_empty(r"^");
3446 assert_empty(r"$");
3447 assert_empty(r"(?m)^");
3448 assert_empty(r"(?m)$");
3449 assert_empty(r"\A");
3450 assert_empty(r"\z");
3451 assert_empty(r"\B");
3452 assert_empty(r"(?-u)\B");
3453 assert_empty(r"\b");
3454 assert_empty(r"(?-u)\b");
3455
3456 // Negative examples.
3457 let assert_non_empty =
3458 |p| assert_ne!(Some(0), props_bytes(p).minimum_len());
3459 assert_non_empty(r"a+");
3460 assert_non_empty(r"a{1}");
3461 assert_non_empty(r"a{1,}");
3462 assert_non_empty(r"a{1,2}");
3463 assert_non_empty(r"a{1,10}");
3464 assert_non_empty(r"b|a");
3465 assert_non_empty(r"a*a+(abcd)*");
3466 #[cfg(feature = "unicode-gencat")]
3467 assert_non_empty(r"\P{any}");
3468 assert_non_empty(r"[a--a]");
3469 assert_non_empty(r"[a&&b]");
3470 }
3471
3472 #[test]
3473 fn analysis_is_literal() {
3474 // Positive examples.
3475 assert!(props(r"a").is_literal());
3476 assert!(props(r"ab").is_literal());
3477 assert!(props(r"abc").is_literal());
3478 assert!(props(r"(?m)abc").is_literal());
3479 assert!(props(r"(?:a)").is_literal());
3480 assert!(props(r"foo(?:a)").is_literal());
3481 assert!(props(r"(?:a)foo").is_literal());
3482 assert!(props(r"[a]").is_literal());
3483
3484 // Negative examples.
3485 assert!(!props(r"").is_literal());
3486 assert!(!props(r"^").is_literal());
3487 assert!(!props(r"a|b").is_literal());
3488 assert!(!props(r"(a)").is_literal());
3489 assert!(!props(r"a+").is_literal());
3490 assert!(!props(r"foo(a)").is_literal());
3491 assert!(!props(r"(a)foo").is_literal());
3492 assert!(!props(r"[ab]").is_literal());
3493 }
3494
3495 #[test]
3496 fn analysis_is_alternation_literal() {
3497 // Positive examples.
3498 assert!(props(r"a").is_alternation_literal());
3499 assert!(props(r"ab").is_alternation_literal());
3500 assert!(props(r"abc").is_alternation_literal());
3501 assert!(props(r"(?m)abc").is_alternation_literal());
3502 assert!(props(r"foo|bar").is_alternation_literal());
3503 assert!(props(r"foo|bar|baz").is_alternation_literal());
3504 assert!(props(r"[a]").is_alternation_literal());
3505 assert!(props(r"(?:ab)|cd").is_alternation_literal());
3506 assert!(props(r"ab|(?:cd)").is_alternation_literal());
3507
3508 // Negative examples.
3509 assert!(!props(r"").is_alternation_literal());
3510 assert!(!props(r"^").is_alternation_literal());
3511 assert!(!props(r"(a)").is_alternation_literal());
3512 assert!(!props(r"a+").is_alternation_literal());
3513 assert!(!props(r"foo(a)").is_alternation_literal());
3514 assert!(!props(r"(a)foo").is_alternation_literal());
3515 assert!(!props(r"[ab]").is_alternation_literal());
3516 assert!(!props(r"[ab]|b").is_alternation_literal());
3517 assert!(!props(r"a|[ab]").is_alternation_literal());
3518 assert!(!props(r"(a)|b").is_alternation_literal());
3519 assert!(!props(r"a|(b)").is_alternation_literal());
3520 assert!(!props(r"a|b").is_alternation_literal());
3521 assert!(!props(r"a|b|c").is_alternation_literal());
3522 assert!(!props(r"[a]|b").is_alternation_literal());
3523 assert!(!props(r"a|[b]").is_alternation_literal());
3524 assert!(!props(r"(?:a)|b").is_alternation_literal());
3525 assert!(!props(r"a|(?:b)").is_alternation_literal());
3526 assert!(!props(r"(?:z|xx)@|xx").is_alternation_literal());
3527 }
3528
3529 // This tests that the smart Hir::repetition constructors does some basic
3530 // simplifications.
3531 #[test]
3532 fn smart_repetition() {
3533 assert_eq!(t(r"a{0}"), Hir::empty());
3534 assert_eq!(t(r"a{1}"), hir_lit("a"));
3535 assert_eq!(t(r"\B{32111}"), hir_look(hir::Look::WordUnicodeNegate));
3536 }
3537
3538 // This tests that the smart Hir::concat constructor simplifies the given
3539 // exprs in a way we expect.
3540 #[test]
3541 fn smart_concat() {
3542 assert_eq!(t(""), Hir::empty());
3543 assert_eq!(t("(?:)"), Hir::empty());
3544 assert_eq!(t("abc"), hir_lit("abc"));
3545 assert_eq!(t("(?:foo)(?:bar)"), hir_lit("foobar"));
3546 assert_eq!(t("quux(?:foo)(?:bar)baz"), hir_lit("quuxfoobarbaz"));
3547 assert_eq!(
3548 t("foo(?:bar^baz)quux"),
3549 hir_cat(vec![
3550 hir_lit("foobar"),
3551 hir_look(hir::Look::Start),
3552 hir_lit("bazquux"),
3553 ])
3554 );
3555 assert_eq!(
3556 t("foo(?:ba(?:r^b)az)quux"),
3557 hir_cat(vec![
3558 hir_lit("foobar"),
3559 hir_look(hir::Look::Start),
3560 hir_lit("bazquux"),
3561 ])
3562 );
3563 }
3564
3565 // This tests that the smart Hir::alternation constructor simplifies the
3566 // given exprs in a way we expect.
3567 #[test]
3568 fn smart_alternation() {
3569 assert_eq!(
3570 t("(?:foo)|(?:bar)"),
3571 hir_alt(vec![hir_lit("foo"), hir_lit("bar")])
3572 );
3573 assert_eq!(
3574 t("quux|(?:abc|def|xyz)|baz"),
3575 hir_alt(vec![
3576 hir_lit("quux"),
3577 hir_lit("abc"),
3578 hir_lit("def"),
3579 hir_lit("xyz"),
3580 hir_lit("baz"),
3581 ])
3582 );
3583 assert_eq!(
3584 t("quux|(?:abc|(?:def|mno)|xyz)|baz"),
3585 hir_alt(vec![
3586 hir_lit("quux"),
3587 hir_lit("abc"),
3588 hir_lit("def"),
3589 hir_lit("mno"),
3590 hir_lit("xyz"),
3591 hir_lit("baz"),
3592 ])
3593 );
3594 assert_eq!(
3595 t("a|b|c|d|e|f|x|y|z"),
3596 hir_uclass(&[('a', 'f'), ('x', 'z')]),
3597 );
3598 // Tests that we lift common prefixes out of an alternation.
3599 assert_eq!(
3600 t("[A-Z]foo|[A-Z]quux"),
3601 hir_cat(vec![
3602 hir_uclass(&[('A', 'Z')]),
3603 hir_alt(vec![hir_lit("foo"), hir_lit("quux")]),
3604 ]),
3605 );
3606 assert_eq!(
3607 t("[A-Z][A-Z]|[A-Z]quux"),
3608 hir_cat(vec![
3609 hir_uclass(&[('A', 'Z')]),
3610 hir_alt(vec![hir_uclass(&[('A', 'Z')]), hir_lit("quux")]),
3611 ]),
3612 );
3613 assert_eq!(
3614 t("[A-Z][A-Z]|[A-Z][A-Z]quux"),
3615 hir_cat(vec![
3616 hir_uclass(&[('A', 'Z')]),
3617 hir_uclass(&[('A', 'Z')]),
3618 hir_alt(vec![Hir::empty(), hir_lit("quux")]),
3619 ]),
3620 );
3621 assert_eq!(
3622 t("[A-Z]foo|[A-Z]foobar"),
3623 hir_cat(vec![
3624 hir_uclass(&[('A', 'Z')]),
3625 hir_alt(vec![hir_lit("foo"), hir_lit("foobar")]),
3626 ]),
3627 );
3628 }
3629
3630 #[test]
3631 fn regression_alt_empty_concat() {
3632 use crate::ast::{self, Ast};
3633
3634 let span = Span::splat(Position::new(0, 0, 0));
3635 let ast = Ast::alternation(ast::Alternation {
3636 span,
3637 asts: vec![Ast::concat(ast::Concat { span, asts: vec![] })],
3638 });
3639
3640 let mut t = Translator::new();
3641 assert_eq!(Ok(Hir::empty()), t.translate("", &ast));
3642 }
3643
3644 #[test]
3645 fn regression_empty_alt() {
3646 use crate::ast::{self, Ast};
3647
3648 let span = Span::splat(Position::new(0, 0, 0));
3649 let ast = Ast::concat(ast::Concat {
3650 span,
3651 asts: vec![Ast::alternation(ast::Alternation {
3652 span,
3653 asts: vec![],
3654 })],
3655 });
3656
3657 let mut t = Translator::new();
3658 assert_eq!(Ok(Hir::fail()), t.translate("", &ast));
3659 }
3660
3661 #[test]
3662 fn regression_singleton_alt() {
3663 use crate::{
3664 ast::{self, Ast},
3665 hir::Dot,
3666 };
3667
3668 let span = Span::splat(Position::new(0, 0, 0));
3669 let ast = Ast::concat(ast::Concat {
3670 span,
3671 asts: vec![Ast::alternation(ast::Alternation {
3672 span,
3673 asts: vec![Ast::dot(span)],
3674 })],
3675 });
3676
3677 let mut t = Translator::new();
3678 assert_eq!(Ok(Hir::dot(Dot::AnyCharExceptLF)), t.translate("", &ast));
3679 }
3680
3681 // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63168
3682 #[test]
3683 fn regression_fuzz_match() {
3684 let pat = "[(\u{6} \0-\u{afdf5}] \0 ";
3685 let ast = ParserBuilder::new()
3686 .octal(false)
3687 .ignore_whitespace(true)
3688 .build()
3689 .parse(pat)
3690 .unwrap();
3691 let hir = TranslatorBuilder::new()
3692 .utf8(true)
3693 .case_insensitive(false)
3694 .multi_line(false)
3695 .dot_matches_new_line(false)
3696 .swap_greed(true)
3697 .unicode(true)
3698 .build()
3699 .translate(pat, &ast)
3700 .unwrap();
3701 assert_eq!(
3702 hir,
3703 Hir::concat(vec![
3704 hir_uclass(&[('\0', '\u{afdf5}')]),
3705 hir_lit("\0"),
3706 ])
3707 );
3708 }
3709
3710 // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63155
3711 #[cfg(feature = "unicode")]
3712 #[test]
3713 fn regression_fuzz_difference1() {
3714 let pat = r"\W\W|\W[^\v--\W\W\P{Script_Extensions:Pau_Cin_Hau}\u10A1A1-\U{3E3E3}--~~~~--~~~~~~~~------~~~~~~--~~~~~~]*";
3715 let _ = t(pat); // shouldn't panic
3716 }
3717
3718 // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63153
3719 #[test]
3720 fn regression_fuzz_char_decrement1() {
3721 let pat = "w[w[^w?\rw\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\r\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0w?\rw[^w?\rw[^w?\rw[^w\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0\0\0\0\0\0\0\0*\0\0\u{1}\0]\0\0-*\0][^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0\0\0\0\0\0\0\0x\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\0\0\0*??\0\u{7f}{2}\u{10}??\0\0\0\0\0\0\0\0\0\u{3}\0\0\0}\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\0\u{1}\0]\0\u{1}\u{1}H-i]-]\0\0\0\0\u{1}\0]\0\0\0\u{1}\0]\0\0-*\0\0\0\0\u{1}9-\u{7f}]\0'|-\u{7f}]\0'|(?i-ux)[-\u{7f}]\0'\u{3}\0\0\0}\0-*\0]<D\0\0\0\0\0\0\u{1}]\0\0\0\0]\0\0-*\0]\0\0 ";
3722 let _ = t(pat); // shouldn't panic
3723 }
3724}
3725