1/*!
2Defines a translator that converts an `Ast` to an `Hir`.
3*/
4
5use core::cell::{Cell, RefCell};
6
7use alloc::{boxed::Box, string::ToString, vec, vec::Vec};
8
9use crate::{
10 ast::{self, Ast, Span, Visitor},
11 either::Either,
12 hir::{self, Error, ErrorKind, Hir, HirKind},
13 unicode::{self, ClassQuery},
14};
15
16type Result<T> = core::result::Result<T, Error>;
17
18/// A builder for constructing an AST->HIR translator.
19#[derive(Clone, Debug)]
20pub struct TranslatorBuilder {
21 utf8: bool,
22 flags: Flags,
23}
24
25impl Default for TranslatorBuilder {
26 fn default() -> TranslatorBuilder {
27 TranslatorBuilder::new()
28 }
29}
30
31impl TranslatorBuilder {
32 /// Create a new translator builder with a default c onfiguration.
33 pub fn new() -> TranslatorBuilder {
34 TranslatorBuilder { utf8: true, flags: Flags::default() }
35 }
36
37 /// Build a translator using the current configuration.
38 pub fn build(&self) -> Translator {
39 Translator {
40 stack: RefCell::new(vec![]),
41 flags: Cell::new(self.flags),
42 utf8: self.utf8,
43 }
44 }
45
46 /// When disabled, translation will permit the construction of a regular
47 /// expression that may match invalid UTF-8.
48 ///
49 /// When enabled (the default), the translator is guaranteed to produce an
50 /// expression that, for non-empty matches, will only ever produce spans
51 /// that are entirely valid UTF-8 (otherwise, the translator will return an
52 /// error).
53 ///
54 /// Perhaps surprisingly, when UTF-8 is enabled, an empty regex or even
55 /// a negated ASCII word boundary (uttered as `(?-u:\B)` in the concrete
56 /// syntax) will be allowed even though they can produce matches that split
57 /// a UTF-8 encoded codepoint. This only applies to zero-width or "empty"
58 /// matches, and it is expected that the regex engine itself must handle
59 /// these cases if necessary (perhaps by suppressing any zero-width matches
60 /// that split a codepoint).
61 pub fn utf8(&mut self, yes: bool) -> &mut TranslatorBuilder {
62 self.utf8 = yes;
63 self
64 }
65
66 /// Enable or disable the case insensitive flag (`i`) by default.
67 pub fn case_insensitive(&mut self, yes: bool) -> &mut TranslatorBuilder {
68 self.flags.case_insensitive = if yes { Some(true) } else { None };
69 self
70 }
71
72 /// Enable or disable the multi-line matching flag (`m`) by default.
73 pub fn multi_line(&mut self, yes: bool) -> &mut TranslatorBuilder {
74 self.flags.multi_line = if yes { Some(true) } else { None };
75 self
76 }
77
78 /// Enable or disable the "dot matches any character" flag (`s`) by
79 /// default.
80 pub fn dot_matches_new_line(
81 &mut self,
82 yes: bool,
83 ) -> &mut TranslatorBuilder {
84 self.flags.dot_matches_new_line = if yes { Some(true) } else { None };
85 self
86 }
87
88 /// Enable or disable the CRLF mode flag (`R`) by default.
89 pub fn crlf(&mut self, yes: bool) -> &mut TranslatorBuilder {
90 self.flags.crlf = if yes { Some(true) } else { None };
91 self
92 }
93
94 /// Enable or disable the "swap greed" flag (`U`) by default.
95 pub fn swap_greed(&mut self, yes: bool) -> &mut TranslatorBuilder {
96 self.flags.swap_greed = if yes { Some(true) } else { None };
97 self
98 }
99
100 /// Enable or disable the Unicode flag (`u`) by default.
101 pub fn unicode(&mut self, yes: bool) -> &mut TranslatorBuilder {
102 self.flags.unicode = if yes { None } else { Some(false) };
103 self
104 }
105}
106
107/// A translator maps abstract syntax to a high level intermediate
108/// representation.
109///
110/// A translator may be benefit from reuse. That is, a translator can translate
111/// many abstract syntax trees.
112///
113/// A `Translator` can be configured in more detail via a
114/// [`TranslatorBuilder`].
115#[derive(Clone, Debug)]
116pub struct Translator {
117 /// Our call stack, but on the heap.
118 stack: RefCell<Vec<HirFrame>>,
119 /// The current flag settings.
120 flags: Cell<Flags>,
121 /// Whether we're allowed to produce HIR that can match arbitrary bytes.
122 utf8: bool,
123}
124
125impl Translator {
126 /// Create a new translator using the default configuration.
127 pub fn new() -> Translator {
128 TranslatorBuilder::new().build()
129 }
130
131 /// Translate the given abstract syntax tree (AST) into a high level
132 /// intermediate representation (HIR).
133 ///
134 /// If there was a problem doing the translation, then an HIR-specific
135 /// error is returned.
136 ///
137 /// The original pattern string used to produce the `Ast` *must* also be
138 /// provided. The translator does not use the pattern string during any
139 /// correct translation, but is used for error reporting.
140 pub fn translate(&mut self, pattern: &str, ast: &Ast) -> Result<Hir> {
141 ast::visit(ast, visitor:TranslatorI::new(self, pattern))
142 }
143}
144
145/// An HirFrame is a single stack frame, represented explicitly, which is
146/// created for each item in the Ast that we traverse.
147///
148/// Note that technically, this type doesn't represent our entire stack
149/// frame. In particular, the Ast visitor represents any state associated with
150/// traversing the Ast itself.
151#[derive(Clone, Debug)]
152enum HirFrame {
153 /// An arbitrary HIR expression. These get pushed whenever we hit a base
154 /// case in the Ast. They get popped after an inductive (i.e., recursive)
155 /// step is complete.
156 Expr(Hir),
157 /// A literal that is being constructed, character by character, from the
158 /// AST. We need this because the AST gives each individual character its
159 /// own node. So as we see characters, we peek at the top-most HirFrame.
160 /// If it's a literal, then we add to it. Otherwise, we push a new literal.
161 /// When it comes time to pop it, we convert it to an Hir via Hir::literal.
162 Literal(Vec<u8>),
163 /// A Unicode character class. This frame is mutated as we descend into
164 /// the Ast of a character class (which is itself its own mini recursive
165 /// structure).
166 ClassUnicode(hir::ClassUnicode),
167 /// A byte-oriented character class. This frame is mutated as we descend
168 /// into the Ast of a character class (which is itself its own mini
169 /// recursive structure).
170 ///
171 /// Byte character classes are created when Unicode mode (`u`) is disabled.
172 /// If `utf8` is enabled (the default), then a byte character is only
173 /// permitted to match ASCII text.
174 ClassBytes(hir::ClassBytes),
175 /// This is pushed whenever a repetition is observed. After visiting every
176 /// sub-expression in the repetition, the translator's stack is expected to
177 /// have this sentinel at the top.
178 ///
179 /// This sentinel only exists to stop other things (like flattening
180 /// literals) from reaching across repetition operators.
181 Repetition,
182 /// This is pushed on to the stack upon first seeing any kind of capture,
183 /// indicated by parentheses (including non-capturing groups). It is popped
184 /// upon leaving a group.
185 Group {
186 /// The old active flags when this group was opened.
187 ///
188 /// If this group sets flags, then the new active flags are set to the
189 /// result of merging the old flags with the flags introduced by this
190 /// group. If the group doesn't set any flags, then this is simply
191 /// equivalent to whatever flags were set when the group was opened.
192 ///
193 /// When this group is popped, the active flags should be restored to
194 /// the flags set here.
195 ///
196 /// The "active" flags correspond to whatever flags are set in the
197 /// Translator.
198 old_flags: Flags,
199 },
200 /// This is pushed whenever a concatenation is observed. After visiting
201 /// every sub-expression in the concatenation, the translator's stack is
202 /// popped until it sees a Concat frame.
203 Concat,
204 /// This is pushed whenever an alternation is observed. After visiting
205 /// every sub-expression in the alternation, the translator's stack is
206 /// popped until it sees an Alternation frame.
207 Alternation,
208 /// This is pushed immediately before each sub-expression in an
209 /// alternation. This separates the branches of an alternation on the
210 /// stack and prevents literal flattening from reaching across alternation
211 /// branches.
212 ///
213 /// It is popped after each expression in a branch until an 'Alternation'
214 /// frame is observed when doing a post visit on an alternation.
215 AlternationBranch,
216}
217
218impl HirFrame {
219 /// Assert that the current stack frame is an Hir expression and return it.
220 fn unwrap_expr(self) -> Hir {
221 match self {
222 HirFrame::Expr(expr) => expr,
223 HirFrame::Literal(lit) => Hir::literal(lit),
224 _ => panic!("tried to unwrap expr from HirFrame, got: {:?}", self),
225 }
226 }
227
228 /// Assert that the current stack frame is a Unicode class expression and
229 /// return it.
230 fn unwrap_class_unicode(self) -> hir::ClassUnicode {
231 match self {
232 HirFrame::ClassUnicode(cls) => cls,
233 _ => panic!(
234 "tried to unwrap Unicode class \
235 from HirFrame, got: {:?}",
236 self
237 ),
238 }
239 }
240
241 /// Assert that the current stack frame is a byte class expression and
242 /// return it.
243 fn unwrap_class_bytes(self) -> hir::ClassBytes {
244 match self {
245 HirFrame::ClassBytes(cls) => cls,
246 _ => panic!(
247 "tried to unwrap byte class \
248 from HirFrame, got: {:?}",
249 self
250 ),
251 }
252 }
253
254 /// Assert that the current stack frame is a repetition sentinel. If it
255 /// isn't, then panic.
256 fn unwrap_repetition(self) {
257 match self {
258 HirFrame::Repetition => {}
259 _ => {
260 panic!(
261 "tried to unwrap repetition from HirFrame, got: {:?}",
262 self
263 )
264 }
265 }
266 }
267
268 /// Assert that the current stack frame is a group indicator and return
269 /// its corresponding flags (the flags that were active at the time the
270 /// group was entered).
271 fn unwrap_group(self) -> Flags {
272 match self {
273 HirFrame::Group { old_flags } => old_flags,
274 _ => {
275 panic!("tried to unwrap group from HirFrame, got: {:?}", self)
276 }
277 }
278 }
279
280 /// Assert that the current stack frame is an alternation pipe sentinel. If
281 /// it isn't, then panic.
282 fn unwrap_alternation_pipe(self) {
283 match self {
284 HirFrame::AlternationBranch => {}
285 _ => {
286 panic!(
287 "tried to unwrap alt pipe from HirFrame, got: {:?}",
288 self
289 )
290 }
291 }
292 }
293}
294
295impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
296 type Output = Hir;
297 type Err = Error;
298
299 fn finish(self) -> Result<Hir> {
300 // ... otherwise, we should have exactly one HIR on the stack.
301 assert_eq!(self.trans().stack.borrow().len(), 1);
302 Ok(self.pop().unwrap().unwrap_expr())
303 }
304
305 fn visit_pre(&mut self, ast: &Ast) -> Result<()> {
306 match *ast {
307 Ast::Class(ast::Class::Bracketed(_)) => {
308 if self.flags().unicode() {
309 let cls = hir::ClassUnicode::empty();
310 self.push(HirFrame::ClassUnicode(cls));
311 } else {
312 let cls = hir::ClassBytes::empty();
313 self.push(HirFrame::ClassBytes(cls));
314 }
315 }
316 Ast::Repetition(_) => self.push(HirFrame::Repetition),
317 Ast::Group(ref x) => {
318 let old_flags = x
319 .flags()
320 .map(|ast| self.set_flags(ast))
321 .unwrap_or_else(|| self.flags());
322 self.push(HirFrame::Group { old_flags });
323 }
324 Ast::Concat(ref x) if x.asts.is_empty() => {}
325 Ast::Concat(_) => {
326 self.push(HirFrame::Concat);
327 }
328 Ast::Alternation(ref x) if x.asts.is_empty() => {}
329 Ast::Alternation(_) => {
330 self.push(HirFrame::Alternation);
331 self.push(HirFrame::AlternationBranch);
332 }
333 _ => {}
334 }
335 Ok(())
336 }
337
338 fn visit_post(&mut self, ast: &Ast) -> Result<()> {
339 match *ast {
340 Ast::Empty(_) => {
341 self.push(HirFrame::Expr(Hir::empty()));
342 }
343 Ast::Flags(ref x) => {
344 self.set_flags(&x.flags);
345 // Flags in the AST are generally considered directives and
346 // not actual sub-expressions. However, they can be used in
347 // the concrete syntax like `((?i))`, and we need some kind of
348 // indication of an expression there, and Empty is the correct
349 // choice.
350 //
351 // There can also be things like `(?i)+`, but we rule those out
352 // in the parser. In the future, we might allow them for
353 // consistency sake.
354 self.push(HirFrame::Expr(Hir::empty()));
355 }
356 Ast::Literal(ref x) => {
357 match self.ast_literal_to_scalar(x)? {
358 Either::Right(byte) => self.push_byte(byte),
359 Either::Left(ch) => {
360 if !self.flags().unicode() && ch.len_utf8() > 1 {
361 return Err(self
362 .error(x.span, ErrorKind::UnicodeNotAllowed));
363 }
364 match self.case_fold_char(x.span, ch)? {
365 None => self.push_char(ch),
366 Some(expr) => self.push(HirFrame::Expr(expr)),
367 }
368 }
369 }
370 // self.push(HirFrame::Expr(self.hir_literal(x)?));
371 }
372 Ast::Dot(span) => {
373 self.push(HirFrame::Expr(self.hir_dot(span)?));
374 }
375 Ast::Assertion(ref x) => {
376 self.push(HirFrame::Expr(self.hir_assertion(x)?));
377 }
378 Ast::Class(ast::Class::Perl(ref x)) => {
379 if self.flags().unicode() {
380 let cls = self.hir_perl_unicode_class(x)?;
381 let hcls = hir::Class::Unicode(cls);
382 self.push(HirFrame::Expr(Hir::class(hcls)));
383 } else {
384 let cls = self.hir_perl_byte_class(x)?;
385 let hcls = hir::Class::Bytes(cls);
386 self.push(HirFrame::Expr(Hir::class(hcls)));
387 }
388 }
389 Ast::Class(ast::Class::Unicode(ref x)) => {
390 let cls = hir::Class::Unicode(self.hir_unicode_class(x)?);
391 self.push(HirFrame::Expr(Hir::class(cls)));
392 }
393 Ast::Class(ast::Class::Bracketed(ref ast)) => {
394 if self.flags().unicode() {
395 let mut cls = self.pop().unwrap().unwrap_class_unicode();
396 self.unicode_fold_and_negate(
397 &ast.span,
398 ast.negated,
399 &mut cls,
400 )?;
401 let expr = Hir::class(hir::Class::Unicode(cls));
402 self.push(HirFrame::Expr(expr));
403 } else {
404 let mut cls = self.pop().unwrap().unwrap_class_bytes();
405 self.bytes_fold_and_negate(
406 &ast.span,
407 ast.negated,
408 &mut cls,
409 )?;
410 let expr = Hir::class(hir::Class::Bytes(cls));
411 self.push(HirFrame::Expr(expr));
412 }
413 }
414 Ast::Repetition(ref x) => {
415 let expr = self.pop().unwrap().unwrap_expr();
416 self.pop().unwrap().unwrap_repetition();
417 self.push(HirFrame::Expr(self.hir_repetition(x, expr)));
418 }
419 Ast::Group(ref x) => {
420 let expr = self.pop().unwrap().unwrap_expr();
421 let old_flags = self.pop().unwrap().unwrap_group();
422 self.trans().flags.set(old_flags);
423 self.push(HirFrame::Expr(self.hir_capture(x, expr)));
424 }
425 Ast::Concat(_) => {
426 let mut exprs = vec![];
427 while let Some(expr) = self.pop_concat_expr() {
428 if !matches!(*expr.kind(), HirKind::Empty) {
429 exprs.push(expr);
430 }
431 }
432 exprs.reverse();
433 self.push(HirFrame::Expr(Hir::concat(exprs)));
434 }
435 Ast::Alternation(_) => {
436 let mut exprs = vec![];
437 while let Some(expr) = self.pop_alt_expr() {
438 self.pop().unwrap().unwrap_alternation_pipe();
439 exprs.push(expr);
440 }
441 exprs.reverse();
442 self.push(HirFrame::Expr(Hir::alternation(exprs)));
443 }
444 }
445 Ok(())
446 }
447
448 fn visit_alternation_in(&mut self) -> Result<()> {
449 self.push(HirFrame::AlternationBranch);
450 Ok(())
451 }
452
453 fn visit_class_set_item_pre(
454 &mut self,
455 ast: &ast::ClassSetItem,
456 ) -> Result<()> {
457 match *ast {
458 ast::ClassSetItem::Bracketed(_) => {
459 if self.flags().unicode() {
460 let cls = hir::ClassUnicode::empty();
461 self.push(HirFrame::ClassUnicode(cls));
462 } else {
463 let cls = hir::ClassBytes::empty();
464 self.push(HirFrame::ClassBytes(cls));
465 }
466 }
467 // We needn't handle the Union case here since the visitor will
468 // do it for us.
469 _ => {}
470 }
471 Ok(())
472 }
473
474 fn visit_class_set_item_post(
475 &mut self,
476 ast: &ast::ClassSetItem,
477 ) -> Result<()> {
478 match *ast {
479 ast::ClassSetItem::Empty(_) => {}
480 ast::ClassSetItem::Literal(ref x) => {
481 if self.flags().unicode() {
482 let mut cls = self.pop().unwrap().unwrap_class_unicode();
483 cls.push(hir::ClassUnicodeRange::new(x.c, x.c));
484 self.push(HirFrame::ClassUnicode(cls));
485 } else {
486 let mut cls = self.pop().unwrap().unwrap_class_bytes();
487 let byte = self.class_literal_byte(x)?;
488 cls.push(hir::ClassBytesRange::new(byte, byte));
489 self.push(HirFrame::ClassBytes(cls));
490 }
491 }
492 ast::ClassSetItem::Range(ref x) => {
493 if self.flags().unicode() {
494 let mut cls = self.pop().unwrap().unwrap_class_unicode();
495 cls.push(hir::ClassUnicodeRange::new(x.start.c, x.end.c));
496 self.push(HirFrame::ClassUnicode(cls));
497 } else {
498 let mut cls = self.pop().unwrap().unwrap_class_bytes();
499 let start = self.class_literal_byte(&x.start)?;
500 let end = self.class_literal_byte(&x.end)?;
501 cls.push(hir::ClassBytesRange::new(start, end));
502 self.push(HirFrame::ClassBytes(cls));
503 }
504 }
505 ast::ClassSetItem::Ascii(ref x) => {
506 if self.flags().unicode() {
507 let xcls = self.hir_ascii_unicode_class(x)?;
508 let mut cls = self.pop().unwrap().unwrap_class_unicode();
509 cls.union(&xcls);
510 self.push(HirFrame::ClassUnicode(cls));
511 } else {
512 let xcls = self.hir_ascii_byte_class(x)?;
513 let mut cls = self.pop().unwrap().unwrap_class_bytes();
514 cls.union(&xcls);
515 self.push(HirFrame::ClassBytes(cls));
516 }
517 }
518 ast::ClassSetItem::Unicode(ref x) => {
519 let xcls = self.hir_unicode_class(x)?;
520 let mut cls = self.pop().unwrap().unwrap_class_unicode();
521 cls.union(&xcls);
522 self.push(HirFrame::ClassUnicode(cls));
523 }
524 ast::ClassSetItem::Perl(ref x) => {
525 if self.flags().unicode() {
526 let xcls = self.hir_perl_unicode_class(x)?;
527 let mut cls = self.pop().unwrap().unwrap_class_unicode();
528 cls.union(&xcls);
529 self.push(HirFrame::ClassUnicode(cls));
530 } else {
531 let xcls = self.hir_perl_byte_class(x)?;
532 let mut cls = self.pop().unwrap().unwrap_class_bytes();
533 cls.union(&xcls);
534 self.push(HirFrame::ClassBytes(cls));
535 }
536 }
537 ast::ClassSetItem::Bracketed(ref ast) => {
538 if self.flags().unicode() {
539 let mut cls1 = self.pop().unwrap().unwrap_class_unicode();
540 self.unicode_fold_and_negate(
541 &ast.span,
542 ast.negated,
543 &mut cls1,
544 )?;
545
546 let mut cls2 = self.pop().unwrap().unwrap_class_unicode();
547 cls2.union(&cls1);
548 self.push(HirFrame::ClassUnicode(cls2));
549 } else {
550 let mut cls1 = self.pop().unwrap().unwrap_class_bytes();
551 self.bytes_fold_and_negate(
552 &ast.span,
553 ast.negated,
554 &mut cls1,
555 )?;
556
557 let mut cls2 = self.pop().unwrap().unwrap_class_bytes();
558 cls2.union(&cls1);
559 self.push(HirFrame::ClassBytes(cls2));
560 }
561 }
562 // This is handled automatically by the visitor.
563 ast::ClassSetItem::Union(_) => {}
564 }
565 Ok(())
566 }
567
568 fn visit_class_set_binary_op_pre(
569 &mut self,
570 _op: &ast::ClassSetBinaryOp,
571 ) -> Result<()> {
572 if self.flags().unicode() {
573 let cls = hir::ClassUnicode::empty();
574 self.push(HirFrame::ClassUnicode(cls));
575 } else {
576 let cls = hir::ClassBytes::empty();
577 self.push(HirFrame::ClassBytes(cls));
578 }
579 Ok(())
580 }
581
582 fn visit_class_set_binary_op_in(
583 &mut self,
584 _op: &ast::ClassSetBinaryOp,
585 ) -> Result<()> {
586 if self.flags().unicode() {
587 let cls = hir::ClassUnicode::empty();
588 self.push(HirFrame::ClassUnicode(cls));
589 } else {
590 let cls = hir::ClassBytes::empty();
591 self.push(HirFrame::ClassBytes(cls));
592 }
593 Ok(())
594 }
595
596 fn visit_class_set_binary_op_post(
597 &mut self,
598 op: &ast::ClassSetBinaryOp,
599 ) -> Result<()> {
600 use crate::ast::ClassSetBinaryOpKind::*;
601
602 if self.flags().unicode() {
603 let mut rhs = self.pop().unwrap().unwrap_class_unicode();
604 let mut lhs = self.pop().unwrap().unwrap_class_unicode();
605 let mut cls = self.pop().unwrap().unwrap_class_unicode();
606 if self.flags().case_insensitive() {
607 rhs.try_case_fold_simple().map_err(|_| {
608 self.error(
609 op.rhs.span().clone(),
610 ErrorKind::UnicodeCaseUnavailable,
611 )
612 })?;
613 lhs.try_case_fold_simple().map_err(|_| {
614 self.error(
615 op.lhs.span().clone(),
616 ErrorKind::UnicodeCaseUnavailable,
617 )
618 })?;
619 }
620 match op.kind {
621 Intersection => lhs.intersect(&rhs),
622 Difference => lhs.difference(&rhs),
623 SymmetricDifference => lhs.symmetric_difference(&rhs),
624 }
625 cls.union(&lhs);
626 self.push(HirFrame::ClassUnicode(cls));
627 } else {
628 let mut rhs = self.pop().unwrap().unwrap_class_bytes();
629 let mut lhs = self.pop().unwrap().unwrap_class_bytes();
630 let mut cls = self.pop().unwrap().unwrap_class_bytes();
631 if self.flags().case_insensitive() {
632 rhs.case_fold_simple();
633 lhs.case_fold_simple();
634 }
635 match op.kind {
636 Intersection => lhs.intersect(&rhs),
637 Difference => lhs.difference(&rhs),
638 SymmetricDifference => lhs.symmetric_difference(&rhs),
639 }
640 cls.union(&lhs);
641 self.push(HirFrame::ClassBytes(cls));
642 }
643 Ok(())
644 }
645}
646
647/// The internal implementation of a translator.
648///
649/// This type is responsible for carrying around the original pattern string,
650/// which is not tied to the internal state of a translator.
651///
652/// A TranslatorI exists for the time it takes to translate a single Ast.
653#[derive(Clone, Debug)]
654struct TranslatorI<'t, 'p> {
655 trans: &'t Translator,
656 pattern: &'p str,
657}
658
659impl<'t, 'p> TranslatorI<'t, 'p> {
660 /// Build a new internal translator.
661 fn new(trans: &'t Translator, pattern: &'p str) -> TranslatorI<'t, 'p> {
662 TranslatorI { trans, pattern }
663 }
664
665 /// Return a reference to the underlying translator.
666 fn trans(&self) -> &Translator {
667 &self.trans
668 }
669
670 /// Push the given frame on to the call stack.
671 fn push(&self, frame: HirFrame) {
672 self.trans().stack.borrow_mut().push(frame);
673 }
674
675 /// Push the given literal char on to the call stack.
676 ///
677 /// If the top-most element of the stack is a literal, then the char
678 /// is appended to the end of that literal. Otherwise, a new literal
679 /// containing just the given char is pushed to the top of the stack.
680 fn push_char(&self, ch: char) {
681 let mut buf = [0; 4];
682 let bytes = ch.encode_utf8(&mut buf).as_bytes();
683 let mut stack = self.trans().stack.borrow_mut();
684 if let Some(HirFrame::Literal(ref mut literal)) = stack.last_mut() {
685 literal.extend_from_slice(bytes);
686 } else {
687 stack.push(HirFrame::Literal(bytes.to_vec()));
688 }
689 }
690
691 /// Push the given literal byte on to the call stack.
692 ///
693 /// If the top-most element of the stack is a literal, then the byte
694 /// is appended to the end of that literal. Otherwise, a new literal
695 /// containing just the given byte is pushed to the top of the stack.
696 fn push_byte(&self, byte: u8) {
697 let mut stack = self.trans().stack.borrow_mut();
698 if let Some(HirFrame::Literal(ref mut literal)) = stack.last_mut() {
699 literal.push(byte);
700 } else {
701 stack.push(HirFrame::Literal(vec![byte]));
702 }
703 }
704
705 /// Pop the top of the call stack. If the call stack is empty, return None.
706 fn pop(&self) -> Option<HirFrame> {
707 self.trans().stack.borrow_mut().pop()
708 }
709
710 /// Pop an HIR expression from the top of the stack for a concatenation.
711 ///
712 /// This returns None if the stack is empty or when a concat frame is seen.
713 /// Otherwise, it panics if it could not find an HIR expression.
714 fn pop_concat_expr(&self) -> Option<Hir> {
715 let frame = self.pop()?;
716 match frame {
717 HirFrame::Concat => None,
718 HirFrame::Expr(expr) => Some(expr),
719 HirFrame::Literal(lit) => Some(Hir::literal(lit)),
720 HirFrame::ClassUnicode(_) => {
721 unreachable!("expected expr or concat, got Unicode class")
722 }
723 HirFrame::ClassBytes(_) => {
724 unreachable!("expected expr or concat, got byte class")
725 }
726 HirFrame::Repetition => {
727 unreachable!("expected expr or concat, got repetition")
728 }
729 HirFrame::Group { .. } => {
730 unreachable!("expected expr or concat, got group")
731 }
732 HirFrame::Alternation => {
733 unreachable!("expected expr or concat, got alt marker")
734 }
735 HirFrame::AlternationBranch => {
736 unreachable!("expected expr or concat, got alt branch marker")
737 }
738 }
739 }
740
741 /// Pop an HIR expression from the top of the stack for an alternation.
742 ///
743 /// This returns None if the stack is empty or when an alternation frame is
744 /// seen. Otherwise, it panics if it could not find an HIR expression.
745 fn pop_alt_expr(&self) -> Option<Hir> {
746 let frame = self.pop()?;
747 match frame {
748 HirFrame::Alternation => None,
749 HirFrame::Expr(expr) => Some(expr),
750 HirFrame::Literal(lit) => Some(Hir::literal(lit)),
751 HirFrame::ClassUnicode(_) => {
752 unreachable!("expected expr or alt, got Unicode class")
753 }
754 HirFrame::ClassBytes(_) => {
755 unreachable!("expected expr or alt, got byte class")
756 }
757 HirFrame::Repetition => {
758 unreachable!("expected expr or alt, got repetition")
759 }
760 HirFrame::Group { .. } => {
761 unreachable!("expected expr or alt, got group")
762 }
763 HirFrame::Concat => {
764 unreachable!("expected expr or alt, got concat marker")
765 }
766 HirFrame::AlternationBranch => {
767 unreachable!("expected expr or alt, got alt branch marker")
768 }
769 }
770 }
771
772 /// Create a new error with the given span and error type.
773 fn error(&self, span: Span, kind: ErrorKind) -> Error {
774 Error { kind, pattern: self.pattern.to_string(), span }
775 }
776
777 /// Return a copy of the active flags.
778 fn flags(&self) -> Flags {
779 self.trans().flags.get()
780 }
781
782 /// Set the flags of this translator from the flags set in the given AST.
783 /// Then, return the old flags.
784 fn set_flags(&self, ast_flags: &ast::Flags) -> Flags {
785 let old_flags = self.flags();
786 let mut new_flags = Flags::from_ast(ast_flags);
787 new_flags.merge(&old_flags);
788 self.trans().flags.set(new_flags);
789 old_flags
790 }
791
792 /// Convert an Ast literal to its scalar representation.
793 ///
794 /// When Unicode mode is enabled, then this always succeeds and returns a
795 /// `char` (Unicode scalar value).
796 ///
797 /// When Unicode mode is disabled, then a `char` will still be returned
798 /// whenever possible. A byte is returned only when invalid UTF-8 is
799 /// allowed and when the byte is not ASCII. Otherwise, a non-ASCII byte
800 /// will result in an error when invalid UTF-8 is not allowed.
801 fn ast_literal_to_scalar(
802 &self,
803 lit: &ast::Literal,
804 ) -> Result<Either<char, u8>> {
805 if self.flags().unicode() {
806 return Ok(Either::Left(lit.c));
807 }
808 let byte = match lit.byte() {
809 None => return Ok(Either::Left(lit.c)),
810 Some(byte) => byte,
811 };
812 if byte <= 0x7F {
813 return Ok(Either::Left(char::try_from(byte).unwrap()));
814 }
815 if self.trans().utf8 {
816 return Err(self.error(lit.span, ErrorKind::InvalidUtf8));
817 }
818 Ok(Either::Right(byte))
819 }
820
821 fn case_fold_char(&self, span: Span, c: char) -> Result<Option<Hir>> {
822 if !self.flags().case_insensitive() {
823 return Ok(None);
824 }
825 if self.flags().unicode() {
826 // If case folding won't do anything, then don't bother trying.
827 let map = unicode::SimpleCaseFolder::new()
828 .map(|f| f.overlaps(c, c))
829 .map_err(|_| {
830 self.error(span, ErrorKind::UnicodeCaseUnavailable)
831 })?;
832 if !map {
833 return Ok(None);
834 }
835 let mut cls =
836 hir::ClassUnicode::new(vec![hir::ClassUnicodeRange::new(
837 c, c,
838 )]);
839 cls.try_case_fold_simple().map_err(|_| {
840 self.error(span, ErrorKind::UnicodeCaseUnavailable)
841 })?;
842 Ok(Some(Hir::class(hir::Class::Unicode(cls))))
843 } else {
844 if c.len_utf8() > 1 {
845 return Err(self.error(span, ErrorKind::UnicodeNotAllowed));
846 }
847 // If case folding won't do anything, then don't bother trying.
848 match c {
849 'A'..='Z' | 'a'..='z' => {}
850 _ => return Ok(None),
851 }
852 let mut cls =
853 hir::ClassBytes::new(vec![hir::ClassBytesRange::new(
854 // OK because 'c.len_utf8() == 1' which in turn implies
855 // that 'c' is ASCII.
856 u8::try_from(c).unwrap(),
857 u8::try_from(c).unwrap(),
858 )]);
859 cls.case_fold_simple();
860 Ok(Some(Hir::class(hir::Class::Bytes(cls))))
861 }
862 }
863
864 fn hir_dot(&self, span: Span) -> Result<Hir> {
865 if !self.flags().unicode() && self.trans().utf8 {
866 return Err(self.error(span, ErrorKind::InvalidUtf8));
867 }
868 Ok(Hir::dot(self.flags().dot()))
869 }
870
871 fn hir_assertion(&self, asst: &ast::Assertion) -> Result<Hir> {
872 let unicode = self.flags().unicode();
873 let multi_line = self.flags().multi_line();
874 let crlf = self.flags().crlf();
875 Ok(match asst.kind {
876 ast::AssertionKind::StartLine => Hir::look(if multi_line {
877 if crlf {
878 hir::Look::StartCRLF
879 } else {
880 hir::Look::StartLF
881 }
882 } else {
883 hir::Look::Start
884 }),
885 ast::AssertionKind::EndLine => Hir::look(if multi_line {
886 if crlf {
887 hir::Look::EndCRLF
888 } else {
889 hir::Look::EndLF
890 }
891 } else {
892 hir::Look::End
893 }),
894 ast::AssertionKind::StartText => Hir::look(hir::Look::Start),
895 ast::AssertionKind::EndText => Hir::look(hir::Look::End),
896 ast::AssertionKind::WordBoundary => Hir::look(if unicode {
897 hir::Look::WordUnicode
898 } else {
899 hir::Look::WordAscii
900 }),
901 ast::AssertionKind::NotWordBoundary => Hir::look(if unicode {
902 hir::Look::WordUnicodeNegate
903 } else {
904 hir::Look::WordAsciiNegate
905 }),
906 })
907 }
908
909 fn hir_capture(&self, group: &ast::Group, expr: Hir) -> Hir {
910 let (index, name) = match group.kind {
911 ast::GroupKind::CaptureIndex(index) => (index, None),
912 ast::GroupKind::CaptureName { ref name, .. } => {
913 (name.index, Some(name.name.clone().into_boxed_str()))
914 }
915 // The HIR doesn't need to use non-capturing groups, since the way
916 // in which the data type is defined handles this automatically.
917 ast::GroupKind::NonCapturing(_) => return expr,
918 };
919 Hir::capture(hir::Capture { index, name, sub: Box::new(expr) })
920 }
921
922 fn hir_repetition(&self, rep: &ast::Repetition, expr: Hir) -> Hir {
923 let (min, max) = match rep.op.kind {
924 ast::RepetitionKind::ZeroOrOne => (0, Some(1)),
925 ast::RepetitionKind::ZeroOrMore => (0, None),
926 ast::RepetitionKind::OneOrMore => (1, None),
927 ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(m)) => {
928 (m, Some(m))
929 }
930 ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast(m)) => {
931 (m, None)
932 }
933 ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(
934 m,
935 n,
936 )) => (m, Some(n)),
937 };
938 let greedy =
939 if self.flags().swap_greed() { !rep.greedy } else { rep.greedy };
940 Hir::repetition(hir::Repetition {
941 min,
942 max,
943 greedy,
944 sub: Box::new(expr),
945 })
946 }
947
948 fn hir_unicode_class(
949 &self,
950 ast_class: &ast::ClassUnicode,
951 ) -> Result<hir::ClassUnicode> {
952 use crate::ast::ClassUnicodeKind::*;
953
954 if !self.flags().unicode() {
955 return Err(
956 self.error(ast_class.span, ErrorKind::UnicodeNotAllowed)
957 );
958 }
959 let query = match ast_class.kind {
960 OneLetter(name) => ClassQuery::OneLetter(name),
961 Named(ref name) => ClassQuery::Binary(name),
962 NamedValue { ref name, ref value, .. } => ClassQuery::ByValue {
963 property_name: name,
964 property_value: value,
965 },
966 };
967 let mut result = self.convert_unicode_class_error(
968 &ast_class.span,
969 unicode::class(query),
970 );
971 if let Ok(ref mut class) = result {
972 self.unicode_fold_and_negate(
973 &ast_class.span,
974 ast_class.negated,
975 class,
976 )?;
977 }
978 result
979 }
980
981 fn hir_ascii_unicode_class(
982 &self,
983 ast: &ast::ClassAscii,
984 ) -> Result<hir::ClassUnicode> {
985 let mut cls = hir::ClassUnicode::new(
986 ascii_class_as_chars(&ast.kind)
987 .map(|(s, e)| hir::ClassUnicodeRange::new(s, e)),
988 );
989 self.unicode_fold_and_negate(&ast.span, ast.negated, &mut cls)?;
990 Ok(cls)
991 }
992
993 fn hir_ascii_byte_class(
994 &self,
995 ast: &ast::ClassAscii,
996 ) -> Result<hir::ClassBytes> {
997 let mut cls = hir::ClassBytes::new(
998 ascii_class(&ast.kind)
999 .map(|(s, e)| hir::ClassBytesRange::new(s, e)),
1000 );
1001 self.bytes_fold_and_negate(&ast.span, ast.negated, &mut cls)?;
1002 Ok(cls)
1003 }
1004
1005 fn hir_perl_unicode_class(
1006 &self,
1007 ast_class: &ast::ClassPerl,
1008 ) -> Result<hir::ClassUnicode> {
1009 use crate::ast::ClassPerlKind::*;
1010
1011 assert!(self.flags().unicode());
1012 let result = match ast_class.kind {
1013 Digit => unicode::perl_digit(),
1014 Space => unicode::perl_space(),
1015 Word => unicode::perl_word(),
1016 };
1017 let mut class =
1018 self.convert_unicode_class_error(&ast_class.span, result)?;
1019 // We needn't apply case folding here because the Perl Unicode classes
1020 // are already closed under Unicode simple case folding.
1021 if ast_class.negated {
1022 class.negate();
1023 }
1024 Ok(class)
1025 }
1026
1027 fn hir_perl_byte_class(
1028 &self,
1029 ast_class: &ast::ClassPerl,
1030 ) -> Result<hir::ClassBytes> {
1031 use crate::ast::ClassPerlKind::*;
1032
1033 assert!(!self.flags().unicode());
1034 let mut class = match ast_class.kind {
1035 Digit => hir_ascii_class_bytes(&ast::ClassAsciiKind::Digit),
1036 Space => hir_ascii_class_bytes(&ast::ClassAsciiKind::Space),
1037 Word => hir_ascii_class_bytes(&ast::ClassAsciiKind::Word),
1038 };
1039 // We needn't apply case folding here because the Perl ASCII classes
1040 // are already closed (under ASCII case folding).
1041 if ast_class.negated {
1042 class.negate();
1043 }
1044 // Negating a Perl byte class is likely to cause it to match invalid
1045 // UTF-8. That's only OK if the translator is configured to allow such
1046 // things.
1047 if self.trans().utf8 && !class.is_ascii() {
1048 return Err(self.error(ast_class.span, ErrorKind::InvalidUtf8));
1049 }
1050 Ok(class)
1051 }
1052
1053 /// Converts the given Unicode specific error to an HIR translation error.
1054 ///
1055 /// The span given should approximate the position at which an error would
1056 /// occur.
1057 fn convert_unicode_class_error(
1058 &self,
1059 span: &Span,
1060 result: core::result::Result<hir::ClassUnicode, unicode::Error>,
1061 ) -> Result<hir::ClassUnicode> {
1062 result.map_err(|err| {
1063 let sp = span.clone();
1064 match err {
1065 unicode::Error::PropertyNotFound => {
1066 self.error(sp, ErrorKind::UnicodePropertyNotFound)
1067 }
1068 unicode::Error::PropertyValueNotFound => {
1069 self.error(sp, ErrorKind::UnicodePropertyValueNotFound)
1070 }
1071 unicode::Error::PerlClassNotFound => {
1072 self.error(sp, ErrorKind::UnicodePerlClassNotFound)
1073 }
1074 }
1075 })
1076 }
1077
1078 fn unicode_fold_and_negate(
1079 &self,
1080 span: &Span,
1081 negated: bool,
1082 class: &mut hir::ClassUnicode,
1083 ) -> Result<()> {
1084 // Note that we must apply case folding before negation!
1085 // Consider `(?i)[^x]`. If we applied negation first, then
1086 // the result would be the character class that matched any
1087 // Unicode scalar value.
1088 if self.flags().case_insensitive() {
1089 class.try_case_fold_simple().map_err(|_| {
1090 self.error(span.clone(), ErrorKind::UnicodeCaseUnavailable)
1091 })?;
1092 }
1093 if negated {
1094 class.negate();
1095 }
1096 Ok(())
1097 }
1098
1099 fn bytes_fold_and_negate(
1100 &self,
1101 span: &Span,
1102 negated: bool,
1103 class: &mut hir::ClassBytes,
1104 ) -> Result<()> {
1105 // Note that we must apply case folding before negation!
1106 // Consider `(?i)[^x]`. If we applied negation first, then
1107 // the result would be the character class that matched any
1108 // Unicode scalar value.
1109 if self.flags().case_insensitive() {
1110 class.case_fold_simple();
1111 }
1112 if negated {
1113 class.negate();
1114 }
1115 if self.trans().utf8 && !class.is_ascii() {
1116 return Err(self.error(span.clone(), ErrorKind::InvalidUtf8));
1117 }
1118 Ok(())
1119 }
1120
1121 /// Return a scalar byte value suitable for use as a literal in a byte
1122 /// character class.
1123 fn class_literal_byte(&self, ast: &ast::Literal) -> Result<u8> {
1124 match self.ast_literal_to_scalar(ast)? {
1125 Either::Right(byte) => Ok(byte),
1126 Either::Left(ch) => {
1127 let cp = u32::from(ch);
1128 if cp <= 0x7F {
1129 Ok(u8::try_from(cp).unwrap())
1130 } else {
1131 // We can't feasibly support Unicode in
1132 // byte oriented classes. Byte classes don't
1133 // do Unicode case folding.
1134 Err(self.error(ast.span, ErrorKind::UnicodeNotAllowed))
1135 }
1136 }
1137 }
1138 }
1139}
1140
1141/// A translator's representation of a regular expression's flags at any given
1142/// moment in time.
1143///
1144/// Each flag can be in one of three states: absent, present but disabled or
1145/// present but enabled.
1146#[derive(Clone, Copy, Debug, Default)]
1147struct Flags {
1148 case_insensitive: Option<bool>,
1149 multi_line: Option<bool>,
1150 dot_matches_new_line: Option<bool>,
1151 swap_greed: Option<bool>,
1152 unicode: Option<bool>,
1153 crlf: Option<bool>,
1154 // Note that `ignore_whitespace` is omitted here because it is handled
1155 // entirely in the parser.
1156}
1157
1158impl Flags {
1159 fn from_ast(ast: &ast::Flags) -> Flags {
1160 let mut flags = Flags::default();
1161 let mut enable = true;
1162 for item in &ast.items {
1163 match item.kind {
1164 ast::FlagsItemKind::Negation => {
1165 enable = false;
1166 }
1167 ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive) => {
1168 flags.case_insensitive = Some(enable);
1169 }
1170 ast::FlagsItemKind::Flag(ast::Flag::MultiLine) => {
1171 flags.multi_line = Some(enable);
1172 }
1173 ast::FlagsItemKind::Flag(ast::Flag::DotMatchesNewLine) => {
1174 flags.dot_matches_new_line = Some(enable);
1175 }
1176 ast::FlagsItemKind::Flag(ast::Flag::SwapGreed) => {
1177 flags.swap_greed = Some(enable);
1178 }
1179 ast::FlagsItemKind::Flag(ast::Flag::Unicode) => {
1180 flags.unicode = Some(enable);
1181 }
1182 ast::FlagsItemKind::Flag(ast::Flag::CRLF) => {
1183 flags.crlf = Some(enable);
1184 }
1185 ast::FlagsItemKind::Flag(ast::Flag::IgnoreWhitespace) => {}
1186 }
1187 }
1188 flags
1189 }
1190
1191 fn merge(&mut self, previous: &Flags) {
1192 if self.case_insensitive.is_none() {
1193 self.case_insensitive = previous.case_insensitive;
1194 }
1195 if self.multi_line.is_none() {
1196 self.multi_line = previous.multi_line;
1197 }
1198 if self.dot_matches_new_line.is_none() {
1199 self.dot_matches_new_line = previous.dot_matches_new_line;
1200 }
1201 if self.swap_greed.is_none() {
1202 self.swap_greed = previous.swap_greed;
1203 }
1204 if self.unicode.is_none() {
1205 self.unicode = previous.unicode;
1206 }
1207 if self.crlf.is_none() {
1208 self.crlf = previous.crlf;
1209 }
1210 }
1211
1212 fn dot(&self) -> hir::Dot {
1213 if self.dot_matches_new_line() {
1214 if self.unicode() {
1215 hir::Dot::AnyChar
1216 } else {
1217 hir::Dot::AnyByte
1218 }
1219 } else {
1220 if self.unicode() {
1221 if self.crlf() {
1222 hir::Dot::AnyCharExceptCRLF
1223 } else {
1224 hir::Dot::AnyCharExceptLF
1225 }
1226 } else {
1227 if self.crlf() {
1228 hir::Dot::AnyByteExceptCRLF
1229 } else {
1230 hir::Dot::AnyByteExceptLF
1231 }
1232 }
1233 }
1234 }
1235
1236 fn case_insensitive(&self) -> bool {
1237 self.case_insensitive.unwrap_or(false)
1238 }
1239
1240 fn multi_line(&self) -> bool {
1241 self.multi_line.unwrap_or(false)
1242 }
1243
1244 fn dot_matches_new_line(&self) -> bool {
1245 self.dot_matches_new_line.unwrap_or(false)
1246 }
1247
1248 fn swap_greed(&self) -> bool {
1249 self.swap_greed.unwrap_or(false)
1250 }
1251
1252 fn unicode(&self) -> bool {
1253 self.unicode.unwrap_or(true)
1254 }
1255
1256 fn crlf(&self) -> bool {
1257 self.crlf.unwrap_or(false)
1258 }
1259}
1260
1261fn hir_ascii_class_bytes(kind: &ast::ClassAsciiKind) -> hir::ClassBytes {
1262 let ranges: Vec<_> = ascii_classimpl Iterator(kind)
1263 .map(|(s: u8, e: u8)| hir::ClassBytesRange::new(start:s, end:e))
1264 .collect();
1265 hir::ClassBytes::new(ranges)
1266}
1267
1268fn ascii_class(kind: &ast::ClassAsciiKind) -> impl Iterator<Item = (u8, u8)> {
1269 use crate::ast::ClassAsciiKind::*;
1270
1271 let slice: &'static [(u8, u8)] = match *kind {
1272 Alnum => &[(b'0', b'9'), (b'A', b'Z'), (b'a', b'z')],
1273 Alpha => &[(b'A', b'Z'), (b'a', b'z')],
1274 Ascii => &[(b'\x00', b'\x7F')],
1275 Blank => &[(b'\t', b'\t'), (b' ', b' ')],
1276 Cntrl => &[(b'\x00', b'\x1F'), (b'\x7F', b'\x7F')],
1277 Digit => &[(b'0', b'9')],
1278 Graph => &[(b'!', b'~')],
1279 Lower => &[(b'a', b'z')],
1280 Print => &[(b' ', b'~')],
1281 Punct => &[(b'!', b'/'), (b':', b'@'), (b'[', b'`'), (b'{', b'~')],
1282 Space => &[
1283 (b'\t', b'\t'),
1284 (b'\n', b'\n'),
1285 (b'\x0B', b'\x0B'),
1286 (b'\x0C', b'\x0C'),
1287 (b'\r', b'\r'),
1288 (b' ', b' '),
1289 ],
1290 Upper => &[(b'A', b'Z')],
1291 Word => &[(b'0', b'9'), (b'A', b'Z'), (b'_', b'_'), (b'a', b'z')],
1292 Xdigit => &[(b'0', b'9'), (b'A', b'F'), (b'a', b'f')],
1293 };
1294 slice.iter().copied()
1295}
1296
1297fn ascii_class_as_chars(
1298 kind: &ast::ClassAsciiKind,
1299) -> impl Iterator<Item = (char, char)> {
1300 ascii_class(kind).map(|(s: u8, e: u8)| (char::from(s), char::from(e)))
1301}
1302
1303#[cfg(test)]
1304mod tests {
1305 use crate::{
1306 ast::{self, parse::ParserBuilder, Ast, Position, Span},
1307 hir::{self, Hir, HirKind, Look, Properties},
1308 unicode::{self, ClassQuery},
1309 };
1310
1311 use super::*;
1312
1313 // We create these errors to compare with real hir::Errors in the tests.
1314 // We define equality between TestError and hir::Error to disregard the
1315 // pattern string in hir::Error, which is annoying to provide in tests.
1316 #[derive(Clone, Debug)]
1317 struct TestError {
1318 span: Span,
1319 kind: hir::ErrorKind,
1320 }
1321
1322 impl PartialEq<hir::Error> for TestError {
1323 fn eq(&self, other: &hir::Error) -> bool {
1324 self.span == other.span && self.kind == other.kind
1325 }
1326 }
1327
1328 impl PartialEq<TestError> for hir::Error {
1329 fn eq(&self, other: &TestError) -> bool {
1330 self.span == other.span && self.kind == other.kind
1331 }
1332 }
1333
1334 fn parse(pattern: &str) -> Ast {
1335 ParserBuilder::new().octal(true).build().parse(pattern).unwrap()
1336 }
1337
1338 fn t(pattern: &str) -> Hir {
1339 TranslatorBuilder::new()
1340 .utf8(true)
1341 .build()
1342 .translate(pattern, &parse(pattern))
1343 .unwrap()
1344 }
1345
1346 fn t_err(pattern: &str) -> hir::Error {
1347 TranslatorBuilder::new()
1348 .utf8(true)
1349 .build()
1350 .translate(pattern, &parse(pattern))
1351 .unwrap_err()
1352 }
1353
1354 fn t_bytes(pattern: &str) -> Hir {
1355 TranslatorBuilder::new()
1356 .utf8(false)
1357 .build()
1358 .translate(pattern, &parse(pattern))
1359 .unwrap()
1360 }
1361
1362 fn props(pattern: &str) -> Properties {
1363 t(pattern).properties().clone()
1364 }
1365
1366 fn props_bytes(pattern: &str) -> Properties {
1367 t_bytes(pattern).properties().clone()
1368 }
1369
1370 fn hir_lit(s: &str) -> Hir {
1371 hir_blit(s.as_bytes())
1372 }
1373
1374 fn hir_blit(s: &[u8]) -> Hir {
1375 Hir::literal(s)
1376 }
1377
1378 fn hir_capture(index: u32, expr: Hir) -> Hir {
1379 Hir::capture(hir::Capture { index, name: None, sub: Box::new(expr) })
1380 }
1381
1382 fn hir_capture_name(index: u32, name: &str, expr: Hir) -> Hir {
1383 Hir::capture(hir::Capture {
1384 index,
1385 name: Some(name.into()),
1386 sub: Box::new(expr),
1387 })
1388 }
1389
1390 fn hir_quest(greedy: bool, expr: Hir) -> Hir {
1391 Hir::repetition(hir::Repetition {
1392 min: 0,
1393 max: Some(1),
1394 greedy,
1395 sub: Box::new(expr),
1396 })
1397 }
1398
1399 fn hir_star(greedy: bool, expr: Hir) -> Hir {
1400 Hir::repetition(hir::Repetition {
1401 min: 0,
1402 max: None,
1403 greedy,
1404 sub: Box::new(expr),
1405 })
1406 }
1407
1408 fn hir_plus(greedy: bool, expr: Hir) -> Hir {
1409 Hir::repetition(hir::Repetition {
1410 min: 1,
1411 max: None,
1412 greedy,
1413 sub: Box::new(expr),
1414 })
1415 }
1416
1417 fn hir_range(greedy: bool, min: u32, max: Option<u32>, expr: Hir) -> Hir {
1418 Hir::repetition(hir::Repetition {
1419 min,
1420 max,
1421 greedy,
1422 sub: Box::new(expr),
1423 })
1424 }
1425
1426 fn hir_alt(alts: Vec<Hir>) -> Hir {
1427 Hir::alternation(alts)
1428 }
1429
1430 fn hir_cat(exprs: Vec<Hir>) -> Hir {
1431 Hir::concat(exprs)
1432 }
1433
1434 #[allow(dead_code)]
1435 fn hir_uclass_query(query: ClassQuery<'_>) -> Hir {
1436 Hir::class(hir::Class::Unicode(unicode::class(query).unwrap()))
1437 }
1438
1439 #[allow(dead_code)]
1440 fn hir_uclass_perl_word() -> Hir {
1441 Hir::class(hir::Class::Unicode(unicode::perl_word().unwrap()))
1442 }
1443
1444 fn hir_ascii_uclass(kind: &ast::ClassAsciiKind) -> Hir {
1445 Hir::class(hir::Class::Unicode(hir::ClassUnicode::new(
1446 ascii_class_as_chars(kind)
1447 .map(|(s, e)| hir::ClassUnicodeRange::new(s, e)),
1448 )))
1449 }
1450
1451 fn hir_ascii_bclass(kind: &ast::ClassAsciiKind) -> Hir {
1452 Hir::class(hir::Class::Bytes(hir::ClassBytes::new(
1453 ascii_class(kind).map(|(s, e)| hir::ClassBytesRange::new(s, e)),
1454 )))
1455 }
1456
1457 fn hir_uclass(ranges: &[(char, char)]) -> Hir {
1458 Hir::class(uclass(ranges))
1459 }
1460
1461 fn hir_bclass(ranges: &[(u8, u8)]) -> Hir {
1462 Hir::class(bclass(ranges))
1463 }
1464
1465 fn hir_case_fold(expr: Hir) -> Hir {
1466 match expr.into_kind() {
1467 HirKind::Class(mut cls) => {
1468 cls.case_fold_simple();
1469 Hir::class(cls)
1470 }
1471 _ => panic!("cannot case fold non-class Hir expr"),
1472 }
1473 }
1474
1475 fn hir_negate(expr: Hir) -> Hir {
1476 match expr.into_kind() {
1477 HirKind::Class(mut cls) => {
1478 cls.negate();
1479 Hir::class(cls)
1480 }
1481 _ => panic!("cannot negate non-class Hir expr"),
1482 }
1483 }
1484
1485 fn uclass(ranges: &[(char, char)]) -> hir::Class {
1486 let ranges: Vec<hir::ClassUnicodeRange> = ranges
1487 .iter()
1488 .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e))
1489 .collect();
1490 hir::Class::Unicode(hir::ClassUnicode::new(ranges))
1491 }
1492
1493 fn bclass(ranges: &[(u8, u8)]) -> hir::Class {
1494 let ranges: Vec<hir::ClassBytesRange> = ranges
1495 .iter()
1496 .map(|&(s, e)| hir::ClassBytesRange::new(s, e))
1497 .collect();
1498 hir::Class::Bytes(hir::ClassBytes::new(ranges))
1499 }
1500
1501 #[cfg(feature = "unicode-case")]
1502 fn class_case_fold(mut cls: hir::Class) -> Hir {
1503 cls.case_fold_simple();
1504 Hir::class(cls)
1505 }
1506
1507 fn class_negate(mut cls: hir::Class) -> Hir {
1508 cls.negate();
1509 Hir::class(cls)
1510 }
1511
1512 #[allow(dead_code)]
1513 fn hir_union(expr1: Hir, expr2: Hir) -> Hir {
1514 use crate::hir::Class::{Bytes, Unicode};
1515
1516 match (expr1.into_kind(), expr2.into_kind()) {
1517 (HirKind::Class(Unicode(mut c1)), HirKind::Class(Unicode(c2))) => {
1518 c1.union(&c2);
1519 Hir::class(hir::Class::Unicode(c1))
1520 }
1521 (HirKind::Class(Bytes(mut c1)), HirKind::Class(Bytes(c2))) => {
1522 c1.union(&c2);
1523 Hir::class(hir::Class::Bytes(c1))
1524 }
1525 _ => panic!("cannot union non-class Hir exprs"),
1526 }
1527 }
1528
1529 #[allow(dead_code)]
1530 fn hir_difference(expr1: Hir, expr2: Hir) -> Hir {
1531 use crate::hir::Class::{Bytes, Unicode};
1532
1533 match (expr1.into_kind(), expr2.into_kind()) {
1534 (HirKind::Class(Unicode(mut c1)), HirKind::Class(Unicode(c2))) => {
1535 c1.difference(&c2);
1536 Hir::class(hir::Class::Unicode(c1))
1537 }
1538 (HirKind::Class(Bytes(mut c1)), HirKind::Class(Bytes(c2))) => {
1539 c1.difference(&c2);
1540 Hir::class(hir::Class::Bytes(c1))
1541 }
1542 _ => panic!("cannot difference non-class Hir exprs"),
1543 }
1544 }
1545
1546 fn hir_look(look: hir::Look) -> Hir {
1547 Hir::look(look)
1548 }
1549
1550 #[test]
1551 fn empty() {
1552 assert_eq!(t(""), Hir::empty());
1553 assert_eq!(t("(?i)"), Hir::empty());
1554 assert_eq!(t("()"), hir_capture(1, Hir::empty()));
1555 assert_eq!(t("(?:)"), Hir::empty());
1556 assert_eq!(t("(?P<wat>)"), hir_capture_name(1, "wat", Hir::empty()));
1557 assert_eq!(t("|"), hir_alt(vec![Hir::empty(), Hir::empty()]));
1558 assert_eq!(
1559 t("()|()"),
1560 hir_alt(vec![
1561 hir_capture(1, Hir::empty()),
1562 hir_capture(2, Hir::empty()),
1563 ])
1564 );
1565 assert_eq!(
1566 t("(|b)"),
1567 hir_capture(1, hir_alt(vec![Hir::empty(), hir_lit("b"),]))
1568 );
1569 assert_eq!(
1570 t("(a|)"),
1571 hir_capture(1, hir_alt(vec![hir_lit("a"), Hir::empty(),]))
1572 );
1573 assert_eq!(
1574 t("(a||c)"),
1575 hir_capture(
1576 1,
1577 hir_alt(vec![hir_lit("a"), Hir::empty(), hir_lit("c"),])
1578 )
1579 );
1580 assert_eq!(
1581 t("(||)"),
1582 hir_capture(
1583 1,
1584 hir_alt(vec![Hir::empty(), Hir::empty(), Hir::empty(),])
1585 )
1586 );
1587 }
1588
1589 #[test]
1590 fn literal() {
1591 assert_eq!(t("a"), hir_lit("a"));
1592 assert_eq!(t("(?-u)a"), hir_lit("a"));
1593 assert_eq!(t("☃"), hir_lit("☃"));
1594 assert_eq!(t("abcd"), hir_lit("abcd"));
1595
1596 assert_eq!(t_bytes("(?-u)a"), hir_lit("a"));
1597 assert_eq!(t_bytes("(?-u)\x61"), hir_lit("a"));
1598 assert_eq!(t_bytes(r"(?-u)\x61"), hir_lit("a"));
1599 assert_eq!(t_bytes(r"(?-u)\xFF"), hir_blit(b"\xFF"));
1600
1601 assert_eq!(
1602 t_err("(?-u)☃"),
1603 TestError {
1604 kind: hir::ErrorKind::UnicodeNotAllowed,
1605 span: Span::new(
1606 Position::new(5, 1, 6),
1607 Position::new(8, 1, 7)
1608 ),
1609 }
1610 );
1611 assert_eq!(
1612 t_err(r"(?-u)\xFF"),
1613 TestError {
1614 kind: hir::ErrorKind::InvalidUtf8,
1615 span: Span::new(
1616 Position::new(5, 1, 6),
1617 Position::new(9, 1, 10)
1618 ),
1619 }
1620 );
1621 }
1622
1623 #[test]
1624 fn literal_case_insensitive() {
1625 #[cfg(feature = "unicode-case")]
1626 assert_eq!(t("(?i)a"), hir_uclass(&[('A', 'A'), ('a', 'a'),]));
1627 #[cfg(feature = "unicode-case")]
1628 assert_eq!(t("(?i:a)"), hir_uclass(&[('A', 'A'), ('a', 'a')]));
1629 #[cfg(feature = "unicode-case")]
1630 assert_eq!(
1631 t("a(?i)a(?-i)a"),
1632 hir_cat(vec![
1633 hir_lit("a"),
1634 hir_uclass(&[('A', 'A'), ('a', 'a')]),
1635 hir_lit("a"),
1636 ])
1637 );
1638 #[cfg(feature = "unicode-case")]
1639 assert_eq!(
1640 t("(?i)ab@c"),
1641 hir_cat(vec![
1642 hir_uclass(&[('A', 'A'), ('a', 'a')]),
1643 hir_uclass(&[('B', 'B'), ('b', 'b')]),
1644 hir_lit("@"),
1645 hir_uclass(&[('C', 'C'), ('c', 'c')]),
1646 ])
1647 );
1648 #[cfg(feature = "unicode-case")]
1649 assert_eq!(
1650 t("(?i)β"),
1651 hir_uclass(&[('Β', 'Β'), ('β', 'β'), ('ϐ', 'ϐ'),])
1652 );
1653
1654 assert_eq!(t("(?i-u)a"), hir_bclass(&[(b'A', b'A'), (b'a', b'a'),]));
1655 #[cfg(feature = "unicode-case")]
1656 assert_eq!(
1657 t("(?-u)a(?i)a(?-i)a"),
1658 hir_cat(vec![
1659 hir_lit("a"),
1660 hir_bclass(&[(b'A', b'A'), (b'a', b'a')]),
1661 hir_lit("a"),
1662 ])
1663 );
1664 assert_eq!(
1665 t("(?i-u)ab@c"),
1666 hir_cat(vec![
1667 hir_bclass(&[(b'A', b'A'), (b'a', b'a')]),
1668 hir_bclass(&[(b'B', b'B'), (b'b', b'b')]),
1669 hir_lit("@"),
1670 hir_bclass(&[(b'C', b'C'), (b'c', b'c')]),
1671 ])
1672 );
1673
1674 assert_eq!(
1675 t_bytes("(?i-u)a"),
1676 hir_bclass(&[(b'A', b'A'), (b'a', b'a'),])
1677 );
1678 assert_eq!(
1679 t_bytes("(?i-u)\x61"),
1680 hir_bclass(&[(b'A', b'A'), (b'a', b'a'),])
1681 );
1682 assert_eq!(
1683 t_bytes(r"(?i-u)\x61"),
1684 hir_bclass(&[(b'A', b'A'), (b'a', b'a'),])
1685 );
1686 assert_eq!(t_bytes(r"(?i-u)\xFF"), hir_blit(b"\xFF"));
1687
1688 assert_eq!(
1689 t_err("(?i-u)β"),
1690 TestError {
1691 kind: hir::ErrorKind::UnicodeNotAllowed,
1692 span: Span::new(
1693 Position::new(6, 1, 7),
1694 Position::new(8, 1, 8),
1695 ),
1696 }
1697 );
1698 }
1699
1700 #[test]
1701 fn dot() {
1702 assert_eq!(
1703 t("."),
1704 hir_uclass(&[('\0', '\t'), ('\x0B', '\u{10FFFF}')])
1705 );
1706 assert_eq!(
1707 t("(?R)."),
1708 hir_uclass(&[
1709 ('\0', '\t'),
1710 ('\x0B', '\x0C'),
1711 ('\x0E', '\u{10FFFF}'),
1712 ])
1713 );
1714 assert_eq!(t("(?s)."), hir_uclass(&[('\0', '\u{10FFFF}')]));
1715 assert_eq!(t("(?Rs)."), hir_uclass(&[('\0', '\u{10FFFF}')]));
1716 assert_eq!(
1717 t_bytes("(?-u)."),
1718 hir_bclass(&[(b'\0', b'\t'), (b'\x0B', b'\xFF')])
1719 );
1720 assert_eq!(
1721 t_bytes("(?R-u)."),
1722 hir_bclass(&[
1723 (b'\0', b'\t'),
1724 (b'\x0B', b'\x0C'),
1725 (b'\x0E', b'\xFF'),
1726 ])
1727 );
1728 assert_eq!(t_bytes("(?s-u)."), hir_bclass(&[(b'\0', b'\xFF'),]));
1729 assert_eq!(t_bytes("(?Rs-u)."), hir_bclass(&[(b'\0', b'\xFF'),]));
1730
1731 // If invalid UTF-8 isn't allowed, then non-Unicode `.` isn't allowed.
1732 assert_eq!(
1733 t_err("(?-u)."),
1734 TestError {
1735 kind: hir::ErrorKind::InvalidUtf8,
1736 span: Span::new(
1737 Position::new(5, 1, 6),
1738 Position::new(6, 1, 7)
1739 ),
1740 }
1741 );
1742 assert_eq!(
1743 t_err("(?R-u)."),
1744 TestError {
1745 kind: hir::ErrorKind::InvalidUtf8,
1746 span: Span::new(
1747 Position::new(6, 1, 7),
1748 Position::new(7, 1, 8)
1749 ),
1750 }
1751 );
1752 assert_eq!(
1753 t_err("(?s-u)."),
1754 TestError {
1755 kind: hir::ErrorKind::InvalidUtf8,
1756 span: Span::new(
1757 Position::new(6, 1, 7),
1758 Position::new(7, 1, 8)
1759 ),
1760 }
1761 );
1762 assert_eq!(
1763 t_err("(?Rs-u)."),
1764 TestError {
1765 kind: hir::ErrorKind::InvalidUtf8,
1766 span: Span::new(
1767 Position::new(7, 1, 8),
1768 Position::new(8, 1, 9)
1769 ),
1770 }
1771 );
1772 }
1773
1774 #[test]
1775 fn assertions() {
1776 assert_eq!(t("^"), hir_look(hir::Look::Start));
1777 assert_eq!(t("$"), hir_look(hir::Look::End));
1778 assert_eq!(t(r"\A"), hir_look(hir::Look::Start));
1779 assert_eq!(t(r"\z"), hir_look(hir::Look::End));
1780 assert_eq!(t("(?m)^"), hir_look(hir::Look::StartLF));
1781 assert_eq!(t("(?m)$"), hir_look(hir::Look::EndLF));
1782 assert_eq!(t(r"(?m)\A"), hir_look(hir::Look::Start));
1783 assert_eq!(t(r"(?m)\z"), hir_look(hir::Look::End));
1784
1785 assert_eq!(t(r"\b"), hir_look(hir::Look::WordUnicode));
1786 assert_eq!(t(r"\B"), hir_look(hir::Look::WordUnicodeNegate));
1787 assert_eq!(t(r"(?-u)\b"), hir_look(hir::Look::WordAscii));
1788 assert_eq!(t(r"(?-u)\B"), hir_look(hir::Look::WordAsciiNegate));
1789 }
1790
1791 #[test]
1792 fn group() {
1793 assert_eq!(t("(a)"), hir_capture(1, hir_lit("a")));
1794 assert_eq!(
1795 t("(a)(b)"),
1796 hir_cat(vec![
1797 hir_capture(1, hir_lit("a")),
1798 hir_capture(2, hir_lit("b")),
1799 ])
1800 );
1801 assert_eq!(
1802 t("(a)|(b)"),
1803 hir_alt(vec![
1804 hir_capture(1, hir_lit("a")),
1805 hir_capture(2, hir_lit("b")),
1806 ])
1807 );
1808 assert_eq!(t("(?P<foo>)"), hir_capture_name(1, "foo", Hir::empty()));
1809 assert_eq!(t("(?P<foo>a)"), hir_capture_name(1, "foo", hir_lit("a")));
1810 assert_eq!(
1811 t("(?P<foo>a)(?P<bar>b)"),
1812 hir_cat(vec![
1813 hir_capture_name(1, "foo", hir_lit("a")),
1814 hir_capture_name(2, "bar", hir_lit("b")),
1815 ])
1816 );
1817 assert_eq!(t("(?:)"), Hir::empty());
1818 assert_eq!(t("(?:a)"), hir_lit("a"));
1819 assert_eq!(
1820 t("(?:a)(b)"),
1821 hir_cat(vec![hir_lit("a"), hir_capture(1, hir_lit("b")),])
1822 );
1823 assert_eq!(
1824 t("(a)(?:b)(c)"),
1825 hir_cat(vec![
1826 hir_capture(1, hir_lit("a")),
1827 hir_lit("b"),
1828 hir_capture(2, hir_lit("c")),
1829 ])
1830 );
1831 assert_eq!(
1832 t("(a)(?P<foo>b)(c)"),
1833 hir_cat(vec![
1834 hir_capture(1, hir_lit("a")),
1835 hir_capture_name(2, "foo", hir_lit("b")),
1836 hir_capture(3, hir_lit("c")),
1837 ])
1838 );
1839 assert_eq!(t("()"), hir_capture(1, Hir::empty()));
1840 assert_eq!(t("((?i))"), hir_capture(1, Hir::empty()));
1841 assert_eq!(t("((?x))"), hir_capture(1, Hir::empty()));
1842 assert_eq!(
1843 t("(((?x)))"),
1844 hir_capture(1, hir_capture(2, Hir::empty()))
1845 );
1846 }
1847
1848 #[test]
1849 fn line_anchors() {
1850 assert_eq!(t("^"), hir_look(hir::Look::Start));
1851 assert_eq!(t("$"), hir_look(hir::Look::End));
1852 assert_eq!(t(r"\A"), hir_look(hir::Look::Start));
1853 assert_eq!(t(r"\z"), hir_look(hir::Look::End));
1854
1855 assert_eq!(t(r"(?m)\A"), hir_look(hir::Look::Start));
1856 assert_eq!(t(r"(?m)\z"), hir_look(hir::Look::End));
1857 assert_eq!(t("(?m)^"), hir_look(hir::Look::StartLF));
1858 assert_eq!(t("(?m)$"), hir_look(hir::Look::EndLF));
1859
1860 assert_eq!(t(r"(?R)\A"), hir_look(hir::Look::Start));
1861 assert_eq!(t(r"(?R)\z"), hir_look(hir::Look::End));
1862 assert_eq!(t("(?R)^"), hir_look(hir::Look::Start));
1863 assert_eq!(t("(?R)$"), hir_look(hir::Look::End));
1864
1865 assert_eq!(t(r"(?Rm)\A"), hir_look(hir::Look::Start));
1866 assert_eq!(t(r"(?Rm)\z"), hir_look(hir::Look::End));
1867 assert_eq!(t("(?Rm)^"), hir_look(hir::Look::StartCRLF));
1868 assert_eq!(t("(?Rm)$"), hir_look(hir::Look::EndCRLF));
1869 }
1870
1871 #[test]
1872 fn flags() {
1873 #[cfg(feature = "unicode-case")]
1874 assert_eq!(
1875 t("(?i:a)a"),
1876 hir_cat(
1877 vec![hir_uclass(&[('A', 'A'), ('a', 'a')]), hir_lit("a"),]
1878 )
1879 );
1880 assert_eq!(
1881 t("(?i-u:a)β"),
1882 hir_cat(vec![
1883 hir_bclass(&[(b'A', b'A'), (b'a', b'a')]),
1884 hir_lit("β"),
1885 ])
1886 );
1887 assert_eq!(
1888 t("(?:(?i-u)a)b"),
1889 hir_cat(vec![
1890 hir_bclass(&[(b'A', b'A'), (b'a', b'a')]),
1891 hir_lit("b"),
1892 ])
1893 );
1894 assert_eq!(
1895 t("((?i-u)a)b"),
1896 hir_cat(vec![
1897 hir_capture(1, hir_bclass(&[(b'A', b'A'), (b'a', b'a')])),
1898 hir_lit("b"),
1899 ])
1900 );
1901 #[cfg(feature = "unicode-case")]
1902 assert_eq!(
1903 t("(?i)(?-i:a)a"),
1904 hir_cat(
1905 vec![hir_lit("a"), hir_uclass(&[('A', 'A'), ('a', 'a')]),]
1906 )
1907 );
1908 #[cfg(feature = "unicode-case")]
1909 assert_eq!(
1910 t("(?im)a^"),
1911 hir_cat(vec![
1912 hir_uclass(&[('A', 'A'), ('a', 'a')]),
1913 hir_look(hir::Look::StartLF),
1914 ])
1915 );
1916 #[cfg(feature = "unicode-case")]
1917 assert_eq!(
1918 t("(?im)a^(?i-m)a^"),
1919 hir_cat(vec![
1920 hir_uclass(&[('A', 'A'), ('a', 'a')]),
1921 hir_look(hir::Look::StartLF),
1922 hir_uclass(&[('A', 'A'), ('a', 'a')]),
1923 hir_look(hir::Look::Start),
1924 ])
1925 );
1926 assert_eq!(
1927 t("(?U)a*a*?(?-U)a*a*?"),
1928 hir_cat(vec![
1929 hir_star(false, hir_lit("a")),
1930 hir_star(true, hir_lit("a")),
1931 hir_star(true, hir_lit("a")),
1932 hir_star(false, hir_lit("a")),
1933 ])
1934 );
1935 #[cfg(feature = "unicode-case")]
1936 assert_eq!(
1937 t("(?:a(?i)a)a"),
1938 hir_cat(vec![
1939 hir_cat(vec![
1940 hir_lit("a"),
1941 hir_uclass(&[('A', 'A'), ('a', 'a')]),
1942 ]),
1943 hir_lit("a"),
1944 ])
1945 );
1946 #[cfg(feature = "unicode-case")]
1947 assert_eq!(
1948 t("(?i)(?:a(?-i)a)a"),
1949 hir_cat(vec![
1950 hir_cat(vec![
1951 hir_uclass(&[('A', 'A'), ('a', 'a')]),
1952 hir_lit("a"),
1953 ]),
1954 hir_uclass(&[('A', 'A'), ('a', 'a')]),
1955 ])
1956 );
1957 }
1958
1959 #[test]
1960 fn escape() {
1961 assert_eq!(
1962 t(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#"),
1963 hir_lit(r"\.+*?()|[]{}^$#")
1964 );
1965 }
1966
1967 #[test]
1968 fn repetition() {
1969 assert_eq!(t("a?"), hir_quest(true, hir_lit("a")));
1970 assert_eq!(t("a*"), hir_star(true, hir_lit("a")));
1971 assert_eq!(t("a+"), hir_plus(true, hir_lit("a")));
1972 assert_eq!(t("a??"), hir_quest(false, hir_lit("a")));
1973 assert_eq!(t("a*?"), hir_star(false, hir_lit("a")));
1974 assert_eq!(t("a+?"), hir_plus(false, hir_lit("a")));
1975
1976 assert_eq!(t("a{1}"), hir_range(true, 1, Some(1), hir_lit("a"),));
1977 assert_eq!(t("a{1,}"), hir_range(true, 1, None, hir_lit("a"),));
1978 assert_eq!(t("a{1,2}"), hir_range(true, 1, Some(2), hir_lit("a"),));
1979 assert_eq!(t("a{1}?"), hir_range(false, 1, Some(1), hir_lit("a"),));
1980 assert_eq!(t("a{1,}?"), hir_range(false, 1, None, hir_lit("a"),));
1981 assert_eq!(t("a{1,2}?"), hir_range(false, 1, Some(2), hir_lit("a"),));
1982
1983 assert_eq!(
1984 t("ab?"),
1985 hir_cat(vec![hir_lit("a"), hir_quest(true, hir_lit("b")),])
1986 );
1987 assert_eq!(t("(ab)?"), hir_quest(true, hir_capture(1, hir_lit("ab"))));
1988 assert_eq!(
1989 t("a|b?"),
1990 hir_alt(vec![hir_lit("a"), hir_quest(true, hir_lit("b")),])
1991 );
1992 }
1993
1994 #[test]
1995 fn cat_alt() {
1996 let a = || hir_look(hir::Look::Start);
1997 let b = || hir_look(hir::Look::End);
1998 let c = || hir_look(hir::Look::WordUnicode);
1999 let d = || hir_look(hir::Look::WordUnicodeNegate);
2000
2001 assert_eq!(t("(^$)"), hir_capture(1, hir_cat(vec![a(), b()])));
2002 assert_eq!(t("^|$"), hir_alt(vec![a(), b()]));
2003 assert_eq!(t(r"^|$|\b"), hir_alt(vec![a(), b(), c()]));
2004 assert_eq!(
2005 t(r"^$|$\b|\b\B"),
2006 hir_alt(vec![
2007 hir_cat(vec![a(), b()]),
2008 hir_cat(vec![b(), c()]),
2009 hir_cat(vec![c(), d()]),
2010 ])
2011 );
2012 assert_eq!(t("(^|$)"), hir_capture(1, hir_alt(vec![a(), b()])));
2013 assert_eq!(
2014 t(r"(^|$|\b)"),
2015 hir_capture(1, hir_alt(vec![a(), b(), c()]))
2016 );
2017 assert_eq!(
2018 t(r"(^$|$\b|\b\B)"),
2019 hir_capture(
2020 1,
2021 hir_alt(vec![
2022 hir_cat(vec![a(), b()]),
2023 hir_cat(vec![b(), c()]),
2024 hir_cat(vec![c(), d()]),
2025 ])
2026 )
2027 );
2028 assert_eq!(
2029 t(r"(^$|($\b|(\b\B)))"),
2030 hir_capture(
2031 1,
2032 hir_alt(vec![
2033 hir_cat(vec![a(), b()]),
2034 hir_capture(
2035 2,
2036 hir_alt(vec![
2037 hir_cat(vec![b(), c()]),
2038 hir_capture(3, hir_cat(vec![c(), d()])),
2039 ])
2040 ),
2041 ])
2042 )
2043 );
2044 }
2045
2046 // Tests the HIR transformation of things like '[a-z]|[A-Z]' into
2047 // '[A-Za-z]'. In other words, an alternation of just classes is always
2048 // equivalent to a single class corresponding to the union of the branches
2049 // in that class. (Unless some branches match invalid UTF-8 and others
2050 // match non-ASCII Unicode.)
2051 #[test]
2052 fn cat_class_flattened() {
2053 assert_eq!(t(r"[a-z]|[A-Z]"), hir_uclass(&[('A', 'Z'), ('a', 'z')]));
2054 // Combining all of the letter properties should give us the one giant
2055 // letter property.
2056 #[cfg(feature = "unicode-gencat")]
2057 assert_eq!(
2058 t(r"(?x)
2059 \p{Lowercase_Letter}
2060 |\p{Uppercase_Letter}
2061 |\p{Titlecase_Letter}
2062 |\p{Modifier_Letter}
2063 |\p{Other_Letter}
2064 "),
2065 hir_uclass_query(ClassQuery::Binary("letter"))
2066 );
2067 // Byte classes that can truly match invalid UTF-8 cannot be combined
2068 // with Unicode classes.
2069 assert_eq!(
2070 t_bytes(r"[Δδ]|(?-u:[\x90-\xFF])|[Λλ]"),
2071 hir_alt(vec![
2072 hir_uclass(&[('Δ', 'Δ'), ('δ', 'δ')]),
2073 hir_bclass(&[(b'\x90', b'\xFF')]),
2074 hir_uclass(&[('Λ', 'Λ'), ('λ', 'λ')]),
2075 ])
2076 );
2077 // Byte classes on their own can be combined, even if some are ASCII
2078 // and others are invalid UTF-8.
2079 assert_eq!(
2080 t_bytes(r"[a-z]|(?-u:[\x90-\xFF])|[A-Z]"),
2081 hir_bclass(&[(b'A', b'Z'), (b'a', b'z'), (b'\x90', b'\xFF')]),
2082 );
2083 }
2084
2085 #[test]
2086 fn class_ascii() {
2087 assert_eq!(
2088 t("[[:alnum:]]"),
2089 hir_ascii_uclass(&ast::ClassAsciiKind::Alnum)
2090 );
2091 assert_eq!(
2092 t("[[:alpha:]]"),
2093 hir_ascii_uclass(&ast::ClassAsciiKind::Alpha)
2094 );
2095 assert_eq!(
2096 t("[[:ascii:]]"),
2097 hir_ascii_uclass(&ast::ClassAsciiKind::Ascii)
2098 );
2099 assert_eq!(
2100 t("[[:blank:]]"),
2101 hir_ascii_uclass(&ast::ClassAsciiKind::Blank)
2102 );
2103 assert_eq!(
2104 t("[[:cntrl:]]"),
2105 hir_ascii_uclass(&ast::ClassAsciiKind::Cntrl)
2106 );
2107 assert_eq!(
2108 t("[[:digit:]]"),
2109 hir_ascii_uclass(&ast::ClassAsciiKind::Digit)
2110 );
2111 assert_eq!(
2112 t("[[:graph:]]"),
2113 hir_ascii_uclass(&ast::ClassAsciiKind::Graph)
2114 );
2115 assert_eq!(
2116 t("[[:lower:]]"),
2117 hir_ascii_uclass(&ast::ClassAsciiKind::Lower)
2118 );
2119 assert_eq!(
2120 t("[[:print:]]"),
2121 hir_ascii_uclass(&ast::ClassAsciiKind::Print)
2122 );
2123 assert_eq!(
2124 t("[[:punct:]]"),
2125 hir_ascii_uclass(&ast::ClassAsciiKind::Punct)
2126 );
2127 assert_eq!(
2128 t("[[:space:]]"),
2129 hir_ascii_uclass(&ast::ClassAsciiKind::Space)
2130 );
2131 assert_eq!(
2132 t("[[:upper:]]"),
2133 hir_ascii_uclass(&ast::ClassAsciiKind::Upper)
2134 );
2135 assert_eq!(
2136 t("[[:word:]]"),
2137 hir_ascii_uclass(&ast::ClassAsciiKind::Word)
2138 );
2139 assert_eq!(
2140 t("[[:xdigit:]]"),
2141 hir_ascii_uclass(&ast::ClassAsciiKind::Xdigit)
2142 );
2143
2144 assert_eq!(
2145 t("[[:^lower:]]"),
2146 hir_negate(hir_ascii_uclass(&ast::ClassAsciiKind::Lower))
2147 );
2148 #[cfg(feature = "unicode-case")]
2149 assert_eq!(
2150 t("(?i)[[:lower:]]"),
2151 hir_uclass(&[
2152 ('A', 'Z'),
2153 ('a', 'z'),
2154 ('\u{17F}', '\u{17F}'),
2155 ('\u{212A}', '\u{212A}'),
2156 ])
2157 );
2158
2159 assert_eq!(
2160 t("(?-u)[[:lower:]]"),
2161 hir_ascii_bclass(&ast::ClassAsciiKind::Lower)
2162 );
2163 assert_eq!(
2164 t("(?i-u)[[:lower:]]"),
2165 hir_case_fold(hir_ascii_bclass(&ast::ClassAsciiKind::Lower))
2166 );
2167
2168 assert_eq!(
2169 t_err("(?-u)[[:^lower:]]"),
2170 TestError {
2171 kind: hir::ErrorKind::InvalidUtf8,
2172 span: Span::new(
2173 Position::new(6, 1, 7),
2174 Position::new(16, 1, 17)
2175 ),
2176 }
2177 );
2178 assert_eq!(
2179 t_err("(?i-u)[[:^lower:]]"),
2180 TestError {
2181 kind: hir::ErrorKind::InvalidUtf8,
2182 span: Span::new(
2183 Position::new(7, 1, 8),
2184 Position::new(17, 1, 18)
2185 ),
2186 }
2187 );
2188 }
2189
2190 #[test]
2191 fn class_ascii_multiple() {
2192 // See: https://github.com/rust-lang/regex/issues/680
2193 assert_eq!(
2194 t("[[:alnum:][:^ascii:]]"),
2195 hir_union(
2196 hir_ascii_uclass(&ast::ClassAsciiKind::Alnum),
2197 hir_uclass(&[('\u{80}', '\u{10FFFF}')]),
2198 ),
2199 );
2200 assert_eq!(
2201 t_bytes("(?-u)[[:alnum:][:^ascii:]]"),
2202 hir_union(
2203 hir_ascii_bclass(&ast::ClassAsciiKind::Alnum),
2204 hir_bclass(&[(0x80, 0xFF)]),
2205 ),
2206 );
2207 }
2208
2209 #[test]
2210 #[cfg(feature = "unicode-perl")]
2211 fn class_perl_unicode() {
2212 // Unicode
2213 assert_eq!(t(r"\d"), hir_uclass_query(ClassQuery::Binary("digit")));
2214 assert_eq!(t(r"\s"), hir_uclass_query(ClassQuery::Binary("space")));
2215 assert_eq!(t(r"\w"), hir_uclass_perl_word());
2216 #[cfg(feature = "unicode-case")]
2217 assert_eq!(
2218 t(r"(?i)\d"),
2219 hir_uclass_query(ClassQuery::Binary("digit"))
2220 );
2221 #[cfg(feature = "unicode-case")]
2222 assert_eq!(
2223 t(r"(?i)\s"),
2224 hir_uclass_query(ClassQuery::Binary("space"))
2225 );
2226 #[cfg(feature = "unicode-case")]
2227 assert_eq!(t(r"(?i)\w"), hir_uclass_perl_word());
2228
2229 // Unicode, negated
2230 assert_eq!(
2231 t(r"\D"),
2232 hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
2233 );
2234 assert_eq!(
2235 t(r"\S"),
2236 hir_negate(hir_uclass_query(ClassQuery::Binary("space")))
2237 );
2238 assert_eq!(t(r"\W"), hir_negate(hir_uclass_perl_word()));
2239 #[cfg(feature = "unicode-case")]
2240 assert_eq!(
2241 t(r"(?i)\D"),
2242 hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
2243 );
2244 #[cfg(feature = "unicode-case")]
2245 assert_eq!(
2246 t(r"(?i)\S"),
2247 hir_negate(hir_uclass_query(ClassQuery::Binary("space")))
2248 );
2249 #[cfg(feature = "unicode-case")]
2250 assert_eq!(t(r"(?i)\W"), hir_negate(hir_uclass_perl_word()));
2251 }
2252
2253 #[test]
2254 fn class_perl_ascii() {
2255 // ASCII only
2256 assert_eq!(
2257 t(r"(?-u)\d"),
2258 hir_ascii_bclass(&ast::ClassAsciiKind::Digit)
2259 );
2260 assert_eq!(
2261 t(r"(?-u)\s"),
2262 hir_ascii_bclass(&ast::ClassAsciiKind::Space)
2263 );
2264 assert_eq!(
2265 t(r"(?-u)\w"),
2266 hir_ascii_bclass(&ast::ClassAsciiKind::Word)
2267 );
2268 assert_eq!(
2269 t(r"(?i-u)\d"),
2270 hir_ascii_bclass(&ast::ClassAsciiKind::Digit)
2271 );
2272 assert_eq!(
2273 t(r"(?i-u)\s"),
2274 hir_ascii_bclass(&ast::ClassAsciiKind::Space)
2275 );
2276 assert_eq!(
2277 t(r"(?i-u)\w"),
2278 hir_ascii_bclass(&ast::ClassAsciiKind::Word)
2279 );
2280
2281 // ASCII only, negated
2282 assert_eq!(
2283 t_bytes(r"(?-u)\D"),
2284 hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit))
2285 );
2286 assert_eq!(
2287 t_bytes(r"(?-u)\S"),
2288 hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space))
2289 );
2290 assert_eq!(
2291 t_bytes(r"(?-u)\W"),
2292 hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word))
2293 );
2294 assert_eq!(
2295 t_bytes(r"(?i-u)\D"),
2296 hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit))
2297 );
2298 assert_eq!(
2299 t_bytes(r"(?i-u)\S"),
2300 hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space))
2301 );
2302 assert_eq!(
2303 t_bytes(r"(?i-u)\W"),
2304 hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word))
2305 );
2306
2307 // ASCII only, negated, with UTF-8 mode enabled.
2308 // In this case, negating any Perl class results in an error because
2309 // all such classes can match invalid UTF-8.
2310 assert_eq!(
2311 t_err(r"(?-u)\D"),
2312 TestError {
2313 kind: hir::ErrorKind::InvalidUtf8,
2314 span: Span::new(
2315 Position::new(5, 1, 6),
2316 Position::new(7, 1, 8),
2317 ),
2318 },
2319 );
2320 assert_eq!(
2321 t_err(r"(?-u)\S"),
2322 TestError {
2323 kind: hir::ErrorKind::InvalidUtf8,
2324 span: Span::new(
2325 Position::new(5, 1, 6),
2326 Position::new(7, 1, 8),
2327 ),
2328 },
2329 );
2330 assert_eq!(
2331 t_err(r"(?-u)\W"),
2332 TestError {
2333 kind: hir::ErrorKind::InvalidUtf8,
2334 span: Span::new(
2335 Position::new(5, 1, 6),
2336 Position::new(7, 1, 8),
2337 ),
2338 },
2339 );
2340 assert_eq!(
2341 t_err(r"(?i-u)\D"),
2342 TestError {
2343 kind: hir::ErrorKind::InvalidUtf8,
2344 span: Span::new(
2345 Position::new(6, 1, 7),
2346 Position::new(8, 1, 9),
2347 ),
2348 },
2349 );
2350 assert_eq!(
2351 t_err(r"(?i-u)\S"),
2352 TestError {
2353 kind: hir::ErrorKind::InvalidUtf8,
2354 span: Span::new(
2355 Position::new(6, 1, 7),
2356 Position::new(8, 1, 9),
2357 ),
2358 },
2359 );
2360 assert_eq!(
2361 t_err(r"(?i-u)\W"),
2362 TestError {
2363 kind: hir::ErrorKind::InvalidUtf8,
2364 span: Span::new(
2365 Position::new(6, 1, 7),
2366 Position::new(8, 1, 9),
2367 ),
2368 },
2369 );
2370 }
2371
2372 #[test]
2373 #[cfg(not(feature = "unicode-perl"))]
2374 fn class_perl_word_disabled() {
2375 assert_eq!(
2376 t_err(r"\w"),
2377 TestError {
2378 kind: hir::ErrorKind::UnicodePerlClassNotFound,
2379 span: Span::new(
2380 Position::new(0, 1, 1),
2381 Position::new(2, 1, 3)
2382 ),
2383 }
2384 );
2385 }
2386
2387 #[test]
2388 #[cfg(all(not(feature = "unicode-perl"), not(feature = "unicode-bool")))]
2389 fn class_perl_space_disabled() {
2390 assert_eq!(
2391 t_err(r"\s"),
2392 TestError {
2393 kind: hir::ErrorKind::UnicodePerlClassNotFound,
2394 span: Span::new(
2395 Position::new(0, 1, 1),
2396 Position::new(2, 1, 3)
2397 ),
2398 }
2399 );
2400 }
2401
2402 #[test]
2403 #[cfg(all(
2404 not(feature = "unicode-perl"),
2405 not(feature = "unicode-gencat")
2406 ))]
2407 fn class_perl_digit_disabled() {
2408 assert_eq!(
2409 t_err(r"\d"),
2410 TestError {
2411 kind: hir::ErrorKind::UnicodePerlClassNotFound,
2412 span: Span::new(
2413 Position::new(0, 1, 1),
2414 Position::new(2, 1, 3)
2415 ),
2416 }
2417 );
2418 }
2419
2420 #[test]
2421 #[cfg(feature = "unicode-gencat")]
2422 fn class_unicode_gencat() {
2423 assert_eq!(t(r"\pZ"), hir_uclass_query(ClassQuery::Binary("Z")));
2424 assert_eq!(t(r"\pz"), hir_uclass_query(ClassQuery::Binary("Z")));
2425 assert_eq!(
2426 t(r"\p{Separator}"),
2427 hir_uclass_query(ClassQuery::Binary("Z"))
2428 );
2429 assert_eq!(
2430 t(r"\p{se PaRa ToR}"),
2431 hir_uclass_query(ClassQuery::Binary("Z"))
2432 );
2433 assert_eq!(
2434 t(r"\p{gc:Separator}"),
2435 hir_uclass_query(ClassQuery::Binary("Z"))
2436 );
2437 assert_eq!(
2438 t(r"\p{gc=Separator}"),
2439 hir_uclass_query(ClassQuery::Binary("Z"))
2440 );
2441 assert_eq!(
2442 t(r"\p{Other}"),
2443 hir_uclass_query(ClassQuery::Binary("Other"))
2444 );
2445 assert_eq!(t(r"\pC"), hir_uclass_query(ClassQuery::Binary("Other")));
2446
2447 assert_eq!(
2448 t(r"\PZ"),
2449 hir_negate(hir_uclass_query(ClassQuery::Binary("Z")))
2450 );
2451 assert_eq!(
2452 t(r"\P{separator}"),
2453 hir_negate(hir_uclass_query(ClassQuery::Binary("Z")))
2454 );
2455 assert_eq!(
2456 t(r"\P{gc!=separator}"),
2457 hir_negate(hir_uclass_query(ClassQuery::Binary("Z")))
2458 );
2459
2460 assert_eq!(t(r"\p{any}"), hir_uclass_query(ClassQuery::Binary("Any")));
2461 assert_eq!(
2462 t(r"\p{assigned}"),
2463 hir_uclass_query(ClassQuery::Binary("Assigned"))
2464 );
2465 assert_eq!(
2466 t(r"\p{ascii}"),
2467 hir_uclass_query(ClassQuery::Binary("ASCII"))
2468 );
2469 assert_eq!(
2470 t(r"\p{gc:any}"),
2471 hir_uclass_query(ClassQuery::Binary("Any"))
2472 );
2473 assert_eq!(
2474 t(r"\p{gc:assigned}"),
2475 hir_uclass_query(ClassQuery::Binary("Assigned"))
2476 );
2477 assert_eq!(
2478 t(r"\p{gc:ascii}"),
2479 hir_uclass_query(ClassQuery::Binary("ASCII"))
2480 );
2481
2482 assert_eq!(
2483 t_err(r"(?-u)\pZ"),
2484 TestError {
2485 kind: hir::ErrorKind::UnicodeNotAllowed,
2486 span: Span::new(
2487 Position::new(5, 1, 6),
2488 Position::new(8, 1, 9)
2489 ),
2490 }
2491 );
2492 assert_eq!(
2493 t_err(r"(?-u)\p{Separator}"),
2494 TestError {
2495 kind: hir::ErrorKind::UnicodeNotAllowed,
2496 span: Span::new(
2497 Position::new(5, 1, 6),
2498 Position::new(18, 1, 19)
2499 ),
2500 }
2501 );
2502 assert_eq!(
2503 t_err(r"\pE"),
2504 TestError {
2505 kind: hir::ErrorKind::UnicodePropertyNotFound,
2506 span: Span::new(
2507 Position::new(0, 1, 1),
2508 Position::new(3, 1, 4)
2509 ),
2510 }
2511 );
2512 assert_eq!(
2513 t_err(r"\p{Foo}"),
2514 TestError {
2515 kind: hir::ErrorKind::UnicodePropertyNotFound,
2516 span: Span::new(
2517 Position::new(0, 1, 1),
2518 Position::new(7, 1, 8)
2519 ),
2520 }
2521 );
2522 assert_eq!(
2523 t_err(r"\p{gc:Foo}"),
2524 TestError {
2525 kind: hir::ErrorKind::UnicodePropertyValueNotFound,
2526 span: Span::new(
2527 Position::new(0, 1, 1),
2528 Position::new(10, 1, 11)
2529 ),
2530 }
2531 );
2532 }
2533
2534 #[test]
2535 #[cfg(not(feature = "unicode-gencat"))]
2536 fn class_unicode_gencat_disabled() {
2537 assert_eq!(
2538 t_err(r"\p{Separator}"),
2539 TestError {
2540 kind: hir::ErrorKind::UnicodePropertyNotFound,
2541 span: Span::new(
2542 Position::new(0, 1, 1),
2543 Position::new(13, 1, 14)
2544 ),
2545 }
2546 );
2547
2548 assert_eq!(
2549 t_err(r"\p{Any}"),
2550 TestError {
2551 kind: hir::ErrorKind::UnicodePropertyNotFound,
2552 span: Span::new(
2553 Position::new(0, 1, 1),
2554 Position::new(7, 1, 8)
2555 ),
2556 }
2557 );
2558 }
2559
2560 #[test]
2561 #[cfg(feature = "unicode-script")]
2562 fn class_unicode_script() {
2563 assert_eq!(
2564 t(r"\p{Greek}"),
2565 hir_uclass_query(ClassQuery::Binary("Greek"))
2566 );
2567 #[cfg(feature = "unicode-case")]
2568 assert_eq!(
2569 t(r"(?i)\p{Greek}"),
2570 hir_case_fold(hir_uclass_query(ClassQuery::Binary("Greek")))
2571 );
2572 #[cfg(feature = "unicode-case")]
2573 assert_eq!(
2574 t(r"(?i)\P{Greek}"),
2575 hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary(
2576 "Greek"
2577 ))))
2578 );
2579
2580 assert_eq!(
2581 t_err(r"\p{sc:Foo}"),
2582 TestError {
2583 kind: hir::ErrorKind::UnicodePropertyValueNotFound,
2584 span: Span::new(
2585 Position::new(0, 1, 1),
2586 Position::new(10, 1, 11)
2587 ),
2588 }
2589 );
2590 assert_eq!(
2591 t_err(r"\p{scx:Foo}"),
2592 TestError {
2593 kind: hir::ErrorKind::UnicodePropertyValueNotFound,
2594 span: Span::new(
2595 Position::new(0, 1, 1),
2596 Position::new(11, 1, 12)
2597 ),
2598 }
2599 );
2600 }
2601
2602 #[test]
2603 #[cfg(not(feature = "unicode-script"))]
2604 fn class_unicode_script_disabled() {
2605 assert_eq!(
2606 t_err(r"\p{Greek}"),
2607 TestError {
2608 kind: hir::ErrorKind::UnicodePropertyNotFound,
2609 span: Span::new(
2610 Position::new(0, 1, 1),
2611 Position::new(9, 1, 10)
2612 ),
2613 }
2614 );
2615
2616 assert_eq!(
2617 t_err(r"\p{scx:Greek}"),
2618 TestError {
2619 kind: hir::ErrorKind::UnicodePropertyNotFound,
2620 span: Span::new(
2621 Position::new(0, 1, 1),
2622 Position::new(13, 1, 14)
2623 ),
2624 }
2625 );
2626 }
2627
2628 #[test]
2629 #[cfg(feature = "unicode-age")]
2630 fn class_unicode_age() {
2631 assert_eq!(
2632 t_err(r"\p{age:Foo}"),
2633 TestError {
2634 kind: hir::ErrorKind::UnicodePropertyValueNotFound,
2635 span: Span::new(
2636 Position::new(0, 1, 1),
2637 Position::new(11, 1, 12)
2638 ),
2639 }
2640 );
2641 }
2642
2643 #[test]
2644 #[cfg(feature = "unicode-gencat")]
2645 fn class_unicode_any_empty() {
2646 assert_eq!(t(r"\P{any}"), hir_uclass(&[]),);
2647 }
2648
2649 #[test]
2650 #[cfg(not(feature = "unicode-age"))]
2651 fn class_unicode_age_disabled() {
2652 assert_eq!(
2653 t_err(r"\p{age:3.0}"),
2654 TestError {
2655 kind: hir::ErrorKind::UnicodePropertyNotFound,
2656 span: Span::new(
2657 Position::new(0, 1, 1),
2658 Position::new(11, 1, 12)
2659 ),
2660 }
2661 );
2662 }
2663
2664 #[test]
2665 fn class_bracketed() {
2666 assert_eq!(t("[a]"), hir_lit("a"));
2667 assert_eq!(t("[ab]"), hir_uclass(&[('a', 'b')]));
2668 assert_eq!(t("[^[a]]"), class_negate(uclass(&[('a', 'a')])));
2669 assert_eq!(t("[a-z]"), hir_uclass(&[('a', 'z')]));
2670 assert_eq!(t("[a-fd-h]"), hir_uclass(&[('a', 'h')]));
2671 assert_eq!(t("[a-fg-m]"), hir_uclass(&[('a', 'm')]));
2672 assert_eq!(t(r"[\x00]"), hir_uclass(&[('\0', '\0')]));
2673 assert_eq!(t(r"[\n]"), hir_uclass(&[('\n', '\n')]));
2674 assert_eq!(t("[\n]"), hir_uclass(&[('\n', '\n')]));
2675 #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))]
2676 assert_eq!(t(r"[\d]"), hir_uclass_query(ClassQuery::Binary("digit")));
2677 #[cfg(feature = "unicode-gencat")]
2678 assert_eq!(
2679 t(r"[\pZ]"),
2680 hir_uclass_query(ClassQuery::Binary("separator"))
2681 );
2682 #[cfg(feature = "unicode-gencat")]
2683 assert_eq!(
2684 t(r"[\p{separator}]"),
2685 hir_uclass_query(ClassQuery::Binary("separator"))
2686 );
2687 #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))]
2688 assert_eq!(t(r"[^\D]"), hir_uclass_query(ClassQuery::Binary("digit")));
2689 #[cfg(feature = "unicode-gencat")]
2690 assert_eq!(
2691 t(r"[^\PZ]"),
2692 hir_uclass_query(ClassQuery::Binary("separator"))
2693 );
2694 #[cfg(feature = "unicode-gencat")]
2695 assert_eq!(
2696 t(r"[^\P{separator}]"),
2697 hir_uclass_query(ClassQuery::Binary("separator"))
2698 );
2699 #[cfg(all(
2700 feature = "unicode-case",
2701 any(feature = "unicode-perl", feature = "unicode-gencat")
2702 ))]
2703 assert_eq!(
2704 t(r"(?i)[^\D]"),
2705 hir_uclass_query(ClassQuery::Binary("digit"))
2706 );
2707 #[cfg(all(feature = "unicode-case", feature = "unicode-script"))]
2708 assert_eq!(
2709 t(r"(?i)[^\P{greek}]"),
2710 hir_case_fold(hir_uclass_query(ClassQuery::Binary("greek")))
2711 );
2712
2713 assert_eq!(t("(?-u)[a]"), hir_bclass(&[(b'a', b'a')]));
2714 assert_eq!(t(r"(?-u)[\x00]"), hir_bclass(&[(b'\0', b'\0')]));
2715 assert_eq!(t_bytes(r"(?-u)[\xFF]"), hir_bclass(&[(b'\xFF', b'\xFF')]));
2716
2717 #[cfg(feature = "unicode-case")]
2718 assert_eq!(t("(?i)[a]"), hir_uclass(&[('A', 'A'), ('a', 'a')]));
2719 #[cfg(feature = "unicode-case")]
2720 assert_eq!(
2721 t("(?i)[k]"),
2722 hir_uclass(&[('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}'),])
2723 );
2724 #[cfg(feature = "unicode-case")]
2725 assert_eq!(
2726 t("(?i)[β]"),
2727 hir_uclass(&[('Β', 'Β'), ('β', 'β'), ('ϐ', 'ϐ'),])
2728 );
2729 assert_eq!(t("(?i-u)[k]"), hir_bclass(&[(b'K', b'K'), (b'k', b'k'),]));
2730
2731 assert_eq!(t("[^a]"), class_negate(uclass(&[('a', 'a')])));
2732 assert_eq!(t(r"[^\x00]"), class_negate(uclass(&[('\0', '\0')])));
2733 assert_eq!(
2734 t_bytes("(?-u)[^a]"),
2735 class_negate(bclass(&[(b'a', b'a')]))
2736 );
2737 #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))]
2738 assert_eq!(
2739 t(r"[^\d]"),
2740 hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
2741 );
2742 #[cfg(feature = "unicode-gencat")]
2743 assert_eq!(
2744 t(r"[^\pZ]"),
2745 hir_negate(hir_uclass_query(ClassQuery::Binary("separator")))
2746 );
2747 #[cfg(feature = "unicode-gencat")]
2748 assert_eq!(
2749 t(r"[^\p{separator}]"),
2750 hir_negate(hir_uclass_query(ClassQuery::Binary("separator")))
2751 );
2752 #[cfg(all(feature = "unicode-case", feature = "unicode-script"))]
2753 assert_eq!(
2754 t(r"(?i)[^\p{greek}]"),
2755 hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary(
2756 "greek"
2757 ))))
2758 );
2759 #[cfg(all(feature = "unicode-case", feature = "unicode-script"))]
2760 assert_eq!(
2761 t(r"(?i)[\P{greek}]"),
2762 hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary(
2763 "greek"
2764 ))))
2765 );
2766
2767 // Test some weird cases.
2768 assert_eq!(t(r"[\[]"), hir_uclass(&[('[', '[')]));
2769
2770 assert_eq!(t(r"[&]"), hir_uclass(&[('&', '&')]));
2771 assert_eq!(t(r"[\&]"), hir_uclass(&[('&', '&')]));
2772 assert_eq!(t(r"[\&\&]"), hir_uclass(&[('&', '&')]));
2773 assert_eq!(t(r"[\x00-&]"), hir_uclass(&[('\0', '&')]));
2774 assert_eq!(t(r"[&-\xFF]"), hir_uclass(&[('&', '\u{FF}')]));
2775
2776 assert_eq!(t(r"[~]"), hir_uclass(&[('~', '~')]));
2777 assert_eq!(t(r"[\~]"), hir_uclass(&[('~', '~')]));
2778 assert_eq!(t(r"[\~\~]"), hir_uclass(&[('~', '~')]));
2779 assert_eq!(t(r"[\x00-~]"), hir_uclass(&[('\0', '~')]));
2780 assert_eq!(t(r"[~-\xFF]"), hir_uclass(&[('~', '\u{FF}')]));
2781
2782 assert_eq!(t(r"[-]"), hir_uclass(&[('-', '-')]));
2783 assert_eq!(t(r"[\-]"), hir_uclass(&[('-', '-')]));
2784 assert_eq!(t(r"[\-\-]"), hir_uclass(&[('-', '-')]));
2785 assert_eq!(t(r"[\x00-\-]"), hir_uclass(&[('\0', '-')]));
2786 assert_eq!(t(r"[\--\xFF]"), hir_uclass(&[('-', '\u{FF}')]));
2787
2788 assert_eq!(
2789 t_err("(?-u)[^a]"),
2790 TestError {
2791 kind: hir::ErrorKind::InvalidUtf8,
2792 span: Span::new(
2793 Position::new(5, 1, 6),
2794 Position::new(9, 1, 10)
2795 ),
2796 }
2797 );
2798 #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))]
2799 assert_eq!(t(r"[^\s\S]"), hir_uclass(&[]),);
2800 #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))]
2801 assert_eq!(t_bytes(r"(?-u)[^\s\S]"), hir_bclass(&[]),);
2802 }
2803
2804 #[test]
2805 fn class_bracketed_union() {
2806 assert_eq!(t("[a-zA-Z]"), hir_uclass(&[('A', 'Z'), ('a', 'z')]));
2807 #[cfg(feature = "unicode-gencat")]
2808 assert_eq!(
2809 t(r"[a\pZb]"),
2810 hir_union(
2811 hir_uclass(&[('a', 'b')]),
2812 hir_uclass_query(ClassQuery::Binary("separator"))
2813 )
2814 );
2815 #[cfg(all(feature = "unicode-gencat", feature = "unicode-script"))]
2816 assert_eq!(
2817 t(r"[\pZ\p{Greek}]"),
2818 hir_union(
2819 hir_uclass_query(ClassQuery::Binary("greek")),
2820 hir_uclass_query(ClassQuery::Binary("separator"))
2821 )
2822 );
2823 #[cfg(all(
2824 feature = "unicode-age",
2825 feature = "unicode-gencat",
2826 feature = "unicode-script"
2827 ))]
2828 assert_eq!(
2829 t(r"[\p{age:3.0}\pZ\p{Greek}]"),
2830 hir_union(
2831 hir_uclass_query(ClassQuery::ByValue {
2832 property_name: "age",
2833 property_value: "3.0",
2834 }),
2835 hir_union(
2836 hir_uclass_query(ClassQuery::Binary("greek")),
2837 hir_uclass_query(ClassQuery::Binary("separator"))
2838 )
2839 )
2840 );
2841 #[cfg(all(
2842 feature = "unicode-age",
2843 feature = "unicode-gencat",
2844 feature = "unicode-script"
2845 ))]
2846 assert_eq!(
2847 t(r"[[[\p{age:3.0}\pZ]\p{Greek}][\p{Cyrillic}]]"),
2848 hir_union(
2849 hir_uclass_query(ClassQuery::ByValue {
2850 property_name: "age",
2851 property_value: "3.0",
2852 }),
2853 hir_union(
2854 hir_uclass_query(ClassQuery::Binary("cyrillic")),
2855 hir_union(
2856 hir_uclass_query(ClassQuery::Binary("greek")),
2857 hir_uclass_query(ClassQuery::Binary("separator"))
2858 )
2859 )
2860 )
2861 );
2862
2863 #[cfg(all(
2864 feature = "unicode-age",
2865 feature = "unicode-case",
2866 feature = "unicode-gencat",
2867 feature = "unicode-script"
2868 ))]
2869 assert_eq!(
2870 t(r"(?i)[\p{age:3.0}\pZ\p{Greek}]"),
2871 hir_case_fold(hir_union(
2872 hir_uclass_query(ClassQuery::ByValue {
2873 property_name: "age",
2874 property_value: "3.0",
2875 }),
2876 hir_union(
2877 hir_uclass_query(ClassQuery::Binary("greek")),
2878 hir_uclass_query(ClassQuery::Binary("separator"))
2879 )
2880 ))
2881 );
2882 #[cfg(all(
2883 feature = "unicode-age",
2884 feature = "unicode-gencat",
2885 feature = "unicode-script"
2886 ))]
2887 assert_eq!(
2888 t(r"[^\p{age:3.0}\pZ\p{Greek}]"),
2889 hir_negate(hir_union(
2890 hir_uclass_query(ClassQuery::ByValue {
2891 property_name: "age",
2892 property_value: "3.0",
2893 }),
2894 hir_union(
2895 hir_uclass_query(ClassQuery::Binary("greek")),
2896 hir_uclass_query(ClassQuery::Binary("separator"))
2897 )
2898 ))
2899 );
2900 #[cfg(all(
2901 feature = "unicode-age",
2902 feature = "unicode-case",
2903 feature = "unicode-gencat",
2904 feature = "unicode-script"
2905 ))]
2906 assert_eq!(
2907 t(r"(?i)[^\p{age:3.0}\pZ\p{Greek}]"),
2908 hir_negate(hir_case_fold(hir_union(
2909 hir_uclass_query(ClassQuery::ByValue {
2910 property_name: "age",
2911 property_value: "3.0",
2912 }),
2913 hir_union(
2914 hir_uclass_query(ClassQuery::Binary("greek")),
2915 hir_uclass_query(ClassQuery::Binary("separator"))
2916 )
2917 )))
2918 );
2919 }
2920
2921 #[test]
2922 fn class_bracketed_nested() {
2923 assert_eq!(t(r"[a[^c]]"), class_negate(uclass(&[('c', 'c')])));
2924 assert_eq!(t(r"[a-b[^c]]"), class_negate(uclass(&[('c', 'c')])));
2925 assert_eq!(t(r"[a-c[^c]]"), class_negate(uclass(&[])));
2926
2927 assert_eq!(t(r"[^a[^c]]"), hir_uclass(&[('c', 'c')]));
2928 assert_eq!(t(r"[^a-b[^c]]"), hir_uclass(&[('c', 'c')]));
2929
2930 #[cfg(feature = "unicode-case")]
2931 assert_eq!(
2932 t(r"(?i)[a[^c]]"),
2933 hir_negate(class_case_fold(uclass(&[('c', 'c')])))
2934 );
2935 #[cfg(feature = "unicode-case")]
2936 assert_eq!(
2937 t(r"(?i)[a-b[^c]]"),
2938 hir_negate(class_case_fold(uclass(&[('c', 'c')])))
2939 );
2940
2941 #[cfg(feature = "unicode-case")]
2942 assert_eq!(t(r"(?i)[^a[^c]]"), hir_uclass(&[('C', 'C'), ('c', 'c')]));
2943 #[cfg(feature = "unicode-case")]
2944 assert_eq!(
2945 t(r"(?i)[^a-b[^c]]"),
2946 hir_uclass(&[('C', 'C'), ('c', 'c')])
2947 );
2948
2949 assert_eq!(t(r"[^a-c[^c]]"), hir_uclass(&[]),);
2950 #[cfg(feature = "unicode-case")]
2951 assert_eq!(t(r"(?i)[^a-c[^c]]"), hir_uclass(&[]),);
2952 }
2953
2954 #[test]
2955 fn class_bracketed_intersect() {
2956 assert_eq!(t("[abc&&b-c]"), hir_uclass(&[('b', 'c')]));
2957 assert_eq!(t("[abc&&[b-c]]"), hir_uclass(&[('b', 'c')]));
2958 assert_eq!(t("[[abc]&&[b-c]]"), hir_uclass(&[('b', 'c')]));
2959 assert_eq!(t("[a-z&&b-y&&c-x]"), hir_uclass(&[('c', 'x')]));
2960 assert_eq!(t("[c-da-b&&a-d]"), hir_uclass(&[('a', 'd')]));
2961 assert_eq!(t("[a-d&&c-da-b]"), hir_uclass(&[('a', 'd')]));
2962 assert_eq!(t(r"[a-z&&a-c]"), hir_uclass(&[('a', 'c')]));
2963 assert_eq!(t(r"[[a-z&&a-c]]"), hir_uclass(&[('a', 'c')]));
2964 assert_eq!(t(r"[^[a-z&&a-c]]"), hir_negate(hir_uclass(&[('a', 'c')])));
2965
2966 assert_eq!(t("(?-u)[abc&&b-c]"), hir_bclass(&[(b'b', b'c')]));
2967 assert_eq!(t("(?-u)[abc&&[b-c]]"), hir_bclass(&[(b'b', b'c')]));
2968 assert_eq!(t("(?-u)[[abc]&&[b-c]]"), hir_bclass(&[(b'b', b'c')]));
2969 assert_eq!(t("(?-u)[a-z&&b-y&&c-x]"), hir_bclass(&[(b'c', b'x')]));
2970 assert_eq!(t("(?-u)[c-da-b&&a-d]"), hir_bclass(&[(b'a', b'd')]));
2971 assert_eq!(t("(?-u)[a-d&&c-da-b]"), hir_bclass(&[(b'a', b'd')]));
2972
2973 #[cfg(feature = "unicode-case")]
2974 assert_eq!(
2975 t("(?i)[abc&&b-c]"),
2976 hir_case_fold(hir_uclass(&[('b', 'c')]))
2977 );
2978 #[cfg(feature = "unicode-case")]
2979 assert_eq!(
2980 t("(?i)[abc&&[b-c]]"),
2981 hir_case_fold(hir_uclass(&[('b', 'c')]))
2982 );
2983 #[cfg(feature = "unicode-case")]
2984 assert_eq!(
2985 t("(?i)[[abc]&&[b-c]]"),
2986 hir_case_fold(hir_uclass(&[('b', 'c')]))
2987 );
2988 #[cfg(feature = "unicode-case")]
2989 assert_eq!(
2990 t("(?i)[a-z&&b-y&&c-x]"),
2991 hir_case_fold(hir_uclass(&[('c', 'x')]))
2992 );
2993 #[cfg(feature = "unicode-case")]
2994 assert_eq!(
2995 t("(?i)[c-da-b&&a-d]"),
2996 hir_case_fold(hir_uclass(&[('a', 'd')]))
2997 );
2998 #[cfg(feature = "unicode-case")]
2999 assert_eq!(
3000 t("(?i)[a-d&&c-da-b]"),
3001 hir_case_fold(hir_uclass(&[('a', 'd')]))
3002 );
3003
3004 assert_eq!(
3005 t("(?i-u)[abc&&b-c]"),
3006 hir_case_fold(hir_bclass(&[(b'b', b'c')]))
3007 );
3008 assert_eq!(
3009 t("(?i-u)[abc&&[b-c]]"),
3010 hir_case_fold(hir_bclass(&[(b'b', b'c')]))
3011 );
3012 assert_eq!(
3013 t("(?i-u)[[abc]&&[b-c]]"),
3014 hir_case_fold(hir_bclass(&[(b'b', b'c')]))
3015 );
3016 assert_eq!(
3017 t("(?i-u)[a-z&&b-y&&c-x]"),
3018 hir_case_fold(hir_bclass(&[(b'c', b'x')]))
3019 );
3020 assert_eq!(
3021 t("(?i-u)[c-da-b&&a-d]"),
3022 hir_case_fold(hir_bclass(&[(b'a', b'd')]))
3023 );
3024 assert_eq!(
3025 t("(?i-u)[a-d&&c-da-b]"),
3026 hir_case_fold(hir_bclass(&[(b'a', b'd')]))
3027 );
3028
3029 // In `[a^]`, `^` does not need to be escaped, so it makes sense that
3030 // `^` is also allowed to be unescaped after `&&`.
3031 assert_eq!(t(r"[\^&&^]"), hir_uclass(&[('^', '^')]));
3032 // `]` needs to be escaped after `&&` since it's not at start of class.
3033 assert_eq!(t(r"[]&&\]]"), hir_uclass(&[(']', ']')]));
3034 assert_eq!(t(r"[-&&-]"), hir_uclass(&[('-', '-')]));
3035 assert_eq!(t(r"[\&&&&]"), hir_uclass(&[('&', '&')]));
3036 assert_eq!(t(r"[\&&&\&]"), hir_uclass(&[('&', '&')]));
3037 // Test precedence.
3038 assert_eq!(
3039 t(r"[a-w&&[^c-g]z]"),
3040 hir_uclass(&[('a', 'b'), ('h', 'w')])
3041 );
3042 }
3043
3044 #[test]
3045 fn class_bracketed_intersect_negate() {
3046 #[cfg(feature = "unicode-perl")]
3047 assert_eq!(
3048 t(r"[^\w&&\d]"),
3049 hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
3050 );
3051 assert_eq!(t(r"[^[a-z&&a-c]]"), hir_negate(hir_uclass(&[('a', 'c')])));
3052 #[cfg(feature = "unicode-perl")]
3053 assert_eq!(
3054 t(r"[^[\w&&\d]]"),
3055 hir_negate(hir_uclass_query(ClassQuery::Binary("digit")))
3056 );
3057 #[cfg(feature = "unicode-perl")]
3058 assert_eq!(
3059 t(r"[^[^\w&&\d]]"),
3060 hir_uclass_query(ClassQuery::Binary("digit"))
3061 );
3062 #[cfg(feature = "unicode-perl")]
3063 assert_eq!(t(r"[[[^\w]&&[^\d]]]"), hir_negate(hir_uclass_perl_word()));
3064
3065 #[cfg(feature = "unicode-perl")]
3066 assert_eq!(
3067 t_bytes(r"(?-u)[^\w&&\d]"),
3068 hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit))
3069 );
3070 assert_eq!(
3071 t_bytes(r"(?-u)[^[a-z&&a-c]]"),
3072 hir_negate(hir_bclass(&[(b'a', b'c')]))
3073 );
3074 assert_eq!(
3075 t_bytes(r"(?-u)[^[\w&&\d]]"),
3076 hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit))
3077 );
3078 assert_eq!(
3079 t_bytes(r"(?-u)[^[^\w&&\d]]"),
3080 hir_ascii_bclass(&ast::ClassAsciiKind::Digit)
3081 );
3082 assert_eq!(
3083 t_bytes(r"(?-u)[[[^\w]&&[^\d]]]"),
3084 hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word))
3085 );
3086 }
3087
3088 #[test]
3089 fn class_bracketed_difference() {
3090 #[cfg(feature = "unicode-gencat")]
3091 assert_eq!(
3092 t(r"[\pL--[:ascii:]]"),
3093 hir_difference(
3094 hir_uclass_query(ClassQuery::Binary("letter")),
3095 hir_uclass(&[('\0', '\x7F')])
3096 )
3097 );
3098
3099 assert_eq!(
3100 t(r"(?-u)[[:alpha:]--[:lower:]]"),
3101 hir_bclass(&[(b'A', b'Z')])
3102 );
3103 }
3104
3105 #[test]
3106 fn class_bracketed_symmetric_difference() {
3107 #[cfg(feature = "unicode-script")]
3108 assert_eq!(
3109 t(r"[\p{sc:Greek}~~\p{scx:Greek}]"),
3110 hir_uclass(&[
3111 ('\u{0342}', '\u{0342}'),
3112 ('\u{0345}', '\u{0345}'),
3113 ('\u{1DC0}', '\u{1DC1}'),
3114 ])
3115 );
3116 assert_eq!(t(r"[a-g~~c-j]"), hir_uclass(&[('a', 'b'), ('h', 'j')]));
3117
3118 assert_eq!(
3119 t(r"(?-u)[a-g~~c-j]"),
3120 hir_bclass(&[(b'a', b'b'), (b'h', b'j')])
3121 );
3122 }
3123
3124 #[test]
3125 fn ignore_whitespace() {
3126 assert_eq!(t(r"(?x)\12 3"), hir_lit("\n3"));
3127 assert_eq!(t(r"(?x)\x { 53 }"), hir_lit("S"));
3128 assert_eq!(
3129 t(r"(?x)\x # comment
3130{ # comment
3131 53 # comment
3132} #comment"),
3133 hir_lit("S")
3134 );
3135
3136 assert_eq!(t(r"(?x)\x 53"), hir_lit("S"));
3137 assert_eq!(
3138 t(r"(?x)\x # comment
3139 53 # comment"),
3140 hir_lit("S")
3141 );
3142 assert_eq!(t(r"(?x)\x5 3"), hir_lit("S"));
3143
3144 #[cfg(feature = "unicode-gencat")]
3145 assert_eq!(
3146 t(r"(?x)\p # comment
3147{ # comment
3148 Separator # comment
3149} # comment"),
3150 hir_uclass_query(ClassQuery::Binary("separator"))
3151 );
3152
3153 assert_eq!(
3154 t(r"(?x)a # comment
3155{ # comment
3156 5 # comment
3157 , # comment
3158 10 # comment
3159} # comment"),
3160 hir_range(true, 5, Some(10), hir_lit("a"))
3161 );
3162
3163 assert_eq!(t(r"(?x)a\ # hi there"), hir_lit("a "));
3164 }
3165
3166 #[test]
3167 fn analysis_is_utf8() {
3168 // Positive examples.
3169 assert!(props_bytes(r"a").is_utf8());
3170 assert!(props_bytes(r"ab").is_utf8());
3171 assert!(props_bytes(r"(?-u)a").is_utf8());
3172 assert!(props_bytes(r"(?-u)ab").is_utf8());
3173 assert!(props_bytes(r"\xFF").is_utf8());
3174 assert!(props_bytes(r"\xFF\xFF").is_utf8());
3175 assert!(props_bytes(r"[^a]").is_utf8());
3176 assert!(props_bytes(r"[^a][^a]").is_utf8());
3177 assert!(props_bytes(r"\b").is_utf8());
3178 assert!(props_bytes(r"\B").is_utf8());
3179 assert!(props_bytes(r"(?-u)\b").is_utf8());
3180 assert!(props_bytes(r"(?-u)\B").is_utf8());
3181
3182 // Negative examples.
3183 assert!(!props_bytes(r"(?-u)\xFF").is_utf8());
3184 assert!(!props_bytes(r"(?-u)\xFF\xFF").is_utf8());
3185 assert!(!props_bytes(r"(?-u)[^a]").is_utf8());
3186 assert!(!props_bytes(r"(?-u)[^a][^a]").is_utf8());
3187 }
3188
3189 #[test]
3190 fn analysis_captures_len() {
3191 assert_eq!(0, props(r"a").explicit_captures_len());
3192 assert_eq!(0, props(r"(?:a)").explicit_captures_len());
3193 assert_eq!(0, props(r"(?i-u:a)").explicit_captures_len());
3194 assert_eq!(0, props(r"(?i-u)a").explicit_captures_len());
3195 assert_eq!(1, props(r"(a)").explicit_captures_len());
3196 assert_eq!(1, props(r"(?P<foo>a)").explicit_captures_len());
3197 assert_eq!(1, props(r"()").explicit_captures_len());
3198 assert_eq!(1, props(r"()a").explicit_captures_len());
3199 assert_eq!(1, props(r"(a)+").explicit_captures_len());
3200 assert_eq!(2, props(r"(a)(b)").explicit_captures_len());
3201 assert_eq!(2, props(r"(a)|(b)").explicit_captures_len());
3202 assert_eq!(2, props(r"((a))").explicit_captures_len());
3203 assert_eq!(1, props(r"([a&&b])").explicit_captures_len());
3204 }
3205
3206 #[test]
3207 fn analysis_static_captures_len() {
3208 let len = |pattern| props(pattern).static_explicit_captures_len();
3209 assert_eq!(Some(0), len(r""));
3210 assert_eq!(Some(0), len(r"foo|bar"));
3211 assert_eq!(None, len(r"(foo)|bar"));
3212 assert_eq!(None, len(r"foo|(bar)"));
3213 assert_eq!(Some(1), len(r"(foo|bar)"));
3214 assert_eq!(Some(1), len(r"(a|b|c|d|e|f)"));
3215 assert_eq!(Some(1), len(r"(a)|(b)|(c)|(d)|(e)|(f)"));
3216 assert_eq!(Some(2), len(r"(a)(b)|(c)(d)|(e)(f)"));
3217 assert_eq!(Some(6), len(r"(a)(b)(c)(d)(e)(f)"));
3218 assert_eq!(Some(3), len(r"(a)(b)(extra)|(a)(b)()"));
3219 assert_eq!(Some(3), len(r"(a)(b)((?:extra)?)"));
3220 assert_eq!(None, len(r"(a)(b)(extra)?"));
3221 assert_eq!(Some(1), len(r"(foo)|(bar)"));
3222 assert_eq!(Some(2), len(r"(foo)(bar)"));
3223 assert_eq!(Some(2), len(r"(foo)+(bar)"));
3224 assert_eq!(None, len(r"(foo)*(bar)"));
3225 assert_eq!(Some(0), len(r"(foo)?{0}"));
3226 assert_eq!(None, len(r"(foo)?{1}"));
3227 assert_eq!(Some(1), len(r"(foo){1}"));
3228 assert_eq!(Some(1), len(r"(foo){1,}"));
3229 assert_eq!(Some(1), len(r"(foo){1,}?"));
3230 assert_eq!(None, len(r"(foo){1,}??"));
3231 assert_eq!(None, len(r"(foo){0,}"));
3232 assert_eq!(Some(1), len(r"(foo)(?:bar)"));
3233 assert_eq!(Some(2), len(r"(foo(?:bar)+)(?:baz(boo))"));
3234 assert_eq!(Some(2), len(r"(?P<bar>foo)(?:bar)(bal|loon)"));
3235 assert_eq!(
3236 Some(2),
3237 len(r#"<(a)[^>]+href="([^"]+)"|<(img)[^>]+src="([^"]+)""#)
3238 );
3239 }
3240
3241 #[test]
3242 fn analysis_is_all_assertions() {
3243 // Positive examples.
3244 let p = props(r"\b");
3245 assert!(!p.look_set().is_empty());
3246 assert_eq!(p.minimum_len(), Some(0));
3247
3248 let p = props(r"\B");
3249 assert!(!p.look_set().is_empty());
3250 assert_eq!(p.minimum_len(), Some(0));
3251
3252 let p = props(r"^");
3253 assert!(!p.look_set().is_empty());
3254 assert_eq!(p.minimum_len(), Some(0));
3255
3256 let p = props(r"$");
3257 assert!(!p.look_set().is_empty());
3258 assert_eq!(p.minimum_len(), Some(0));
3259
3260 let p = props(r"\A");
3261 assert!(!p.look_set().is_empty());
3262 assert_eq!(p.minimum_len(), Some(0));
3263
3264 let p = props(r"\z");
3265 assert!(!p.look_set().is_empty());
3266 assert_eq!(p.minimum_len(), Some(0));
3267
3268 let p = props(r"$^\z\A\b\B");
3269 assert!(!p.look_set().is_empty());
3270 assert_eq!(p.minimum_len(), Some(0));
3271
3272 let p = props(r"$|^|\z|\A|\b|\B");
3273 assert!(!p.look_set().is_empty());
3274 assert_eq!(p.minimum_len(), Some(0));
3275
3276 let p = props(r"^$|$^");
3277 assert!(!p.look_set().is_empty());
3278 assert_eq!(p.minimum_len(), Some(0));
3279
3280 let p = props(r"((\b)+())*^");
3281 assert!(!p.look_set().is_empty());
3282 assert_eq!(p.minimum_len(), Some(0));
3283
3284 // Negative examples.
3285 let p = props(r"^a");
3286 assert!(!p.look_set().is_empty());
3287 assert_eq!(p.minimum_len(), Some(1));
3288 }
3289
3290 #[test]
3291 fn analysis_look_set_prefix_any() {
3292 let p = props(r"(?-u)(?i:(?:\b|_)win(?:32|64|dows)?(?:\b|_))");
3293 assert!(p.look_set_prefix_any().contains(Look::WordAscii));
3294 }
3295
3296 #[test]
3297 fn analysis_is_anchored() {
3298 let is_start = |p| props(p).look_set_prefix().contains(Look::Start);
3299 let is_end = |p| props(p).look_set_suffix().contains(Look::End);
3300
3301 // Positive examples.
3302 assert!(is_start(r"^"));
3303 assert!(is_end(r"$"));
3304
3305 assert!(is_start(r"^^"));
3306 assert!(props(r"$$").look_set_suffix().contains(Look::End));
3307
3308 assert!(is_start(r"^$"));
3309 assert!(is_end(r"^$"));
3310
3311 assert!(is_start(r"^foo"));
3312 assert!(is_end(r"foo$"));
3313
3314 assert!(is_start(r"^foo|^bar"));
3315 assert!(is_end(r"foo$|bar$"));
3316
3317 assert!(is_start(r"^(foo|bar)"));
3318 assert!(is_end(r"(foo|bar)$"));
3319
3320 assert!(is_start(r"^+"));
3321 assert!(is_end(r"$+"));
3322 assert!(is_start(r"^++"));
3323 assert!(is_end(r"$++"));
3324 assert!(is_start(r"(^)+"));
3325 assert!(is_end(r"($)+"));
3326
3327 assert!(is_start(r"$^"));
3328 assert!(is_start(r"$^"));
3329 assert!(is_start(r"$^|^$"));
3330 assert!(is_end(r"$^|^$"));
3331
3332 assert!(is_start(r"\b^"));
3333 assert!(is_end(r"$\b"));
3334 assert!(is_start(r"^(?m:^)"));
3335 assert!(is_end(r"(?m:$)$"));
3336 assert!(is_start(r"(?m:^)^"));
3337 assert!(is_end(r"$(?m:$)"));
3338
3339 // Negative examples.
3340 assert!(!is_start(r"(?m)^"));
3341 assert!(!is_end(r"(?m)$"));
3342 assert!(!is_start(r"(?m:^$)|$^"));
3343 assert!(!is_end(r"(?m:^$)|$^"));
3344 assert!(!is_start(r"$^|(?m:^$)"));
3345 assert!(!is_end(r"$^|(?m:^$)"));
3346
3347 assert!(!is_start(r"a^"));
3348 assert!(!is_start(r"$a"));
3349
3350 assert!(!is_end(r"a^"));
3351 assert!(!is_end(r"$a"));
3352
3353 assert!(!is_start(r"^foo|bar"));
3354 assert!(!is_end(r"foo|bar$"));
3355
3356 assert!(!is_start(r"^*"));
3357 assert!(!is_end(r"$*"));
3358 assert!(!is_start(r"^*+"));
3359 assert!(!is_end(r"$*+"));
3360 assert!(!is_start(r"^+*"));
3361 assert!(!is_end(r"$+*"));
3362 assert!(!is_start(r"(^)*"));
3363 assert!(!is_end(r"($)*"));
3364 }
3365
3366 #[test]
3367 fn analysis_is_any_anchored() {
3368 let is_start = |p| props(p).look_set().contains(Look::Start);
3369 let is_end = |p| props(p).look_set().contains(Look::End);
3370
3371 // Positive examples.
3372 assert!(is_start(r"^"));
3373 assert!(is_end(r"$"));
3374 assert!(is_start(r"\A"));
3375 assert!(is_end(r"\z"));
3376
3377 // Negative examples.
3378 assert!(!is_start(r"(?m)^"));
3379 assert!(!is_end(r"(?m)$"));
3380 assert!(!is_start(r"$"));
3381 assert!(!is_end(r"^"));
3382 }
3383
3384 #[test]
3385 fn analysis_can_empty() {
3386 // Positive examples.
3387 let assert_empty =
3388 |p| assert_eq!(Some(0), props_bytes(p).minimum_len());
3389 assert_empty(r"");
3390 assert_empty(r"()");
3391 assert_empty(r"()*");
3392 assert_empty(r"()+");
3393 assert_empty(r"()?");
3394 assert_empty(r"a*");
3395 assert_empty(r"a?");
3396 assert_empty(r"a{0}");
3397 assert_empty(r"a{0,}");
3398 assert_empty(r"a{0,1}");
3399 assert_empty(r"a{0,10}");
3400 #[cfg(feature = "unicode-gencat")]
3401 assert_empty(r"\pL*");
3402 assert_empty(r"a*|b");
3403 assert_empty(r"b|a*");
3404 assert_empty(r"a|");
3405 assert_empty(r"|a");
3406 assert_empty(r"a||b");
3407 assert_empty(r"a*a?(abcd)*");
3408 assert_empty(r"^");
3409 assert_empty(r"$");
3410 assert_empty(r"(?m)^");
3411 assert_empty(r"(?m)$");
3412 assert_empty(r"\A");
3413 assert_empty(r"\z");
3414 assert_empty(r"\B");
3415 assert_empty(r"(?-u)\B");
3416 assert_empty(r"\b");
3417 assert_empty(r"(?-u)\b");
3418
3419 // Negative examples.
3420 let assert_non_empty =
3421 |p| assert_ne!(Some(0), props_bytes(p).minimum_len());
3422 assert_non_empty(r"a+");
3423 assert_non_empty(r"a{1}");
3424 assert_non_empty(r"a{1,}");
3425 assert_non_empty(r"a{1,2}");
3426 assert_non_empty(r"a{1,10}");
3427 assert_non_empty(r"b|a");
3428 assert_non_empty(r"a*a+(abcd)*");
3429 #[cfg(feature = "unicode-gencat")]
3430 assert_non_empty(r"\P{any}");
3431 assert_non_empty(r"[a--a]");
3432 assert_non_empty(r"[a&&b]");
3433 }
3434
3435 #[test]
3436 fn analysis_is_literal() {
3437 // Positive examples.
3438 assert!(props(r"a").is_literal());
3439 assert!(props(r"ab").is_literal());
3440 assert!(props(r"abc").is_literal());
3441 assert!(props(r"(?m)abc").is_literal());
3442 assert!(props(r"(?:a)").is_literal());
3443 assert!(props(r"foo(?:a)").is_literal());
3444 assert!(props(r"(?:a)foo").is_literal());
3445 assert!(props(r"[a]").is_literal());
3446
3447 // Negative examples.
3448 assert!(!props(r"").is_literal());
3449 assert!(!props(r"^").is_literal());
3450 assert!(!props(r"a|b").is_literal());
3451 assert!(!props(r"(a)").is_literal());
3452 assert!(!props(r"a+").is_literal());
3453 assert!(!props(r"foo(a)").is_literal());
3454 assert!(!props(r"(a)foo").is_literal());
3455 assert!(!props(r"[ab]").is_literal());
3456 }
3457
3458 #[test]
3459 fn analysis_is_alternation_literal() {
3460 // Positive examples.
3461 assert!(props(r"a").is_alternation_literal());
3462 assert!(props(r"ab").is_alternation_literal());
3463 assert!(props(r"abc").is_alternation_literal());
3464 assert!(props(r"(?m)abc").is_alternation_literal());
3465 assert!(props(r"foo|bar").is_alternation_literal());
3466 assert!(props(r"foo|bar|baz").is_alternation_literal());
3467 assert!(props(r"[a]").is_alternation_literal());
3468 assert!(props(r"(?:ab)|cd").is_alternation_literal());
3469 assert!(props(r"ab|(?:cd)").is_alternation_literal());
3470
3471 // Negative examples.
3472 assert!(!props(r"").is_alternation_literal());
3473 assert!(!props(r"^").is_alternation_literal());
3474 assert!(!props(r"(a)").is_alternation_literal());
3475 assert!(!props(r"a+").is_alternation_literal());
3476 assert!(!props(r"foo(a)").is_alternation_literal());
3477 assert!(!props(r"(a)foo").is_alternation_literal());
3478 assert!(!props(r"[ab]").is_alternation_literal());
3479 assert!(!props(r"[ab]|b").is_alternation_literal());
3480 assert!(!props(r"a|[ab]").is_alternation_literal());
3481 assert!(!props(r"(a)|b").is_alternation_literal());
3482 assert!(!props(r"a|(b)").is_alternation_literal());
3483 assert!(!props(r"a|b").is_alternation_literal());
3484 assert!(!props(r"a|b|c").is_alternation_literal());
3485 assert!(!props(r"[a]|b").is_alternation_literal());
3486 assert!(!props(r"a|[b]").is_alternation_literal());
3487 assert!(!props(r"(?:a)|b").is_alternation_literal());
3488 assert!(!props(r"a|(?:b)").is_alternation_literal());
3489 assert!(!props(r"(?:z|xx)@|xx").is_alternation_literal());
3490 }
3491
3492 // This tests that the smart Hir::concat constructor simplifies the given
3493 // exprs in a way we expect.
3494 #[test]
3495 fn smart_concat() {
3496 assert_eq!(t(""), Hir::empty());
3497 assert_eq!(t("(?:)"), Hir::empty());
3498 assert_eq!(t("abc"), hir_lit("abc"));
3499 assert_eq!(t("(?:foo)(?:bar)"), hir_lit("foobar"));
3500 assert_eq!(t("quux(?:foo)(?:bar)baz"), hir_lit("quuxfoobarbaz"));
3501 assert_eq!(
3502 t("foo(?:bar^baz)quux"),
3503 hir_cat(vec![
3504 hir_lit("foobar"),
3505 hir_look(hir::Look::Start),
3506 hir_lit("bazquux"),
3507 ])
3508 );
3509 assert_eq!(
3510 t("foo(?:ba(?:r^b)az)quux"),
3511 hir_cat(vec![
3512 hir_lit("foobar"),
3513 hir_look(hir::Look::Start),
3514 hir_lit("bazquux"),
3515 ])
3516 );
3517 }
3518
3519 // This tests that the smart Hir::alternation constructor simplifies the
3520 // given exprs in a way we expect.
3521 #[test]
3522 fn smart_alternation() {
3523 assert_eq!(
3524 t("(?:foo)|(?:bar)"),
3525 hir_alt(vec![hir_lit("foo"), hir_lit("bar")])
3526 );
3527 assert_eq!(
3528 t("quux|(?:abc|def|xyz)|baz"),
3529 hir_alt(vec![
3530 hir_lit("quux"),
3531 hir_lit("abc"),
3532 hir_lit("def"),
3533 hir_lit("xyz"),
3534 hir_lit("baz"),
3535 ])
3536 );
3537 assert_eq!(
3538 t("quux|(?:abc|(?:def|mno)|xyz)|baz"),
3539 hir_alt(vec![
3540 hir_lit("quux"),
3541 hir_lit("abc"),
3542 hir_lit("def"),
3543 hir_lit("mno"),
3544 hir_lit("xyz"),
3545 hir_lit("baz"),
3546 ])
3547 );
3548 assert_eq!(
3549 t("a|b|c|d|e|f|x|y|z"),
3550 hir_uclass(&[('a', 'f'), ('x', 'z')]),
3551 );
3552 // Tests that we lift common prefixes out of an alternation.
3553 assert_eq!(
3554 t("[A-Z]foo|[A-Z]quux"),
3555 hir_cat(vec![
3556 hir_uclass(&[('A', 'Z')]),
3557 hir_alt(vec![hir_lit("foo"), hir_lit("quux")]),
3558 ]),
3559 );
3560 assert_eq!(
3561 t("[A-Z][A-Z]|[A-Z]quux"),
3562 hir_cat(vec![
3563 hir_uclass(&[('A', 'Z')]),
3564 hir_alt(vec![hir_uclass(&[('A', 'Z')]), hir_lit("quux")]),
3565 ]),
3566 );
3567 assert_eq!(
3568 t("[A-Z][A-Z]|[A-Z][A-Z]quux"),
3569 hir_cat(vec![
3570 hir_uclass(&[('A', 'Z')]),
3571 hir_uclass(&[('A', 'Z')]),
3572 hir_alt(vec![Hir::empty(), hir_lit("quux")]),
3573 ]),
3574 );
3575 assert_eq!(
3576 t("[A-Z]foo|[A-Z]foobar"),
3577 hir_cat(vec![
3578 hir_uclass(&[('A', 'Z')]),
3579 hir_alt(vec![hir_lit("foo"), hir_lit("foobar")]),
3580 ]),
3581 );
3582 }
3583}
3584