1/*!
2Defines a high-level intermediate representation for regular expressions.
3*/
4use std::char;
5use std::cmp;
6use std::error;
7use std::fmt;
8use std::result;
9use std::u8;
10
11use crate::ast::Span;
12use crate::hir::interval::{Interval, IntervalSet, IntervalSetIter};
13use crate::unicode;
14
15pub use crate::hir::visitor::{visit, Visitor};
16pub use crate::unicode::CaseFoldError;
17
18mod interval;
19pub mod literal;
20pub mod print;
21pub mod translate;
22mod visitor;
23
24/// An error that can occur while translating an `Ast` to a `Hir`.
25#[derive(Clone, Debug, Eq, PartialEq)]
26pub struct Error {
27 /// The kind of error.
28 kind: ErrorKind,
29 /// The original pattern that the translator's Ast was parsed from. Every
30 /// span in an error is a valid range into this string.
31 pattern: String,
32 /// The span of this error, derived from the Ast given to the translator.
33 span: Span,
34}
35
36impl Error {
37 /// Return the type of this error.
38 pub fn kind(&self) -> &ErrorKind {
39 &self.kind
40 }
41
42 /// The original pattern string in which this error occurred.
43 ///
44 /// Every span reported by this error is reported in terms of this string.
45 pub fn pattern(&self) -> &str {
46 &self.pattern
47 }
48
49 /// Return the span at which this error occurred.
50 pub fn span(&self) -> &Span {
51 &self.span
52 }
53}
54
55/// The type of an error that occurred while building an `Hir`.
56#[derive(Clone, Debug, Eq, PartialEq)]
57pub enum ErrorKind {
58 /// This error occurs when a Unicode feature is used when Unicode
59 /// support is disabled. For example `(?-u:\pL)` would trigger this error.
60 UnicodeNotAllowed,
61 /// This error occurs when translating a pattern that could match a byte
62 /// sequence that isn't UTF-8 and `allow_invalid_utf8` was disabled.
63 InvalidUtf8,
64 /// This occurs when an unrecognized Unicode property name could not
65 /// be found.
66 UnicodePropertyNotFound,
67 /// This occurs when an unrecognized Unicode property value could not
68 /// be found.
69 UnicodePropertyValueNotFound,
70 /// This occurs when a Unicode-aware Perl character class (`\w`, `\s` or
71 /// `\d`) could not be found. This can occur when the `unicode-perl`
72 /// crate feature is not enabled.
73 UnicodePerlClassNotFound,
74 /// This occurs when the Unicode simple case mapping tables are not
75 /// available, and the regular expression required Unicode aware case
76 /// insensitivity.
77 UnicodeCaseUnavailable,
78 /// This occurs when the translator attempts to construct a character class
79 /// that is empty.
80 ///
81 /// Note that this restriction in the translator may be removed in the
82 /// future.
83 EmptyClassNotAllowed,
84 /// Hints that destructuring should not be exhaustive.
85 ///
86 /// This enum may grow additional variants, so this makes sure clients
87 /// don't count on exhaustive matching. (Otherwise, adding a new variant
88 /// could break existing code.)
89 #[doc(hidden)]
90 __Nonexhaustive,
91}
92
93impl ErrorKind {
94 // TODO: Remove this method entirely on the next breaking semver release.
95 #[allow(deprecated)]
96 fn description(&self) -> &str {
97 use self::ErrorKind::*;
98 match *self {
99 UnicodeNotAllowed => "Unicode not allowed here",
100 InvalidUtf8 => "pattern can match invalid UTF-8",
101 UnicodePropertyNotFound => "Unicode property not found",
102 UnicodePropertyValueNotFound => "Unicode property value not found",
103 UnicodePerlClassNotFound => {
104 "Unicode-aware Perl class not found \
105 (make sure the unicode-perl feature is enabled)"
106 }
107 UnicodeCaseUnavailable => {
108 "Unicode-aware case insensitivity matching is not available \
109 (make sure the unicode-case feature is enabled)"
110 }
111 EmptyClassNotAllowed => "empty character classes are not allowed",
112 __Nonexhaustive => unreachable!(),
113 }
114 }
115}
116
117impl error::Error for Error {
118 // TODO: Remove this method entirely on the next breaking semver release.
119 #[allow(deprecated)]
120 fn description(&self) -> &str {
121 self.kind.description()
122 }
123}
124
125impl fmt::Display for Error {
126 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
127 crate::error::Formatter::from(self).fmt(f)
128 }
129}
130
131impl fmt::Display for ErrorKind {
132 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
133 // TODO: Remove this on the next breaking semver release.
134 #[allow(deprecated)]
135 f.write_str(self.description())
136 }
137}
138
139/// A high-level intermediate representation (HIR) for a regular expression.
140///
141/// The HIR of a regular expression represents an intermediate step between its
142/// abstract syntax (a structured description of the concrete syntax) and
143/// compiled byte codes. The purpose of HIR is to make regular expressions
144/// easier to analyze. In particular, the AST is much more complex than the
145/// HIR. For example, while an AST supports arbitrarily nested character
146/// classes, the HIR will flatten all nested classes into a single set. The HIR
147/// will also "compile away" every flag present in the concrete syntax. For
148/// example, users of HIR expressions never need to worry about case folding;
149/// it is handled automatically by the translator (e.g., by translating `(?i)A`
150/// to `[aA]`).
151///
152/// If the HIR was produced by a translator that disallows invalid UTF-8, then
153/// the HIR is guaranteed to match UTF-8 exclusively.
154///
155/// This type defines its own destructor that uses constant stack space and
156/// heap space proportional to the size of the HIR.
157///
158/// The specific type of an HIR expression can be accessed via its `kind`
159/// or `into_kind` methods. This extra level of indirection exists for two
160/// reasons:
161///
162/// 1. Construction of an HIR expression *must* use the constructor methods
163/// on this `Hir` type instead of building the `HirKind` values directly.
164/// This permits construction to enforce invariants like "concatenations
165/// always consist of two or more sub-expressions."
166/// 2. Every HIR expression contains attributes that are defined inductively,
167/// and can be computed cheaply during the construction process. For
168/// example, one such attribute is whether the expression must match at the
169/// beginning of the text.
170///
171/// Also, an `Hir`'s `fmt::Display` implementation prints an HIR as a regular
172/// expression pattern string, and uses constant stack space and heap space
173/// proportional to the size of the `Hir`.
174#[derive(Clone, Debug, Eq, PartialEq)]
175pub struct Hir {
176 /// The underlying HIR kind.
177 kind: HirKind,
178 /// Analysis info about this HIR, computed during construction.
179 info: HirInfo,
180}
181
182/// The kind of an arbitrary `Hir` expression.
183#[derive(Clone, Debug, Eq, PartialEq)]
184pub enum HirKind {
185 /// The empty regular expression, which matches everything, including the
186 /// empty string.
187 Empty,
188 /// A single literal character that matches exactly this character.
189 Literal(Literal),
190 /// A single character class that matches any of the characters in the
191 /// class. A class can either consist of Unicode scalar values as
192 /// characters, or it can use bytes.
193 Class(Class),
194 /// An anchor assertion. An anchor assertion match always has zero length.
195 Anchor(Anchor),
196 /// A word boundary assertion, which may or may not be Unicode aware. A
197 /// word boundary assertion match always has zero length.
198 WordBoundary(WordBoundary),
199 /// A repetition operation applied to a child expression.
200 Repetition(Repetition),
201 /// A possibly capturing group, which contains a child expression.
202 Group(Group),
203 /// A concatenation of expressions. A concatenation always has at least two
204 /// child expressions.
205 ///
206 /// A concatenation matches only if each of its child expression matches
207 /// one after the other.
208 Concat(Vec<Hir>),
209 /// An alternation of expressions. An alternation always has at least two
210 /// child expressions.
211 ///
212 /// An alternation matches only if at least one of its child expression
213 /// matches. If multiple expressions match, then the leftmost is preferred.
214 Alternation(Vec<Hir>),
215}
216
217impl Hir {
218 /// Returns a reference to the underlying HIR kind.
219 pub fn kind(&self) -> &HirKind {
220 &self.kind
221 }
222
223 /// Consumes ownership of this HIR expression and returns its underlying
224 /// `HirKind`.
225 pub fn into_kind(mut self) -> HirKind {
226 use std::mem;
227 mem::replace(&mut self.kind, HirKind::Empty)
228 }
229
230 /// Returns an empty HIR expression.
231 ///
232 /// An empty HIR expression always matches, including the empty string.
233 pub fn empty() -> Hir {
234 let mut info = HirInfo::new();
235 info.set_always_utf8(true);
236 info.set_all_assertions(true);
237 info.set_anchored_start(false);
238 info.set_anchored_end(false);
239 info.set_line_anchored_start(false);
240 info.set_line_anchored_end(false);
241 info.set_any_anchored_start(false);
242 info.set_any_anchored_end(false);
243 info.set_match_empty(true);
244 info.set_literal(false);
245 info.set_alternation_literal(false);
246 Hir { kind: HirKind::Empty, info }
247 }
248
249 /// Creates a literal HIR expression.
250 ///
251 /// If the given literal has a `Byte` variant with an ASCII byte, then this
252 /// method panics. This enforces the invariant that `Byte` variants are
253 /// only used to express matching of invalid UTF-8.
254 pub fn literal(lit: Literal) -> Hir {
255 if let Literal::Byte(b) = lit {
256 assert!(b > 0x7F);
257 }
258
259 let mut info = HirInfo::new();
260 info.set_always_utf8(lit.is_unicode());
261 info.set_all_assertions(false);
262 info.set_anchored_start(false);
263 info.set_anchored_end(false);
264 info.set_line_anchored_start(false);
265 info.set_line_anchored_end(false);
266 info.set_any_anchored_start(false);
267 info.set_any_anchored_end(false);
268 info.set_match_empty(false);
269 info.set_literal(true);
270 info.set_alternation_literal(true);
271 Hir { kind: HirKind::Literal(lit), info }
272 }
273
274 /// Creates a class HIR expression.
275 pub fn class(class: Class) -> Hir {
276 let mut info = HirInfo::new();
277 info.set_always_utf8(class.is_always_utf8());
278 info.set_all_assertions(false);
279 info.set_anchored_start(false);
280 info.set_anchored_end(false);
281 info.set_line_anchored_start(false);
282 info.set_line_anchored_end(false);
283 info.set_any_anchored_start(false);
284 info.set_any_anchored_end(false);
285 info.set_match_empty(false);
286 info.set_literal(false);
287 info.set_alternation_literal(false);
288 Hir { kind: HirKind::Class(class), info }
289 }
290
291 /// Creates an anchor assertion HIR expression.
292 pub fn anchor(anchor: Anchor) -> Hir {
293 let mut info = HirInfo::new();
294 info.set_always_utf8(true);
295 info.set_all_assertions(true);
296 info.set_anchored_start(false);
297 info.set_anchored_end(false);
298 info.set_line_anchored_start(false);
299 info.set_line_anchored_end(false);
300 info.set_any_anchored_start(false);
301 info.set_any_anchored_end(false);
302 info.set_match_empty(true);
303 info.set_literal(false);
304 info.set_alternation_literal(false);
305 if let Anchor::StartText = anchor {
306 info.set_anchored_start(true);
307 info.set_line_anchored_start(true);
308 info.set_any_anchored_start(true);
309 }
310 if let Anchor::EndText = anchor {
311 info.set_anchored_end(true);
312 info.set_line_anchored_end(true);
313 info.set_any_anchored_end(true);
314 }
315 if let Anchor::StartLine = anchor {
316 info.set_line_anchored_start(true);
317 }
318 if let Anchor::EndLine = anchor {
319 info.set_line_anchored_end(true);
320 }
321 Hir { kind: HirKind::Anchor(anchor), info }
322 }
323
324 /// Creates a word boundary assertion HIR expression.
325 pub fn word_boundary(word_boundary: WordBoundary) -> Hir {
326 let mut info = HirInfo::new();
327 info.set_always_utf8(true);
328 info.set_all_assertions(true);
329 info.set_anchored_start(false);
330 info.set_anchored_end(false);
331 info.set_line_anchored_start(false);
332 info.set_line_anchored_end(false);
333 info.set_any_anchored_start(false);
334 info.set_any_anchored_end(false);
335 info.set_literal(false);
336 info.set_alternation_literal(false);
337 // A negated word boundary matches '', so that's fine. But \b does not
338 // match \b, so why do we say it can match the empty string? Well,
339 // because, if you search for \b against 'a', it will report [0, 0) and
340 // [1, 1) as matches, and both of those matches correspond to the empty
341 // string. Thus, only *certain* empty strings match \b, which similarly
342 // applies to \B.
343 info.set_match_empty(true);
344 // Negated ASCII word boundaries can match invalid UTF-8.
345 if let WordBoundary::AsciiNegate = word_boundary {
346 info.set_always_utf8(false);
347 }
348 Hir { kind: HirKind::WordBoundary(word_boundary), info }
349 }
350
351 /// Creates a repetition HIR expression.
352 pub fn repetition(rep: Repetition) -> Hir {
353 let mut info = HirInfo::new();
354 info.set_always_utf8(rep.hir.is_always_utf8());
355 info.set_all_assertions(rep.hir.is_all_assertions());
356 // If this operator can match the empty string, then it can never
357 // be anchored.
358 info.set_anchored_start(
359 !rep.is_match_empty() && rep.hir.is_anchored_start(),
360 );
361 info.set_anchored_end(
362 !rep.is_match_empty() && rep.hir.is_anchored_end(),
363 );
364 info.set_line_anchored_start(
365 !rep.is_match_empty() && rep.hir.is_anchored_start(),
366 );
367 info.set_line_anchored_end(
368 !rep.is_match_empty() && rep.hir.is_anchored_end(),
369 );
370 info.set_any_anchored_start(rep.hir.is_any_anchored_start());
371 info.set_any_anchored_end(rep.hir.is_any_anchored_end());
372 info.set_match_empty(rep.is_match_empty() || rep.hir.is_match_empty());
373 info.set_literal(false);
374 info.set_alternation_literal(false);
375 Hir { kind: HirKind::Repetition(rep), info }
376 }
377
378 /// Creates a group HIR expression.
379 pub fn group(group: Group) -> Hir {
380 let mut info = HirInfo::new();
381 info.set_always_utf8(group.hir.is_always_utf8());
382 info.set_all_assertions(group.hir.is_all_assertions());
383 info.set_anchored_start(group.hir.is_anchored_start());
384 info.set_anchored_end(group.hir.is_anchored_end());
385 info.set_line_anchored_start(group.hir.is_line_anchored_start());
386 info.set_line_anchored_end(group.hir.is_line_anchored_end());
387 info.set_any_anchored_start(group.hir.is_any_anchored_start());
388 info.set_any_anchored_end(group.hir.is_any_anchored_end());
389 info.set_match_empty(group.hir.is_match_empty());
390 info.set_literal(false);
391 info.set_alternation_literal(false);
392 Hir { kind: HirKind::Group(group), info }
393 }
394
395 /// Returns the concatenation of the given expressions.
396 ///
397 /// This flattens the concatenation as appropriate.
398 pub fn concat(mut exprs: Vec<Hir>) -> Hir {
399 match exprs.len() {
400 0 => Hir::empty(),
401 1 => exprs.pop().unwrap(),
402 _ => {
403 let mut info = HirInfo::new();
404 info.set_always_utf8(true);
405 info.set_all_assertions(true);
406 info.set_any_anchored_start(false);
407 info.set_any_anchored_end(false);
408 info.set_match_empty(true);
409 info.set_literal(true);
410 info.set_alternation_literal(true);
411
412 // Some attributes require analyzing all sub-expressions.
413 for e in &exprs {
414 let x = info.is_always_utf8() && e.is_always_utf8();
415 info.set_always_utf8(x);
416
417 let x = info.is_all_assertions() && e.is_all_assertions();
418 info.set_all_assertions(x);
419
420 let x = info.is_any_anchored_start()
421 || e.is_any_anchored_start();
422 info.set_any_anchored_start(x);
423
424 let x =
425 info.is_any_anchored_end() || e.is_any_anchored_end();
426 info.set_any_anchored_end(x);
427
428 let x = info.is_match_empty() && e.is_match_empty();
429 info.set_match_empty(x);
430
431 let x = info.is_literal() && e.is_literal();
432 info.set_literal(x);
433
434 let x = info.is_alternation_literal()
435 && e.is_alternation_literal();
436 info.set_alternation_literal(x);
437 }
438 // Anchored attributes require something slightly more
439 // sophisticated. Normally, WLOG, to determine whether an
440 // expression is anchored to the start, we'd only need to check
441 // the first expression of a concatenation. However,
442 // expressions like `$\b^` are still anchored to the start,
443 // but the first expression in the concatenation *isn't*
444 // anchored to the start. So the "first" expression to look at
445 // is actually one that is either not an assertion or is
446 // specifically the StartText assertion.
447 info.set_anchored_start(
448 exprs
449 .iter()
450 .take_while(|e| {
451 e.is_anchored_start() || e.is_all_assertions()
452 })
453 .any(|e| e.is_anchored_start()),
454 );
455 // Similarly for the end anchor, but in reverse.
456 info.set_anchored_end(
457 exprs
458 .iter()
459 .rev()
460 .take_while(|e| {
461 e.is_anchored_end() || e.is_all_assertions()
462 })
463 .any(|e| e.is_anchored_end()),
464 );
465 // Repeat the process for line anchors.
466 info.set_line_anchored_start(
467 exprs
468 .iter()
469 .take_while(|e| {
470 e.is_line_anchored_start() || e.is_all_assertions()
471 })
472 .any(|e| e.is_line_anchored_start()),
473 );
474 info.set_line_anchored_end(
475 exprs
476 .iter()
477 .rev()
478 .take_while(|e| {
479 e.is_line_anchored_end() || e.is_all_assertions()
480 })
481 .any(|e| e.is_line_anchored_end()),
482 );
483 Hir { kind: HirKind::Concat(exprs), info }
484 }
485 }
486 }
487
488 /// Returns the alternation of the given expressions.
489 ///
490 /// This flattens the alternation as appropriate.
491 pub fn alternation(mut exprs: Vec<Hir>) -> Hir {
492 match exprs.len() {
493 0 => Hir::empty(),
494 1 => exprs.pop().unwrap(),
495 _ => {
496 let mut info = HirInfo::new();
497 info.set_always_utf8(true);
498 info.set_all_assertions(true);
499 info.set_anchored_start(true);
500 info.set_anchored_end(true);
501 info.set_line_anchored_start(true);
502 info.set_line_anchored_end(true);
503 info.set_any_anchored_start(false);
504 info.set_any_anchored_end(false);
505 info.set_match_empty(false);
506 info.set_literal(false);
507 info.set_alternation_literal(true);
508
509 // Some attributes require analyzing all sub-expressions.
510 for e in &exprs {
511 let x = info.is_always_utf8() && e.is_always_utf8();
512 info.set_always_utf8(x);
513
514 let x = info.is_all_assertions() && e.is_all_assertions();
515 info.set_all_assertions(x);
516
517 let x = info.is_anchored_start() && e.is_anchored_start();
518 info.set_anchored_start(x);
519
520 let x = info.is_anchored_end() && e.is_anchored_end();
521 info.set_anchored_end(x);
522
523 let x = info.is_line_anchored_start()
524 && e.is_line_anchored_start();
525 info.set_line_anchored_start(x);
526
527 let x = info.is_line_anchored_end()
528 && e.is_line_anchored_end();
529 info.set_line_anchored_end(x);
530
531 let x = info.is_any_anchored_start()
532 || e.is_any_anchored_start();
533 info.set_any_anchored_start(x);
534
535 let x =
536 info.is_any_anchored_end() || e.is_any_anchored_end();
537 info.set_any_anchored_end(x);
538
539 let x = info.is_match_empty() || e.is_match_empty();
540 info.set_match_empty(x);
541
542 let x = info.is_alternation_literal() && e.is_literal();
543 info.set_alternation_literal(x);
544 }
545 Hir { kind: HirKind::Alternation(exprs), info }
546 }
547 }
548 }
549
550 /// Build an HIR expression for `.`.
551 ///
552 /// A `.` expression matches any character except for `\n`. To build an
553 /// expression that matches any character, including `\n`, use the `any`
554 /// method.
555 ///
556 /// If `bytes` is `true`, then this assumes characters are limited to a
557 /// single byte.
558 pub fn dot(bytes: bool) -> Hir {
559 if bytes {
560 let mut cls = ClassBytes::empty();
561 cls.push(ClassBytesRange::new(b'\0', b'\x09'));
562 cls.push(ClassBytesRange::new(b'\x0B', b'\xFF'));
563 Hir::class(Class::Bytes(cls))
564 } else {
565 let mut cls = ClassUnicode::empty();
566 cls.push(ClassUnicodeRange::new('\0', '\x09'));
567 cls.push(ClassUnicodeRange::new('\x0B', '\u{10FFFF}'));
568 Hir::class(Class::Unicode(cls))
569 }
570 }
571
572 /// Build an HIR expression for `(?s).`.
573 ///
574 /// A `(?s).` expression matches any character, including `\n`. To build an
575 /// expression that matches any character except for `\n`, then use the
576 /// `dot` method.
577 ///
578 /// If `bytes` is `true`, then this assumes characters are limited to a
579 /// single byte.
580 pub fn any(bytes: bool) -> Hir {
581 if bytes {
582 let mut cls = ClassBytes::empty();
583 cls.push(ClassBytesRange::new(b'\0', b'\xFF'));
584 Hir::class(Class::Bytes(cls))
585 } else {
586 let mut cls = ClassUnicode::empty();
587 cls.push(ClassUnicodeRange::new('\0', '\u{10FFFF}'));
588 Hir::class(Class::Unicode(cls))
589 }
590 }
591
592 /// Return true if and only if this HIR will always match valid UTF-8.
593 ///
594 /// When this returns false, then it is possible for this HIR expression
595 /// to match invalid UTF-8.
596 pub fn is_always_utf8(&self) -> bool {
597 self.info.is_always_utf8()
598 }
599
600 /// Returns true if and only if this entire HIR expression is made up of
601 /// zero-width assertions.
602 ///
603 /// This includes expressions like `^$\b\A\z` and even `((\b)+())*^`, but
604 /// not `^a`.
605 pub fn is_all_assertions(&self) -> bool {
606 self.info.is_all_assertions()
607 }
608
609 /// Return true if and only if this HIR is required to match from the
610 /// beginning of text. This includes expressions like `^foo`, `^(foo|bar)`,
611 /// `^foo|^bar` but not `^foo|bar`.
612 pub fn is_anchored_start(&self) -> bool {
613 self.info.is_anchored_start()
614 }
615
616 /// Return true if and only if this HIR is required to match at the end
617 /// of text. This includes expressions like `foo$`, `(foo|bar)$`,
618 /// `foo$|bar$` but not `foo$|bar`.
619 pub fn is_anchored_end(&self) -> bool {
620 self.info.is_anchored_end()
621 }
622
623 /// Return true if and only if this HIR is required to match from the
624 /// beginning of text or the beginning of a line. This includes expressions
625 /// like `^foo`, `(?m)^foo`, `^(foo|bar)`, `^(foo|bar)`, `(?m)^foo|^bar`
626 /// but not `^foo|bar` or `(?m)^foo|bar`.
627 ///
628 /// Note that if `is_anchored_start` is `true`, then
629 /// `is_line_anchored_start` will also be `true`. The reverse implication
630 /// is not true. For example, `(?m)^foo` is line anchored, but not
631 /// `is_anchored_start`.
632 pub fn is_line_anchored_start(&self) -> bool {
633 self.info.is_line_anchored_start()
634 }
635
636 /// Return true if and only if this HIR is required to match at the
637 /// end of text or the end of a line. This includes expressions like
638 /// `foo$`, `(?m)foo$`, `(foo|bar)$`, `(?m)(foo|bar)$`, `foo$|bar$`,
639 /// `(?m)(foo|bar)$`, but not `foo$|bar` or `(?m)foo$|bar`.
640 ///
641 /// Note that if `is_anchored_end` is `true`, then
642 /// `is_line_anchored_end` will also be `true`. The reverse implication
643 /// is not true. For example, `(?m)foo$` is line anchored, but not
644 /// `is_anchored_end`.
645 pub fn is_line_anchored_end(&self) -> bool {
646 self.info.is_line_anchored_end()
647 }
648
649 /// Return true if and only if this HIR contains any sub-expression that
650 /// is required to match at the beginning of text. Specifically, this
651 /// returns true if the `^` symbol (when multiline mode is disabled) or the
652 /// `\A` escape appear anywhere in the regex.
653 pub fn is_any_anchored_start(&self) -> bool {
654 self.info.is_any_anchored_start()
655 }
656
657 /// Return true if and only if this HIR contains any sub-expression that is
658 /// required to match at the end of text. Specifically, this returns true
659 /// if the `$` symbol (when multiline mode is disabled) or the `\z` escape
660 /// appear anywhere in the regex.
661 pub fn is_any_anchored_end(&self) -> bool {
662 self.info.is_any_anchored_end()
663 }
664
665 /// Return true if and only if the empty string is part of the language
666 /// matched by this regular expression.
667 ///
668 /// This includes `a*`, `a?b*`, `a{0}`, `()`, `()+`, `^$`, `a|b?`, `\b`
669 /// and `\B`, but not `a` or `a+`.
670 pub fn is_match_empty(&self) -> bool {
671 self.info.is_match_empty()
672 }
673
674 /// Return true if and only if this HIR is a simple literal. This is only
675 /// true when this HIR expression is either itself a `Literal` or a
676 /// concatenation of only `Literal`s.
677 ///
678 /// For example, `f` and `foo` are literals, but `f+`, `(foo)`, `foo()`,
679 /// `` are not (even though that contain sub-expressions that are literals).
680 pub fn is_literal(&self) -> bool {
681 self.info.is_literal()
682 }
683
684 /// Return true if and only if this HIR is either a simple literal or an
685 /// alternation of simple literals. This is only
686 /// true when this HIR expression is either itself a `Literal` or a
687 /// concatenation of only `Literal`s or an alternation of only `Literal`s.
688 ///
689 /// For example, `f`, `foo`, `a|b|c`, and `foo|bar|baz` are alternation
690 /// literals, but `f+`, `(foo)`, `foo()`, ``
691 /// are not (even though that contain sub-expressions that are literals).
692 pub fn is_alternation_literal(&self) -> bool {
693 self.info.is_alternation_literal()
694 }
695}
696
697impl HirKind {
698 /// Return true if and only if this HIR is the empty regular expression.
699 ///
700 /// Note that this is not defined inductively. That is, it only tests if
701 /// this kind is the `Empty` variant. To get the inductive definition,
702 /// use the `is_match_empty` method on [`Hir`](struct.Hir.html).
703 pub fn is_empty(&self) -> bool {
704 match *self {
705 HirKind::Empty => true,
706 _ => false,
707 }
708 }
709
710 /// Returns true if and only if this kind has any (including possibly
711 /// empty) subexpressions.
712 pub fn has_subexprs(&self) -> bool {
713 match *self {
714 HirKind::Empty
715 | HirKind::Literal(_)
716 | HirKind::Class(_)
717 | HirKind::Anchor(_)
718 | HirKind::WordBoundary(_) => false,
719 HirKind::Group(_)
720 | HirKind::Repetition(_)
721 | HirKind::Concat(_)
722 | HirKind::Alternation(_) => true,
723 }
724 }
725}
726
727/// Print a display representation of this Hir.
728///
729/// The result of this is a valid regular expression pattern string.
730///
731/// This implementation uses constant stack space and heap space proportional
732/// to the size of the `Hir`.
733impl fmt::Display for Hir {
734 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
735 use crate::hir::print::Printer;
736 Printer::new().print(self, f)
737 }
738}
739
740/// The high-level intermediate representation of a literal.
741///
742/// A literal corresponds to a single character, where a character is either
743/// defined by a Unicode scalar value or an arbitrary byte. Unicode characters
744/// are preferred whenever possible. In particular, a `Byte` variant is only
745/// ever produced when it could match invalid UTF-8.
746#[derive(Clone, Debug, Eq, PartialEq)]
747pub enum Literal {
748 /// A single character represented by a Unicode scalar value.
749 Unicode(char),
750 /// A single character represented by an arbitrary byte.
751 Byte(u8),
752}
753
754impl Literal {
755 /// Returns true if and only if this literal corresponds to a Unicode
756 /// scalar value.
757 pub fn is_unicode(&self) -> bool {
758 match *self {
759 Literal::Unicode(_) => true,
760 Literal::Byte(b) if b <= 0x7F => true,
761 Literal::Byte(_) => false,
762 }
763 }
764}
765
766/// The high-level intermediate representation of a character class.
767///
768/// A character class corresponds to a set of characters. A character is either
769/// defined by a Unicode scalar value or a byte. Unicode characters are used
770/// by default, while bytes are used when Unicode mode (via the `u` flag) is
771/// disabled.
772///
773/// A character class, regardless of its character type, is represented by a
774/// sequence of non-overlapping non-adjacent ranges of characters.
775///
776/// Note that unlike [`Literal`](enum.Literal.html), a `Bytes` variant may
777/// be produced even when it exclusively matches valid UTF-8. This is because
778/// a `Bytes` variant represents an intention by the author of the regular
779/// expression to disable Unicode mode, which in turn impacts the semantics of
780/// case insensitive matching. For example, `(?i)k` and `(?i-u)k` will not
781/// match the same set of strings.
782#[derive(Clone, Debug, Eq, PartialEq)]
783pub enum Class {
784 /// A set of characters represented by Unicode scalar values.
785 Unicode(ClassUnicode),
786 /// A set of characters represented by arbitrary bytes (one byte per
787 /// character).
788 Bytes(ClassBytes),
789}
790
791impl Class {
792 /// Apply Unicode simple case folding to this character class, in place.
793 /// The character class will be expanded to include all simple case folded
794 /// character variants.
795 ///
796 /// If this is a byte oriented character class, then this will be limited
797 /// to the ASCII ranges `A-Z` and `a-z`.
798 pub fn case_fold_simple(&mut self) {
799 match *self {
800 Class::Unicode(ref mut x) => x.case_fold_simple(),
801 Class::Bytes(ref mut x) => x.case_fold_simple(),
802 }
803 }
804
805 /// Negate this character class in place.
806 ///
807 /// After completion, this character class will contain precisely the
808 /// characters that weren't previously in the class.
809 pub fn negate(&mut self) {
810 match *self {
811 Class::Unicode(ref mut x) => x.negate(),
812 Class::Bytes(ref mut x) => x.negate(),
813 }
814 }
815
816 /// Returns true if and only if this character class will only ever match
817 /// valid UTF-8.
818 ///
819 /// A character class can match invalid UTF-8 only when the following
820 /// conditions are met:
821 ///
822 /// 1. The translator was configured to permit generating an expression
823 /// that can match invalid UTF-8. (By default, this is disabled.)
824 /// 2. Unicode mode (via the `u` flag) was disabled either in the concrete
825 /// syntax or in the parser builder. By default, Unicode mode is
826 /// enabled.
827 pub fn is_always_utf8(&self) -> bool {
828 match *self {
829 Class::Unicode(_) => true,
830 Class::Bytes(ref x) => x.is_all_ascii(),
831 }
832 }
833}
834
835/// A set of characters represented by Unicode scalar values.
836#[derive(Clone, Debug, Eq, PartialEq)]
837pub struct ClassUnicode {
838 set: IntervalSet<ClassUnicodeRange>,
839}
840
841impl ClassUnicode {
842 /// Create a new class from a sequence of ranges.
843 ///
844 /// The given ranges do not need to be in any specific order, and ranges
845 /// may overlap.
846 pub fn new<I>(ranges: I) -> ClassUnicode
847 where
848 I: IntoIterator<Item = ClassUnicodeRange>,
849 {
850 ClassUnicode { set: IntervalSet::new(ranges) }
851 }
852
853 /// Create a new class with no ranges.
854 pub fn empty() -> ClassUnicode {
855 ClassUnicode::new(vec![])
856 }
857
858 /// Add a new range to this set.
859 pub fn push(&mut self, range: ClassUnicodeRange) {
860 self.set.push(range);
861 }
862
863 /// Return an iterator over all ranges in this class.
864 ///
865 /// The iterator yields ranges in ascending order.
866 pub fn iter(&self) -> ClassUnicodeIter<'_> {
867 ClassUnicodeIter(self.set.iter())
868 }
869
870 /// Return the underlying ranges as a slice.
871 pub fn ranges(&self) -> &[ClassUnicodeRange] {
872 self.set.intervals()
873 }
874
875 /// Expand this character class such that it contains all case folded
876 /// characters, according to Unicode's "simple" mapping. For example, if
877 /// this class consists of the range `a-z`, then applying case folding will
878 /// result in the class containing both the ranges `a-z` and `A-Z`.
879 ///
880 /// # Panics
881 ///
882 /// This routine panics when the case mapping data necessary for this
883 /// routine to complete is unavailable. This occurs when the `unicode-case`
884 /// feature is not enabled.
885 ///
886 /// Callers should prefer using `try_case_fold_simple` instead, which will
887 /// return an error instead of panicking.
888 pub fn case_fold_simple(&mut self) {
889 self.set
890 .case_fold_simple()
891 .expect("unicode-case feature must be enabled");
892 }
893
894 /// Expand this character class such that it contains all case folded
895 /// characters, according to Unicode's "simple" mapping. For example, if
896 /// this class consists of the range `a-z`, then applying case folding will
897 /// result in the class containing both the ranges `a-z` and `A-Z`.
898 ///
899 /// # Error
900 ///
901 /// This routine returns an error when the case mapping data necessary
902 /// for this routine to complete is unavailable. This occurs when the
903 /// `unicode-case` feature is not enabled.
904 pub fn try_case_fold_simple(
905 &mut self,
906 ) -> result::Result<(), CaseFoldError> {
907 self.set.case_fold_simple()
908 }
909
910 /// Negate this character class.
911 ///
912 /// For all `c` where `c` is a Unicode scalar value, if `c` was in this
913 /// set, then it will not be in this set after negation.
914 pub fn negate(&mut self) {
915 self.set.negate();
916 }
917
918 /// Union this character class with the given character class, in place.
919 pub fn union(&mut self, other: &ClassUnicode) {
920 self.set.union(&other.set);
921 }
922
923 /// Intersect this character class with the given character class, in
924 /// place.
925 pub fn intersect(&mut self, other: &ClassUnicode) {
926 self.set.intersect(&other.set);
927 }
928
929 /// Subtract the given character class from this character class, in place.
930 pub fn difference(&mut self, other: &ClassUnicode) {
931 self.set.difference(&other.set);
932 }
933
934 /// Compute the symmetric difference of the given character classes, in
935 /// place.
936 ///
937 /// This computes the symmetric difference of two character classes. This
938 /// removes all elements in this class that are also in the given class,
939 /// but all adds all elements from the given class that aren't in this
940 /// class. That is, the class will contain all elements in either class,
941 /// but will not contain any elements that are in both classes.
942 pub fn symmetric_difference(&mut self, other: &ClassUnicode) {
943 self.set.symmetric_difference(&other.set);
944 }
945
946 /// Returns true if and only if this character class will either match
947 /// nothing or only ASCII bytes. Stated differently, this returns false
948 /// if and only if this class contains a non-ASCII codepoint.
949 pub fn is_all_ascii(&self) -> bool {
950 self.set.intervals().last().map_or(true, |r| r.end <= '\x7F')
951 }
952}
953
954/// An iterator over all ranges in a Unicode character class.
955///
956/// The lifetime `'a` refers to the lifetime of the underlying class.
957#[derive(Debug)]
958pub struct ClassUnicodeIter<'a>(IntervalSetIter<'a, ClassUnicodeRange>);
959
960impl<'a> Iterator for ClassUnicodeIter<'a> {
961 type Item = &'a ClassUnicodeRange;
962
963 fn next(&mut self) -> Option<&'a ClassUnicodeRange> {
964 self.0.next()
965 }
966}
967
968/// A single range of characters represented by Unicode scalar values.
969///
970/// The range is closed. That is, the start and end of the range are included
971/// in the range.
972#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
973pub struct ClassUnicodeRange {
974 start: char,
975 end: char,
976}
977
978impl fmt::Debug for ClassUnicodeRange {
979 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
980 let start = if !self.start.is_whitespace() && !self.start.is_control()
981 {
982 self.start.to_string()
983 } else {
984 format!("0x{:X}", self.start as u32)
985 };
986 let end = if !self.end.is_whitespace() && !self.end.is_control() {
987 self.end.to_string()
988 } else {
989 format!("0x{:X}", self.end as u32)
990 };
991 f.debug_struct("ClassUnicodeRange")
992 .field("start", &start)
993 .field("end", &end)
994 .finish()
995 }
996}
997
998impl Interval for ClassUnicodeRange {
999 type Bound = char;
1000
1001 #[inline]
1002 fn lower(&self) -> char {
1003 self.start
1004 }
1005 #[inline]
1006 fn upper(&self) -> char {
1007 self.end
1008 }
1009 #[inline]
1010 fn set_lower(&mut self, bound: char) {
1011 self.start = bound;
1012 }
1013 #[inline]
1014 fn set_upper(&mut self, bound: char) {
1015 self.end = bound;
1016 }
1017
1018 /// Apply simple case folding to this Unicode scalar value range.
1019 ///
1020 /// Additional ranges are appended to the given vector. Canonical ordering
1021 /// is *not* maintained in the given vector.
1022 fn case_fold_simple(
1023 &self,
1024 ranges: &mut Vec<ClassUnicodeRange>,
1025 ) -> Result<(), unicode::CaseFoldError> {
1026 if !unicode::contains_simple_case_mapping(self.start, self.end)? {
1027 return Ok(());
1028 }
1029 let start = self.start as u32;
1030 let end = (self.end as u32).saturating_add(1);
1031 let mut next_simple_cp = None;
1032 for cp in (start..end).filter_map(char::from_u32) {
1033 if next_simple_cp.map_or(false, |next| cp < next) {
1034 continue;
1035 }
1036 let it = match unicode::simple_fold(cp)? {
1037 Ok(it) => it,
1038 Err(next) => {
1039 next_simple_cp = next;
1040 continue;
1041 }
1042 };
1043 for cp_folded in it {
1044 ranges.push(ClassUnicodeRange::new(cp_folded, cp_folded));
1045 }
1046 }
1047 Ok(())
1048 }
1049}
1050
1051impl ClassUnicodeRange {
1052 /// Create a new Unicode scalar value range for a character class.
1053 ///
1054 /// The returned range is always in a canonical form. That is, the range
1055 /// returned always satisfies the invariant that `start <= end`.
1056 pub fn new(start: char, end: char) -> ClassUnicodeRange {
1057 ClassUnicodeRange::create(start, end)
1058 }
1059
1060 /// Return the start of this range.
1061 ///
1062 /// The start of a range is always less than or equal to the end of the
1063 /// range.
1064 pub fn start(&self) -> char {
1065 self.start
1066 }
1067
1068 /// Return the end of this range.
1069 ///
1070 /// The end of a range is always greater than or equal to the start of the
1071 /// range.
1072 pub fn end(&self) -> char {
1073 self.end
1074 }
1075}
1076
1077/// A set of characters represented by arbitrary bytes (where one byte
1078/// corresponds to one character).
1079#[derive(Clone, Debug, Eq, PartialEq)]
1080pub struct ClassBytes {
1081 set: IntervalSet<ClassBytesRange>,
1082}
1083
1084impl ClassBytes {
1085 /// Create a new class from a sequence of ranges.
1086 ///
1087 /// The given ranges do not need to be in any specific order, and ranges
1088 /// may overlap.
1089 pub fn new<I>(ranges: I) -> ClassBytes
1090 where
1091 I: IntoIterator<Item = ClassBytesRange>,
1092 {
1093 ClassBytes { set: IntervalSet::new(ranges) }
1094 }
1095
1096 /// Create a new class with no ranges.
1097 pub fn empty() -> ClassBytes {
1098 ClassBytes::new(vec![])
1099 }
1100
1101 /// Add a new range to this set.
1102 pub fn push(&mut self, range: ClassBytesRange) {
1103 self.set.push(range);
1104 }
1105
1106 /// Return an iterator over all ranges in this class.
1107 ///
1108 /// The iterator yields ranges in ascending order.
1109 pub fn iter(&self) -> ClassBytesIter<'_> {
1110 ClassBytesIter(self.set.iter())
1111 }
1112
1113 /// Return the underlying ranges as a slice.
1114 pub fn ranges(&self) -> &[ClassBytesRange] {
1115 self.set.intervals()
1116 }
1117
1118 /// Expand this character class such that it contains all case folded
1119 /// characters. For example, if this class consists of the range `a-z`,
1120 /// then applying case folding will result in the class containing both the
1121 /// ranges `a-z` and `A-Z`.
1122 ///
1123 /// Note that this only applies ASCII case folding, which is limited to the
1124 /// characters `a-z` and `A-Z`.
1125 pub fn case_fold_simple(&mut self) {
1126 self.set.case_fold_simple().expect("ASCII case folding never fails");
1127 }
1128
1129 /// Negate this byte class.
1130 ///
1131 /// For all `b` where `b` is a any byte, if `b` was in this set, then it
1132 /// will not be in this set after negation.
1133 pub fn negate(&mut self) {
1134 self.set.negate();
1135 }
1136
1137 /// Union this byte class with the given byte class, in place.
1138 pub fn union(&mut self, other: &ClassBytes) {
1139 self.set.union(&other.set);
1140 }
1141
1142 /// Intersect this byte class with the given byte class, in place.
1143 pub fn intersect(&mut self, other: &ClassBytes) {
1144 self.set.intersect(&other.set);
1145 }
1146
1147 /// Subtract the given byte class from this byte class, in place.
1148 pub fn difference(&mut self, other: &ClassBytes) {
1149 self.set.difference(&other.set);
1150 }
1151
1152 /// Compute the symmetric difference of the given byte classes, in place.
1153 ///
1154 /// This computes the symmetric difference of two byte classes. This
1155 /// removes all elements in this class that are also in the given class,
1156 /// but all adds all elements from the given class that aren't in this
1157 /// class. That is, the class will contain all elements in either class,
1158 /// but will not contain any elements that are in both classes.
1159 pub fn symmetric_difference(&mut self, other: &ClassBytes) {
1160 self.set.symmetric_difference(&other.set);
1161 }
1162
1163 /// Returns true if and only if this character class will either match
1164 /// nothing or only ASCII bytes. Stated differently, this returns false
1165 /// if and only if this class contains a non-ASCII byte.
1166 pub fn is_all_ascii(&self) -> bool {
1167 self.set.intervals().last().map_or(true, |r| r.end <= 0x7F)
1168 }
1169}
1170
1171/// An iterator over all ranges in a byte character class.
1172///
1173/// The lifetime `'a` refers to the lifetime of the underlying class.
1174#[derive(Debug)]
1175pub struct ClassBytesIter<'a>(IntervalSetIter<'a, ClassBytesRange>);
1176
1177impl<'a> Iterator for ClassBytesIter<'a> {
1178 type Item = &'a ClassBytesRange;
1179
1180 fn next(&mut self) -> Option<&'a ClassBytesRange> {
1181 self.0.next()
1182 }
1183}
1184
1185/// A single range of characters represented by arbitrary bytes.
1186///
1187/// The range is closed. That is, the start and end of the range are included
1188/// in the range.
1189#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
1190pub struct ClassBytesRange {
1191 start: u8,
1192 end: u8,
1193}
1194
1195impl Interval for ClassBytesRange {
1196 type Bound = u8;
1197
1198 #[inline]
1199 fn lower(&self) -> u8 {
1200 self.start
1201 }
1202 #[inline]
1203 fn upper(&self) -> u8 {
1204 self.end
1205 }
1206 #[inline]
1207 fn set_lower(&mut self, bound: u8) {
1208 self.start = bound;
1209 }
1210 #[inline]
1211 fn set_upper(&mut self, bound: u8) {
1212 self.end = bound;
1213 }
1214
1215 /// Apply simple case folding to this byte range. Only ASCII case mappings
1216 /// (for a-z) are applied.
1217 ///
1218 /// Additional ranges are appended to the given vector. Canonical ordering
1219 /// is *not* maintained in the given vector.
1220 fn case_fold_simple(
1221 &self,
1222 ranges: &mut Vec<ClassBytesRange>,
1223 ) -> Result<(), unicode::CaseFoldError> {
1224 if !ClassBytesRange::new(b'a', b'z').is_intersection_empty(self) {
1225 let lower = cmp::max(self.start, b'a');
1226 let upper = cmp::min(self.end, b'z');
1227 ranges.push(ClassBytesRange::new(lower - 32, upper - 32));
1228 }
1229 if !ClassBytesRange::new(b'A', b'Z').is_intersection_empty(self) {
1230 let lower = cmp::max(self.start, b'A');
1231 let upper = cmp::min(self.end, b'Z');
1232 ranges.push(ClassBytesRange::new(lower + 32, upper + 32));
1233 }
1234 Ok(())
1235 }
1236}
1237
1238impl ClassBytesRange {
1239 /// Create a new byte range for a character class.
1240 ///
1241 /// The returned range is always in a canonical form. That is, the range
1242 /// returned always satisfies the invariant that `start <= end`.
1243 pub fn new(start: u8, end: u8) -> ClassBytesRange {
1244 ClassBytesRange::create(start, end)
1245 }
1246
1247 /// Return the start of this range.
1248 ///
1249 /// The start of a range is always less than or equal to the end of the
1250 /// range.
1251 pub fn start(&self) -> u8 {
1252 self.start
1253 }
1254
1255 /// Return the end of this range.
1256 ///
1257 /// The end of a range is always greater than or equal to the start of the
1258 /// range.
1259 pub fn end(&self) -> u8 {
1260 self.end
1261 }
1262}
1263
1264impl fmt::Debug for ClassBytesRange {
1265 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1266 let mut debug = f.debug_struct("ClassBytesRange");
1267 if self.start <= 0x7F {
1268 debug.field("start", &(self.start as char));
1269 } else {
1270 debug.field("start", &self.start);
1271 }
1272 if self.end <= 0x7F {
1273 debug.field("end", &(self.end as char));
1274 } else {
1275 debug.field("end", &self.end);
1276 }
1277 debug.finish()
1278 }
1279}
1280
1281/// The high-level intermediate representation for an anchor assertion.
1282///
1283/// A matching anchor assertion is always zero-length.
1284#[derive(Clone, Debug, Eq, PartialEq)]
1285pub enum Anchor {
1286 /// Match the beginning of a line or the beginning of text. Specifically,
1287 /// this matches at the starting position of the input, or at the position
1288 /// immediately following a `\n` character.
1289 StartLine,
1290 /// Match the end of a line or the end of text. Specifically,
1291 /// this matches at the end position of the input, or at the position
1292 /// immediately preceding a `\n` character.
1293 EndLine,
1294 /// Match the beginning of text. Specifically, this matches at the starting
1295 /// position of the input.
1296 StartText,
1297 /// Match the end of text. Specifically, this matches at the ending
1298 /// position of the input.
1299 EndText,
1300}
1301
1302/// The high-level intermediate representation for a word-boundary assertion.
1303///
1304/// A matching word boundary assertion is always zero-length.
1305#[derive(Clone, Debug, Eq, PartialEq)]
1306pub enum WordBoundary {
1307 /// Match a Unicode-aware word boundary. That is, this matches a position
1308 /// where the left adjacent character and right adjacent character
1309 /// correspond to a word and non-word or a non-word and word character.
1310 Unicode,
1311 /// Match a Unicode-aware negation of a word boundary.
1312 UnicodeNegate,
1313 /// Match an ASCII-only word boundary. That is, this matches a position
1314 /// where the left adjacent character and right adjacent character
1315 /// correspond to a word and non-word or a non-word and word character.
1316 Ascii,
1317 /// Match an ASCII-only negation of a word boundary.
1318 AsciiNegate,
1319}
1320
1321impl WordBoundary {
1322 /// Returns true if and only if this word boundary assertion is negated.
1323 pub fn is_negated(&self) -> bool {
1324 match *self {
1325 WordBoundary::Unicode | WordBoundary::Ascii => false,
1326 WordBoundary::UnicodeNegate | WordBoundary::AsciiNegate => true,
1327 }
1328 }
1329}
1330
1331/// The high-level intermediate representation for a group.
1332///
1333/// This represents one of three possible group types:
1334///
1335/// 1. A non-capturing group (e.g., `(?:expr)`).
1336/// 2. A capturing group (e.g., `(expr)`).
1337/// 3. A named capturing group (e.g., `(?P<name>expr)`).
1338#[derive(Clone, Debug, Eq, PartialEq)]
1339pub struct Group {
1340 /// The kind of this group. If it is a capturing group, then the kind
1341 /// contains the capture group index (and the name, if it is a named
1342 /// group).
1343 pub kind: GroupKind,
1344 /// The expression inside the capturing group, which may be empty.
1345 pub hir: Box<Hir>,
1346}
1347
1348/// The kind of group.
1349#[derive(Clone, Debug, Eq, PartialEq)]
1350pub enum GroupKind {
1351 /// A normal unnamed capturing group.
1352 ///
1353 /// The value is the capture index of the group.
1354 CaptureIndex(u32),
1355 /// A named capturing group.
1356 CaptureName {
1357 /// The name of the group.
1358 name: String,
1359 /// The capture index of the group.
1360 index: u32,
1361 },
1362 /// A non-capturing group.
1363 NonCapturing,
1364}
1365
1366/// The high-level intermediate representation of a repetition operator.
1367///
1368/// A repetition operator permits the repetition of an arbitrary
1369/// sub-expression.
1370#[derive(Clone, Debug, Eq, PartialEq)]
1371pub struct Repetition {
1372 /// The kind of this repetition operator.
1373 pub kind: RepetitionKind,
1374 /// Whether this repetition operator is greedy or not. A greedy operator
1375 /// will match as much as it can. A non-greedy operator will match as
1376 /// little as it can.
1377 ///
1378 /// Typically, operators are greedy by default and are only non-greedy when
1379 /// a `?` suffix is used, e.g., `(expr)*` is greedy while `(expr)*?` is
1380 /// not. However, this can be inverted via the `U` "ungreedy" flag.
1381 pub greedy: bool,
1382 /// The expression being repeated.
1383 pub hir: Box<Hir>,
1384}
1385
1386impl Repetition {
1387 /// Returns true if and only if this repetition operator makes it possible
1388 /// to match the empty string.
1389 ///
1390 /// Note that this is not defined inductively. For example, while `a*`
1391 /// will report `true`, `()+` will not, even though `()` matches the empty
1392 /// string and one or more occurrences of something that matches the empty
1393 /// string will always match the empty string. In order to get the
1394 /// inductive definition, see the corresponding method on
1395 /// [`Hir`](struct.Hir.html).
1396 pub fn is_match_empty(&self) -> bool {
1397 match self.kind {
1398 RepetitionKind::ZeroOrOne => true,
1399 RepetitionKind::ZeroOrMore => true,
1400 RepetitionKind::OneOrMore => false,
1401 RepetitionKind::Range(RepetitionRange::Exactly(m)) => m == 0,
1402 RepetitionKind::Range(RepetitionRange::AtLeast(m)) => m == 0,
1403 RepetitionKind::Range(RepetitionRange::Bounded(m, _)) => m == 0,
1404 }
1405 }
1406}
1407
1408/// The kind of a repetition operator.
1409#[derive(Clone, Debug, Eq, PartialEq)]
1410pub enum RepetitionKind {
1411 /// Matches a sub-expression zero or one times.
1412 ZeroOrOne,
1413 /// Matches a sub-expression zero or more times.
1414 ZeroOrMore,
1415 /// Matches a sub-expression one or more times.
1416 OneOrMore,
1417 /// Matches a sub-expression within a bounded range of times.
1418 Range(RepetitionRange),
1419}
1420
1421/// The kind of a counted repetition operator.
1422#[derive(Clone, Debug, Eq, PartialEq)]
1423pub enum RepetitionRange {
1424 /// Matches a sub-expression exactly this many times.
1425 Exactly(u32),
1426 /// Matches a sub-expression at least this many times.
1427 AtLeast(u32),
1428 /// Matches a sub-expression at least `m` times and at most `n` times.
1429 Bounded(u32, u32),
1430}
1431
1432/// A custom `Drop` impl is used for `HirKind` such that it uses constant stack
1433/// space but heap space proportional to the depth of the total `Hir`.
1434impl Drop for Hir {
1435 fn drop(&mut self) {
1436 use std::mem;
1437
1438 match *self.kind() {
1439 HirKind::Empty
1440 | HirKind::Literal(_)
1441 | HirKind::Class(_)
1442 | HirKind::Anchor(_)
1443 | HirKind::WordBoundary(_) => return,
1444 HirKind::Group(ref x) if !x.hir.kind.has_subexprs() => return,
1445 HirKind::Repetition(ref x) if !x.hir.kind.has_subexprs() => return,
1446 HirKind::Concat(ref x) if x.is_empty() => return,
1447 HirKind::Alternation(ref x) if x.is_empty() => return,
1448 _ => {}
1449 }
1450
1451 let mut stack = vec![mem::replace(self, Hir::empty())];
1452 while let Some(mut expr) = stack.pop() {
1453 match expr.kind {
1454 HirKind::Empty
1455 | HirKind::Literal(_)
1456 | HirKind::Class(_)
1457 | HirKind::Anchor(_)
1458 | HirKind::WordBoundary(_) => {}
1459 HirKind::Group(ref mut x) => {
1460 stack.push(mem::replace(&mut x.hir, Hir::empty()));
1461 }
1462 HirKind::Repetition(ref mut x) => {
1463 stack.push(mem::replace(&mut x.hir, Hir::empty()));
1464 }
1465 HirKind::Concat(ref mut x) => {
1466 stack.extend(x.drain(..));
1467 }
1468 HirKind::Alternation(ref mut x) => {
1469 stack.extend(x.drain(..));
1470 }
1471 }
1472 }
1473 }
1474}
1475
1476/// A type that documents various attributes of an HIR expression.
1477///
1478/// These attributes are typically defined inductively on the HIR.
1479#[derive(Clone, Debug, Eq, PartialEq)]
1480struct HirInfo {
1481 /// Represent yes/no questions by a bitfield to conserve space, since
1482 /// this is included in every HIR expression.
1483 ///
1484 /// If more attributes need to be added, it is OK to increase the size of
1485 /// this as appropriate.
1486 bools: u16,
1487}
1488
1489// A simple macro for defining bitfield accessors/mutators.
1490macro_rules! define_bool {
1491 ($bit:expr, $is_fn_name:ident, $set_fn_name:ident) => {
1492 fn $is_fn_name(&self) -> bool {
1493 self.bools & (0b1 << $bit) > 0
1494 }
1495
1496 fn $set_fn_name(&mut self, yes: bool) {
1497 if yes {
1498 self.bools |= 1 << $bit;
1499 } else {
1500 self.bools &= !(1 << $bit);
1501 }
1502 }
1503 };
1504}
1505
1506impl HirInfo {
1507 fn new() -> HirInfo {
1508 HirInfo { bools: 0 }
1509 }
1510
1511 define_bool!(0, is_always_utf8, set_always_utf8);
1512 define_bool!(1, is_all_assertions, set_all_assertions);
1513 define_bool!(2, is_anchored_start, set_anchored_start);
1514 define_bool!(3, is_anchored_end, set_anchored_end);
1515 define_bool!(4, is_line_anchored_start, set_line_anchored_start);
1516 define_bool!(5, is_line_anchored_end, set_line_anchored_end);
1517 define_bool!(6, is_any_anchored_start, set_any_anchored_start);
1518 define_bool!(7, is_any_anchored_end, set_any_anchored_end);
1519 define_bool!(8, is_match_empty, set_match_empty);
1520 define_bool!(9, is_literal, set_literal);
1521 define_bool!(10, is_alternation_literal, set_alternation_literal);
1522}
1523
1524#[cfg(test)]
1525mod tests {
1526 use super::*;
1527
1528 fn uclass(ranges: &[(char, char)]) -> ClassUnicode {
1529 let ranges: Vec<ClassUnicodeRange> = ranges
1530 .iter()
1531 .map(|&(s, e)| ClassUnicodeRange::new(s, e))
1532 .collect();
1533 ClassUnicode::new(ranges)
1534 }
1535
1536 fn bclass(ranges: &[(u8, u8)]) -> ClassBytes {
1537 let ranges: Vec<ClassBytesRange> =
1538 ranges.iter().map(|&(s, e)| ClassBytesRange::new(s, e)).collect();
1539 ClassBytes::new(ranges)
1540 }
1541
1542 fn uranges(cls: &ClassUnicode) -> Vec<(char, char)> {
1543 cls.iter().map(|x| (x.start(), x.end())).collect()
1544 }
1545
1546 #[cfg(feature = "unicode-case")]
1547 fn ucasefold(cls: &ClassUnicode) -> ClassUnicode {
1548 let mut cls_ = cls.clone();
1549 cls_.case_fold_simple();
1550 cls_
1551 }
1552
1553 fn uunion(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
1554 let mut cls_ = cls1.clone();
1555 cls_.union(cls2);
1556 cls_
1557 }
1558
1559 fn uintersect(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
1560 let mut cls_ = cls1.clone();
1561 cls_.intersect(cls2);
1562 cls_
1563 }
1564
1565 fn udifference(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
1566 let mut cls_ = cls1.clone();
1567 cls_.difference(cls2);
1568 cls_
1569 }
1570
1571 fn usymdifference(
1572 cls1: &ClassUnicode,
1573 cls2: &ClassUnicode,
1574 ) -> ClassUnicode {
1575 let mut cls_ = cls1.clone();
1576 cls_.symmetric_difference(cls2);
1577 cls_
1578 }
1579
1580 fn unegate(cls: &ClassUnicode) -> ClassUnicode {
1581 let mut cls_ = cls.clone();
1582 cls_.negate();
1583 cls_
1584 }
1585
1586 fn branges(cls: &ClassBytes) -> Vec<(u8, u8)> {
1587 cls.iter().map(|x| (x.start(), x.end())).collect()
1588 }
1589
1590 fn bcasefold(cls: &ClassBytes) -> ClassBytes {
1591 let mut cls_ = cls.clone();
1592 cls_.case_fold_simple();
1593 cls_
1594 }
1595
1596 fn bunion(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
1597 let mut cls_ = cls1.clone();
1598 cls_.union(cls2);
1599 cls_
1600 }
1601
1602 fn bintersect(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
1603 let mut cls_ = cls1.clone();
1604 cls_.intersect(cls2);
1605 cls_
1606 }
1607
1608 fn bdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
1609 let mut cls_ = cls1.clone();
1610 cls_.difference(cls2);
1611 cls_
1612 }
1613
1614 fn bsymdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
1615 let mut cls_ = cls1.clone();
1616 cls_.symmetric_difference(cls2);
1617 cls_
1618 }
1619
1620 fn bnegate(cls: &ClassBytes) -> ClassBytes {
1621 let mut cls_ = cls.clone();
1622 cls_.negate();
1623 cls_
1624 }
1625
1626 #[test]
1627 fn class_range_canonical_unicode() {
1628 let range = ClassUnicodeRange::new('\u{00FF}', '\0');
1629 assert_eq!('\0', range.start());
1630 assert_eq!('\u{00FF}', range.end());
1631 }
1632
1633 #[test]
1634 fn class_range_canonical_bytes() {
1635 let range = ClassBytesRange::new(b'\xFF', b'\0');
1636 assert_eq!(b'\0', range.start());
1637 assert_eq!(b'\xFF', range.end());
1638 }
1639
1640 #[test]
1641 fn class_canonicalize_unicode() {
1642 let cls = uclass(&[('a', 'c'), ('x', 'z')]);
1643 let expected = vec![('a', 'c'), ('x', 'z')];
1644 assert_eq!(expected, uranges(&cls));
1645
1646 let cls = uclass(&[('x', 'z'), ('a', 'c')]);
1647 let expected = vec![('a', 'c'), ('x', 'z')];
1648 assert_eq!(expected, uranges(&cls));
1649
1650 let cls = uclass(&[('x', 'z'), ('w', 'y')]);
1651 let expected = vec![('w', 'z')];
1652 assert_eq!(expected, uranges(&cls));
1653
1654 let cls = uclass(&[
1655 ('c', 'f'),
1656 ('a', 'g'),
1657 ('d', 'j'),
1658 ('a', 'c'),
1659 ('m', 'p'),
1660 ('l', 's'),
1661 ]);
1662 let expected = vec![('a', 'j'), ('l', 's')];
1663 assert_eq!(expected, uranges(&cls));
1664
1665 let cls = uclass(&[('x', 'z'), ('u', 'w')]);
1666 let expected = vec![('u', 'z')];
1667 assert_eq!(expected, uranges(&cls));
1668
1669 let cls = uclass(&[('\x00', '\u{10FFFF}'), ('\x00', '\u{10FFFF}')]);
1670 let expected = vec![('\x00', '\u{10FFFF}')];
1671 assert_eq!(expected, uranges(&cls));
1672
1673 let cls = uclass(&[('a', 'a'), ('b', 'b')]);
1674 let expected = vec![('a', 'b')];
1675 assert_eq!(expected, uranges(&cls));
1676 }
1677
1678 #[test]
1679 fn class_canonicalize_bytes() {
1680 let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]);
1681 let expected = vec![(b'a', b'c'), (b'x', b'z')];
1682 assert_eq!(expected, branges(&cls));
1683
1684 let cls = bclass(&[(b'x', b'z'), (b'a', b'c')]);
1685 let expected = vec![(b'a', b'c'), (b'x', b'z')];
1686 assert_eq!(expected, branges(&cls));
1687
1688 let cls = bclass(&[(b'x', b'z'), (b'w', b'y')]);
1689 let expected = vec![(b'w', b'z')];
1690 assert_eq!(expected, branges(&cls));
1691
1692 let cls = bclass(&[
1693 (b'c', b'f'),
1694 (b'a', b'g'),
1695 (b'd', b'j'),
1696 (b'a', b'c'),
1697 (b'm', b'p'),
1698 (b'l', b's'),
1699 ]);
1700 let expected = vec![(b'a', b'j'), (b'l', b's')];
1701 assert_eq!(expected, branges(&cls));
1702
1703 let cls = bclass(&[(b'x', b'z'), (b'u', b'w')]);
1704 let expected = vec![(b'u', b'z')];
1705 assert_eq!(expected, branges(&cls));
1706
1707 let cls = bclass(&[(b'\x00', b'\xFF'), (b'\x00', b'\xFF')]);
1708 let expected = vec![(b'\x00', b'\xFF')];
1709 assert_eq!(expected, branges(&cls));
1710
1711 let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]);
1712 let expected = vec![(b'a', b'b')];
1713 assert_eq!(expected, branges(&cls));
1714 }
1715
1716 #[test]
1717 #[cfg(feature = "unicode-case")]
1718 fn class_case_fold_unicode() {
1719 let cls = uclass(&[
1720 ('C', 'F'),
1721 ('A', 'G'),
1722 ('D', 'J'),
1723 ('A', 'C'),
1724 ('M', 'P'),
1725 ('L', 'S'),
1726 ('c', 'f'),
1727 ]);
1728 let expected = uclass(&[
1729 ('A', 'J'),
1730 ('L', 'S'),
1731 ('a', 'j'),
1732 ('l', 's'),
1733 ('\u{17F}', '\u{17F}'),
1734 ]);
1735 assert_eq!(expected, ucasefold(&cls));
1736
1737 let cls = uclass(&[('A', 'Z')]);
1738 let expected = uclass(&[
1739 ('A', 'Z'),
1740 ('a', 'z'),
1741 ('\u{17F}', '\u{17F}'),
1742 ('\u{212A}', '\u{212A}'),
1743 ]);
1744 assert_eq!(expected, ucasefold(&cls));
1745
1746 let cls = uclass(&[('a', 'z')]);
1747 let expected = uclass(&[
1748 ('A', 'Z'),
1749 ('a', 'z'),
1750 ('\u{17F}', '\u{17F}'),
1751 ('\u{212A}', '\u{212A}'),
1752 ]);
1753 assert_eq!(expected, ucasefold(&cls));
1754
1755 let cls = uclass(&[('A', 'A'), ('_', '_')]);
1756 let expected = uclass(&[('A', 'A'), ('_', '_'), ('a', 'a')]);
1757 assert_eq!(expected, ucasefold(&cls));
1758
1759 let cls = uclass(&[('A', 'A'), ('=', '=')]);
1760 let expected = uclass(&[('=', '='), ('A', 'A'), ('a', 'a')]);
1761 assert_eq!(expected, ucasefold(&cls));
1762
1763 let cls = uclass(&[('\x00', '\x10')]);
1764 assert_eq!(cls, ucasefold(&cls));
1765
1766 let cls = uclass(&[('k', 'k')]);
1767 let expected =
1768 uclass(&[('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}')]);
1769 assert_eq!(expected, ucasefold(&cls));
1770
1771 let cls = uclass(&[('@', '@')]);
1772 assert_eq!(cls, ucasefold(&cls));
1773 }
1774
1775 #[test]
1776 #[cfg(not(feature = "unicode-case"))]
1777 fn class_case_fold_unicode_disabled() {
1778 let mut cls = uclass(&[
1779 ('C', 'F'),
1780 ('A', 'G'),
1781 ('D', 'J'),
1782 ('A', 'C'),
1783 ('M', 'P'),
1784 ('L', 'S'),
1785 ('c', 'f'),
1786 ]);
1787 assert!(cls.try_case_fold_simple().is_err());
1788 }
1789
1790 #[test]
1791 #[should_panic]
1792 #[cfg(not(feature = "unicode-case"))]
1793 fn class_case_fold_unicode_disabled_panics() {
1794 let mut cls = uclass(&[
1795 ('C', 'F'),
1796 ('A', 'G'),
1797 ('D', 'J'),
1798 ('A', 'C'),
1799 ('M', 'P'),
1800 ('L', 'S'),
1801 ('c', 'f'),
1802 ]);
1803 cls.case_fold_simple();
1804 }
1805
1806 #[test]
1807 fn class_case_fold_bytes() {
1808 let cls = bclass(&[
1809 (b'C', b'F'),
1810 (b'A', b'G'),
1811 (b'D', b'J'),
1812 (b'A', b'C'),
1813 (b'M', b'P'),
1814 (b'L', b'S'),
1815 (b'c', b'f'),
1816 ]);
1817 let expected =
1818 bclass(&[(b'A', b'J'), (b'L', b'S'), (b'a', b'j'), (b'l', b's')]);
1819 assert_eq!(expected, bcasefold(&cls));
1820
1821 let cls = bclass(&[(b'A', b'Z')]);
1822 let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]);
1823 assert_eq!(expected, bcasefold(&cls));
1824
1825 let cls = bclass(&[(b'a', b'z')]);
1826 let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]);
1827 assert_eq!(expected, bcasefold(&cls));
1828
1829 let cls = bclass(&[(b'A', b'A'), (b'_', b'_')]);
1830 let expected = bclass(&[(b'A', b'A'), (b'_', b'_'), (b'a', b'a')]);
1831 assert_eq!(expected, bcasefold(&cls));
1832
1833 let cls = bclass(&[(b'A', b'A'), (b'=', b'=')]);
1834 let expected = bclass(&[(b'=', b'='), (b'A', b'A'), (b'a', b'a')]);
1835 assert_eq!(expected, bcasefold(&cls));
1836
1837 let cls = bclass(&[(b'\x00', b'\x10')]);
1838 assert_eq!(cls, bcasefold(&cls));
1839
1840 let cls = bclass(&[(b'k', b'k')]);
1841 let expected = bclass(&[(b'K', b'K'), (b'k', b'k')]);
1842 assert_eq!(expected, bcasefold(&cls));
1843
1844 let cls = bclass(&[(b'@', b'@')]);
1845 assert_eq!(cls, bcasefold(&cls));
1846 }
1847
1848 #[test]
1849 fn class_negate_unicode() {
1850 let cls = uclass(&[('a', 'a')]);
1851 let expected = uclass(&[('\x00', '\x60'), ('\x62', '\u{10FFFF}')]);
1852 assert_eq!(expected, unegate(&cls));
1853
1854 let cls = uclass(&[('a', 'a'), ('b', 'b')]);
1855 let expected = uclass(&[('\x00', '\x60'), ('\x63', '\u{10FFFF}')]);
1856 assert_eq!(expected, unegate(&cls));
1857
1858 let cls = uclass(&[('a', 'c'), ('x', 'z')]);
1859 let expected = uclass(&[
1860 ('\x00', '\x60'),
1861 ('\x64', '\x77'),
1862 ('\x7B', '\u{10FFFF}'),
1863 ]);
1864 assert_eq!(expected, unegate(&cls));
1865
1866 let cls = uclass(&[('\x00', 'a')]);
1867 let expected = uclass(&[('\x62', '\u{10FFFF}')]);
1868 assert_eq!(expected, unegate(&cls));
1869
1870 let cls = uclass(&[('a', '\u{10FFFF}')]);
1871 let expected = uclass(&[('\x00', '\x60')]);
1872 assert_eq!(expected, unegate(&cls));
1873
1874 let cls = uclass(&[('\x00', '\u{10FFFF}')]);
1875 let expected = uclass(&[]);
1876 assert_eq!(expected, unegate(&cls));
1877
1878 let cls = uclass(&[]);
1879 let expected = uclass(&[('\x00', '\u{10FFFF}')]);
1880 assert_eq!(expected, unegate(&cls));
1881
1882 let cls =
1883 uclass(&[('\x00', '\u{10FFFD}'), ('\u{10FFFF}', '\u{10FFFF}')]);
1884 let expected = uclass(&[('\u{10FFFE}', '\u{10FFFE}')]);
1885 assert_eq!(expected, unegate(&cls));
1886
1887 let cls = uclass(&[('\x00', '\u{D7FF}')]);
1888 let expected = uclass(&[('\u{E000}', '\u{10FFFF}')]);
1889 assert_eq!(expected, unegate(&cls));
1890
1891 let cls = uclass(&[('\x00', '\u{D7FE}')]);
1892 let expected = uclass(&[('\u{D7FF}', '\u{10FFFF}')]);
1893 assert_eq!(expected, unegate(&cls));
1894
1895 let cls = uclass(&[('\u{E000}', '\u{10FFFF}')]);
1896 let expected = uclass(&[('\x00', '\u{D7FF}')]);
1897 assert_eq!(expected, unegate(&cls));
1898
1899 let cls = uclass(&[('\u{E001}', '\u{10FFFF}')]);
1900 let expected = uclass(&[('\x00', '\u{E000}')]);
1901 assert_eq!(expected, unegate(&cls));
1902 }
1903
1904 #[test]
1905 fn class_negate_bytes() {
1906 let cls = bclass(&[(b'a', b'a')]);
1907 let expected = bclass(&[(b'\x00', b'\x60'), (b'\x62', b'\xFF')]);
1908 assert_eq!(expected, bnegate(&cls));
1909
1910 let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]);
1911 let expected = bclass(&[(b'\x00', b'\x60'), (b'\x63', b'\xFF')]);
1912 assert_eq!(expected, bnegate(&cls));
1913
1914 let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]);
1915 let expected = bclass(&[
1916 (b'\x00', b'\x60'),
1917 (b'\x64', b'\x77'),
1918 (b'\x7B', b'\xFF'),
1919 ]);
1920 assert_eq!(expected, bnegate(&cls));
1921
1922 let cls = bclass(&[(b'\x00', b'a')]);
1923 let expected = bclass(&[(b'\x62', b'\xFF')]);
1924 assert_eq!(expected, bnegate(&cls));
1925
1926 let cls = bclass(&[(b'a', b'\xFF')]);
1927 let expected = bclass(&[(b'\x00', b'\x60')]);
1928 assert_eq!(expected, bnegate(&cls));
1929
1930 let cls = bclass(&[(b'\x00', b'\xFF')]);
1931 let expected = bclass(&[]);
1932 assert_eq!(expected, bnegate(&cls));
1933
1934 let cls = bclass(&[]);
1935 let expected = bclass(&[(b'\x00', b'\xFF')]);
1936 assert_eq!(expected, bnegate(&cls));
1937
1938 let cls = bclass(&[(b'\x00', b'\xFD'), (b'\xFF', b'\xFF')]);
1939 let expected = bclass(&[(b'\xFE', b'\xFE')]);
1940 assert_eq!(expected, bnegate(&cls));
1941 }
1942
1943 #[test]
1944 fn class_union_unicode() {
1945 let cls1 = uclass(&[('a', 'g'), ('m', 't'), ('A', 'C')]);
1946 let cls2 = uclass(&[('a', 'z')]);
1947 let expected = uclass(&[('a', 'z'), ('A', 'C')]);
1948 assert_eq!(expected, uunion(&cls1, &cls2));
1949 }
1950
1951 #[test]
1952 fn class_union_bytes() {
1953 let cls1 = bclass(&[(b'a', b'g'), (b'm', b't'), (b'A', b'C')]);
1954 let cls2 = bclass(&[(b'a', b'z')]);
1955 let expected = bclass(&[(b'a', b'z'), (b'A', b'C')]);
1956 assert_eq!(expected, bunion(&cls1, &cls2));
1957 }
1958
1959 #[test]
1960 fn class_intersect_unicode() {
1961 let cls1 = uclass(&[]);
1962 let cls2 = uclass(&[('a', 'a')]);
1963 let expected = uclass(&[]);
1964 assert_eq!(expected, uintersect(&cls1, &cls2));
1965
1966 let cls1 = uclass(&[('a', 'a')]);
1967 let cls2 = uclass(&[('a', 'a')]);
1968 let expected = uclass(&[('a', 'a')]);
1969 assert_eq!(expected, uintersect(&cls1, &cls2));
1970
1971 let cls1 = uclass(&[('a', 'a')]);
1972 let cls2 = uclass(&[('b', 'b')]);
1973 let expected = uclass(&[]);
1974 assert_eq!(expected, uintersect(&cls1, &cls2));
1975
1976 let cls1 = uclass(&[('a', 'a')]);
1977 let cls2 = uclass(&[('a', 'c')]);
1978 let expected = uclass(&[('a', 'a')]);
1979 assert_eq!(expected, uintersect(&cls1, &cls2));
1980
1981 let cls1 = uclass(&[('a', 'b')]);
1982 let cls2 = uclass(&[('a', 'c')]);
1983 let expected = uclass(&[('a', 'b')]);
1984 assert_eq!(expected, uintersect(&cls1, &cls2));
1985
1986 let cls1 = uclass(&[('a', 'b')]);
1987 let cls2 = uclass(&[('b', 'c')]);
1988 let expected = uclass(&[('b', 'b')]);
1989 assert_eq!(expected, uintersect(&cls1, &cls2));
1990
1991 let cls1 = uclass(&[('a', 'b')]);
1992 let cls2 = uclass(&[('c', 'd')]);
1993 let expected = uclass(&[]);
1994 assert_eq!(expected, uintersect(&cls1, &cls2));
1995
1996 let cls1 = uclass(&[('b', 'c')]);
1997 let cls2 = uclass(&[('a', 'd')]);
1998 let expected = uclass(&[('b', 'c')]);
1999 assert_eq!(expected, uintersect(&cls1, &cls2));
2000
2001 let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2002 let cls2 = uclass(&[('a', 'h')]);
2003 let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2004 assert_eq!(expected, uintersect(&cls1, &cls2));
2005
2006 let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2007 let cls2 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2008 let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2009 assert_eq!(expected, uintersect(&cls1, &cls2));
2010
2011 let cls1 = uclass(&[('a', 'b'), ('g', 'h')]);
2012 let cls2 = uclass(&[('d', 'e'), ('k', 'l')]);
2013 let expected = uclass(&[]);
2014 assert_eq!(expected, uintersect(&cls1, &cls2));
2015
2016 let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2017 let cls2 = uclass(&[('h', 'h')]);
2018 let expected = uclass(&[('h', 'h')]);
2019 assert_eq!(expected, uintersect(&cls1, &cls2));
2020
2021 let cls1 = uclass(&[('a', 'b'), ('e', 'f'), ('i', 'j')]);
2022 let cls2 = uclass(&[('c', 'd'), ('g', 'h'), ('k', 'l')]);
2023 let expected = uclass(&[]);
2024 assert_eq!(expected, uintersect(&cls1, &cls2));
2025
2026 let cls1 = uclass(&[('a', 'b'), ('c', 'd'), ('e', 'f')]);
2027 let cls2 = uclass(&[('b', 'c'), ('d', 'e'), ('f', 'g')]);
2028 let expected = uclass(&[('b', 'f')]);
2029 assert_eq!(expected, uintersect(&cls1, &cls2));
2030 }
2031
2032 #[test]
2033 fn class_intersect_bytes() {
2034 let cls1 = bclass(&[]);
2035 let cls2 = bclass(&[(b'a', b'a')]);
2036 let expected = bclass(&[]);
2037 assert_eq!(expected, bintersect(&cls1, &cls2));
2038
2039 let cls1 = bclass(&[(b'a', b'a')]);
2040 let cls2 = bclass(&[(b'a', b'a')]);
2041 let expected = bclass(&[(b'a', b'a')]);
2042 assert_eq!(expected, bintersect(&cls1, &cls2));
2043
2044 let cls1 = bclass(&[(b'a', b'a')]);
2045 let cls2 = bclass(&[(b'b', b'b')]);
2046 let expected = bclass(&[]);
2047 assert_eq!(expected, bintersect(&cls1, &cls2));
2048
2049 let cls1 = bclass(&[(b'a', b'a')]);
2050 let cls2 = bclass(&[(b'a', b'c')]);
2051 let expected = bclass(&[(b'a', b'a')]);
2052 assert_eq!(expected, bintersect(&cls1, &cls2));
2053
2054 let cls1 = bclass(&[(b'a', b'b')]);
2055 let cls2 = bclass(&[(b'a', b'c')]);
2056 let expected = bclass(&[(b'a', b'b')]);
2057 assert_eq!(expected, bintersect(&cls1, &cls2));
2058
2059 let cls1 = bclass(&[(b'a', b'b')]);
2060 let cls2 = bclass(&[(b'b', b'c')]);
2061 let expected = bclass(&[(b'b', b'b')]);
2062 assert_eq!(expected, bintersect(&cls1, &cls2));
2063
2064 let cls1 = bclass(&[(b'a', b'b')]);
2065 let cls2 = bclass(&[(b'c', b'd')]);
2066 let expected = bclass(&[]);
2067 assert_eq!(expected, bintersect(&cls1, &cls2));
2068
2069 let cls1 = bclass(&[(b'b', b'c')]);
2070 let cls2 = bclass(&[(b'a', b'd')]);
2071 let expected = bclass(&[(b'b', b'c')]);
2072 assert_eq!(expected, bintersect(&cls1, &cls2));
2073
2074 let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
2075 let cls2 = bclass(&[(b'a', b'h')]);
2076 let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
2077 assert_eq!(expected, bintersect(&cls1, &cls2));
2078
2079 let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
2080 let cls2 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
2081 let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
2082 assert_eq!(expected, bintersect(&cls1, &cls2));
2083
2084 let cls1 = bclass(&[(b'a', b'b'), (b'g', b'h')]);
2085 let cls2 = bclass(&[(b'd', b'e'), (b'k', b'l')]);
2086 let expected = bclass(&[]);
2087 assert_eq!(expected, bintersect(&cls1, &cls2));
2088
2089 let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
2090 let cls2 = bclass(&[(b'h', b'h')]);
2091 let expected = bclass(&[(b'h', b'h')]);
2092 assert_eq!(expected, bintersect(&cls1, &cls2));
2093
2094 let cls1 = bclass(&[(b'a', b'b'), (b'e', b'f'), (b'i', b'j')]);
2095 let cls2 = bclass(&[(b'c', b'd'), (b'g', b'h'), (b'k', b'l')]);
2096 let expected = bclass(&[]);
2097 assert_eq!(expected, bintersect(&cls1, &cls2));
2098
2099 let cls1 = bclass(&[(b'a', b'b'), (b'c', b'd'), (b'e', b'f')]);
2100 let cls2 = bclass(&[(b'b', b'c'), (b'd', b'e'), (b'f', b'g')]);
2101 let expected = bclass(&[(b'b', b'f')]);
2102 assert_eq!(expected, bintersect(&cls1, &cls2));
2103 }
2104
2105 #[test]
2106 fn class_difference_unicode() {
2107 let cls1 = uclass(&[('a', 'a')]);
2108 let cls2 = uclass(&[('a', 'a')]);
2109 let expected = uclass(&[]);
2110 assert_eq!(expected, udifference(&cls1, &cls2));
2111
2112 let cls1 = uclass(&[('a', 'a')]);
2113 let cls2 = uclass(&[]);
2114 let expected = uclass(&[('a', 'a')]);
2115 assert_eq!(expected, udifference(&cls1, &cls2));
2116
2117 let cls1 = uclass(&[]);
2118 let cls2 = uclass(&[('a', 'a')]);
2119 let expected = uclass(&[]);
2120 assert_eq!(expected, udifference(&cls1, &cls2));
2121
2122 let cls1 = uclass(&[('a', 'z')]);
2123 let cls2 = uclass(&[('a', 'a')]);
2124 let expected = uclass(&[('b', 'z')]);
2125 assert_eq!(expected, udifference(&cls1, &cls2));
2126
2127 let cls1 = uclass(&[('a', 'z')]);
2128 let cls2 = uclass(&[('z', 'z')]);
2129 let expected = uclass(&[('a', 'y')]);
2130 assert_eq!(expected, udifference(&cls1, &cls2));
2131
2132 let cls1 = uclass(&[('a', 'z')]);
2133 let cls2 = uclass(&[('m', 'm')]);
2134 let expected = uclass(&[('a', 'l'), ('n', 'z')]);
2135 assert_eq!(expected, udifference(&cls1, &cls2));
2136
2137 let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
2138 let cls2 = uclass(&[('a', 'z')]);
2139 let expected = uclass(&[]);
2140 assert_eq!(expected, udifference(&cls1, &cls2));
2141
2142 let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
2143 let cls2 = uclass(&[('d', 'v')]);
2144 let expected = uclass(&[('a', 'c')]);
2145 assert_eq!(expected, udifference(&cls1, &cls2));
2146
2147 let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
2148 let cls2 = uclass(&[('b', 'g'), ('s', 'u')]);
2149 let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]);
2150 assert_eq!(expected, udifference(&cls1, &cls2));
2151
2152 let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
2153 let cls2 = uclass(&[('b', 'd'), ('e', 'g'), ('s', 'u')]);
2154 let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]);
2155 assert_eq!(expected, udifference(&cls1, &cls2));
2156
2157 let cls1 = uclass(&[('x', 'z')]);
2158 let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]);
2159 let expected = uclass(&[('x', 'z')]);
2160 assert_eq!(expected, udifference(&cls1, &cls2));
2161
2162 let cls1 = uclass(&[('a', 'z')]);
2163 let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]);
2164 let expected = uclass(&[('d', 'd'), ('h', 'r'), ('v', 'z')]);
2165 assert_eq!(expected, udifference(&cls1, &cls2));
2166 }
2167
2168 #[test]
2169 fn class_difference_bytes() {
2170 let cls1 = bclass(&[(b'a', b'a')]);
2171 let cls2 = bclass(&[(b'a', b'a')]);
2172 let expected = bclass(&[]);
2173 assert_eq!(expected, bdifference(&cls1, &cls2));
2174
2175 let cls1 = bclass(&[(b'a', b'a')]);
2176 let cls2 = bclass(&[]);
2177 let expected = bclass(&[(b'a', b'a')]);
2178 assert_eq!(expected, bdifference(&cls1, &cls2));
2179
2180 let cls1 = bclass(&[]);
2181 let cls2 = bclass(&[(b'a', b'a')]);
2182 let expected = bclass(&[]);
2183 assert_eq!(expected, bdifference(&cls1, &cls2));
2184
2185 let cls1 = bclass(&[(b'a', b'z')]);
2186 let cls2 = bclass(&[(b'a', b'a')]);
2187 let expected = bclass(&[(b'b', b'z')]);
2188 assert_eq!(expected, bdifference(&cls1, &cls2));
2189
2190 let cls1 = bclass(&[(b'a', b'z')]);
2191 let cls2 = bclass(&[(b'z', b'z')]);
2192 let expected = bclass(&[(b'a', b'y')]);
2193 assert_eq!(expected, bdifference(&cls1, &cls2));
2194
2195 let cls1 = bclass(&[(b'a', b'z')]);
2196 let cls2 = bclass(&[(b'm', b'm')]);
2197 let expected = bclass(&[(b'a', b'l'), (b'n', b'z')]);
2198 assert_eq!(expected, bdifference(&cls1, &cls2));
2199
2200 let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
2201 let cls2 = bclass(&[(b'a', b'z')]);
2202 let expected = bclass(&[]);
2203 assert_eq!(expected, bdifference(&cls1, &cls2));
2204
2205 let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
2206 let cls2 = bclass(&[(b'd', b'v')]);
2207 let expected = bclass(&[(b'a', b'c')]);
2208 assert_eq!(expected, bdifference(&cls1, &cls2));
2209
2210 let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
2211 let cls2 = bclass(&[(b'b', b'g'), (b's', b'u')]);
2212 let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]);
2213 assert_eq!(expected, bdifference(&cls1, &cls2));
2214
2215 let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
2216 let cls2 = bclass(&[(b'b', b'd'), (b'e', b'g'), (b's', b'u')]);
2217 let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]);
2218 assert_eq!(expected, bdifference(&cls1, &cls2));
2219
2220 let cls1 = bclass(&[(b'x', b'z')]);
2221 let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]);
2222 let expected = bclass(&[(b'x', b'z')]);
2223 assert_eq!(expected, bdifference(&cls1, &cls2));
2224
2225 let cls1 = bclass(&[(b'a', b'z')]);
2226 let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]);
2227 let expected = bclass(&[(b'd', b'd'), (b'h', b'r'), (b'v', b'z')]);
2228 assert_eq!(expected, bdifference(&cls1, &cls2));
2229 }
2230
2231 #[test]
2232 fn class_symmetric_difference_unicode() {
2233 let cls1 = uclass(&[('a', 'm')]);
2234 let cls2 = uclass(&[('g', 't')]);
2235 let expected = uclass(&[('a', 'f'), ('n', 't')]);
2236 assert_eq!(expected, usymdifference(&cls1, &cls2));
2237 }
2238
2239 #[test]
2240 fn class_symmetric_difference_bytes() {
2241 let cls1 = bclass(&[(b'a', b'm')]);
2242 let cls2 = bclass(&[(b'g', b't')]);
2243 let expected = bclass(&[(b'a', b'f'), (b'n', b't')]);
2244 assert_eq!(expected, bsymdifference(&cls1, &cls2));
2245 }
2246
2247 #[test]
2248 #[should_panic]
2249 fn hir_byte_literal_non_ascii() {
2250 Hir::literal(Literal::Byte(b'a'));
2251 }
2252
2253 // We use a thread with an explicit stack size to test that our destructor
2254 // for Hir can handle arbitrarily sized expressions in constant stack
2255 // space. In case we run on a platform without threads (WASM?), we limit
2256 // this test to Windows/Unix.
2257 #[test]
2258 #[cfg(any(unix, windows))]
2259 fn no_stack_overflow_on_drop() {
2260 use std::thread;
2261
2262 let run = || {
2263 let mut expr = Hir::empty();
2264 for _ in 0..100 {
2265 expr = Hir::group(Group {
2266 kind: GroupKind::NonCapturing,
2267 hir: Box::new(expr),
2268 });
2269 expr = Hir::repetition(Repetition {
2270 kind: RepetitionKind::ZeroOrOne,
2271 greedy: true,
2272 hir: Box::new(expr),
2273 });
2274
2275 expr = Hir {
2276 kind: HirKind::Concat(vec![expr]),
2277 info: HirInfo::new(),
2278 };
2279 expr = Hir {
2280 kind: HirKind::Alternation(vec![expr]),
2281 info: HirInfo::new(),
2282 };
2283 }
2284 assert!(!expr.kind.is_empty());
2285 };
2286
2287 // We run our test on a thread with a small stack size so we can
2288 // force the issue more easily.
2289 //
2290 // NOTE(2023-03-21): See the corresponding test in 'crate::ast::tests'
2291 // for context on the specific stack size chosen here.
2292 thread::Builder::new()
2293 .stack_size(16 << 10)
2294 .spawn(run)
2295 .unwrap()
2296 .join()
2297 .unwrap();
2298 }
2299}
2300