1/*!
2Defines a high-level intermediate (HIR) representation for regular expressions.
3
4The HIR is represented by the [`Hir`] type, and it principally constructed via
5[translation](translate) from an [`Ast`](crate::ast::Ast). Alternatively, users
6may use the smart constructors defined on `Hir` to build their own by hand. The
7smart constructors simultaneously simplify and "optimize" the HIR, and are also
8the same routines used by translation.
9
10Most regex engines only have an HIR like this, and usually construct it
11directly from the concrete syntax. This crate however first parses the
12concrete syntax into an `Ast`, and only then creates the HIR from the `Ast`,
13as mentioned above. It's done this way to facilitate better error reporting,
14and to have a structured representation of a regex that faithfully represents
15its concrete syntax. Namely, while an `Hir` value can be converted back to an
16equivalent regex pattern string, it is unlikely to look like the original due
17to its simplified structure.
18*/
19
20use core::{char, cmp};
21
22use alloc::{
23 boxed::Box,
24 format,
25 string::{String, ToString},
26 vec,
27 vec::Vec,
28};
29
30use crate::{
31 ast::Span,
32 hir::interval::{Interval, IntervalSet, IntervalSetIter},
33 unicode,
34};
35
36pub use crate::{
37 hir::visitor::{visit, Visitor},
38 unicode::CaseFoldError,
39};
40
41mod interval;
42pub mod literal;
43pub mod print;
44pub mod translate;
45mod visitor;
46
47/// An error that can occur while translating an `Ast` to a `Hir`.
48#[derive(Clone, Debug, Eq, PartialEq)]
49pub struct Error {
50 /// The kind of error.
51 kind: ErrorKind,
52 /// The original pattern that the translator's Ast was parsed from. Every
53 /// span in an error is a valid range into this string.
54 pattern: String,
55 /// The span of this error, derived from the Ast given to the translator.
56 span: Span,
57}
58
59impl Error {
60 /// Return the type of this error.
61 pub fn kind(&self) -> &ErrorKind {
62 &self.kind
63 }
64
65 /// The original pattern string in which this error occurred.
66 ///
67 /// Every span reported by this error is reported in terms of this string.
68 pub fn pattern(&self) -> &str {
69 &self.pattern
70 }
71
72 /// Return the span at which this error occurred.
73 pub fn span(&self) -> &Span {
74 &self.span
75 }
76}
77
78/// The type of an error that occurred while building an `Hir`.
79///
80/// This error type is marked as `non_exhaustive`. This means that adding a
81/// new variant is not considered a breaking change.
82#[non_exhaustive]
83#[derive(Clone, Debug, Eq, PartialEq)]
84pub enum ErrorKind {
85 /// This error occurs when a Unicode feature is used when Unicode
86 /// support is disabled. For example `(?-u:\pL)` would trigger this error.
87 UnicodeNotAllowed,
88 /// This error occurs when translating a pattern that could match a byte
89 /// sequence that isn't UTF-8 and `utf8` was enabled.
90 InvalidUtf8,
91 /// This occurs when an unrecognized Unicode property name could not
92 /// be found.
93 UnicodePropertyNotFound,
94 /// This occurs when an unrecognized Unicode property value could not
95 /// be found.
96 UnicodePropertyValueNotFound,
97 /// This occurs when a Unicode-aware Perl character class (`\w`, `\s` or
98 /// `\d`) could not be found. This can occur when the `unicode-perl`
99 /// crate feature is not enabled.
100 UnicodePerlClassNotFound,
101 /// This occurs when the Unicode simple case mapping tables are not
102 /// available, and the regular expression required Unicode aware case
103 /// insensitivity.
104 UnicodeCaseUnavailable,
105}
106
107#[cfg(feature = "std")]
108impl std::error::Error for Error {}
109
110impl core::fmt::Display for Error {
111 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
112 crate::error::Formatter::from(self).fmt(f)
113 }
114}
115
116impl core::fmt::Display for ErrorKind {
117 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
118 use self::ErrorKind::*;
119
120 let msg: &str = match *self {
121 UnicodeNotAllowed => "Unicode not allowed here",
122 InvalidUtf8 => "pattern can match invalid UTF-8",
123 UnicodePropertyNotFound => "Unicode property not found",
124 UnicodePropertyValueNotFound => "Unicode property value not found",
125 UnicodePerlClassNotFound => {
126 "Unicode-aware Perl class not found \
127 (make sure the unicode-perl feature is enabled)"
128 }
129 UnicodeCaseUnavailable => {
130 "Unicode-aware case insensitivity matching is not available \
131 (make sure the unicode-case feature is enabled)"
132 }
133 };
134 f.write_str(data:msg)
135 }
136}
137
138/// A high-level intermediate representation (HIR) for a regular expression.
139///
140/// An HIR value is a combination of a [`HirKind`] and a set of [`Properties`].
141/// An `HirKind` indicates what kind of regular expression it is (a literal,
142/// a repetition, a look-around assertion, etc.), where as a `Properties`
143/// describes various facts about the regular expression. For example, whether
144/// it matches UTF-8 or if it matches the empty string.
145///
146/// The HIR of a regular expression represents an intermediate step between
147/// its abstract syntax (a structured description of the concrete syntax) and
148/// an actual regex matcher. The purpose of HIR is to make regular expressions
149/// easier to analyze. In particular, the AST is much more complex than the
150/// HIR. For example, while an AST supports arbitrarily nested character
151/// classes, the HIR will flatten all nested classes into a single set. The HIR
152/// will also "compile away" every flag present in the concrete syntax. For
153/// example, users of HIR expressions never need to worry about case folding;
154/// it is handled automatically by the translator (e.g., by translating
155/// `(?i:A)` to `[aA]`).
156///
157/// The specific type of an HIR expression can be accessed via its `kind`
158/// or `into_kind` methods. This extra level of indirection exists for two
159/// reasons:
160///
161/// 1. Construction of an HIR expression *must* use the constructor methods on
162/// this `Hir` type instead of building the `HirKind` values directly. This
163/// permits construction to enforce invariants like "concatenations always
164/// consist of two or more sub-expressions."
165/// 2. Every HIR expression contains attributes that are defined inductively,
166/// and can be computed cheaply during the construction process. For example,
167/// one such attribute is whether the expression must match at the beginning of
168/// the haystack.
169///
170/// In particular, if you have an `HirKind` value, then there is intentionally
171/// no way to build an `Hir` value from it. You instead need to do case
172/// analysis on the `HirKind` value and build the `Hir` value using its smart
173/// constructors.
174///
175/// # UTF-8
176///
177/// If the HIR was produced by a translator with
178/// [`TranslatorBuilder::utf8`](translate::TranslatorBuilder::utf8) enabled,
179/// then the HIR is guaranteed to match UTF-8 exclusively for all non-empty
180/// matches.
181///
182/// For empty matches, those can occur at any position. It is the
183/// repsonsibility of the regex engine to determine whether empty matches are
184/// permitted between the code units of a single codepoint.
185///
186/// # Stack space
187///
188/// This type defines its own destructor that uses constant stack space and
189/// heap space proportional to the size of the HIR.
190///
191/// Also, an `Hir`'s `fmt::Display` implementation prints an HIR as a regular
192/// expression pattern string, and uses constant stack space and heap space
193/// proportional to the size of the `Hir`. The regex it prints is guaranteed to
194/// be _semantically_ equivalent to the original concrete syntax, but it may
195/// look very different. (And potentially not practically readable by a human.)
196///
197/// An `Hir`'s `fmt::Debug` implementation currently does not use constant
198/// stack space. The implementation will also suppress some details (such as
199/// the `Properties` inlined into every `Hir` value to make it less noisy).
200#[derive(Clone, Eq, PartialEq)]
201pub struct Hir {
202 /// The underlying HIR kind.
203 kind: HirKind,
204 /// Analysis info about this HIR, computed during construction.
205 props: Properties,
206}
207
208/// Methods for accessing the underlying `HirKind` and `Properties`.
209impl Hir {
210 /// Returns a reference to the underlying HIR kind.
211 pub fn kind(&self) -> &HirKind {
212 &self.kind
213 }
214
215 /// Consumes ownership of this HIR expression and returns its underlying
216 /// `HirKind`.
217 pub fn into_kind(mut self) -> HirKind {
218 core::mem::replace(&mut self.kind, HirKind::Empty)
219 }
220
221 /// Returns the properties computed for this `Hir`.
222 pub fn properties(&self) -> &Properties {
223 &self.props
224 }
225
226 /// Splits this HIR into its constituent parts.
227 ///
228 /// This is useful because `let Hir { kind, props } = hir;` does not work
229 /// because of `Hir`'s custom `Drop` implementation.
230 fn into_parts(mut self) -> (HirKind, Properties) {
231 (
232 core::mem::replace(&mut self.kind, HirKind::Empty),
233 core::mem::replace(&mut self.props, Properties::empty()),
234 )
235 }
236}
237
238/// Smart constructors for HIR values.
239///
240/// These constructors are called "smart" because they do inductive work or
241/// simplifications. For example, calling `Hir::repetition` with a repetition
242/// like `a{0}` will actually return a `Hir` with a `HirKind::Empty` kind
243/// since it is equivalent to an empty regex. Another example is calling
244/// `Hir::concat(vec![expr])`. Instead of getting a `HirKind::Concat`, you'll
245/// just get back the original `expr` since it's precisely equivalent.
246///
247/// Smart constructors enable maintaining invariants about the HIR data type
248/// while also simulanteously keeping the representation as simple as possible.
249impl Hir {
250 /// Returns an empty HIR expression.
251 ///
252 /// An empty HIR expression always matches, including the empty string.
253 #[inline]
254 pub fn empty() -> Hir {
255 let props = Properties::empty();
256 Hir { kind: HirKind::Empty, props }
257 }
258
259 /// Returns an HIR expression that can never match anything. That is,
260 /// the size of the set of strings in the language described by the HIR
261 /// returned is `0`.
262 ///
263 /// This is distinct from [`Hir::empty`] in that the empty string matches
264 /// the HIR returned by `Hir::empty`. That is, the set of strings in the
265 /// language describe described by `Hir::empty` is non-empty.
266 ///
267 /// Note that currently, the HIR returned uses an empty character class to
268 /// indicate that nothing can match. An equivalent expression that cannot
269 /// match is an empty alternation, but all such "fail" expressions are
270 /// normalized (via smart constructors) to empty character classes. This is
271 /// because empty character classes can be spelled in the concrete syntax
272 /// of a regex (e.g., `\P{any}` or `(?-u:[^\x00-\xFF])` or `[a&&b]`), but
273 /// empty alternations cannot.
274 #[inline]
275 pub fn fail() -> Hir {
276 let class = Class::Bytes(ClassBytes::empty());
277 let props = Properties::class(&class);
278 // We can't just call Hir::class here because it defers to Hir::fail
279 // in order to canonicalize the Hir value used to represent "cannot
280 // match."
281 Hir { kind: HirKind::Class(class), props }
282 }
283
284 /// Creates a literal HIR expression.
285 ///
286 /// This accepts anything that can be converted into a `Box<[u8]>`.
287 ///
288 /// Note that there is no mechanism for storing a `char` or a `Box<str>`
289 /// in an HIR. Everything is "just bytes." Whether a `Literal` (or
290 /// any HIR node) matches valid UTF-8 exclusively can be queried via
291 /// [`Properties::is_utf8`].
292 ///
293 /// # Example
294 ///
295 /// This example shows that concatenations of `Literal` HIR values will
296 /// automatically get flattened and combined together. So for example, even
297 /// if you concat multiple `Literal` values that are themselves not valid
298 /// UTF-8, they might add up to valid UTF-8. This also demonstrates just
299 /// how "smart" Hir's smart constructors are.
300 ///
301 /// ```
302 /// use regex_syntax::hir::{Hir, HirKind, Literal};
303 ///
304 /// let literals = vec![
305 /// Hir::literal([0xE2]),
306 /// Hir::literal([0x98]),
307 /// Hir::literal([0x83]),
308 /// ];
309 /// // Each literal, on its own, is invalid UTF-8.
310 /// assert!(literals.iter().all(|hir| !hir.properties().is_utf8()));
311 ///
312 /// let concat = Hir::concat(literals);
313 /// // But the concatenation is valid UTF-8!
314 /// assert!(concat.properties().is_utf8());
315 ///
316 /// // And also notice that the literals have been concatenated into a
317 /// // single `Literal`, to the point where there is no explicit `Concat`!
318 /// let expected = HirKind::Literal(Literal(Box::from("☃".as_bytes())));
319 /// assert_eq!(&expected, concat.kind());
320 /// ```
321 #[inline]
322 pub fn literal<B: Into<Box<[u8]>>>(lit: B) -> Hir {
323 let bytes = lit.into();
324 if bytes.is_empty() {
325 return Hir::empty();
326 }
327
328 let lit = Literal(bytes);
329 let props = Properties::literal(&lit);
330 Hir { kind: HirKind::Literal(lit), props }
331 }
332
333 /// Creates a class HIR expression. The class may either be defined over
334 /// ranges of Unicode codepoints or ranges of raw byte values.
335 ///
336 /// Note that an empty class is permitted. An empty class is equivalent to
337 /// `Hir::fail()`.
338 #[inline]
339 pub fn class(class: Class) -> Hir {
340 if class.is_empty() {
341 return Hir::fail();
342 } else if let Some(bytes) = class.literal() {
343 return Hir::literal(bytes);
344 }
345 let props = Properties::class(&class);
346 Hir { kind: HirKind::Class(class), props }
347 }
348
349 /// Creates a look-around assertion HIR expression.
350 #[inline]
351 pub fn look(look: Look) -> Hir {
352 let props = Properties::look(look);
353 Hir { kind: HirKind::Look(look), props }
354 }
355
356 /// Creates a repetition HIR expression.
357 #[inline]
358 pub fn repetition(rep: Repetition) -> Hir {
359 // The regex 'a{0}' is always equivalent to the empty regex. This is
360 // true even when 'a' is an expression that never matches anything
361 // (like '\P{any}').
362 //
363 // Additionally, the regex 'a{1}' is always equivalent to 'a'.
364 if rep.min == 0 && rep.max == Some(0) {
365 return Hir::empty();
366 } else if rep.min == 1 && rep.max == Some(1) {
367 return *rep.sub;
368 }
369 let props = Properties::repetition(&rep);
370 Hir { kind: HirKind::Repetition(rep), props }
371 }
372
373 /// Creates a capture HIR expression.
374 ///
375 /// Note that there is no explicit HIR value for a non-capturing group.
376 /// Since a non-capturing group only exists to override precedence in the
377 /// concrete syntax and since an HIR already does its own grouping based on
378 /// what is parsed, there is no need to explicitly represent non-capturing
379 /// groups in the HIR.
380 #[inline]
381 pub fn capture(capture: Capture) -> Hir {
382 let props = Properties::capture(&capture);
383 Hir { kind: HirKind::Capture(capture), props }
384 }
385
386 /// Returns the concatenation of the given expressions.
387 ///
388 /// This attempts to flatten and simplify the concatenation as appropriate.
389 ///
390 /// # Example
391 ///
392 /// This shows a simple example of basic flattening of both concatenations
393 /// and literals.
394 ///
395 /// ```
396 /// use regex_syntax::hir::Hir;
397 ///
398 /// let hir = Hir::concat(vec![
399 /// Hir::concat(vec![
400 /// Hir::literal([b'a']),
401 /// Hir::literal([b'b']),
402 /// Hir::literal([b'c']),
403 /// ]),
404 /// Hir::concat(vec![
405 /// Hir::literal([b'x']),
406 /// Hir::literal([b'y']),
407 /// Hir::literal([b'z']),
408 /// ]),
409 /// ]);
410 /// let expected = Hir::literal("abcxyz".as_bytes());
411 /// assert_eq!(expected, hir);
412 /// ```
413 pub fn concat(subs: Vec<Hir>) -> Hir {
414 // We rebuild the concatenation by simplifying it. Would be nice to do
415 // it in place, but that seems a little tricky?
416 let mut new = vec![];
417 // This gobbles up any adjacent literals in a concatenation and smushes
418 // them together. Basically, when we see a literal, we add its bytes
419 // to 'prior_lit', and whenever we see anything else, we first take
420 // any bytes in 'prior_lit' and add it to the 'new' concatenation.
421 let mut prior_lit: Option<Vec<u8>> = None;
422 for sub in subs {
423 let (kind, props) = sub.into_parts();
424 match kind {
425 HirKind::Literal(Literal(bytes)) => {
426 if let Some(ref mut prior_bytes) = prior_lit {
427 prior_bytes.extend_from_slice(&bytes);
428 } else {
429 prior_lit = Some(bytes.to_vec());
430 }
431 }
432 // We also flatten concats that are direct children of another
433 // concat. We only need to do this one level deep since
434 // Hir::concat is the only way to build concatenations, and so
435 // flattening happens inductively.
436 HirKind::Concat(subs2) => {
437 for sub2 in subs2 {
438 let (kind2, props2) = sub2.into_parts();
439 match kind2 {
440 HirKind::Literal(Literal(bytes)) => {
441 if let Some(ref mut prior_bytes) = prior_lit {
442 prior_bytes.extend_from_slice(&bytes);
443 } else {
444 prior_lit = Some(bytes.to_vec());
445 }
446 }
447 kind2 => {
448 if let Some(prior_bytes) = prior_lit.take() {
449 new.push(Hir::literal(prior_bytes));
450 }
451 new.push(Hir { kind: kind2, props: props2 });
452 }
453 }
454 }
455 }
456 // We can just skip empty HIRs.
457 HirKind::Empty => {}
458 kind => {
459 if let Some(prior_bytes) = prior_lit.take() {
460 new.push(Hir::literal(prior_bytes));
461 }
462 new.push(Hir { kind, props });
463 }
464 }
465 }
466 if let Some(prior_bytes) = prior_lit.take() {
467 new.push(Hir::literal(prior_bytes));
468 }
469 if new.is_empty() {
470 return Hir::empty();
471 } else if new.len() == 1 {
472 return new.pop().unwrap();
473 }
474 let props = Properties::concat(&new);
475 Hir { kind: HirKind::Concat(new), props }
476 }
477
478 /// Returns the alternation of the given expressions.
479 ///
480 /// This flattens and simplifies the alternation as appropriate. This may
481 /// include factoring out common prefixes or even rewriting the alternation
482 /// as a character class.
483 ///
484 /// Note that an empty alternation is equivalent to `Hir::fail()`. (It
485 /// is not possible for one to write an empty alternation, or even an
486 /// alternation with a single sub-expression, in the concrete syntax of a
487 /// regex.)
488 ///
489 /// # Example
490 ///
491 /// This is a simple example showing how an alternation might get
492 /// simplified.
493 ///
494 /// ```
495 /// use regex_syntax::hir::{Hir, Class, ClassUnicode, ClassUnicodeRange};
496 ///
497 /// let hir = Hir::alternation(vec![
498 /// Hir::literal([b'a']),
499 /// Hir::literal([b'b']),
500 /// Hir::literal([b'c']),
501 /// Hir::literal([b'd']),
502 /// Hir::literal([b'e']),
503 /// Hir::literal([b'f']),
504 /// ]);
505 /// let expected = Hir::class(Class::Unicode(ClassUnicode::new([
506 /// ClassUnicodeRange::new('a', 'f'),
507 /// ])));
508 /// assert_eq!(expected, hir);
509 /// ```
510 ///
511 /// And another example showing how common prefixes might get factored
512 /// out.
513 ///
514 /// ```
515 /// use regex_syntax::hir::{Hir, Class, ClassUnicode, ClassUnicodeRange};
516 ///
517 /// let hir = Hir::alternation(vec![
518 /// Hir::concat(vec![
519 /// Hir::literal("abc".as_bytes()),
520 /// Hir::class(Class::Unicode(ClassUnicode::new([
521 /// ClassUnicodeRange::new('A', 'Z'),
522 /// ]))),
523 /// ]),
524 /// Hir::concat(vec![
525 /// Hir::literal("abc".as_bytes()),
526 /// Hir::class(Class::Unicode(ClassUnicode::new([
527 /// ClassUnicodeRange::new('a', 'z'),
528 /// ]))),
529 /// ]),
530 /// ]);
531 /// let expected = Hir::concat(vec![
532 /// Hir::literal("abc".as_bytes()),
533 /// Hir::alternation(vec![
534 /// Hir::class(Class::Unicode(ClassUnicode::new([
535 /// ClassUnicodeRange::new('A', 'Z'),
536 /// ]))),
537 /// Hir::class(Class::Unicode(ClassUnicode::new([
538 /// ClassUnicodeRange::new('a', 'z'),
539 /// ]))),
540 /// ]),
541 /// ]);
542 /// assert_eq!(expected, hir);
543 /// ```
544 ///
545 /// Note that these sorts of simplifications are not guaranteed.
546 pub fn alternation(subs: Vec<Hir>) -> Hir {
547 // We rebuild the alternation by simplifying it. We proceed similarly
548 // as the concatenation case. But in this case, there's no literal
549 // simplification happening. We're just flattening alternations.
550 let mut new = vec![];
551 for sub in subs {
552 let (kind, props) = sub.into_parts();
553 match kind {
554 HirKind::Alternation(subs2) => {
555 new.extend(subs2);
556 }
557 kind => {
558 new.push(Hir { kind, props });
559 }
560 }
561 }
562 if new.is_empty() {
563 return Hir::fail();
564 } else if new.len() == 1 {
565 return new.pop().unwrap();
566 }
567 // Now that it's completely flattened, look for the special case of
568 // 'char1|char2|...|charN' and collapse that into a class. Note that
569 // we look for 'char' first and then bytes. The issue here is that if
570 // we find both non-ASCII codepoints and non-ASCII singleton bytes,
571 // then it isn't actually possible to smush them into a single class.
572 // (Because classes are either "all codepoints" or "all bytes." You
573 // can have a class that both matches non-ASCII but valid UTF-8 and
574 // invalid UTF-8.) So we look for all chars and then all bytes, and
575 // don't handle anything else.
576 if let Some(singletons) = singleton_chars(&new) {
577 let it = singletons
578 .into_iter()
579 .map(|ch| ClassUnicodeRange { start: ch, end: ch });
580 return Hir::class(Class::Unicode(ClassUnicode::new(it)));
581 }
582 if let Some(singletons) = singleton_bytes(&new) {
583 let it = singletons
584 .into_iter()
585 .map(|b| ClassBytesRange { start: b, end: b });
586 return Hir::class(Class::Bytes(ClassBytes::new(it)));
587 }
588 // Similar to singleton chars, we can also look for alternations of
589 // classes. Those can be smushed into a single class.
590 if let Some(cls) = class_chars(&new) {
591 return Hir::class(cls);
592 }
593 if let Some(cls) = class_bytes(&new) {
594 return Hir::class(cls);
595 }
596 // Factor out a common prefix if we can, which might potentially
597 // simplify the expression and unlock other optimizations downstream.
598 // It also might generally make NFA matching and DFA construction
599 // faster by reducing the scope of branching in the regex.
600 new = match lift_common_prefix(new) {
601 Ok(hir) => return hir,
602 Err(unchanged) => unchanged,
603 };
604 let props = Properties::alternation(&new);
605 Hir { kind: HirKind::Alternation(new), props }
606 }
607
608 /// Returns an HIR expression for `.`.
609 ///
610 /// * [`Dot::AnyChar`] maps to `(?su-R:.)`.
611 /// * [`Dot::AnyByte`] maps to `(?s-Ru:.)`.
612 /// * [`Dot::AnyCharExceptLF`] maps to `(?u-Rs:.)`.
613 /// * [`Dot::AnyCharExceptCRLF`] maps to `(?Ru-s:.)`.
614 /// * [`Dot::AnyByteExceptLF`] maps to `(?-Rsu:.)`.
615 /// * [`Dot::AnyByteExceptCRLF`] maps to `(?R-su:.)`.
616 ///
617 /// # Example
618 ///
619 /// Note that this is a convenience routine for constructing the correct
620 /// character class based on the value of `Dot`. There is no explicit "dot"
621 /// HIR value. It is just an abbreviation for a common character class.
622 ///
623 /// ```
624 /// use regex_syntax::hir::{Hir, Dot, Class, ClassBytes, ClassBytesRange};
625 ///
626 /// let hir = Hir::dot(Dot::AnyByte);
627 /// let expected = Hir::class(Class::Bytes(ClassBytes::new([
628 /// ClassBytesRange::new(0x00, 0xFF),
629 /// ])));
630 /// assert_eq!(expected, hir);
631 /// ```
632 #[inline]
633 pub fn dot(dot: Dot) -> Hir {
634 match dot {
635 Dot::AnyChar => {
636 let mut cls = ClassUnicode::empty();
637 cls.push(ClassUnicodeRange::new('\0', '\u{10FFFF}'));
638 Hir::class(Class::Unicode(cls))
639 }
640 Dot::AnyByte => {
641 let mut cls = ClassBytes::empty();
642 cls.push(ClassBytesRange::new(b'\0', b'\xFF'));
643 Hir::class(Class::Bytes(cls))
644 }
645 Dot::AnyCharExceptLF => {
646 let mut cls = ClassUnicode::empty();
647 cls.push(ClassUnicodeRange::new('\0', '\x09'));
648 cls.push(ClassUnicodeRange::new('\x0B', '\u{10FFFF}'));
649 Hir::class(Class::Unicode(cls))
650 }
651 Dot::AnyCharExceptCRLF => {
652 let mut cls = ClassUnicode::empty();
653 cls.push(ClassUnicodeRange::new('\0', '\x09'));
654 cls.push(ClassUnicodeRange::new('\x0B', '\x0C'));
655 cls.push(ClassUnicodeRange::new('\x0E', '\u{10FFFF}'));
656 Hir::class(Class::Unicode(cls))
657 }
658 Dot::AnyByteExceptLF => {
659 let mut cls = ClassBytes::empty();
660 cls.push(ClassBytesRange::new(b'\0', b'\x09'));
661 cls.push(ClassBytesRange::new(b'\x0B', b'\xFF'));
662 Hir::class(Class::Bytes(cls))
663 }
664 Dot::AnyByteExceptCRLF => {
665 let mut cls = ClassBytes::empty();
666 cls.push(ClassBytesRange::new(b'\0', b'\x09'));
667 cls.push(ClassBytesRange::new(b'\x0B', b'\x0C'));
668 cls.push(ClassBytesRange::new(b'\x0E', b'\xFF'));
669 Hir::class(Class::Bytes(cls))
670 }
671 }
672 }
673}
674
675/// The underlying kind of an arbitrary [`Hir`] expression.
676///
677/// An `HirKind` is principally useful for doing case analysis on the type
678/// of a regular expression. If you're looking to build new `Hir` values,
679/// then you _must_ use the smart constructors defined on `Hir`, like
680/// [`Hir::repetition`], to build new `Hir` values. The API intentionally does
681/// not expose any way of building an `Hir` directly from an `HirKind`.
682#[derive(Clone, Debug, Eq, PartialEq)]
683pub enum HirKind {
684 /// The empty regular expression, which matches everything, including the
685 /// empty string.
686 Empty,
687 /// A literalstring that matches exactly these bytes.
688 Literal(Literal),
689 /// A single character class that matches any of the characters in the
690 /// class. A class can either consist of Unicode scalar values as
691 /// characters, or it can use bytes.
692 ///
693 /// A class may be empty. In which case, it matches nothing.
694 Class(Class),
695 /// A look-around assertion. A look-around match always has zero length.
696 Look(Look),
697 /// A repetition operation applied to a sub-expression.
698 Repetition(Repetition),
699 /// A capturing group, which contains a sub-expression.
700 Capture(Capture),
701 /// A concatenation of expressions.
702 ///
703 /// A concatenation matches only if each of its sub-expressions match one
704 /// after the other.
705 ///
706 /// Concatenations are guaranteed by `Hir`'s smart constructors to always
707 /// have at least two sub-expressions.
708 Concat(Vec<Hir>),
709 /// An alternation of expressions.
710 ///
711 /// An alternation matches only if at least one of its sub-expressions
712 /// match. If multiple sub-expressions match, then the leftmost is
713 /// preferred.
714 ///
715 /// Alternations are guaranteed by `Hir`'s smart constructors to always
716 /// have at least two sub-expressions.
717 Alternation(Vec<Hir>),
718}
719
720impl HirKind {
721 /// Returns a slice of this kind's sub-expressions, if any.
722 pub fn subs(&self) -> &[Hir] {
723 use core::slice::from_ref;
724
725 match *self {
726 HirKind::Empty
727 | HirKind::Literal(_)
728 | HirKind::Class(_)
729 | HirKind::Look(_) => &[],
730 HirKind::Repetition(Repetition { ref sub: &Box, .. }) => from_ref(sub),
731 HirKind::Capture(Capture { ref sub: &Box, .. }) => from_ref(sub),
732 HirKind::Concat(ref subs: &Vec) => subs,
733 HirKind::Alternation(ref subs: &Vec) => subs,
734 }
735 }
736}
737
738impl core::fmt::Debug for Hir {
739 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
740 self.kind.fmt(f)
741 }
742}
743
744/// Print a display representation of this Hir.
745///
746/// The result of this is a valid regular expression pattern string.
747///
748/// This implementation uses constant stack space and heap space proportional
749/// to the size of the `Hir`.
750impl core::fmt::Display for Hir {
751 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
752 crate::hir::print::Printer::new().print(self, wtr:f)
753 }
754}
755
756/// The high-level intermediate representation of a literal.
757///
758/// A literal corresponds to `0` or more bytes that should be matched
759/// literally. The smart constructors defined on `Hir` will automatically
760/// concatenate adjacent literals into one literal, and will even automatically
761/// replace empty literals with `Hir::empty()`.
762///
763/// Note that despite a literal being represented by a sequence of bytes, its
764/// `Debug` implementation will attempt to print it as a normal string. (That
765/// is, not a sequence of decimal numbers.)
766#[derive(Clone, Eq, PartialEq)]
767pub struct Literal(pub Box<[u8]>);
768
769impl core::fmt::Debug for Literal {
770 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
771 crate::debug::Bytes(&self.0).fmt(f)
772 }
773}
774
775/// The high-level intermediate representation of a character class.
776///
777/// A character class corresponds to a set of characters. A character is either
778/// defined by a Unicode scalar value or a byte. Unicode characters are used
779/// by default, while bytes are used when Unicode mode (via the `u` flag) is
780/// disabled.
781///
782/// A character class, regardless of its character type, is represented by a
783/// sequence of non-overlapping non-adjacent ranges of characters.
784///
785/// Note that `Bytes` variant may be produced even when it exclusively matches
786/// valid UTF-8. This is because a `Bytes` variant represents an intention by
787/// the author of the regular expression to disable Unicode mode, which in turn
788/// impacts the semantics of case insensitive matching. For example, `(?i)k`
789/// and `(?i-u)k` will not match the same set of strings.
790#[derive(Clone, Eq, PartialEq)]
791pub enum Class {
792 /// A set of characters represented by Unicode scalar values.
793 Unicode(ClassUnicode),
794 /// A set of characters represented by arbitrary bytes (one byte per
795 /// character).
796 Bytes(ClassBytes),
797}
798
799impl Class {
800 /// Apply Unicode simple case folding to this character class, in place.
801 /// The character class will be expanded to include all simple case folded
802 /// character variants.
803 ///
804 /// If this is a byte oriented character class, then this will be limited
805 /// to the ASCII ranges `A-Z` and `a-z`.
806 ///
807 /// # Panics
808 ///
809 /// This routine panics when the case mapping data necessary for this
810 /// routine to complete is unavailable. This occurs when the `unicode-case`
811 /// feature is not enabled and the underlying class is Unicode oriented.
812 ///
813 /// Callers should prefer using `try_case_fold_simple` instead, which will
814 /// return an error instead of panicking.
815 pub fn case_fold_simple(&mut self) {
816 match *self {
817 Class::Unicode(ref mut x) => x.case_fold_simple(),
818 Class::Bytes(ref mut x) => x.case_fold_simple(),
819 }
820 }
821
822 /// Apply Unicode simple case folding to this character class, in place.
823 /// The character class will be expanded to include all simple case folded
824 /// character variants.
825 ///
826 /// If this is a byte oriented character class, then this will be limited
827 /// to the ASCII ranges `A-Z` and `a-z`.
828 ///
829 /// # Error
830 ///
831 /// This routine returns an error when the case mapping data necessary
832 /// for this routine to complete is unavailable. This occurs when the
833 /// `unicode-case` feature is not enabled and the underlying class is
834 /// Unicode oriented.
835 pub fn try_case_fold_simple(
836 &mut self,
837 ) -> core::result::Result<(), CaseFoldError> {
838 match *self {
839 Class::Unicode(ref mut x) => x.try_case_fold_simple()?,
840 Class::Bytes(ref mut x) => x.case_fold_simple(),
841 }
842 Ok(())
843 }
844
845 /// Negate this character class in place.
846 ///
847 /// After completion, this character class will contain precisely the
848 /// characters that weren't previously in the class.
849 pub fn negate(&mut self) {
850 match *self {
851 Class::Unicode(ref mut x) => x.negate(),
852 Class::Bytes(ref mut x) => x.negate(),
853 }
854 }
855
856 /// Returns true if and only if this character class will only ever match
857 /// valid UTF-8.
858 ///
859 /// A character class can match invalid UTF-8 only when the following
860 /// conditions are met:
861 ///
862 /// 1. The translator was configured to permit generating an expression
863 /// that can match invalid UTF-8. (By default, this is disabled.)
864 /// 2. Unicode mode (via the `u` flag) was disabled either in the concrete
865 /// syntax or in the parser builder. By default, Unicode mode is
866 /// enabled.
867 pub fn is_utf8(&self) -> bool {
868 match *self {
869 Class::Unicode(_) => true,
870 Class::Bytes(ref x) => x.is_ascii(),
871 }
872 }
873
874 /// Returns the length, in bytes, of the smallest string matched by this
875 /// character class.
876 ///
877 /// For non-empty byte oriented classes, this always returns `1`. For
878 /// non-empty Unicode oriented classes, this can return `1`, `2`, `3` or
879 /// `4`. For empty classes, `None` is returned. It is impossible for `0` to
880 /// be returned.
881 ///
882 /// # Example
883 ///
884 /// This example shows some examples of regexes and their corresponding
885 /// minimum length, if any.
886 ///
887 /// ```
888 /// use regex_syntax::{hir::Properties, parse};
889 ///
890 /// // The empty string has a min length of 0.
891 /// let hir = parse(r"")?;
892 /// assert_eq!(Some(0), hir.properties().minimum_len());
893 /// // As do other types of regexes that only match the empty string.
894 /// let hir = parse(r"^$\b\B")?;
895 /// assert_eq!(Some(0), hir.properties().minimum_len());
896 /// // A regex that can match the empty string but match more is still 0.
897 /// let hir = parse(r"a*")?;
898 /// assert_eq!(Some(0), hir.properties().minimum_len());
899 /// // A regex that matches nothing has no minimum defined.
900 /// let hir = parse(r"[a&&b]")?;
901 /// assert_eq!(None, hir.properties().minimum_len());
902 /// // Character classes usually have a minimum length of 1.
903 /// let hir = parse(r"\w")?;
904 /// assert_eq!(Some(1), hir.properties().minimum_len());
905 /// // But sometimes Unicode classes might be bigger!
906 /// let hir = parse(r"\p{Cyrillic}")?;
907 /// assert_eq!(Some(2), hir.properties().minimum_len());
908 ///
909 /// # Ok::<(), Box<dyn std::error::Error>>(())
910 /// ```
911 pub fn minimum_len(&self) -> Option<usize> {
912 match *self {
913 Class::Unicode(ref x) => x.minimum_len(),
914 Class::Bytes(ref x) => x.minimum_len(),
915 }
916 }
917
918 /// Returns the length, in bytes, of the longest string matched by this
919 /// character class.
920 ///
921 /// For non-empty byte oriented classes, this always returns `1`. For
922 /// non-empty Unicode oriented classes, this can return `1`, `2`, `3` or
923 /// `4`. For empty classes, `None` is returned. It is impossible for `0` to
924 /// be returned.
925 ///
926 /// # Example
927 ///
928 /// This example shows some examples of regexes and their corresponding
929 /// maximum length, if any.
930 ///
931 /// ```
932 /// use regex_syntax::{hir::Properties, parse};
933 ///
934 /// // The empty string has a max length of 0.
935 /// let hir = parse(r"")?;
936 /// assert_eq!(Some(0), hir.properties().maximum_len());
937 /// // As do other types of regexes that only match the empty string.
938 /// let hir = parse(r"^$\b\B")?;
939 /// assert_eq!(Some(0), hir.properties().maximum_len());
940 /// // A regex that matches nothing has no maximum defined.
941 /// let hir = parse(r"[a&&b]")?;
942 /// assert_eq!(None, hir.properties().maximum_len());
943 /// // Bounded repeats work as you expect.
944 /// let hir = parse(r"x{2,10}")?;
945 /// assert_eq!(Some(10), hir.properties().maximum_len());
946 /// // An unbounded repeat means there is no maximum.
947 /// let hir = parse(r"x{2,}")?;
948 /// assert_eq!(None, hir.properties().maximum_len());
949 /// // With Unicode enabled, \w can match up to 4 bytes!
950 /// let hir = parse(r"\w")?;
951 /// assert_eq!(Some(4), hir.properties().maximum_len());
952 /// // Without Unicode enabled, \w matches at most 1 byte.
953 /// let hir = parse(r"(?-u)\w")?;
954 /// assert_eq!(Some(1), hir.properties().maximum_len());
955 ///
956 /// # Ok::<(), Box<dyn std::error::Error>>(())
957 /// ```
958 pub fn maximum_len(&self) -> Option<usize> {
959 match *self {
960 Class::Unicode(ref x) => x.maximum_len(),
961 Class::Bytes(ref x) => x.maximum_len(),
962 }
963 }
964
965 /// Returns true if and only if this character class is empty. That is,
966 /// it has no elements.
967 ///
968 /// An empty character can never match anything, including an empty string.
969 pub fn is_empty(&self) -> bool {
970 match *self {
971 Class::Unicode(ref x) => x.ranges().is_empty(),
972 Class::Bytes(ref x) => x.ranges().is_empty(),
973 }
974 }
975
976 /// If this class consists of exactly one element (whether a codepoint or a
977 /// byte), then return it as a literal byte string.
978 ///
979 /// If this class is empty or contains more than one element, then `None`
980 /// is returned.
981 pub fn literal(&self) -> Option<Vec<u8>> {
982 match *self {
983 Class::Unicode(ref x) => x.literal(),
984 Class::Bytes(ref x) => x.literal(),
985 }
986 }
987}
988
989impl core::fmt::Debug for Class {
990 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
991 use crate::debug::Byte;
992
993 let mut fmter: DebugSet<'_, '_> = f.debug_set();
994 match *self {
995 Class::Unicode(ref cls: &ClassUnicode) => {
996 for r: &ClassUnicodeRange in cls.ranges().iter() {
997 fmter.entry(&(r.start..=r.end));
998 }
999 }
1000 Class::Bytes(ref cls: &ClassBytes) => {
1001 for r: &ClassBytesRange in cls.ranges().iter() {
1002 fmter.entry(&(Byte(r.start)..=Byte(r.end)));
1003 }
1004 }
1005 }
1006 fmter.finish()
1007 }
1008}
1009
1010/// A set of characters represented by Unicode scalar values.
1011#[derive(Clone, Debug, Eq, PartialEq)]
1012pub struct ClassUnicode {
1013 set: IntervalSet<ClassUnicodeRange>,
1014}
1015
1016impl ClassUnicode {
1017 /// Create a new class from a sequence of ranges.
1018 ///
1019 /// The given ranges do not need to be in any specific order, and ranges
1020 /// may overlap. Ranges will automatically be sorted into a canonical
1021 /// non-overlapping order.
1022 pub fn new<I>(ranges: I) -> ClassUnicode
1023 where
1024 I: IntoIterator<Item = ClassUnicodeRange>,
1025 {
1026 ClassUnicode { set: IntervalSet::new(ranges) }
1027 }
1028
1029 /// Create a new class with no ranges.
1030 ///
1031 /// An empty class matches nothing. That is, it is equivalent to
1032 /// [`Hir::fail`].
1033 pub fn empty() -> ClassUnicode {
1034 ClassUnicode::new(vec![])
1035 }
1036
1037 /// Add a new range to this set.
1038 pub fn push(&mut self, range: ClassUnicodeRange) {
1039 self.set.push(range);
1040 }
1041
1042 /// Return an iterator over all ranges in this class.
1043 ///
1044 /// The iterator yields ranges in ascending order.
1045 pub fn iter(&self) -> ClassUnicodeIter<'_> {
1046 ClassUnicodeIter(self.set.iter())
1047 }
1048
1049 /// Return the underlying ranges as a slice.
1050 pub fn ranges(&self) -> &[ClassUnicodeRange] {
1051 self.set.intervals()
1052 }
1053
1054 /// Expand this character class such that it contains all case folded
1055 /// characters, according to Unicode's "simple" mapping. For example, if
1056 /// this class consists of the range `a-z`, then applying case folding will
1057 /// result in the class containing both the ranges `a-z` and `A-Z`.
1058 ///
1059 /// # Panics
1060 ///
1061 /// This routine panics when the case mapping data necessary for this
1062 /// routine to complete is unavailable. This occurs when the `unicode-case`
1063 /// feature is not enabled.
1064 ///
1065 /// Callers should prefer using `try_case_fold_simple` instead, which will
1066 /// return an error instead of panicking.
1067 pub fn case_fold_simple(&mut self) {
1068 self.set
1069 .case_fold_simple()
1070 .expect("unicode-case feature must be enabled");
1071 }
1072
1073 /// Expand this character class such that it contains all case folded
1074 /// characters, according to Unicode's "simple" mapping. For example, if
1075 /// this class consists of the range `a-z`, then applying case folding will
1076 /// result in the class containing both the ranges `a-z` and `A-Z`.
1077 ///
1078 /// # Error
1079 ///
1080 /// This routine returns an error when the case mapping data necessary
1081 /// for this routine to complete is unavailable. This occurs when the
1082 /// `unicode-case` feature is not enabled.
1083 pub fn try_case_fold_simple(
1084 &mut self,
1085 ) -> core::result::Result<(), CaseFoldError> {
1086 self.set.case_fold_simple()
1087 }
1088
1089 /// Negate this character class.
1090 ///
1091 /// For all `c` where `c` is a Unicode scalar value, if `c` was in this
1092 /// set, then it will not be in this set after negation.
1093 pub fn negate(&mut self) {
1094 self.set.negate();
1095 }
1096
1097 /// Union this character class with the given character class, in place.
1098 pub fn union(&mut self, other: &ClassUnicode) {
1099 self.set.union(&other.set);
1100 }
1101
1102 /// Intersect this character class with the given character class, in
1103 /// place.
1104 pub fn intersect(&mut self, other: &ClassUnicode) {
1105 self.set.intersect(&other.set);
1106 }
1107
1108 /// Subtract the given character class from this character class, in place.
1109 pub fn difference(&mut self, other: &ClassUnicode) {
1110 self.set.difference(&other.set);
1111 }
1112
1113 /// Compute the symmetric difference of the given character classes, in
1114 /// place.
1115 ///
1116 /// This computes the symmetric difference of two character classes. This
1117 /// removes all elements in this class that are also in the given class,
1118 /// but all adds all elements from the given class that aren't in this
1119 /// class. That is, the class will contain all elements in either class,
1120 /// but will not contain any elements that are in both classes.
1121 pub fn symmetric_difference(&mut self, other: &ClassUnicode) {
1122 self.set.symmetric_difference(&other.set);
1123 }
1124
1125 /// Returns true if and only if this character class will either match
1126 /// nothing or only ASCII bytes. Stated differently, this returns false
1127 /// if and only if this class contains a non-ASCII codepoint.
1128 pub fn is_ascii(&self) -> bool {
1129 self.set.intervals().last().map_or(true, |r| r.end <= '\x7F')
1130 }
1131
1132 /// Returns the length, in bytes, of the smallest string matched by this
1133 /// character class.
1134 ///
1135 /// Returns `None` when the class is empty.
1136 pub fn minimum_len(&self) -> Option<usize> {
1137 let first = self.ranges().get(0)?;
1138 // Correct because c1 < c2 implies c1.len_utf8() < c2.len_utf8().
1139 Some(first.start.len_utf8())
1140 }
1141
1142 /// Returns the length, in bytes, of the longest string matched by this
1143 /// character class.
1144 ///
1145 /// Returns `None` when the class is empty.
1146 pub fn maximum_len(&self) -> Option<usize> {
1147 let last = self.ranges().last()?;
1148 // Correct because c1 < c2 implies c1.len_utf8() < c2.len_utf8().
1149 Some(last.end.len_utf8())
1150 }
1151
1152 /// If this class consists of exactly one codepoint, then return it as
1153 /// a literal byte string.
1154 ///
1155 /// If this class is empty or contains more than one codepoint, then `None`
1156 /// is returned.
1157 pub fn literal(&self) -> Option<Vec<u8>> {
1158 let rs = self.ranges();
1159 if rs.len() == 1 && rs[0].start == rs[0].end {
1160 Some(rs[0].start.encode_utf8(&mut [0; 4]).to_string().into_bytes())
1161 } else {
1162 None
1163 }
1164 }
1165
1166 /// If this class consists of only ASCII ranges, then return its
1167 /// corresponding and equivalent byte class.
1168 pub fn to_byte_class(&self) -> Option<ClassBytes> {
1169 if !self.is_ascii() {
1170 return None;
1171 }
1172 Some(ClassBytes::new(self.ranges().iter().map(|r| {
1173 // Since we are guaranteed that our codepoint range is ASCII, the
1174 // 'u8::try_from' calls below are guaranteed to be correct.
1175 ClassBytesRange {
1176 start: u8::try_from(r.start).unwrap(),
1177 end: u8::try_from(r.end).unwrap(),
1178 }
1179 })))
1180 }
1181}
1182
1183/// An iterator over all ranges in a Unicode character class.
1184///
1185/// The lifetime `'a` refers to the lifetime of the underlying class.
1186#[derive(Debug)]
1187pub struct ClassUnicodeIter<'a>(IntervalSetIter<'a, ClassUnicodeRange>);
1188
1189impl<'a> Iterator for ClassUnicodeIter<'a> {
1190 type Item = &'a ClassUnicodeRange;
1191
1192 fn next(&mut self) -> Option<&'a ClassUnicodeRange> {
1193 self.0.next()
1194 }
1195}
1196
1197/// A single range of characters represented by Unicode scalar values.
1198///
1199/// The range is closed. That is, the start and end of the range are included
1200/// in the range.
1201#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
1202pub struct ClassUnicodeRange {
1203 start: char,
1204 end: char,
1205}
1206
1207impl core::fmt::Debug for ClassUnicodeRange {
1208 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1209 let start: String = if !self.start.is_whitespace() && !self.start.is_control()
1210 {
1211 self.start.to_string()
1212 } else {
1213 format!("0x{:X}", u32::from(self.start))
1214 };
1215 let end: String = if !self.end.is_whitespace() && !self.end.is_control() {
1216 self.end.to_string()
1217 } else {
1218 format!("0x{:X}", u32::from(self.end))
1219 };
1220 f&mut DebugStruct<'_, '_>.debug_struct("ClassUnicodeRange")
1221 .field("start", &start)
1222 .field(name:"end", &end)
1223 .finish()
1224 }
1225}
1226
1227impl Interval for ClassUnicodeRange {
1228 type Bound = char;
1229
1230 #[inline]
1231 fn lower(&self) -> char {
1232 self.start
1233 }
1234 #[inline]
1235 fn upper(&self) -> char {
1236 self.end
1237 }
1238 #[inline]
1239 fn set_lower(&mut self, bound: char) {
1240 self.start = bound;
1241 }
1242 #[inline]
1243 fn set_upper(&mut self, bound: char) {
1244 self.end = bound;
1245 }
1246
1247 /// Apply simple case folding to this Unicode scalar value range.
1248 ///
1249 /// Additional ranges are appended to the given vector. Canonical ordering
1250 /// is *not* maintained in the given vector.
1251 fn case_fold_simple(
1252 &self,
1253 ranges: &mut Vec<ClassUnicodeRange>,
1254 ) -> Result<(), unicode::CaseFoldError> {
1255 let mut folder = unicode::SimpleCaseFolder::new()?;
1256 if !folder.overlaps(self.start, self.end) {
1257 return Ok(());
1258 }
1259 let (start, end) = (u32::from(self.start), u32::from(self.end));
1260 for cp in (start..=end).filter_map(char::from_u32) {
1261 for &cp_folded in folder.mapping(cp) {
1262 ranges.push(ClassUnicodeRange::new(cp_folded, cp_folded));
1263 }
1264 }
1265 Ok(())
1266 }
1267}
1268
1269impl ClassUnicodeRange {
1270 /// Create a new Unicode scalar value range for a character class.
1271 ///
1272 /// The returned range is always in a canonical form. That is, the range
1273 /// returned always satisfies the invariant that `start <= end`.
1274 pub fn new(start: char, end: char) -> ClassUnicodeRange {
1275 ClassUnicodeRange::create(start, end)
1276 }
1277
1278 /// Return the start of this range.
1279 ///
1280 /// The start of a range is always less than or equal to the end of the
1281 /// range.
1282 pub fn start(&self) -> char {
1283 self.start
1284 }
1285
1286 /// Return the end of this range.
1287 ///
1288 /// The end of a range is always greater than or equal to the start of the
1289 /// range.
1290 pub fn end(&self) -> char {
1291 self.end
1292 }
1293
1294 /// Returns the number of codepoints in this range.
1295 pub fn len(&self) -> usize {
1296 let diff = 1 + u32::from(self.end) - u32::from(self.start);
1297 // This is likely to panic in 16-bit targets since a usize can only fit
1298 // 2^16. It's not clear what to do here, other than to return an error
1299 // when building a Unicode class that contains a range whose length
1300 // overflows usize. (Which, to be honest, is probably quite common on
1301 // 16-bit targets. For example, this would imply that '.' and '\p{any}'
1302 // would be impossible to build.)
1303 usize::try_from(diff).expect("char class len fits in usize")
1304 }
1305}
1306
1307/// A set of characters represented by arbitrary bytes (where one byte
1308/// corresponds to one character).
1309#[derive(Clone, Debug, Eq, PartialEq)]
1310pub struct ClassBytes {
1311 set: IntervalSet<ClassBytesRange>,
1312}
1313
1314impl ClassBytes {
1315 /// Create a new class from a sequence of ranges.
1316 ///
1317 /// The given ranges do not need to be in any specific order, and ranges
1318 /// may overlap. Ranges will automatically be sorted into a canonical
1319 /// non-overlapping order.
1320 pub fn new<I>(ranges: I) -> ClassBytes
1321 where
1322 I: IntoIterator<Item = ClassBytesRange>,
1323 {
1324 ClassBytes { set: IntervalSet::new(ranges) }
1325 }
1326
1327 /// Create a new class with no ranges.
1328 ///
1329 /// An empty class matches nothing. That is, it is equivalent to
1330 /// [`Hir::fail`].
1331 pub fn empty() -> ClassBytes {
1332 ClassBytes::new(vec![])
1333 }
1334
1335 /// Add a new range to this set.
1336 pub fn push(&mut self, range: ClassBytesRange) {
1337 self.set.push(range);
1338 }
1339
1340 /// Return an iterator over all ranges in this class.
1341 ///
1342 /// The iterator yields ranges in ascending order.
1343 pub fn iter(&self) -> ClassBytesIter<'_> {
1344 ClassBytesIter(self.set.iter())
1345 }
1346
1347 /// Return the underlying ranges as a slice.
1348 pub fn ranges(&self) -> &[ClassBytesRange] {
1349 self.set.intervals()
1350 }
1351
1352 /// Expand this character class such that it contains all case folded
1353 /// characters. For example, if this class consists of the range `a-z`,
1354 /// then applying case folding will result in the class containing both the
1355 /// ranges `a-z` and `A-Z`.
1356 ///
1357 /// Note that this only applies ASCII case folding, which is limited to the
1358 /// characters `a-z` and `A-Z`.
1359 pub fn case_fold_simple(&mut self) {
1360 self.set.case_fold_simple().expect("ASCII case folding never fails");
1361 }
1362
1363 /// Negate this byte class.
1364 ///
1365 /// For all `b` where `b` is a any byte, if `b` was in this set, then it
1366 /// will not be in this set after negation.
1367 pub fn negate(&mut self) {
1368 self.set.negate();
1369 }
1370
1371 /// Union this byte class with the given byte class, in place.
1372 pub fn union(&mut self, other: &ClassBytes) {
1373 self.set.union(&other.set);
1374 }
1375
1376 /// Intersect this byte class with the given byte class, in place.
1377 pub fn intersect(&mut self, other: &ClassBytes) {
1378 self.set.intersect(&other.set);
1379 }
1380
1381 /// Subtract the given byte class from this byte class, in place.
1382 pub fn difference(&mut self, other: &ClassBytes) {
1383 self.set.difference(&other.set);
1384 }
1385
1386 /// Compute the symmetric difference of the given byte classes, in place.
1387 ///
1388 /// This computes the symmetric difference of two byte classes. This
1389 /// removes all elements in this class that are also in the given class,
1390 /// but all adds all elements from the given class that aren't in this
1391 /// class. That is, the class will contain all elements in either class,
1392 /// but will not contain any elements that are in both classes.
1393 pub fn symmetric_difference(&mut self, other: &ClassBytes) {
1394 self.set.symmetric_difference(&other.set);
1395 }
1396
1397 /// Returns true if and only if this character class will either match
1398 /// nothing or only ASCII bytes. Stated differently, this returns false
1399 /// if and only if this class contains a non-ASCII byte.
1400 pub fn is_ascii(&self) -> bool {
1401 self.set.intervals().last().map_or(true, |r| r.end <= 0x7F)
1402 }
1403
1404 /// Returns the length, in bytes, of the smallest string matched by this
1405 /// character class.
1406 ///
1407 /// Returns `None` when the class is empty.
1408 pub fn minimum_len(&self) -> Option<usize> {
1409 if self.ranges().is_empty() {
1410 None
1411 } else {
1412 Some(1)
1413 }
1414 }
1415
1416 /// Returns the length, in bytes, of the longest string matched by this
1417 /// character class.
1418 ///
1419 /// Returns `None` when the class is empty.
1420 pub fn maximum_len(&self) -> Option<usize> {
1421 if self.ranges().is_empty() {
1422 None
1423 } else {
1424 Some(1)
1425 }
1426 }
1427
1428 /// If this class consists of exactly one byte, then return it as
1429 /// a literal byte string.
1430 ///
1431 /// If this class is empty or contains more than one byte, then `None`
1432 /// is returned.
1433 pub fn literal(&self) -> Option<Vec<u8>> {
1434 let rs = self.ranges();
1435 if rs.len() == 1 && rs[0].start == rs[0].end {
1436 Some(vec![rs[0].start])
1437 } else {
1438 None
1439 }
1440 }
1441
1442 /// If this class consists of only ASCII ranges, then return its
1443 /// corresponding and equivalent Unicode class.
1444 pub fn to_unicode_class(&self) -> Option<ClassUnicode> {
1445 if !self.is_ascii() {
1446 return None;
1447 }
1448 Some(ClassUnicode::new(self.ranges().iter().map(|r| {
1449 // Since we are guaranteed that our byte range is ASCII, the
1450 // 'char::from' calls below are correct and will not erroneously
1451 // convert a raw byte value into its corresponding codepoint.
1452 ClassUnicodeRange {
1453 start: char::from(r.start),
1454 end: char::from(r.end),
1455 }
1456 })))
1457 }
1458}
1459
1460/// An iterator over all ranges in a byte character class.
1461///
1462/// The lifetime `'a` refers to the lifetime of the underlying class.
1463#[derive(Debug)]
1464pub struct ClassBytesIter<'a>(IntervalSetIter<'a, ClassBytesRange>);
1465
1466impl<'a> Iterator for ClassBytesIter<'a> {
1467 type Item = &'a ClassBytesRange;
1468
1469 fn next(&mut self) -> Option<&'a ClassBytesRange> {
1470 self.0.next()
1471 }
1472}
1473
1474/// A single range of characters represented by arbitrary bytes.
1475///
1476/// The range is closed. That is, the start and end of the range are included
1477/// in the range.
1478#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
1479pub struct ClassBytesRange {
1480 start: u8,
1481 end: u8,
1482}
1483
1484impl Interval for ClassBytesRange {
1485 type Bound = u8;
1486
1487 #[inline]
1488 fn lower(&self) -> u8 {
1489 self.start
1490 }
1491 #[inline]
1492 fn upper(&self) -> u8 {
1493 self.end
1494 }
1495 #[inline]
1496 fn set_lower(&mut self, bound: u8) {
1497 self.start = bound;
1498 }
1499 #[inline]
1500 fn set_upper(&mut self, bound: u8) {
1501 self.end = bound;
1502 }
1503
1504 /// Apply simple case folding to this byte range. Only ASCII case mappings
1505 /// (for a-z) are applied.
1506 ///
1507 /// Additional ranges are appended to the given vector. Canonical ordering
1508 /// is *not* maintained in the given vector.
1509 fn case_fold_simple(
1510 &self,
1511 ranges: &mut Vec<ClassBytesRange>,
1512 ) -> Result<(), unicode::CaseFoldError> {
1513 if !ClassBytesRange::new(b'a', b'z').is_intersection_empty(self) {
1514 let lower = cmp::max(self.start, b'a');
1515 let upper = cmp::min(self.end, b'z');
1516 ranges.push(ClassBytesRange::new(lower - 32, upper - 32));
1517 }
1518 if !ClassBytesRange::new(b'A', b'Z').is_intersection_empty(self) {
1519 let lower = cmp::max(self.start, b'A');
1520 let upper = cmp::min(self.end, b'Z');
1521 ranges.push(ClassBytesRange::new(lower + 32, upper + 32));
1522 }
1523 Ok(())
1524 }
1525}
1526
1527impl ClassBytesRange {
1528 /// Create a new byte range for a character class.
1529 ///
1530 /// The returned range is always in a canonical form. That is, the range
1531 /// returned always satisfies the invariant that `start <= end`.
1532 pub fn new(start: u8, end: u8) -> ClassBytesRange {
1533 ClassBytesRange::create(start, end)
1534 }
1535
1536 /// Return the start of this range.
1537 ///
1538 /// The start of a range is always less than or equal to the end of the
1539 /// range.
1540 pub fn start(&self) -> u8 {
1541 self.start
1542 }
1543
1544 /// Return the end of this range.
1545 ///
1546 /// The end of a range is always greater than or equal to the start of the
1547 /// range.
1548 pub fn end(&self) -> u8 {
1549 self.end
1550 }
1551
1552 /// Returns the number of bytes in this range.
1553 pub fn len(&self) -> usize {
1554 usize::from(self.end.checked_sub(self.start).unwrap())
1555 .checked_add(1)
1556 .unwrap()
1557 }
1558}
1559
1560impl core::fmt::Debug for ClassBytesRange {
1561 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1562 f&mut DebugStruct<'_, '_>.debug_struct("ClassBytesRange")
1563 .field("start", &crate::debug::Byte(self.start))
1564 .field(name:"end", &crate::debug::Byte(self.end))
1565 .finish()
1566 }
1567}
1568
1569/// The high-level intermediate representation for a look-around assertion.
1570///
1571/// An assertion match is always zero-length. Also called an "empty match."
1572#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1573pub enum Look {
1574 /// Match the beginning of text. Specifically, this matches at the starting
1575 /// position of the input.
1576 Start = 1 << 0,
1577 /// Match the end of text. Specifically, this matches at the ending
1578 /// position of the input.
1579 End = 1 << 1,
1580 /// Match the beginning of a line or the beginning of text. Specifically,
1581 /// this matches at the starting position of the input, or at the position
1582 /// immediately following a `\n` character.
1583 StartLF = 1 << 2,
1584 /// Match the end of a line or the end of text. Specifically, this matches
1585 /// at the end position of the input, or at the position immediately
1586 /// preceding a `\n` character.
1587 EndLF = 1 << 3,
1588 /// Match the beginning of a line or the beginning of text. Specifically,
1589 /// this matches at the starting position of the input, or at the position
1590 /// immediately following either a `\r` or `\n` character, but never after
1591 /// a `\r` when a `\n` follows.
1592 StartCRLF = 1 << 4,
1593 /// Match the end of a line or the end of text. Specifically, this matches
1594 /// at the end position of the input, or at the position immediately
1595 /// preceding a `\r` or `\n` character, but never before a `\n` when a `\r`
1596 /// precedes it.
1597 EndCRLF = 1 << 5,
1598 /// Match an ASCII-only word boundary. That is, this matches a position
1599 /// where the left adjacent character and right adjacent character
1600 /// correspond to a word and non-word or a non-word and word character.
1601 WordAscii = 1 << 6,
1602 /// Match an ASCII-only negation of a word boundary.
1603 WordAsciiNegate = 1 << 7,
1604 /// Match a Unicode-aware word boundary. That is, this matches a position
1605 /// where the left adjacent character and right adjacent character
1606 /// correspond to a word and non-word or a non-word and word character.
1607 WordUnicode = 1 << 8,
1608 /// Match a Unicode-aware negation of a word boundary.
1609 WordUnicodeNegate = 1 << 9,
1610}
1611
1612impl Look {
1613 /// Flip the look-around assertion to its equivalent for reverse searches.
1614 /// For example, `StartLF` gets translated to `EndLF`.
1615 ///
1616 /// Some assertions, such as `WordUnicode`, remain the same since they
1617 /// match the same positions regardless of the direction of the search.
1618 #[inline]
1619 pub const fn reversed(self) -> Look {
1620 match self {
1621 Look::Start => Look::End,
1622 Look::End => Look::Start,
1623 Look::StartLF => Look::EndLF,
1624 Look::EndLF => Look::StartLF,
1625 Look::StartCRLF => Look::EndCRLF,
1626 Look::EndCRLF => Look::StartCRLF,
1627 Look::WordAscii => Look::WordAscii,
1628 Look::WordAsciiNegate => Look::WordAsciiNegate,
1629 Look::WordUnicode => Look::WordUnicode,
1630 Look::WordUnicodeNegate => Look::WordUnicodeNegate,
1631 }
1632 }
1633
1634 /// Return the underlying representation of this look-around enumeration
1635 /// as an integer. Giving the return value to the [`Look::from_repr`]
1636 /// constructor is guaranteed to return the same look-around variant that
1637 /// one started with within a semver compatible release of this crate.
1638 #[inline]
1639 pub const fn as_repr(self) -> u16 {
1640 // AFAIK, 'as' is the only way to zero-cost convert an int enum to an
1641 // actual int.
1642 self as u16
1643 }
1644
1645 /// Given the underlying representation of a `Look` value, return the
1646 /// corresponding `Look` value if the representation is valid. Otherwise
1647 /// `None` is returned.
1648 #[inline]
1649 pub const fn from_repr(repr: u16) -> Option<Look> {
1650 match repr {
1651 0b00_0000_0001 => Some(Look::Start),
1652 0b00_0000_0010 => Some(Look::End),
1653 0b00_0000_0100 => Some(Look::StartLF),
1654 0b00_0000_1000 => Some(Look::EndLF),
1655 0b00_0001_0000 => Some(Look::StartCRLF),
1656 0b00_0010_0000 => Some(Look::EndCRLF),
1657 0b00_0100_0000 => Some(Look::WordAscii),
1658 0b00_1000_0000 => Some(Look::WordAsciiNegate),
1659 0b01_0000_0000 => Some(Look::WordUnicode),
1660 0b10_0000_0000 => Some(Look::WordUnicodeNegate),
1661 _ => None,
1662 }
1663 }
1664
1665 /// Returns a convenient single codepoint representation of this
1666 /// look-around assertion. Each assertion is guaranteed to be represented
1667 /// by a distinct character.
1668 ///
1669 /// This is useful for succinctly representing a look-around assertion in
1670 /// human friendly but succinct output intended for a programmer working on
1671 /// regex internals.
1672 #[inline]
1673 pub const fn as_char(self) -> char {
1674 match self {
1675 Look::Start => 'A',
1676 Look::End => 'z',
1677 Look::StartLF => '^',
1678 Look::EndLF => '$',
1679 Look::StartCRLF => 'r',
1680 Look::EndCRLF => 'R',
1681 Look::WordAscii => 'b',
1682 Look::WordAsciiNegate => 'B',
1683 Look::WordUnicode => '𝛃',
1684 Look::WordUnicodeNegate => '𝚩',
1685 }
1686 }
1687}
1688
1689/// The high-level intermediate representation for a capturing group.
1690///
1691/// A capturing group always has an index and a child expression. It may
1692/// also have a name associated with it (e.g., `(?P<foo>\w)`), but it's not
1693/// necessary.
1694///
1695/// Note that there is no explicit representation of a non-capturing group
1696/// in a `Hir`. Instead, non-capturing grouping is handled automatically by
1697/// the recursive structure of the `Hir` itself.
1698#[derive(Clone, Debug, Eq, PartialEq)]
1699pub struct Capture {
1700 /// The capture index of the capture.
1701 pub index: u32,
1702 /// The name of the capture, if it exists.
1703 pub name: Option<Box<str>>,
1704 /// The expression inside the capturing group, which may be empty.
1705 pub sub: Box<Hir>,
1706}
1707
1708/// The high-level intermediate representation of a repetition operator.
1709///
1710/// A repetition operator permits the repetition of an arbitrary
1711/// sub-expression.
1712#[derive(Clone, Debug, Eq, PartialEq)]
1713pub struct Repetition {
1714 /// The minimum range of the repetition.
1715 ///
1716 /// Note that special cases like `?`, `+` and `*` all get translated into
1717 /// the ranges `{0,1}`, `{1,}` and `{0,}`, respectively.
1718 ///
1719 /// When `min` is zero, this expression can match the empty string
1720 /// regardless of what its sub-expression is.
1721 pub min: u32,
1722 /// The maximum range of the repetition.
1723 ///
1724 /// Note that when `max` is `None`, `min` acts as a lower bound but where
1725 /// there is no upper bound. For something like `x{5}` where the min and
1726 /// max are equivalent, `min` will be set to `5` and `max` will be set to
1727 /// `Some(5)`.
1728 pub max: Option<u32>,
1729 /// Whether this repetition operator is greedy or not. A greedy operator
1730 /// will match as much as it can. A non-greedy operator will match as
1731 /// little as it can.
1732 ///
1733 /// Typically, operators are greedy by default and are only non-greedy when
1734 /// a `?` suffix is used, e.g., `(expr)*` is greedy while `(expr)*?` is
1735 /// not. However, this can be inverted via the `U` "ungreedy" flag.
1736 pub greedy: bool,
1737 /// The expression being repeated.
1738 pub sub: Box<Hir>,
1739}
1740
1741impl Repetition {
1742 /// Returns a new repetition with the same `min`, `max` and `greedy`
1743 /// values, but with its sub-expression replaced with the one given.
1744 pub fn with(&self, sub: Hir) -> Repetition {
1745 Repetition {
1746 min: self.min,
1747 max: self.max,
1748 greedy: self.greedy,
1749 sub: Box::new(sub),
1750 }
1751 }
1752}
1753
1754/// A type describing the different flavors of `.`.
1755///
1756/// This type is meant to be used with [`Hir::dot`], which is a convenience
1757/// routine for building HIR values derived from the `.` regex.
1758#[non_exhaustive]
1759#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1760pub enum Dot {
1761 /// Matches the UTF-8 encoding of any Unicode scalar value.
1762 ///
1763 /// This is equivalent to `(?su:.)` and also `\p{any}`.
1764 AnyChar,
1765 /// Matches any byte value.
1766 ///
1767 /// This is equivalent to `(?s-u:.)` and also `(?-u:[\x00-\xFF])`.
1768 AnyByte,
1769 /// Matches the UTF-8 encoding of any Unicode scalar value except for `\n`.
1770 ///
1771 /// This is equivalent to `(?u-s:.)` and also `[\p{any}--\n]`.
1772 AnyCharExceptLF,
1773 /// Matches the UTF-8 encoding of any Unicode scalar value except for `\r`
1774 /// and `\n`.
1775 ///
1776 /// This is equivalent to `(?uR-s:.)` and also `[\p{any}--\r\n]`.
1777 AnyCharExceptCRLF,
1778 /// Matches any byte value except for `\n`.
1779 ///
1780 /// This is equivalent to `(?-su:.)` and also `(?-u:[[\x00-\xFF]--\n])`.
1781 AnyByteExceptLF,
1782 /// Matches any byte value except for `\r` and `\n`.
1783 ///
1784 /// This is equivalent to `(?R-su:.)` and also `(?-u:[[\x00-\xFF]--\r\n])`.
1785 AnyByteExceptCRLF,
1786}
1787
1788/// A custom `Drop` impl is used for `HirKind` such that it uses constant stack
1789/// space but heap space proportional to the depth of the total `Hir`.
1790impl Drop for Hir {
1791 fn drop(&mut self) {
1792 use core::mem;
1793
1794 match *self.kind() {
1795 HirKind::Empty
1796 | HirKind::Literal(_)
1797 | HirKind::Class(_)
1798 | HirKind::Look(_) => return,
1799 HirKind::Capture(ref x) if x.sub.kind.subs().is_empty() => return,
1800 HirKind::Repetition(ref x) if x.sub.kind.subs().is_empty() => {
1801 return
1802 }
1803 HirKind::Concat(ref x) if x.is_empty() => return,
1804 HirKind::Alternation(ref x) if x.is_empty() => return,
1805 _ => {}
1806 }
1807
1808 let mut stack = vec![mem::replace(self, Hir::empty())];
1809 while let Some(mut expr) = stack.pop() {
1810 match expr.kind {
1811 HirKind::Empty
1812 | HirKind::Literal(_)
1813 | HirKind::Class(_)
1814 | HirKind::Look(_) => {}
1815 HirKind::Capture(ref mut x) => {
1816 stack.push(mem::replace(&mut x.sub, Hir::empty()));
1817 }
1818 HirKind::Repetition(ref mut x) => {
1819 stack.push(mem::replace(&mut x.sub, Hir::empty()));
1820 }
1821 HirKind::Concat(ref mut x) => {
1822 stack.extend(x.drain(..));
1823 }
1824 HirKind::Alternation(ref mut x) => {
1825 stack.extend(x.drain(..));
1826 }
1827 }
1828 }
1829 }
1830}
1831
1832/// A type that collects various properties of an HIR value.
1833///
1834/// Properties are always scalar values and represent meta data that is
1835/// computed inductively on an HIR value. Properties are defined for all
1836/// HIR values.
1837///
1838/// All methods on a `Properties` value take constant time and are meant to
1839/// be cheap to call.
1840#[derive(Clone, Debug, Eq, PartialEq)]
1841pub struct Properties(Box<PropertiesI>);
1842
1843/// The property definition. It is split out so that we can box it, and
1844/// there by make `Properties` use less stack size. This is kind-of important
1845/// because every HIR value has a `Properties` attached to it.
1846///
1847/// This does have the unfortunate consequence that creating any HIR value
1848/// always leads to at least one alloc for properties, but this is generally
1849/// true anyway (for pretty much all HirKinds except for look-arounds).
1850#[derive(Clone, Debug, Eq, PartialEq)]
1851struct PropertiesI {
1852 minimum_len: Option<usize>,
1853 maximum_len: Option<usize>,
1854 look_set: LookSet,
1855 look_set_prefix: LookSet,
1856 look_set_suffix: LookSet,
1857 look_set_prefix_any: LookSet,
1858 look_set_suffix_any: LookSet,
1859 utf8: bool,
1860 explicit_captures_len: usize,
1861 static_explicit_captures_len: Option<usize>,
1862 literal: bool,
1863 alternation_literal: bool,
1864}
1865
1866impl Properties {
1867 /// Returns the length (in bytes) of the smallest string matched by this
1868 /// HIR.
1869 ///
1870 /// A return value of `0` is possible and occurs when the HIR can match an
1871 /// empty string.
1872 ///
1873 /// `None` is returned when there is no minimum length. This occurs in
1874 /// precisely the cases where the HIR matches nothing. i.e., The language
1875 /// the regex matches is empty. An example of such a regex is `\P{any}`.
1876 #[inline]
1877 pub fn minimum_len(&self) -> Option<usize> {
1878 self.0.minimum_len
1879 }
1880
1881 /// Returns the length (in bytes) of the longest string matched by this
1882 /// HIR.
1883 ///
1884 /// A return value of `0` is possible and occurs when nothing longer than
1885 /// the empty string is in the language described by this HIR.
1886 ///
1887 /// `None` is returned when there is no longest matching string. This
1888 /// occurs when the HIR matches nothing or when there is no upper bound on
1889 /// the length of matching strings. Example of such regexes are `\P{any}`
1890 /// (matches nothing) and `a+` (has no upper bound).
1891 #[inline]
1892 pub fn maximum_len(&self) -> Option<usize> {
1893 self.0.maximum_len
1894 }
1895
1896 /// Returns a set of all look-around assertions that appear at least once
1897 /// in this HIR value.
1898 #[inline]
1899 pub fn look_set(&self) -> LookSet {
1900 self.0.look_set
1901 }
1902
1903 /// Returns a set of all look-around assertions that appear as a prefix for
1904 /// this HIR value. That is, the set returned corresponds to the set of
1905 /// assertions that must be passed before matching any bytes in a haystack.
1906 ///
1907 /// For example, `hir.look_set_prefix().contains(Look::Start)` returns true
1908 /// if and only if the HIR is fully anchored at the start.
1909 #[inline]
1910 pub fn look_set_prefix(&self) -> LookSet {
1911 self.0.look_set_prefix
1912 }
1913
1914 /// Returns a set of all look-around assertions that appear as a _possible_
1915 /// prefix for this HIR value. That is, the set returned corresponds to the
1916 /// set of assertions that _may_ be passed before matching any bytes in a
1917 /// haystack.
1918 ///
1919 /// For example, `hir.look_set_prefix_any().contains(Look::Start)` returns
1920 /// true if and only if it's possible for the regex to match through a
1921 /// anchored assertion before consuming any input.
1922 #[inline]
1923 pub fn look_set_prefix_any(&self) -> LookSet {
1924 self.0.look_set_prefix_any
1925 }
1926
1927 /// Returns a set of all look-around assertions that appear as a suffix for
1928 /// this HIR value. That is, the set returned corresponds to the set of
1929 /// assertions that must be passed in order to be considered a match after
1930 /// all other consuming HIR expressions.
1931 ///
1932 /// For example, `hir.look_set_suffix().contains(Look::End)` returns true
1933 /// if and only if the HIR is fully anchored at the end.
1934 #[inline]
1935 pub fn look_set_suffix(&self) -> LookSet {
1936 self.0.look_set_suffix
1937 }
1938
1939 /// Returns a set of all look-around assertions that appear as a _possible_
1940 /// suffix for this HIR value. That is, the set returned corresponds to the
1941 /// set of assertions that _may_ be passed before matching any bytes in a
1942 /// haystack.
1943 ///
1944 /// For example, `hir.look_set_suffix_any().contains(Look::End)` returns
1945 /// true if and only if it's possible for the regex to match through a
1946 /// anchored assertion at the end of a match without consuming any input.
1947 #[inline]
1948 pub fn look_set_suffix_any(&self) -> LookSet {
1949 self.0.look_set_suffix_any
1950 }
1951
1952 /// Return true if and only if the corresponding HIR will always match
1953 /// valid UTF-8.
1954 ///
1955 /// When this returns false, then it is possible for this HIR expression to
1956 /// match invalid UTF-8, including by matching between the code units of
1957 /// a single UTF-8 encoded codepoint.
1958 ///
1959 /// Note that this returns true even when the corresponding HIR can match
1960 /// the empty string. Since an empty string can technically appear between
1961 /// UTF-8 code units, it is possible for a match to be reported that splits
1962 /// a codepoint which could in turn be considered matching invalid UTF-8.
1963 /// However, it is generally assumed that such empty matches are handled
1964 /// specially by the search routine if it is absolutely required that
1965 /// matches not split a codepoint.
1966 ///
1967 /// # Example
1968 ///
1969 /// This code example shows the UTF-8 property of a variety of patterns.
1970 ///
1971 /// ```
1972 /// use regex_syntax::{ParserBuilder, parse};
1973 ///
1974 /// // Examples of 'is_utf8() == true'.
1975 /// assert!(parse(r"a")?.properties().is_utf8());
1976 /// assert!(parse(r"[^a]")?.properties().is_utf8());
1977 /// assert!(parse(r".")?.properties().is_utf8());
1978 /// assert!(parse(r"\W")?.properties().is_utf8());
1979 /// assert!(parse(r"\b")?.properties().is_utf8());
1980 /// assert!(parse(r"\B")?.properties().is_utf8());
1981 /// assert!(parse(r"(?-u)\b")?.properties().is_utf8());
1982 /// assert!(parse(r"(?-u)\B")?.properties().is_utf8());
1983 /// // Unicode mode is enabled by default, and in
1984 /// // that mode, all \x hex escapes are treated as
1985 /// // codepoints. So this actually matches the UTF-8
1986 /// // encoding of U+00FF.
1987 /// assert!(parse(r"\xFF")?.properties().is_utf8());
1988 ///
1989 /// // Now we show examples of 'is_utf8() == false'.
1990 /// // The only way to do this is to force the parser
1991 /// // to permit invalid UTF-8, otherwise all of these
1992 /// // would fail to parse!
1993 /// let parse = |pattern| {
1994 /// ParserBuilder::new().utf8(false).build().parse(pattern)
1995 /// };
1996 /// assert!(!parse(r"(?-u)[^a]")?.properties().is_utf8());
1997 /// assert!(!parse(r"(?-u).")?.properties().is_utf8());
1998 /// assert!(!parse(r"(?-u)\W")?.properties().is_utf8());
1999 /// // Conversely to the equivalent example above,
2000 /// // when Unicode mode is disabled, \x hex escapes
2001 /// // are treated as their raw byte values.
2002 /// assert!(!parse(r"(?-u)\xFF")?.properties().is_utf8());
2003 /// // Note that just because we disabled UTF-8 in the
2004 /// // parser doesn't mean we still can't use Unicode.
2005 /// // It is enabled by default, so \xFF is still
2006 /// // equivalent to matching the UTF-8 encoding of
2007 /// // U+00FF by default.
2008 /// assert!(parse(r"\xFF")?.properties().is_utf8());
2009 /// // Even though we use raw bytes that individually
2010 /// // are not valid UTF-8, when combined together, the
2011 /// // overall expression *does* match valid UTF-8!
2012 /// assert!(parse(r"(?-u)\xE2\x98\x83")?.properties().is_utf8());
2013 ///
2014 /// # Ok::<(), Box<dyn std::error::Error>>(())
2015 /// ```
2016 #[inline]
2017 pub fn is_utf8(&self) -> bool {
2018 self.0.utf8
2019 }
2020
2021 /// Returns the total number of explicit capturing groups in the
2022 /// corresponding HIR.
2023 ///
2024 /// Note that this does not include the implicit capturing group
2025 /// corresponding to the entire match that is typically included by regex
2026 /// engines.
2027 ///
2028 /// # Example
2029 ///
2030 /// This method will return `0` for `a` and `1` for `(a)`:
2031 ///
2032 /// ```
2033 /// use regex_syntax::parse;
2034 ///
2035 /// assert_eq!(0, parse("a")?.properties().explicit_captures_len());
2036 /// assert_eq!(1, parse("(a)")?.properties().explicit_captures_len());
2037 ///
2038 /// # Ok::<(), Box<dyn std::error::Error>>(())
2039 /// ```
2040 #[inline]
2041 pub fn explicit_captures_len(&self) -> usize {
2042 self.0.explicit_captures_len
2043 }
2044
2045 /// Returns the total number of explicit capturing groups that appear in
2046 /// every possible match.
2047 ///
2048 /// If the number of capture groups can vary depending on the match, then
2049 /// this returns `None`. That is, a value is only returned when the number
2050 /// of matching groups is invariant or "static."
2051 ///
2052 /// Note that this does not include the implicit capturing group
2053 /// corresponding to the entire match.
2054 ///
2055 /// # Example
2056 ///
2057 /// This shows a few cases where a static number of capture groups is
2058 /// available and a few cases where it is not.
2059 ///
2060 /// ```
2061 /// use regex_syntax::parse;
2062 ///
2063 /// let len = |pattern| {
2064 /// parse(pattern).map(|h| {
2065 /// h.properties().static_explicit_captures_len()
2066 /// })
2067 /// };
2068 ///
2069 /// assert_eq!(Some(0), len("a")?);
2070 /// assert_eq!(Some(1), len("(a)")?);
2071 /// assert_eq!(Some(1), len("(a)|(b)")?);
2072 /// assert_eq!(Some(2), len("(a)(b)|(c)(d)")?);
2073 /// assert_eq!(None, len("(a)|b")?);
2074 /// assert_eq!(None, len("a|(b)")?);
2075 /// assert_eq!(None, len("(b)*")?);
2076 /// assert_eq!(Some(1), len("(b)+")?);
2077 ///
2078 /// # Ok::<(), Box<dyn std::error::Error>>(())
2079 /// ```
2080 #[inline]
2081 pub fn static_explicit_captures_len(&self) -> Option<usize> {
2082 self.0.static_explicit_captures_len
2083 }
2084
2085 /// Return true if and only if this HIR is a simple literal. This is
2086 /// only true when this HIR expression is either itself a `Literal` or a
2087 /// concatenation of only `Literal`s.
2088 ///
2089 /// For example, `f` and `foo` are literals, but `f+`, `(foo)`, `foo()` and
2090 /// the empty string are not (even though they contain sub-expressions that
2091 /// are literals).
2092 #[inline]
2093 pub fn is_literal(&self) -> bool {
2094 self.0.literal
2095 }
2096
2097 /// Return true if and only if this HIR is either a simple literal or an
2098 /// alternation of simple literals. This is only
2099 /// true when this HIR expression is either itself a `Literal` or a
2100 /// concatenation of only `Literal`s or an alternation of only `Literal`s.
2101 ///
2102 /// For example, `f`, `foo`, `a|b|c`, and `foo|bar|baz` are alternation
2103 /// literals, but `f+`, `(foo)`, `foo()`, and the empty pattern are not
2104 /// (even though that contain sub-expressions that are literals).
2105 #[inline]
2106 pub fn is_alternation_literal(&self) -> bool {
2107 self.0.alternation_literal
2108 }
2109
2110 /// Returns the total amount of heap memory usage, in bytes, used by this
2111 /// `Properties` value.
2112 #[inline]
2113 pub fn memory_usage(&self) -> usize {
2114 core::mem::size_of::<PropertiesI>()
2115 }
2116
2117 /// Returns a new set of properties that corresponds to the union of the
2118 /// iterator of properties given.
2119 ///
2120 /// This is useful when one has multiple `Hir` expressions and wants
2121 /// to combine them into a single alternation without constructing the
2122 /// corresponding `Hir`. This routine provides a way of combining the
2123 /// properties of each `Hir` expression into one set of properties
2124 /// representing the union of those expressions.
2125 ///
2126 /// # Example: union with HIRs that never match
2127 ///
2128 /// This example shows that unioning properties together with one that
2129 /// represents a regex that never matches will "poison" certain attributes,
2130 /// like the minimum and maximum lengths.
2131 ///
2132 /// ```
2133 /// use regex_syntax::{hir::Properties, parse};
2134 ///
2135 /// let hir1 = parse("ab?c?")?;
2136 /// assert_eq!(Some(1), hir1.properties().minimum_len());
2137 /// assert_eq!(Some(3), hir1.properties().maximum_len());
2138 ///
2139 /// let hir2 = parse(r"[a&&b]")?;
2140 /// assert_eq!(None, hir2.properties().minimum_len());
2141 /// assert_eq!(None, hir2.properties().maximum_len());
2142 ///
2143 /// let hir3 = parse(r"wxy?z?")?;
2144 /// assert_eq!(Some(2), hir3.properties().minimum_len());
2145 /// assert_eq!(Some(4), hir3.properties().maximum_len());
2146 ///
2147 /// let unioned = Properties::union([
2148 /// hir1.properties(),
2149 /// hir2.properties(),
2150 /// hir3.properties(),
2151 /// ]);
2152 /// assert_eq!(None, unioned.minimum_len());
2153 /// assert_eq!(None, unioned.maximum_len());
2154 ///
2155 /// # Ok::<(), Box<dyn std::error::Error>>(())
2156 /// ```
2157 ///
2158 /// The maximum length can also be "poisoned" by a pattern that has no
2159 /// upper bound on the length of a match. The minimum length remains
2160 /// unaffected:
2161 ///
2162 /// ```
2163 /// use regex_syntax::{hir::Properties, parse};
2164 ///
2165 /// let hir1 = parse("ab?c?")?;
2166 /// assert_eq!(Some(1), hir1.properties().minimum_len());
2167 /// assert_eq!(Some(3), hir1.properties().maximum_len());
2168 ///
2169 /// let hir2 = parse(r"a+")?;
2170 /// assert_eq!(Some(1), hir2.properties().minimum_len());
2171 /// assert_eq!(None, hir2.properties().maximum_len());
2172 ///
2173 /// let hir3 = parse(r"wxy?z?")?;
2174 /// assert_eq!(Some(2), hir3.properties().minimum_len());
2175 /// assert_eq!(Some(4), hir3.properties().maximum_len());
2176 ///
2177 /// let unioned = Properties::union([
2178 /// hir1.properties(),
2179 /// hir2.properties(),
2180 /// hir3.properties(),
2181 /// ]);
2182 /// assert_eq!(Some(1), unioned.minimum_len());
2183 /// assert_eq!(None, unioned.maximum_len());
2184 ///
2185 /// # Ok::<(), Box<dyn std::error::Error>>(())
2186 /// ```
2187 pub fn union<I, P>(props: I) -> Properties
2188 where
2189 I: IntoIterator<Item = P>,
2190 P: core::borrow::Borrow<Properties>,
2191 {
2192 let mut it = props.into_iter().peekable();
2193 // While empty alternations aren't possible, we still behave as if they
2194 // are. When we have an empty alternate, then clearly the look-around
2195 // prefix and suffix is empty. Otherwise, it is the intersection of all
2196 // prefixes and suffixes (respectively) of the branches.
2197 let fix = if it.peek().is_none() {
2198 LookSet::empty()
2199 } else {
2200 LookSet::full()
2201 };
2202 // And also, an empty alternate means we have 0 static capture groups,
2203 // but we otherwise start with the number corresponding to the first
2204 // alternate. If any subsequent alternate has a different number of
2205 // static capture groups, then we overall have a variation and not a
2206 // static number of groups.
2207 let static_explicit_captures_len =
2208 it.peek().and_then(|p| p.borrow().static_explicit_captures_len());
2209 // The base case is an empty alternation, which matches nothing.
2210 // Note though that empty alternations aren't possible, because the
2211 // Hir::alternation smart constructor rewrites those as empty character
2212 // classes.
2213 let mut props = PropertiesI {
2214 minimum_len: None,
2215 maximum_len: None,
2216 look_set: LookSet::empty(),
2217 look_set_prefix: fix,
2218 look_set_suffix: fix,
2219 look_set_prefix_any: LookSet::empty(),
2220 look_set_suffix_any: LookSet::empty(),
2221 utf8: true,
2222 explicit_captures_len: 0,
2223 static_explicit_captures_len,
2224 literal: false,
2225 alternation_literal: true,
2226 };
2227 let (mut min_poisoned, mut max_poisoned) = (false, false);
2228 // Handle properties that need to visit every child hir.
2229 for prop in it {
2230 let p = prop.borrow();
2231 props.look_set.set_union(p.look_set());
2232 props.look_set_prefix.set_intersect(p.look_set_prefix());
2233 props.look_set_suffix.set_intersect(p.look_set_suffix());
2234 props.look_set_prefix_any.set_union(p.look_set_prefix_any());
2235 props.look_set_suffix_any.set_union(p.look_set_suffix_any());
2236 props.utf8 = props.utf8 && p.is_utf8();
2237 props.explicit_captures_len = props
2238 .explicit_captures_len
2239 .saturating_add(p.explicit_captures_len());
2240 if props.static_explicit_captures_len
2241 != p.static_explicit_captures_len()
2242 {
2243 props.static_explicit_captures_len = None;
2244 }
2245 props.alternation_literal =
2246 props.alternation_literal && p.is_literal();
2247 if !min_poisoned {
2248 if let Some(xmin) = p.minimum_len() {
2249 if props.minimum_len.map_or(true, |pmin| xmin < pmin) {
2250 props.minimum_len = Some(xmin);
2251 }
2252 } else {
2253 props.minimum_len = None;
2254 min_poisoned = true;
2255 }
2256 }
2257 if !max_poisoned {
2258 if let Some(xmax) = p.maximum_len() {
2259 if props.maximum_len.map_or(true, |pmax| xmax > pmax) {
2260 props.maximum_len = Some(xmax);
2261 }
2262 } else {
2263 props.maximum_len = None;
2264 max_poisoned = true;
2265 }
2266 }
2267 }
2268 Properties(Box::new(props))
2269 }
2270}
2271
2272impl Properties {
2273 /// Create a new set of HIR properties for an empty regex.
2274 fn empty() -> Properties {
2275 let inner = PropertiesI {
2276 minimum_len: Some(0),
2277 maximum_len: Some(0),
2278 look_set: LookSet::empty(),
2279 look_set_prefix: LookSet::empty(),
2280 look_set_suffix: LookSet::empty(),
2281 look_set_prefix_any: LookSet::empty(),
2282 look_set_suffix_any: LookSet::empty(),
2283 // It is debatable whether an empty regex always matches at valid
2284 // UTF-8 boundaries. Strictly speaking, at a byte oriented view,
2285 // it is clearly false. There are, for example, many empty strings
2286 // between the bytes encoding a '☃'.
2287 //
2288 // However, when Unicode mode is enabled, the fundamental atom
2289 // of matching is really a codepoint. And in that scenario, an
2290 // empty regex is defined to only match at valid UTF-8 boundaries
2291 // and to never split a codepoint. It just so happens that this
2292 // enforcement is somewhat tricky to do for regexes that match
2293 // the empty string inside regex engines themselves. It usually
2294 // requires some layer above the regex engine to filter out such
2295 // matches.
2296 //
2297 // In any case, 'true' is really the only coherent option. If it
2298 // were false, for example, then 'a*' would also need to be false
2299 // since it too can match the empty string.
2300 utf8: true,
2301 explicit_captures_len: 0,
2302 static_explicit_captures_len: Some(0),
2303 literal: false,
2304 alternation_literal: false,
2305 };
2306 Properties(Box::new(inner))
2307 }
2308
2309 /// Create a new set of HIR properties for a literal regex.
2310 fn literal(lit: &Literal) -> Properties {
2311 let inner = PropertiesI {
2312 minimum_len: Some(lit.0.len()),
2313 maximum_len: Some(lit.0.len()),
2314 look_set: LookSet::empty(),
2315 look_set_prefix: LookSet::empty(),
2316 look_set_suffix: LookSet::empty(),
2317 look_set_prefix_any: LookSet::empty(),
2318 look_set_suffix_any: LookSet::empty(),
2319 utf8: core::str::from_utf8(&lit.0).is_ok(),
2320 explicit_captures_len: 0,
2321 static_explicit_captures_len: Some(0),
2322 literal: true,
2323 alternation_literal: true,
2324 };
2325 Properties(Box::new(inner))
2326 }
2327
2328 /// Create a new set of HIR properties for a character class.
2329 fn class(class: &Class) -> Properties {
2330 let inner = PropertiesI {
2331 minimum_len: class.minimum_len(),
2332 maximum_len: class.maximum_len(),
2333 look_set: LookSet::empty(),
2334 look_set_prefix: LookSet::empty(),
2335 look_set_suffix: LookSet::empty(),
2336 look_set_prefix_any: LookSet::empty(),
2337 look_set_suffix_any: LookSet::empty(),
2338 utf8: class.is_utf8(),
2339 explicit_captures_len: 0,
2340 static_explicit_captures_len: Some(0),
2341 literal: false,
2342 alternation_literal: false,
2343 };
2344 Properties(Box::new(inner))
2345 }
2346
2347 /// Create a new set of HIR properties for a look-around assertion.
2348 fn look(look: Look) -> Properties {
2349 let inner = PropertiesI {
2350 minimum_len: Some(0),
2351 maximum_len: Some(0),
2352 look_set: LookSet::singleton(look),
2353 look_set_prefix: LookSet::singleton(look),
2354 look_set_suffix: LookSet::singleton(look),
2355 look_set_prefix_any: LookSet::singleton(look),
2356 look_set_suffix_any: LookSet::singleton(look),
2357 // This requires a little explanation. Basically, we don't consider
2358 // matching an empty string to be equivalent to matching invalid
2359 // UTF-8, even though technically matching every empty string will
2360 // split the UTF-8 encoding of a single codepoint when treating a
2361 // UTF-8 encoded string as a sequence of bytes. Our defense here is
2362 // that in such a case, a codepoint should logically be treated as
2363 // the fundamental atom for matching, and thus the only valid match
2364 // points are between codepoints and not bytes.
2365 //
2366 // More practically, this is true here because it's also true
2367 // for 'Hir::empty()', otherwise something like 'a*' would be
2368 // considered to match invalid UTF-8. That in turn makes this
2369 // property borderline useless.
2370 utf8: true,
2371 explicit_captures_len: 0,
2372 static_explicit_captures_len: Some(0),
2373 literal: false,
2374 alternation_literal: false,
2375 };
2376 Properties(Box::new(inner))
2377 }
2378
2379 /// Create a new set of HIR properties for a repetition.
2380 fn repetition(rep: &Repetition) -> Properties {
2381 let p = rep.sub.properties();
2382 let minimum_len = p.minimum_len().map(|child_min| {
2383 let rep_min = usize::try_from(rep.min).unwrap_or(usize::MAX);
2384 child_min.saturating_mul(rep_min)
2385 });
2386 let maximum_len = rep.max.and_then(|rep_max| {
2387 let rep_max = usize::try_from(rep_max).ok()?;
2388 let child_max = p.maximum_len()?;
2389 child_max.checked_mul(rep_max)
2390 });
2391
2392 let mut inner = PropertiesI {
2393 minimum_len,
2394 maximum_len,
2395 look_set: p.look_set(),
2396 look_set_prefix: LookSet::empty(),
2397 look_set_suffix: LookSet::empty(),
2398 look_set_prefix_any: p.look_set_prefix_any(),
2399 look_set_suffix_any: p.look_set_suffix_any(),
2400 utf8: p.is_utf8(),
2401 explicit_captures_len: p.explicit_captures_len(),
2402 static_explicit_captures_len: p.static_explicit_captures_len(),
2403 literal: false,
2404 alternation_literal: false,
2405 };
2406 // If the repetition operator can match the empty string, then its
2407 // lookset prefix and suffixes themselves remain empty since they are
2408 // no longer required to match.
2409 if rep.min > 0 {
2410 inner.look_set_prefix = p.look_set_prefix();
2411 inner.look_set_suffix = p.look_set_suffix();
2412 }
2413 // If the static captures len of the sub-expression is not known or is
2414 // zero, then it automatically propagates to the repetition, regardless
2415 // of the repetition. Otherwise, it might change, but only when the
2416 // repetition can match 0 times.
2417 if rep.min == 0
2418 && inner.static_explicit_captures_len.map_or(false, |len| len > 0)
2419 {
2420 // If we require a match 0 times, then our captures len is
2421 // guaranteed to be zero. Otherwise, if we *can* match the empty
2422 // string, then it's impossible to know how many captures will be
2423 // in the resulting match.
2424 if rep.max == Some(0) {
2425 inner.static_explicit_captures_len = Some(0);
2426 } else {
2427 inner.static_explicit_captures_len = None;
2428 }
2429 }
2430 Properties(Box::new(inner))
2431 }
2432
2433 /// Create a new set of HIR properties for a capture.
2434 fn capture(capture: &Capture) -> Properties {
2435 let p = capture.sub.properties();
2436 Properties(Box::new(PropertiesI {
2437 explicit_captures_len: p.explicit_captures_len().saturating_add(1),
2438 static_explicit_captures_len: p
2439 .static_explicit_captures_len()
2440 .map(|len| len.saturating_add(1)),
2441 literal: false,
2442 alternation_literal: false,
2443 ..*p.0.clone()
2444 }))
2445 }
2446
2447 /// Create a new set of HIR properties for a concatenation.
2448 fn concat(concat: &[Hir]) -> Properties {
2449 // The base case is an empty concatenation, which matches the empty
2450 // string. Note though that empty concatenations aren't possible,
2451 // because the Hir::concat smart constructor rewrites those as
2452 // Hir::empty.
2453 let mut props = PropertiesI {
2454 minimum_len: Some(0),
2455 maximum_len: Some(0),
2456 look_set: LookSet::empty(),
2457 look_set_prefix: LookSet::empty(),
2458 look_set_suffix: LookSet::empty(),
2459 look_set_prefix_any: LookSet::empty(),
2460 look_set_suffix_any: LookSet::empty(),
2461 utf8: true,
2462 explicit_captures_len: 0,
2463 static_explicit_captures_len: Some(0),
2464 literal: true,
2465 alternation_literal: true,
2466 };
2467 // Handle properties that need to visit every child hir.
2468 for x in concat.iter() {
2469 let p = x.properties();
2470 props.look_set.set_union(p.look_set());
2471 props.utf8 = props.utf8 && p.is_utf8();
2472 props.explicit_captures_len = props
2473 .explicit_captures_len
2474 .saturating_add(p.explicit_captures_len());
2475 props.static_explicit_captures_len = p
2476 .static_explicit_captures_len()
2477 .and_then(|len1| {
2478 Some((len1, props.static_explicit_captures_len?))
2479 })
2480 .and_then(|(len1, len2)| Some(len1.saturating_add(len2)));
2481 props.literal = props.literal && p.is_literal();
2482 props.alternation_literal =
2483 props.alternation_literal && p.is_alternation_literal();
2484 if let Some(minimum_len) = props.minimum_len {
2485 match p.minimum_len() {
2486 None => props.minimum_len = None,
2487 Some(len) => {
2488 // We use saturating arithmetic here because the
2489 // minimum is just a lower bound. We can't go any
2490 // higher than what our number types permit.
2491 props.minimum_len =
2492 Some(minimum_len.saturating_add(len));
2493 }
2494 }
2495 }
2496 if let Some(maximum_len) = props.maximum_len {
2497 match p.maximum_len() {
2498 None => props.maximum_len = None,
2499 Some(len) => {
2500 props.maximum_len = maximum_len.checked_add(len)
2501 }
2502 }
2503 }
2504 }
2505 // Handle the prefix properties, which only requires visiting
2506 // child exprs until one matches more than the empty string.
2507 let mut it = concat.iter();
2508 while let Some(x) = it.next() {
2509 props.look_set_prefix.set_union(x.properties().look_set_prefix());
2510 props
2511 .look_set_prefix_any
2512 .set_union(x.properties().look_set_prefix_any());
2513 if x.properties().maximum_len().map_or(true, |x| x > 0) {
2514 break;
2515 }
2516 }
2517 // Same thing for the suffix properties, but in reverse.
2518 let mut it = concat.iter().rev();
2519 while let Some(x) = it.next() {
2520 props.look_set_suffix.set_union(x.properties().look_set_suffix());
2521 props
2522 .look_set_suffix_any
2523 .set_union(x.properties().look_set_suffix_any());
2524 if x.properties().maximum_len().map_or(true, |x| x > 0) {
2525 break;
2526 }
2527 }
2528 Properties(Box::new(props))
2529 }
2530
2531 /// Create a new set of HIR properties for a concatenation.
2532 fn alternation(alts: &[Hir]) -> Properties {
2533 Properties::union(alts.iter().map(|hir| hir.properties()))
2534 }
2535}
2536
2537/// A set of look-around assertions.
2538///
2539/// This is useful for efficiently tracking look-around assertions. For
2540/// example, an [`Hir`] provides properties that return `LookSet`s.
2541#[derive(Clone, Copy, Default, Eq, PartialEq)]
2542pub struct LookSet {
2543 /// The underlying representation this set is exposed to make it possible
2544 /// to store it somewhere efficiently. The representation is that
2545 /// of a bitset, where each assertion occupies bit `i` where `i =
2546 /// Look::as_repr()`.
2547 ///
2548 /// Note that users of this internal representation must permit the full
2549 /// range of `u16` values to be represented. For example, even if the
2550 /// current implementation only makes use of the 10 least significant bits,
2551 /// it may use more bits in a future semver compatible release.
2552 pub bits: u16,
2553}
2554
2555impl LookSet {
2556 /// Create an empty set of look-around assertions.
2557 #[inline]
2558 pub fn empty() -> LookSet {
2559 LookSet { bits: 0 }
2560 }
2561
2562 /// Create a full set of look-around assertions.
2563 ///
2564 /// This set contains all possible look-around assertions.
2565 #[inline]
2566 pub fn full() -> LookSet {
2567 LookSet { bits: !0 }
2568 }
2569
2570 /// Create a look-around set containing the look-around assertion given.
2571 ///
2572 /// This is a convenience routine for creating an empty set and inserting
2573 /// one look-around assertions.
2574 #[inline]
2575 pub fn singleton(look: Look) -> LookSet {
2576 LookSet::empty().insert(look)
2577 }
2578
2579 /// Returns the total number of look-around assertions in this set.
2580 #[inline]
2581 pub fn len(self) -> usize {
2582 // OK because max value always fits in a u8, which in turn always
2583 // fits in a usize, regardless of target.
2584 usize::try_from(self.bits.count_ones()).unwrap()
2585 }
2586
2587 /// Returns true if and only if this set is empty.
2588 #[inline]
2589 pub fn is_empty(self) -> bool {
2590 self.len() == 0
2591 }
2592
2593 /// Returns true if and only if the given look-around assertion is in this
2594 /// set.
2595 #[inline]
2596 pub fn contains(self, look: Look) -> bool {
2597 self.bits & look.as_repr() != 0
2598 }
2599
2600 /// Returns true if and only if this set contains any anchor assertions.
2601 /// This includes both "start/end of haystack" and "start/end of line."
2602 #[inline]
2603 pub fn contains_anchor(&self) -> bool {
2604 self.contains_anchor_haystack() || self.contains_anchor_line()
2605 }
2606
2607 /// Returns true if and only if this set contains any "start/end of
2608 /// haystack" anchors. This doesn't include "start/end of line" anchors.
2609 #[inline]
2610 pub fn contains_anchor_haystack(&self) -> bool {
2611 self.contains(Look::Start) || self.contains(Look::End)
2612 }
2613
2614 /// Returns true if and only if this set contains any "start/end of line"
2615 /// anchors. This doesn't include "start/end of haystack" anchors. This
2616 /// includes both `\n` line anchors and CRLF (`\r\n`) aware line anchors.
2617 #[inline]
2618 pub fn contains_anchor_line(&self) -> bool {
2619 self.contains(Look::StartLF)
2620 || self.contains(Look::EndLF)
2621 || self.contains(Look::StartCRLF)
2622 || self.contains(Look::EndCRLF)
2623 }
2624
2625 /// Returns true if and only if this set contains any "start/end of line"
2626 /// anchors that only treat `\n` as line terminators. This does not include
2627 /// haystack anchors or CRLF aware line anchors.
2628 #[inline]
2629 pub fn contains_anchor_lf(&self) -> bool {
2630 self.contains(Look::StartLF) || self.contains(Look::EndLF)
2631 }
2632
2633 /// Returns true if and only if this set contains any "start/end of line"
2634 /// anchors that are CRLF-aware. This doesn't include "start/end of
2635 /// haystack" or "start/end of line-feed" anchors.
2636 #[inline]
2637 pub fn contains_anchor_crlf(&self) -> bool {
2638 self.contains(Look::StartCRLF) || self.contains(Look::EndCRLF)
2639 }
2640
2641 /// Returns true if and only if this set contains any word boundary or
2642 /// negated word boundary assertions. This include both Unicode and ASCII
2643 /// word boundaries.
2644 #[inline]
2645 pub fn contains_word(self) -> bool {
2646 self.contains_word_unicode() || self.contains_word_ascii()
2647 }
2648
2649 /// Returns true if and only if this set contains any Unicode word boundary
2650 /// or negated Unicode word boundary assertions.
2651 #[inline]
2652 pub fn contains_word_unicode(self) -> bool {
2653 self.contains(Look::WordUnicode)
2654 || self.contains(Look::WordUnicodeNegate)
2655 }
2656
2657 /// Returns true if and only if this set contains any ASCII word boundary
2658 /// or negated ASCII word boundary assertions.
2659 #[inline]
2660 pub fn contains_word_ascii(self) -> bool {
2661 self.contains(Look::WordAscii) || self.contains(Look::WordAsciiNegate)
2662 }
2663
2664 /// Returns an iterator over all of the look-around assertions in this set.
2665 #[inline]
2666 pub fn iter(self) -> LookSetIter {
2667 LookSetIter { set: self }
2668 }
2669
2670 /// Return a new set that is equivalent to the original, but with the given
2671 /// assertion added to it. If the assertion is already in the set, then the
2672 /// returned set is equivalent to the original.
2673 #[inline]
2674 pub fn insert(self, look: Look) -> LookSet {
2675 LookSet { bits: self.bits | look.as_repr() }
2676 }
2677
2678 /// Updates this set in place with the result of inserting the given
2679 /// assertion into this set.
2680 #[inline]
2681 pub fn set_insert(&mut self, look: Look) {
2682 *self = self.insert(look);
2683 }
2684
2685 /// Return a new set that is equivalent to the original, but with the given
2686 /// assertion removed from it. If the assertion is not in the set, then the
2687 /// returned set is equivalent to the original.
2688 #[inline]
2689 pub fn remove(self, look: Look) -> LookSet {
2690 LookSet { bits: self.bits & !look.as_repr() }
2691 }
2692
2693 /// Updates this set in place with the result of removing the given
2694 /// assertion from this set.
2695 #[inline]
2696 pub fn set_remove(&mut self, look: Look) {
2697 *self = self.remove(look);
2698 }
2699
2700 /// Returns a new set that is the result of subtracting the given set from
2701 /// this set.
2702 #[inline]
2703 pub fn subtract(self, other: LookSet) -> LookSet {
2704 LookSet { bits: self.bits & !other.bits }
2705 }
2706
2707 /// Updates this set in place with the result of subtracting the given set
2708 /// from this set.
2709 #[inline]
2710 pub fn set_subtract(&mut self, other: LookSet) {
2711 *self = self.subtract(other);
2712 }
2713
2714 /// Returns a new set that is the union of this and the one given.
2715 #[inline]
2716 pub fn union(self, other: LookSet) -> LookSet {
2717 LookSet { bits: self.bits | other.bits }
2718 }
2719
2720 /// Updates this set in place with the result of unioning it with the one
2721 /// given.
2722 #[inline]
2723 pub fn set_union(&mut self, other: LookSet) {
2724 *self = self.union(other);
2725 }
2726
2727 /// Returns a new set that is the intersection of this and the one given.
2728 #[inline]
2729 pub fn intersect(self, other: LookSet) -> LookSet {
2730 LookSet { bits: self.bits & other.bits }
2731 }
2732
2733 /// Updates this set in place with the result of intersecting it with the
2734 /// one given.
2735 #[inline]
2736 pub fn set_intersect(&mut self, other: LookSet) {
2737 *self = self.intersect(other);
2738 }
2739
2740 /// Return a `LookSet` from the slice given as a native endian 16-bit
2741 /// integer.
2742 ///
2743 /// # Panics
2744 ///
2745 /// This panics if `slice.len() < 2`.
2746 #[inline]
2747 pub fn read_repr(slice: &[u8]) -> LookSet {
2748 let bits = u16::from_ne_bytes(slice[..2].try_into().unwrap());
2749 LookSet { bits }
2750 }
2751
2752 /// Write a `LookSet` as a native endian 16-bit integer to the beginning
2753 /// of the slice given.
2754 ///
2755 /// # Panics
2756 ///
2757 /// This panics if `slice.len() < 2`.
2758 #[inline]
2759 pub fn write_repr(self, slice: &mut [u8]) {
2760 let raw = self.bits.to_ne_bytes();
2761 slice[0] = raw[0];
2762 slice[1] = raw[1];
2763 }
2764}
2765
2766impl core::fmt::Debug for LookSet {
2767 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
2768 if self.is_empty() {
2769 return write!(f, "∅");
2770 }
2771 for look: Look in self.iter() {
2772 write!(f, "{}", look.as_char())?;
2773 }
2774 Ok(())
2775 }
2776}
2777
2778/// An iterator over all look-around assertions in a [`LookSet`].
2779///
2780/// This iterator is created by [`LookSet::iter`].
2781#[derive(Clone, Debug)]
2782pub struct LookSetIter {
2783 set: LookSet,
2784}
2785
2786impl Iterator for LookSetIter {
2787 type Item = Look;
2788
2789 #[inline]
2790 fn next(&mut self) -> Option<Look> {
2791 if self.set.is_empty() {
2792 return None;
2793 }
2794 // We'll never have more than u8::MAX distinct look-around assertions,
2795 // so 'repr' will always fit into a u16.
2796 let repr: u16 = u16::try_from(self.set.bits.trailing_zeros()).unwrap();
2797 let look: Look = Look::from_repr(1 << repr)?;
2798 self.set = self.set.remove(look);
2799 Some(look)
2800 }
2801}
2802
2803/// Given a sequence of HIR values where each value corresponds to a Unicode
2804/// class (or an all-ASCII byte class), return a single Unicode class
2805/// corresponding to the union of the classes found.
2806fn class_chars(hirs: &[Hir]) -> Option<Class> {
2807 let mut cls: ClassUnicode = ClassUnicode::new(ranges:vec![]);
2808 for hir: &Hir in hirs.iter() {
2809 match *hir.kind() {
2810 HirKind::Class(Class::Unicode(ref cls2: &ClassUnicode)) => {
2811 cls.union(cls2);
2812 }
2813 HirKind::Class(Class::Bytes(ref cls2: &ClassBytes)) => {
2814 cls.union(&cls2.to_unicode_class()?);
2815 }
2816 _ => return None,
2817 };
2818 }
2819 Some(Class::Unicode(cls))
2820}
2821
2822/// Given a sequence of HIR values where each value corresponds to a byte class
2823/// (or an all-ASCII Unicode class), return a single byte class corresponding
2824/// to the union of the classes found.
2825fn class_bytes(hirs: &[Hir]) -> Option<Class> {
2826 let mut cls: ClassBytes = ClassBytes::new(ranges:vec![]);
2827 for hir: &Hir in hirs.iter() {
2828 match *hir.kind() {
2829 HirKind::Class(Class::Unicode(ref cls2: &ClassUnicode)) => {
2830 cls.union(&cls2.to_byte_class()?);
2831 }
2832 HirKind::Class(Class::Bytes(ref cls2: &ClassBytes)) => {
2833 cls.union(cls2);
2834 }
2835 _ => return None,
2836 };
2837 }
2838 Some(Class::Bytes(cls))
2839}
2840
2841/// Given a sequence of HIR values where each value corresponds to a literal
2842/// that is a single `char`, return that sequence of `char`s. Otherwise return
2843/// None. No deduplication is done.
2844fn singleton_chars(hirs: &[Hir]) -> Option<Vec<char>> {
2845 let mut singletons: Vec = vec![];
2846 for hir: &Hir in hirs.iter() {
2847 let literal: &Box<[u8]> = match *hir.kind() {
2848 HirKind::Literal(Literal(ref bytes: &Box<[u8]>)) => bytes,
2849 _ => return None,
2850 };
2851 let ch: char = match crate::debug::utf8_decode(bytes:literal) {
2852 None => return None,
2853 Some(Err(_)) => return None,
2854 Some(Ok(ch: char)) => ch,
2855 };
2856 if literal.len() != ch.len_utf8() {
2857 return None;
2858 }
2859 singletons.push(ch);
2860 }
2861 Some(singletons)
2862}
2863
2864/// Given a sequence of HIR values where each value corresponds to a literal
2865/// that is a single byte, return that sequence of bytes. Otherwise return
2866/// None. No deduplication is done.
2867fn singleton_bytes(hirs: &[Hir]) -> Option<Vec<u8>> {
2868 let mut singletons: Vec = vec![];
2869 for hir: &Hir in hirs.iter() {
2870 let literal: &Box<[u8]> = match *hir.kind() {
2871 HirKind::Literal(Literal(ref bytes: &Box<[u8]>)) => bytes,
2872 _ => return None,
2873 };
2874 if literal.len() != 1 {
2875 return None;
2876 }
2877 singletons.push(literal[0]);
2878 }
2879 Some(singletons)
2880}
2881
2882/// Looks for a common prefix in the list of alternation branches given. If one
2883/// is found, then an equivalent but (hopefully) simplified Hir is returned.
2884/// Otherwise, the original given list of branches is returned unmodified.
2885///
2886/// This is not quite as good as it could be. Right now, it requires that
2887/// all branches are 'Concat' expressions. It also doesn't do well with
2888/// literals. For example, given 'foofoo|foobar', it will not refactor it to
2889/// 'foo(?:foo|bar)' because literals are flattened into their own special
2890/// concatenation. (One wonders if perhaps 'Literal' should be a single atom
2891/// instead of a string of bytes because of this. Otherwise, handling the
2892/// current representation in this routine will be pretty gnarly. Sigh.)
2893fn lift_common_prefix(hirs: Vec<Hir>) -> Result<Hir, Vec<Hir>> {
2894 if hirs.len() <= 1 {
2895 return Err(hirs);
2896 }
2897 let mut prefix = match hirs[0].kind() {
2898 HirKind::Concat(ref xs) => &**xs,
2899 _ => return Err(hirs),
2900 };
2901 if prefix.is_empty() {
2902 return Err(hirs);
2903 }
2904 for h in hirs.iter().skip(1) {
2905 let concat = match h.kind() {
2906 HirKind::Concat(ref xs) => xs,
2907 _ => return Err(hirs),
2908 };
2909 let common_len = prefix
2910 .iter()
2911 .zip(concat.iter())
2912 .take_while(|(x, y)| x == y)
2913 .count();
2914 prefix = &prefix[..common_len];
2915 if prefix.is_empty() {
2916 return Err(hirs);
2917 }
2918 }
2919 let len = prefix.len();
2920 assert_ne!(0, len);
2921 let mut prefix_concat = vec![];
2922 let mut suffix_alts = vec![];
2923 for h in hirs {
2924 let mut concat = match h.into_kind() {
2925 HirKind::Concat(xs) => xs,
2926 // We required all sub-expressions to be
2927 // concats above, so we're only here if we
2928 // have a concat.
2929 _ => unreachable!(),
2930 };
2931 suffix_alts.push(Hir::concat(concat.split_off(len)));
2932 if prefix_concat.is_empty() {
2933 prefix_concat = concat;
2934 }
2935 }
2936 let mut concat = prefix_concat;
2937 concat.push(Hir::alternation(suffix_alts));
2938 Ok(Hir::concat(concat))
2939}
2940
2941#[cfg(test)]
2942mod tests {
2943 use super::*;
2944
2945 fn uclass(ranges: &[(char, char)]) -> ClassUnicode {
2946 let ranges: Vec<ClassUnicodeRange> = ranges
2947 .iter()
2948 .map(|&(s, e)| ClassUnicodeRange::new(s, e))
2949 .collect();
2950 ClassUnicode::new(ranges)
2951 }
2952
2953 fn bclass(ranges: &[(u8, u8)]) -> ClassBytes {
2954 let ranges: Vec<ClassBytesRange> =
2955 ranges.iter().map(|&(s, e)| ClassBytesRange::new(s, e)).collect();
2956 ClassBytes::new(ranges)
2957 }
2958
2959 fn uranges(cls: &ClassUnicode) -> Vec<(char, char)> {
2960 cls.iter().map(|x| (x.start(), x.end())).collect()
2961 }
2962
2963 #[cfg(feature = "unicode-case")]
2964 fn ucasefold(cls: &ClassUnicode) -> ClassUnicode {
2965 let mut cls_ = cls.clone();
2966 cls_.case_fold_simple();
2967 cls_
2968 }
2969
2970 fn uunion(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
2971 let mut cls_ = cls1.clone();
2972 cls_.union(cls2);
2973 cls_
2974 }
2975
2976 fn uintersect(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
2977 let mut cls_ = cls1.clone();
2978 cls_.intersect(cls2);
2979 cls_
2980 }
2981
2982 fn udifference(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
2983 let mut cls_ = cls1.clone();
2984 cls_.difference(cls2);
2985 cls_
2986 }
2987
2988 fn usymdifference(
2989 cls1: &ClassUnicode,
2990 cls2: &ClassUnicode,
2991 ) -> ClassUnicode {
2992 let mut cls_ = cls1.clone();
2993 cls_.symmetric_difference(cls2);
2994 cls_
2995 }
2996
2997 fn unegate(cls: &ClassUnicode) -> ClassUnicode {
2998 let mut cls_ = cls.clone();
2999 cls_.negate();
3000 cls_
3001 }
3002
3003 fn branges(cls: &ClassBytes) -> Vec<(u8, u8)> {
3004 cls.iter().map(|x| (x.start(), x.end())).collect()
3005 }
3006
3007 fn bcasefold(cls: &ClassBytes) -> ClassBytes {
3008 let mut cls_ = cls.clone();
3009 cls_.case_fold_simple();
3010 cls_
3011 }
3012
3013 fn bunion(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
3014 let mut cls_ = cls1.clone();
3015 cls_.union(cls2);
3016 cls_
3017 }
3018
3019 fn bintersect(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
3020 let mut cls_ = cls1.clone();
3021 cls_.intersect(cls2);
3022 cls_
3023 }
3024
3025 fn bdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
3026 let mut cls_ = cls1.clone();
3027 cls_.difference(cls2);
3028 cls_
3029 }
3030
3031 fn bsymdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
3032 let mut cls_ = cls1.clone();
3033 cls_.symmetric_difference(cls2);
3034 cls_
3035 }
3036
3037 fn bnegate(cls: &ClassBytes) -> ClassBytes {
3038 let mut cls_ = cls.clone();
3039 cls_.negate();
3040 cls_
3041 }
3042
3043 #[test]
3044 fn class_range_canonical_unicode() {
3045 let range = ClassUnicodeRange::new('\u{00FF}', '\0');
3046 assert_eq!('\0', range.start());
3047 assert_eq!('\u{00FF}', range.end());
3048 }
3049
3050 #[test]
3051 fn class_range_canonical_bytes() {
3052 let range = ClassBytesRange::new(b'\xFF', b'\0');
3053 assert_eq!(b'\0', range.start());
3054 assert_eq!(b'\xFF', range.end());
3055 }
3056
3057 #[test]
3058 fn class_canonicalize_unicode() {
3059 let cls = uclass(&[('a', 'c'), ('x', 'z')]);
3060 let expected = vec![('a', 'c'), ('x', 'z')];
3061 assert_eq!(expected, uranges(&cls));
3062
3063 let cls = uclass(&[('x', 'z'), ('a', 'c')]);
3064 let expected = vec![('a', 'c'), ('x', 'z')];
3065 assert_eq!(expected, uranges(&cls));
3066
3067 let cls = uclass(&[('x', 'z'), ('w', 'y')]);
3068 let expected = vec![('w', 'z')];
3069 assert_eq!(expected, uranges(&cls));
3070
3071 let cls = uclass(&[
3072 ('c', 'f'),
3073 ('a', 'g'),
3074 ('d', 'j'),
3075 ('a', 'c'),
3076 ('m', 'p'),
3077 ('l', 's'),
3078 ]);
3079 let expected = vec![('a', 'j'), ('l', 's')];
3080 assert_eq!(expected, uranges(&cls));
3081
3082 let cls = uclass(&[('x', 'z'), ('u', 'w')]);
3083 let expected = vec![('u', 'z')];
3084 assert_eq!(expected, uranges(&cls));
3085
3086 let cls = uclass(&[('\x00', '\u{10FFFF}'), ('\x00', '\u{10FFFF}')]);
3087 let expected = vec![('\x00', '\u{10FFFF}')];
3088 assert_eq!(expected, uranges(&cls));
3089
3090 let cls = uclass(&[('a', 'a'), ('b', 'b')]);
3091 let expected = vec![('a', 'b')];
3092 assert_eq!(expected, uranges(&cls));
3093 }
3094
3095 #[test]
3096 fn class_canonicalize_bytes() {
3097 let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]);
3098 let expected = vec![(b'a', b'c'), (b'x', b'z')];
3099 assert_eq!(expected, branges(&cls));
3100
3101 let cls = bclass(&[(b'x', b'z'), (b'a', b'c')]);
3102 let expected = vec![(b'a', b'c'), (b'x', b'z')];
3103 assert_eq!(expected, branges(&cls));
3104
3105 let cls = bclass(&[(b'x', b'z'), (b'w', b'y')]);
3106 let expected = vec![(b'w', b'z')];
3107 assert_eq!(expected, branges(&cls));
3108
3109 let cls = bclass(&[
3110 (b'c', b'f'),
3111 (b'a', b'g'),
3112 (b'd', b'j'),
3113 (b'a', b'c'),
3114 (b'm', b'p'),
3115 (b'l', b's'),
3116 ]);
3117 let expected = vec![(b'a', b'j'), (b'l', b's')];
3118 assert_eq!(expected, branges(&cls));
3119
3120 let cls = bclass(&[(b'x', b'z'), (b'u', b'w')]);
3121 let expected = vec![(b'u', b'z')];
3122 assert_eq!(expected, branges(&cls));
3123
3124 let cls = bclass(&[(b'\x00', b'\xFF'), (b'\x00', b'\xFF')]);
3125 let expected = vec![(b'\x00', b'\xFF')];
3126 assert_eq!(expected, branges(&cls));
3127
3128 let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]);
3129 let expected = vec![(b'a', b'b')];
3130 assert_eq!(expected, branges(&cls));
3131 }
3132
3133 #[test]
3134 #[cfg(feature = "unicode-case")]
3135 fn class_case_fold_unicode() {
3136 let cls = uclass(&[
3137 ('C', 'F'),
3138 ('A', 'G'),
3139 ('D', 'J'),
3140 ('A', 'C'),
3141 ('M', 'P'),
3142 ('L', 'S'),
3143 ('c', 'f'),
3144 ]);
3145 let expected = uclass(&[
3146 ('A', 'J'),
3147 ('L', 'S'),
3148 ('a', 'j'),
3149 ('l', 's'),
3150 ('\u{17F}', '\u{17F}'),
3151 ]);
3152 assert_eq!(expected, ucasefold(&cls));
3153
3154 let cls = uclass(&[('A', 'Z')]);
3155 let expected = uclass(&[
3156 ('A', 'Z'),
3157 ('a', 'z'),
3158 ('\u{17F}', '\u{17F}'),
3159 ('\u{212A}', '\u{212A}'),
3160 ]);
3161 assert_eq!(expected, ucasefold(&cls));
3162
3163 let cls = uclass(&[('a', 'z')]);
3164 let expected = uclass(&[
3165 ('A', 'Z'),
3166 ('a', 'z'),
3167 ('\u{17F}', '\u{17F}'),
3168 ('\u{212A}', '\u{212A}'),
3169 ]);
3170 assert_eq!(expected, ucasefold(&cls));
3171
3172 let cls = uclass(&[('A', 'A'), ('_', '_')]);
3173 let expected = uclass(&[('A', 'A'), ('_', '_'), ('a', 'a')]);
3174 assert_eq!(expected, ucasefold(&cls));
3175
3176 let cls = uclass(&[('A', 'A'), ('=', '=')]);
3177 let expected = uclass(&[('=', '='), ('A', 'A'), ('a', 'a')]);
3178 assert_eq!(expected, ucasefold(&cls));
3179
3180 let cls = uclass(&[('\x00', '\x10')]);
3181 assert_eq!(cls, ucasefold(&cls));
3182
3183 let cls = uclass(&[('k', 'k')]);
3184 let expected =
3185 uclass(&[('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}')]);
3186 assert_eq!(expected, ucasefold(&cls));
3187
3188 let cls = uclass(&[('@', '@')]);
3189 assert_eq!(cls, ucasefold(&cls));
3190 }
3191
3192 #[test]
3193 #[cfg(not(feature = "unicode-case"))]
3194 fn class_case_fold_unicode_disabled() {
3195 let mut cls = uclass(&[
3196 ('C', 'F'),
3197 ('A', 'G'),
3198 ('D', 'J'),
3199 ('A', 'C'),
3200 ('M', 'P'),
3201 ('L', 'S'),
3202 ('c', 'f'),
3203 ]);
3204 assert!(cls.try_case_fold_simple().is_err());
3205 }
3206
3207 #[test]
3208 #[should_panic]
3209 #[cfg(not(feature = "unicode-case"))]
3210 fn class_case_fold_unicode_disabled_panics() {
3211 let mut cls = uclass(&[
3212 ('C', 'F'),
3213 ('A', 'G'),
3214 ('D', 'J'),
3215 ('A', 'C'),
3216 ('M', 'P'),
3217 ('L', 'S'),
3218 ('c', 'f'),
3219 ]);
3220 cls.case_fold_simple();
3221 }
3222
3223 #[test]
3224 fn class_case_fold_bytes() {
3225 let cls = bclass(&[
3226 (b'C', b'F'),
3227 (b'A', b'G'),
3228 (b'D', b'J'),
3229 (b'A', b'C'),
3230 (b'M', b'P'),
3231 (b'L', b'S'),
3232 (b'c', b'f'),
3233 ]);
3234 let expected =
3235 bclass(&[(b'A', b'J'), (b'L', b'S'), (b'a', b'j'), (b'l', b's')]);
3236 assert_eq!(expected, bcasefold(&cls));
3237
3238 let cls = bclass(&[(b'A', b'Z')]);
3239 let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]);
3240 assert_eq!(expected, bcasefold(&cls));
3241
3242 let cls = bclass(&[(b'a', b'z')]);
3243 let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]);
3244 assert_eq!(expected, bcasefold(&cls));
3245
3246 let cls = bclass(&[(b'A', b'A'), (b'_', b'_')]);
3247 let expected = bclass(&[(b'A', b'A'), (b'_', b'_'), (b'a', b'a')]);
3248 assert_eq!(expected, bcasefold(&cls));
3249
3250 let cls = bclass(&[(b'A', b'A'), (b'=', b'=')]);
3251 let expected = bclass(&[(b'=', b'='), (b'A', b'A'), (b'a', b'a')]);
3252 assert_eq!(expected, bcasefold(&cls));
3253
3254 let cls = bclass(&[(b'\x00', b'\x10')]);
3255 assert_eq!(cls, bcasefold(&cls));
3256
3257 let cls = bclass(&[(b'k', b'k')]);
3258 let expected = bclass(&[(b'K', b'K'), (b'k', b'k')]);
3259 assert_eq!(expected, bcasefold(&cls));
3260
3261 let cls = bclass(&[(b'@', b'@')]);
3262 assert_eq!(cls, bcasefold(&cls));
3263 }
3264
3265 #[test]
3266 fn class_negate_unicode() {
3267 let cls = uclass(&[('a', 'a')]);
3268 let expected = uclass(&[('\x00', '\x60'), ('\x62', '\u{10FFFF}')]);
3269 assert_eq!(expected, unegate(&cls));
3270
3271 let cls = uclass(&[('a', 'a'), ('b', 'b')]);
3272 let expected = uclass(&[('\x00', '\x60'), ('\x63', '\u{10FFFF}')]);
3273 assert_eq!(expected, unegate(&cls));
3274
3275 let cls = uclass(&[('a', 'c'), ('x', 'z')]);
3276 let expected = uclass(&[
3277 ('\x00', '\x60'),
3278 ('\x64', '\x77'),
3279 ('\x7B', '\u{10FFFF}'),
3280 ]);
3281 assert_eq!(expected, unegate(&cls));
3282
3283 let cls = uclass(&[('\x00', 'a')]);
3284 let expected = uclass(&[('\x62', '\u{10FFFF}')]);
3285 assert_eq!(expected, unegate(&cls));
3286
3287 let cls = uclass(&[('a', '\u{10FFFF}')]);
3288 let expected = uclass(&[('\x00', '\x60')]);
3289 assert_eq!(expected, unegate(&cls));
3290
3291 let cls = uclass(&[('\x00', '\u{10FFFF}')]);
3292 let expected = uclass(&[]);
3293 assert_eq!(expected, unegate(&cls));
3294
3295 let cls = uclass(&[]);
3296 let expected = uclass(&[('\x00', '\u{10FFFF}')]);
3297 assert_eq!(expected, unegate(&cls));
3298
3299 let cls =
3300 uclass(&[('\x00', '\u{10FFFD}'), ('\u{10FFFF}', '\u{10FFFF}')]);
3301 let expected = uclass(&[('\u{10FFFE}', '\u{10FFFE}')]);
3302 assert_eq!(expected, unegate(&cls));
3303
3304 let cls = uclass(&[('\x00', '\u{D7FF}')]);
3305 let expected = uclass(&[('\u{E000}', '\u{10FFFF}')]);
3306 assert_eq!(expected, unegate(&cls));
3307
3308 let cls = uclass(&[('\x00', '\u{D7FE}')]);
3309 let expected = uclass(&[('\u{D7FF}', '\u{10FFFF}')]);
3310 assert_eq!(expected, unegate(&cls));
3311
3312 let cls = uclass(&[('\u{E000}', '\u{10FFFF}')]);
3313 let expected = uclass(&[('\x00', '\u{D7FF}')]);
3314 assert_eq!(expected, unegate(&cls));
3315
3316 let cls = uclass(&[('\u{E001}', '\u{10FFFF}')]);
3317 let expected = uclass(&[('\x00', '\u{E000}')]);
3318 assert_eq!(expected, unegate(&cls));
3319 }
3320
3321 #[test]
3322 fn class_negate_bytes() {
3323 let cls = bclass(&[(b'a', b'a')]);
3324 let expected = bclass(&[(b'\x00', b'\x60'), (b'\x62', b'\xFF')]);
3325 assert_eq!(expected, bnegate(&cls));
3326
3327 let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]);
3328 let expected = bclass(&[(b'\x00', b'\x60'), (b'\x63', b'\xFF')]);
3329 assert_eq!(expected, bnegate(&cls));
3330
3331 let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]);
3332 let expected = bclass(&[
3333 (b'\x00', b'\x60'),
3334 (b'\x64', b'\x77'),
3335 (b'\x7B', b'\xFF'),
3336 ]);
3337 assert_eq!(expected, bnegate(&cls));
3338
3339 let cls = bclass(&[(b'\x00', b'a')]);
3340 let expected = bclass(&[(b'\x62', b'\xFF')]);
3341 assert_eq!(expected, bnegate(&cls));
3342
3343 let cls = bclass(&[(b'a', b'\xFF')]);
3344 let expected = bclass(&[(b'\x00', b'\x60')]);
3345 assert_eq!(expected, bnegate(&cls));
3346
3347 let cls = bclass(&[(b'\x00', b'\xFF')]);
3348 let expected = bclass(&[]);
3349 assert_eq!(expected, bnegate(&cls));
3350
3351 let cls = bclass(&[]);
3352 let expected = bclass(&[(b'\x00', b'\xFF')]);
3353 assert_eq!(expected, bnegate(&cls));
3354
3355 let cls = bclass(&[(b'\x00', b'\xFD'), (b'\xFF', b'\xFF')]);
3356 let expected = bclass(&[(b'\xFE', b'\xFE')]);
3357 assert_eq!(expected, bnegate(&cls));
3358 }
3359
3360 #[test]
3361 fn class_union_unicode() {
3362 let cls1 = uclass(&[('a', 'g'), ('m', 't'), ('A', 'C')]);
3363 let cls2 = uclass(&[('a', 'z')]);
3364 let expected = uclass(&[('a', 'z'), ('A', 'C')]);
3365 assert_eq!(expected, uunion(&cls1, &cls2));
3366 }
3367
3368 #[test]
3369 fn class_union_bytes() {
3370 let cls1 = bclass(&[(b'a', b'g'), (b'm', b't'), (b'A', b'C')]);
3371 let cls2 = bclass(&[(b'a', b'z')]);
3372 let expected = bclass(&[(b'a', b'z'), (b'A', b'C')]);
3373 assert_eq!(expected, bunion(&cls1, &cls2));
3374 }
3375
3376 #[test]
3377 fn class_intersect_unicode() {
3378 let cls1 = uclass(&[]);
3379 let cls2 = uclass(&[('a', 'a')]);
3380 let expected = uclass(&[]);
3381 assert_eq!(expected, uintersect(&cls1, &cls2));
3382
3383 let cls1 = uclass(&[('a', 'a')]);
3384 let cls2 = uclass(&[('a', 'a')]);
3385 let expected = uclass(&[('a', 'a')]);
3386 assert_eq!(expected, uintersect(&cls1, &cls2));
3387
3388 let cls1 = uclass(&[('a', 'a')]);
3389 let cls2 = uclass(&[('b', 'b')]);
3390 let expected = uclass(&[]);
3391 assert_eq!(expected, uintersect(&cls1, &cls2));
3392
3393 let cls1 = uclass(&[('a', 'a')]);
3394 let cls2 = uclass(&[('a', 'c')]);
3395 let expected = uclass(&[('a', 'a')]);
3396 assert_eq!(expected, uintersect(&cls1, &cls2));
3397
3398 let cls1 = uclass(&[('a', 'b')]);
3399 let cls2 = uclass(&[('a', 'c')]);
3400 let expected = uclass(&[('a', 'b')]);
3401 assert_eq!(expected, uintersect(&cls1, &cls2));
3402
3403 let cls1 = uclass(&[('a', 'b')]);
3404 let cls2 = uclass(&[('b', 'c')]);
3405 let expected = uclass(&[('b', 'b')]);
3406 assert_eq!(expected, uintersect(&cls1, &cls2));
3407
3408 let cls1 = uclass(&[('a', 'b')]);
3409 let cls2 = uclass(&[('c', 'd')]);
3410 let expected = uclass(&[]);
3411 assert_eq!(expected, uintersect(&cls1, &cls2));
3412
3413 let cls1 = uclass(&[('b', 'c')]);
3414 let cls2 = uclass(&[('a', 'd')]);
3415 let expected = uclass(&[('b', 'c')]);
3416 assert_eq!(expected, uintersect(&cls1, &cls2));
3417
3418 let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3419 let cls2 = uclass(&[('a', 'h')]);
3420 let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3421 assert_eq!(expected, uintersect(&cls1, &cls2));
3422
3423 let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3424 let cls2 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3425 let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3426 assert_eq!(expected, uintersect(&cls1, &cls2));
3427
3428 let cls1 = uclass(&[('a', 'b'), ('g', 'h')]);
3429 let cls2 = uclass(&[('d', 'e'), ('k', 'l')]);
3430 let expected = uclass(&[]);
3431 assert_eq!(expected, uintersect(&cls1, &cls2));
3432
3433 let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3434 let cls2 = uclass(&[('h', 'h')]);
3435 let expected = uclass(&[('h', 'h')]);
3436 assert_eq!(expected, uintersect(&cls1, &cls2));
3437
3438 let cls1 = uclass(&[('a', 'b'), ('e', 'f'), ('i', 'j')]);
3439 let cls2 = uclass(&[('c', 'd'), ('g', 'h'), ('k', 'l')]);
3440 let expected = uclass(&[]);
3441 assert_eq!(expected, uintersect(&cls1, &cls2));
3442
3443 let cls1 = uclass(&[('a', 'b'), ('c', 'd'), ('e', 'f')]);
3444 let cls2 = uclass(&[('b', 'c'), ('d', 'e'), ('f', 'g')]);
3445 let expected = uclass(&[('b', 'f')]);
3446 assert_eq!(expected, uintersect(&cls1, &cls2));
3447 }
3448
3449 #[test]
3450 fn class_intersect_bytes() {
3451 let cls1 = bclass(&[]);
3452 let cls2 = bclass(&[(b'a', b'a')]);
3453 let expected = bclass(&[]);
3454 assert_eq!(expected, bintersect(&cls1, &cls2));
3455
3456 let cls1 = bclass(&[(b'a', b'a')]);
3457 let cls2 = bclass(&[(b'a', b'a')]);
3458 let expected = bclass(&[(b'a', b'a')]);
3459 assert_eq!(expected, bintersect(&cls1, &cls2));
3460
3461 let cls1 = bclass(&[(b'a', b'a')]);
3462 let cls2 = bclass(&[(b'b', b'b')]);
3463 let expected = bclass(&[]);
3464 assert_eq!(expected, bintersect(&cls1, &cls2));
3465
3466 let cls1 = bclass(&[(b'a', b'a')]);
3467 let cls2 = bclass(&[(b'a', b'c')]);
3468 let expected = bclass(&[(b'a', b'a')]);
3469 assert_eq!(expected, bintersect(&cls1, &cls2));
3470
3471 let cls1 = bclass(&[(b'a', b'b')]);
3472 let cls2 = bclass(&[(b'a', b'c')]);
3473 let expected = bclass(&[(b'a', b'b')]);
3474 assert_eq!(expected, bintersect(&cls1, &cls2));
3475
3476 let cls1 = bclass(&[(b'a', b'b')]);
3477 let cls2 = bclass(&[(b'b', b'c')]);
3478 let expected = bclass(&[(b'b', b'b')]);
3479 assert_eq!(expected, bintersect(&cls1, &cls2));
3480
3481 let cls1 = bclass(&[(b'a', b'b')]);
3482 let cls2 = bclass(&[(b'c', b'd')]);
3483 let expected = bclass(&[]);
3484 assert_eq!(expected, bintersect(&cls1, &cls2));
3485
3486 let cls1 = bclass(&[(b'b', b'c')]);
3487 let cls2 = bclass(&[(b'a', b'd')]);
3488 let expected = bclass(&[(b'b', b'c')]);
3489 assert_eq!(expected, bintersect(&cls1, &cls2));
3490
3491 let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3492 let cls2 = bclass(&[(b'a', b'h')]);
3493 let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3494 assert_eq!(expected, bintersect(&cls1, &cls2));
3495
3496 let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3497 let cls2 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3498 let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3499 assert_eq!(expected, bintersect(&cls1, &cls2));
3500
3501 let cls1 = bclass(&[(b'a', b'b'), (b'g', b'h')]);
3502 let cls2 = bclass(&[(b'd', b'e'), (b'k', b'l')]);
3503 let expected = bclass(&[]);
3504 assert_eq!(expected, bintersect(&cls1, &cls2));
3505
3506 let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3507 let cls2 = bclass(&[(b'h', b'h')]);
3508 let expected = bclass(&[(b'h', b'h')]);
3509 assert_eq!(expected, bintersect(&cls1, &cls2));
3510
3511 let cls1 = bclass(&[(b'a', b'b'), (b'e', b'f'), (b'i', b'j')]);
3512 let cls2 = bclass(&[(b'c', b'd'), (b'g', b'h'), (b'k', b'l')]);
3513 let expected = bclass(&[]);
3514 assert_eq!(expected, bintersect(&cls1, &cls2));
3515
3516 let cls1 = bclass(&[(b'a', b'b'), (b'c', b'd'), (b'e', b'f')]);
3517 let cls2 = bclass(&[(b'b', b'c'), (b'd', b'e'), (b'f', b'g')]);
3518 let expected = bclass(&[(b'b', b'f')]);
3519 assert_eq!(expected, bintersect(&cls1, &cls2));
3520 }
3521
3522 #[test]
3523 fn class_difference_unicode() {
3524 let cls1 = uclass(&[('a', 'a')]);
3525 let cls2 = uclass(&[('a', 'a')]);
3526 let expected = uclass(&[]);
3527 assert_eq!(expected, udifference(&cls1, &cls2));
3528
3529 let cls1 = uclass(&[('a', 'a')]);
3530 let cls2 = uclass(&[]);
3531 let expected = uclass(&[('a', 'a')]);
3532 assert_eq!(expected, udifference(&cls1, &cls2));
3533
3534 let cls1 = uclass(&[]);
3535 let cls2 = uclass(&[('a', 'a')]);
3536 let expected = uclass(&[]);
3537 assert_eq!(expected, udifference(&cls1, &cls2));
3538
3539 let cls1 = uclass(&[('a', 'z')]);
3540 let cls2 = uclass(&[('a', 'a')]);
3541 let expected = uclass(&[('b', 'z')]);
3542 assert_eq!(expected, udifference(&cls1, &cls2));
3543
3544 let cls1 = uclass(&[('a', 'z')]);
3545 let cls2 = uclass(&[('z', 'z')]);
3546 let expected = uclass(&[('a', 'y')]);
3547 assert_eq!(expected, udifference(&cls1, &cls2));
3548
3549 let cls1 = uclass(&[('a', 'z')]);
3550 let cls2 = uclass(&[('m', 'm')]);
3551 let expected = uclass(&[('a', 'l'), ('n', 'z')]);
3552 assert_eq!(expected, udifference(&cls1, &cls2));
3553
3554 let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
3555 let cls2 = uclass(&[('a', 'z')]);
3556 let expected = uclass(&[]);
3557 assert_eq!(expected, udifference(&cls1, &cls2));
3558
3559 let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
3560 let cls2 = uclass(&[('d', 'v')]);
3561 let expected = uclass(&[('a', 'c')]);
3562 assert_eq!(expected, udifference(&cls1, &cls2));
3563
3564 let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
3565 let cls2 = uclass(&[('b', 'g'), ('s', 'u')]);
3566 let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]);
3567 assert_eq!(expected, udifference(&cls1, &cls2));
3568
3569 let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
3570 let cls2 = uclass(&[('b', 'd'), ('e', 'g'), ('s', 'u')]);
3571 let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]);
3572 assert_eq!(expected, udifference(&cls1, &cls2));
3573
3574 let cls1 = uclass(&[('x', 'z')]);
3575 let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]);
3576 let expected = uclass(&[('x', 'z')]);
3577 assert_eq!(expected, udifference(&cls1, &cls2));
3578
3579 let cls1 = uclass(&[('a', 'z')]);
3580 let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]);
3581 let expected = uclass(&[('d', 'd'), ('h', 'r'), ('v', 'z')]);
3582 assert_eq!(expected, udifference(&cls1, &cls2));
3583 }
3584
3585 #[test]
3586 fn class_difference_bytes() {
3587 let cls1 = bclass(&[(b'a', b'a')]);
3588 let cls2 = bclass(&[(b'a', b'a')]);
3589 let expected = bclass(&[]);
3590 assert_eq!(expected, bdifference(&cls1, &cls2));
3591
3592 let cls1 = bclass(&[(b'a', b'a')]);
3593 let cls2 = bclass(&[]);
3594 let expected = bclass(&[(b'a', b'a')]);
3595 assert_eq!(expected, bdifference(&cls1, &cls2));
3596
3597 let cls1 = bclass(&[]);
3598 let cls2 = bclass(&[(b'a', b'a')]);
3599 let expected = bclass(&[]);
3600 assert_eq!(expected, bdifference(&cls1, &cls2));
3601
3602 let cls1 = bclass(&[(b'a', b'z')]);
3603 let cls2 = bclass(&[(b'a', b'a')]);
3604 let expected = bclass(&[(b'b', b'z')]);
3605 assert_eq!(expected, bdifference(&cls1, &cls2));
3606
3607 let cls1 = bclass(&[(b'a', b'z')]);
3608 let cls2 = bclass(&[(b'z', b'z')]);
3609 let expected = bclass(&[(b'a', b'y')]);
3610 assert_eq!(expected, bdifference(&cls1, &cls2));
3611
3612 let cls1 = bclass(&[(b'a', b'z')]);
3613 let cls2 = bclass(&[(b'm', b'm')]);
3614 let expected = bclass(&[(b'a', b'l'), (b'n', b'z')]);
3615 assert_eq!(expected, bdifference(&cls1, &cls2));
3616
3617 let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
3618 let cls2 = bclass(&[(b'a', b'z')]);
3619 let expected = bclass(&[]);
3620 assert_eq!(expected, bdifference(&cls1, &cls2));
3621
3622 let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
3623 let cls2 = bclass(&[(b'd', b'v')]);
3624 let expected = bclass(&[(b'a', b'c')]);
3625 assert_eq!(expected, bdifference(&cls1, &cls2));
3626
3627 let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
3628 let cls2 = bclass(&[(b'b', b'g'), (b's', b'u')]);
3629 let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]);
3630 assert_eq!(expected, bdifference(&cls1, &cls2));
3631
3632 let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
3633 let cls2 = bclass(&[(b'b', b'd'), (b'e', b'g'), (b's', b'u')]);
3634 let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]);
3635 assert_eq!(expected, bdifference(&cls1, &cls2));
3636
3637 let cls1 = bclass(&[(b'x', b'z')]);
3638 let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]);
3639 let expected = bclass(&[(b'x', b'z')]);
3640 assert_eq!(expected, bdifference(&cls1, &cls2));
3641
3642 let cls1 = bclass(&[(b'a', b'z')]);
3643 let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]);
3644 let expected = bclass(&[(b'd', b'd'), (b'h', b'r'), (b'v', b'z')]);
3645 assert_eq!(expected, bdifference(&cls1, &cls2));
3646 }
3647
3648 #[test]
3649 fn class_symmetric_difference_unicode() {
3650 let cls1 = uclass(&[('a', 'm')]);
3651 let cls2 = uclass(&[('g', 't')]);
3652 let expected = uclass(&[('a', 'f'), ('n', 't')]);
3653 assert_eq!(expected, usymdifference(&cls1, &cls2));
3654 }
3655
3656 #[test]
3657 fn class_symmetric_difference_bytes() {
3658 let cls1 = bclass(&[(b'a', b'm')]);
3659 let cls2 = bclass(&[(b'g', b't')]);
3660 let expected = bclass(&[(b'a', b'f'), (b'n', b't')]);
3661 assert_eq!(expected, bsymdifference(&cls1, &cls2));
3662 }
3663
3664 // We use a thread with an explicit stack size to test that our destructor
3665 // for Hir can handle arbitrarily sized expressions in constant stack
3666 // space. In case we run on a platform without threads (WASM?), we limit
3667 // this test to Windows/Unix.
3668 #[test]
3669 #[cfg(any(unix, windows))]
3670 fn no_stack_overflow_on_drop() {
3671 use std::thread;
3672
3673 let run = || {
3674 let mut expr = Hir::empty();
3675 for _ in 0..100 {
3676 expr = Hir::capture(Capture {
3677 index: 1,
3678 name: None,
3679 sub: Box::new(expr),
3680 });
3681 expr = Hir::repetition(Repetition {
3682 min: 0,
3683 max: Some(1),
3684 greedy: true,
3685 sub: Box::new(expr),
3686 });
3687
3688 expr = Hir {
3689 kind: HirKind::Concat(vec![expr]),
3690 props: Properties::empty(),
3691 };
3692 expr = Hir {
3693 kind: HirKind::Alternation(vec![expr]),
3694 props: Properties::empty(),
3695 };
3696 }
3697 assert!(!matches!(*expr.kind(), HirKind::Empty));
3698 };
3699
3700 // We run our test on a thread with a small stack size so we can
3701 // force the issue more easily.
3702 //
3703 // NOTE(2023-03-21): See the corresponding test in 'crate::ast::tests'
3704 // for context on the specific stack size chosen here.
3705 thread::Builder::new()
3706 .stack_size(16 << 10)
3707 .spawn(run)
3708 .unwrap()
3709 .join()
3710 .unwrap();
3711 }
3712
3713 #[test]
3714 fn look_set_iter() {
3715 let set = LookSet::empty();
3716 assert_eq!(0, set.iter().count());
3717
3718 let set = LookSet::full();
3719 assert_eq!(10, set.iter().count());
3720
3721 let set =
3722 LookSet::empty().insert(Look::StartLF).insert(Look::WordUnicode);
3723 assert_eq!(2, set.iter().count());
3724
3725 let set = LookSet::empty().insert(Look::StartLF);
3726 assert_eq!(1, set.iter().count());
3727
3728 let set = LookSet::empty().insert(Look::WordAsciiNegate);
3729 assert_eq!(1, set.iter().count());
3730 }
3731
3732 #[test]
3733 fn look_set_debug() {
3734 let res = format!("{:?}", LookSet::empty());
3735 assert_eq!("∅", res);
3736 let res = format!("{:?}", LookSet::full());
3737 assert_eq!("Az^$rRbB𝛃𝚩", res);
3738 }
3739}
3740