1/*!
2Defines a high-level intermediate (HIR) representation for regular expressions.
3
4The HIR is represented by the [`Hir`] type, and it principally constructed via
5[translation](translate) from an [`Ast`](crate::ast::Ast). Alternatively, users
6may use the smart constructors defined on `Hir` to build their own by hand. The
7smart constructors simultaneously simplify and "optimize" the HIR, and are also
8the same routines used by translation.
9
10Most regex engines only have an HIR like this, and usually construct it
11directly from the concrete syntax. This crate however first parses the
12concrete syntax into an `Ast`, and only then creates the HIR from the `Ast`,
13as mentioned above. It's done this way to facilitate better error reporting,
14and to have a structured representation of a regex that faithfully represents
15its concrete syntax. Namely, while an `Hir` value can be converted back to an
16equivalent regex pattern string, it is unlikely to look like the original due
17to its simplified structure.
18*/
19
20use core::{char, cmp};
21
22use alloc::{
23 boxed::Box,
24 format,
25 string::{String, ToString},
26 vec,
27 vec::Vec,
28};
29
30use crate::{
31 ast::Span,
32 hir::interval::{Interval, IntervalSet, IntervalSetIter},
33 unicode,
34};
35
36pub use crate::{
37 hir::visitor::{visit, Visitor},
38 unicode::CaseFoldError,
39};
40
41mod interval;
42pub mod literal;
43pub mod print;
44pub mod translate;
45mod visitor;
46
47/// An error that can occur while translating an `Ast` to a `Hir`.
48#[derive(Clone, Debug, Eq, PartialEq)]
49pub struct Error {
50 /// The kind of error.
51 kind: ErrorKind,
52 /// The original pattern that the translator's Ast was parsed from. Every
53 /// span in an error is a valid range into this string.
54 pattern: String,
55 /// The span of this error, derived from the Ast given to the translator.
56 span: Span,
57}
58
59impl Error {
60 /// Return the type of this error.
61 pub fn kind(&self) -> &ErrorKind {
62 &self.kind
63 }
64
65 /// The original pattern string in which this error occurred.
66 ///
67 /// Every span reported by this error is reported in terms of this string.
68 pub fn pattern(&self) -> &str {
69 &self.pattern
70 }
71
72 /// Return the span at which this error occurred.
73 pub fn span(&self) -> &Span {
74 &self.span
75 }
76}
77
78/// The type of an error that occurred while building an `Hir`.
79///
80/// This error type is marked as `non_exhaustive`. This means that adding a
81/// new variant is not considered a breaking change.
82#[non_exhaustive]
83#[derive(Clone, Debug, Eq, PartialEq)]
84pub enum ErrorKind {
85 /// This error occurs when a Unicode feature is used when Unicode
86 /// support is disabled. For example `(?-u:\pL)` would trigger this error.
87 UnicodeNotAllowed,
88 /// This error occurs when translating a pattern that could match a byte
89 /// sequence that isn't UTF-8 and `utf8` was enabled.
90 InvalidUtf8,
91 /// This error occurs when one uses a non-ASCII byte for a line terminator,
92 /// but where Unicode mode is enabled and UTF-8 mode is disabled.
93 InvalidLineTerminator,
94 /// This occurs when an unrecognized Unicode property name could not
95 /// be found.
96 UnicodePropertyNotFound,
97 /// This occurs when an unrecognized Unicode property value could not
98 /// be found.
99 UnicodePropertyValueNotFound,
100 /// This occurs when a Unicode-aware Perl character class (`\w`, `\s` or
101 /// `\d`) could not be found. This can occur when the `unicode-perl`
102 /// crate feature is not enabled.
103 UnicodePerlClassNotFound,
104 /// This occurs when the Unicode simple case mapping tables are not
105 /// available, and the regular expression required Unicode aware case
106 /// insensitivity.
107 UnicodeCaseUnavailable,
108}
109
110#[cfg(feature = "std")]
111impl std::error::Error for Error {}
112
113impl core::fmt::Display for Error {
114 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
115 crate::error::Formatter::from(self).fmt(f)
116 }
117}
118
119impl core::fmt::Display for ErrorKind {
120 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
121 use self::ErrorKind::*;
122
123 let msg: &str = match *self {
124 UnicodeNotAllowed => "Unicode not allowed here",
125 InvalidUtf8 => "pattern can match invalid UTF-8",
126 InvalidLineTerminator => "invalid line terminator, must be ASCII",
127 UnicodePropertyNotFound => "Unicode property not found",
128 UnicodePropertyValueNotFound => "Unicode property value not found",
129 UnicodePerlClassNotFound => {
130 "Unicode-aware Perl class not found \
131 (make sure the unicode-perl feature is enabled)"
132 }
133 UnicodeCaseUnavailable => {
134 "Unicode-aware case insensitivity matching is not available \
135 (make sure the unicode-case feature is enabled)"
136 }
137 };
138 f.write_str(data:msg)
139 }
140}
141
142/// A high-level intermediate representation (HIR) for a regular expression.
143///
144/// An HIR value is a combination of a [`HirKind`] and a set of [`Properties`].
145/// An `HirKind` indicates what kind of regular expression it is (a literal,
146/// a repetition, a look-around assertion, etc.), where as a `Properties`
147/// describes various facts about the regular expression. For example, whether
148/// it matches UTF-8 or if it matches the empty string.
149///
150/// The HIR of a regular expression represents an intermediate step between
151/// its abstract syntax (a structured description of the concrete syntax) and
152/// an actual regex matcher. The purpose of HIR is to make regular expressions
153/// easier to analyze. In particular, the AST is much more complex than the
154/// HIR. For example, while an AST supports arbitrarily nested character
155/// classes, the HIR will flatten all nested classes into a single set. The HIR
156/// will also "compile away" every flag present in the concrete syntax. For
157/// example, users of HIR expressions never need to worry about case folding;
158/// it is handled automatically by the translator (e.g., by translating
159/// `(?i:A)` to `[aA]`).
160///
161/// The specific type of an HIR expression can be accessed via its `kind`
162/// or `into_kind` methods. This extra level of indirection exists for two
163/// reasons:
164///
165/// 1. Construction of an HIR expression *must* use the constructor methods on
166/// this `Hir` type instead of building the `HirKind` values directly. This
167/// permits construction to enforce invariants like "concatenations always
168/// consist of two or more sub-expressions."
169/// 2. Every HIR expression contains attributes that are defined inductively,
170/// and can be computed cheaply during the construction process. For example,
171/// one such attribute is whether the expression must match at the beginning of
172/// the haystack.
173///
174/// In particular, if you have an `HirKind` value, then there is intentionally
175/// no way to build an `Hir` value from it. You instead need to do case
176/// analysis on the `HirKind` value and build the `Hir` value using its smart
177/// constructors.
178///
179/// # UTF-8
180///
181/// If the HIR was produced by a translator with
182/// [`TranslatorBuilder::utf8`](translate::TranslatorBuilder::utf8) enabled,
183/// then the HIR is guaranteed to match UTF-8 exclusively for all non-empty
184/// matches.
185///
186/// For empty matches, those can occur at any position. It is the
187/// responsibility of the regex engine to determine whether empty matches are
188/// permitted between the code units of a single codepoint.
189///
190/// # Stack space
191///
192/// This type defines its own destructor that uses constant stack space and
193/// heap space proportional to the size of the HIR.
194///
195/// Also, an `Hir`'s `fmt::Display` implementation prints an HIR as a regular
196/// expression pattern string, and uses constant stack space and heap space
197/// proportional to the size of the `Hir`. The regex it prints is guaranteed to
198/// be _semantically_ equivalent to the original concrete syntax, but it may
199/// look very different. (And potentially not practically readable by a human.)
200///
201/// An `Hir`'s `fmt::Debug` implementation currently does not use constant
202/// stack space. The implementation will also suppress some details (such as
203/// the `Properties` inlined into every `Hir` value to make it less noisy).
204#[derive(Clone, Eq, PartialEq)]
205pub struct Hir {
206 /// The underlying HIR kind.
207 kind: HirKind,
208 /// Analysis info about this HIR, computed during construction.
209 props: Properties,
210}
211
212/// Methods for accessing the underlying `HirKind` and `Properties`.
213impl Hir {
214 /// Returns a reference to the underlying HIR kind.
215 pub fn kind(&self) -> &HirKind {
216 &self.kind
217 }
218
219 /// Consumes ownership of this HIR expression and returns its underlying
220 /// `HirKind`.
221 pub fn into_kind(mut self) -> HirKind {
222 core::mem::replace(&mut self.kind, HirKind::Empty)
223 }
224
225 /// Returns the properties computed for this `Hir`.
226 pub fn properties(&self) -> &Properties {
227 &self.props
228 }
229
230 /// Splits this HIR into its constituent parts.
231 ///
232 /// This is useful because `let Hir { kind, props } = hir;` does not work
233 /// because of `Hir`'s custom `Drop` implementation.
234 fn into_parts(mut self) -> (HirKind, Properties) {
235 (
236 core::mem::replace(&mut self.kind, HirKind::Empty),
237 core::mem::replace(&mut self.props, Properties::empty()),
238 )
239 }
240}
241
242/// Smart constructors for HIR values.
243///
244/// These constructors are called "smart" because they do inductive work or
245/// simplifications. For example, calling `Hir::repetition` with a repetition
246/// like `a{0}` will actually return a `Hir` with a `HirKind::Empty` kind
247/// since it is equivalent to an empty regex. Another example is calling
248/// `Hir::concat(vec![expr])`. Instead of getting a `HirKind::Concat`, you'll
249/// just get back the original `expr` since it's precisely equivalent.
250///
251/// Smart constructors enable maintaining invariants about the HIR data type
252/// while also simulanteously keeping the representation as simple as possible.
253impl Hir {
254 /// Returns an empty HIR expression.
255 ///
256 /// An empty HIR expression always matches, including the empty string.
257 #[inline]
258 pub fn empty() -> Hir {
259 let props = Properties::empty();
260 Hir { kind: HirKind::Empty, props }
261 }
262
263 /// Returns an HIR expression that can never match anything. That is,
264 /// the size of the set of strings in the language described by the HIR
265 /// returned is `0`.
266 ///
267 /// This is distinct from [`Hir::empty`] in that the empty string matches
268 /// the HIR returned by `Hir::empty`. That is, the set of strings in the
269 /// language describe described by `Hir::empty` is non-empty.
270 ///
271 /// Note that currently, the HIR returned uses an empty character class to
272 /// indicate that nothing can match. An equivalent expression that cannot
273 /// match is an empty alternation, but all such "fail" expressions are
274 /// normalized (via smart constructors) to empty character classes. This is
275 /// because empty character classes can be spelled in the concrete syntax
276 /// of a regex (e.g., `\P{any}` or `(?-u:[^\x00-\xFF])` or `[a&&b]`), but
277 /// empty alternations cannot.
278 #[inline]
279 pub fn fail() -> Hir {
280 let class = Class::Bytes(ClassBytes::empty());
281 let props = Properties::class(&class);
282 // We can't just call Hir::class here because it defers to Hir::fail
283 // in order to canonicalize the Hir value used to represent "cannot
284 // match."
285 Hir { kind: HirKind::Class(class), props }
286 }
287
288 /// Creates a literal HIR expression.
289 ///
290 /// This accepts anything that can be converted into a `Box<[u8]>`.
291 ///
292 /// Note that there is no mechanism for storing a `char` or a `Box<str>`
293 /// in an HIR. Everything is "just bytes." Whether a `Literal` (or
294 /// any HIR node) matches valid UTF-8 exclusively can be queried via
295 /// [`Properties::is_utf8`].
296 ///
297 /// # Example
298 ///
299 /// This example shows that concatenations of `Literal` HIR values will
300 /// automatically get flattened and combined together. So for example, even
301 /// if you concat multiple `Literal` values that are themselves not valid
302 /// UTF-8, they might add up to valid UTF-8. This also demonstrates just
303 /// how "smart" Hir's smart constructors are.
304 ///
305 /// ```
306 /// use regex_syntax::hir::{Hir, HirKind, Literal};
307 ///
308 /// let literals = vec![
309 /// Hir::literal([0xE2]),
310 /// Hir::literal([0x98]),
311 /// Hir::literal([0x83]),
312 /// ];
313 /// // Each literal, on its own, is invalid UTF-8.
314 /// assert!(literals.iter().all(|hir| !hir.properties().is_utf8()));
315 ///
316 /// let concat = Hir::concat(literals);
317 /// // But the concatenation is valid UTF-8!
318 /// assert!(concat.properties().is_utf8());
319 ///
320 /// // And also notice that the literals have been concatenated into a
321 /// // single `Literal`, to the point where there is no explicit `Concat`!
322 /// let expected = HirKind::Literal(Literal(Box::from("☃".as_bytes())));
323 /// assert_eq!(&expected, concat.kind());
324 /// ```
325 #[inline]
326 pub fn literal<B: Into<Box<[u8]>>>(lit: B) -> Hir {
327 let bytes = lit.into();
328 if bytes.is_empty() {
329 return Hir::empty();
330 }
331
332 let lit = Literal(bytes);
333 let props = Properties::literal(&lit);
334 Hir { kind: HirKind::Literal(lit), props }
335 }
336
337 /// Creates a class HIR expression. The class may either be defined over
338 /// ranges of Unicode codepoints or ranges of raw byte values.
339 ///
340 /// Note that an empty class is permitted. An empty class is equivalent to
341 /// `Hir::fail()`.
342 #[inline]
343 pub fn class(class: Class) -> Hir {
344 if class.is_empty() {
345 return Hir::fail();
346 } else if let Some(bytes) = class.literal() {
347 return Hir::literal(bytes);
348 }
349 let props = Properties::class(&class);
350 Hir { kind: HirKind::Class(class), props }
351 }
352
353 /// Creates a look-around assertion HIR expression.
354 #[inline]
355 pub fn look(look: Look) -> Hir {
356 let props = Properties::look(look);
357 Hir { kind: HirKind::Look(look), props }
358 }
359
360 /// Creates a repetition HIR expression.
361 #[inline]
362 pub fn repetition(mut rep: Repetition) -> Hir {
363 // If the sub-expression of a repetition can only match the empty
364 // string, then we force its maximum to be at most 1.
365 if rep.sub.properties().maximum_len() == Some(0) {
366 rep.min = cmp::min(rep.min, 1);
367 rep.max = rep.max.map(|n| cmp::min(n, 1)).or(Some(1));
368 }
369 // The regex 'a{0}' is always equivalent to the empty regex. This is
370 // true even when 'a' is an expression that never matches anything
371 // (like '\P{any}').
372 //
373 // Additionally, the regex 'a{1}' is always equivalent to 'a'.
374 if rep.min == 0 && rep.max == Some(0) {
375 return Hir::empty();
376 } else if rep.min == 1 && rep.max == Some(1) {
377 return *rep.sub;
378 }
379 let props = Properties::repetition(&rep);
380 Hir { kind: HirKind::Repetition(rep), props }
381 }
382
383 /// Creates a capture HIR expression.
384 ///
385 /// Note that there is no explicit HIR value for a non-capturing group.
386 /// Since a non-capturing group only exists to override precedence in the
387 /// concrete syntax and since an HIR already does its own grouping based on
388 /// what is parsed, there is no need to explicitly represent non-capturing
389 /// groups in the HIR.
390 #[inline]
391 pub fn capture(capture: Capture) -> Hir {
392 let props = Properties::capture(&capture);
393 Hir { kind: HirKind::Capture(capture), props }
394 }
395
396 /// Returns the concatenation of the given expressions.
397 ///
398 /// This attempts to flatten and simplify the concatenation as appropriate.
399 ///
400 /// # Example
401 ///
402 /// This shows a simple example of basic flattening of both concatenations
403 /// and literals.
404 ///
405 /// ```
406 /// use regex_syntax::hir::Hir;
407 ///
408 /// let hir = Hir::concat(vec![
409 /// Hir::concat(vec![
410 /// Hir::literal([b'a']),
411 /// Hir::literal([b'b']),
412 /// Hir::literal([b'c']),
413 /// ]),
414 /// Hir::concat(vec![
415 /// Hir::literal([b'x']),
416 /// Hir::literal([b'y']),
417 /// Hir::literal([b'z']),
418 /// ]),
419 /// ]);
420 /// let expected = Hir::literal("abcxyz".as_bytes());
421 /// assert_eq!(expected, hir);
422 /// ```
423 pub fn concat(subs: Vec<Hir>) -> Hir {
424 // We rebuild the concatenation by simplifying it. Would be nice to do
425 // it in place, but that seems a little tricky?
426 let mut new = vec![];
427 // This gobbles up any adjacent literals in a concatenation and smushes
428 // them together. Basically, when we see a literal, we add its bytes
429 // to 'prior_lit', and whenever we see anything else, we first take
430 // any bytes in 'prior_lit' and add it to the 'new' concatenation.
431 let mut prior_lit: Option<Vec<u8>> = None;
432 for sub in subs {
433 let (kind, props) = sub.into_parts();
434 match kind {
435 HirKind::Literal(Literal(bytes)) => {
436 if let Some(ref mut prior_bytes) = prior_lit {
437 prior_bytes.extend_from_slice(&bytes);
438 } else {
439 prior_lit = Some(bytes.to_vec());
440 }
441 }
442 // We also flatten concats that are direct children of another
443 // concat. We only need to do this one level deep since
444 // Hir::concat is the only way to build concatenations, and so
445 // flattening happens inductively.
446 HirKind::Concat(subs2) => {
447 for sub2 in subs2 {
448 let (kind2, props2) = sub2.into_parts();
449 match kind2 {
450 HirKind::Literal(Literal(bytes)) => {
451 if let Some(ref mut prior_bytes) = prior_lit {
452 prior_bytes.extend_from_slice(&bytes);
453 } else {
454 prior_lit = Some(bytes.to_vec());
455 }
456 }
457 kind2 => {
458 if let Some(prior_bytes) = prior_lit.take() {
459 new.push(Hir::literal(prior_bytes));
460 }
461 new.push(Hir { kind: kind2, props: props2 });
462 }
463 }
464 }
465 }
466 // We can just skip empty HIRs.
467 HirKind::Empty => {}
468 kind => {
469 if let Some(prior_bytes) = prior_lit.take() {
470 new.push(Hir::literal(prior_bytes));
471 }
472 new.push(Hir { kind, props });
473 }
474 }
475 }
476 if let Some(prior_bytes) = prior_lit.take() {
477 new.push(Hir::literal(prior_bytes));
478 }
479 if new.is_empty() {
480 return Hir::empty();
481 } else if new.len() == 1 {
482 return new.pop().unwrap();
483 }
484 let props = Properties::concat(&new);
485 Hir { kind: HirKind::Concat(new), props }
486 }
487
488 /// Returns the alternation of the given expressions.
489 ///
490 /// This flattens and simplifies the alternation as appropriate. This may
491 /// include factoring out common prefixes or even rewriting the alternation
492 /// as a character class.
493 ///
494 /// Note that an empty alternation is equivalent to `Hir::fail()`. (It
495 /// is not possible for one to write an empty alternation, or even an
496 /// alternation with a single sub-expression, in the concrete syntax of a
497 /// regex.)
498 ///
499 /// # Example
500 ///
501 /// This is a simple example showing how an alternation might get
502 /// simplified.
503 ///
504 /// ```
505 /// use regex_syntax::hir::{Hir, Class, ClassUnicode, ClassUnicodeRange};
506 ///
507 /// let hir = Hir::alternation(vec![
508 /// Hir::literal([b'a']),
509 /// Hir::literal([b'b']),
510 /// Hir::literal([b'c']),
511 /// Hir::literal([b'd']),
512 /// Hir::literal([b'e']),
513 /// Hir::literal([b'f']),
514 /// ]);
515 /// let expected = Hir::class(Class::Unicode(ClassUnicode::new([
516 /// ClassUnicodeRange::new('a', 'f'),
517 /// ])));
518 /// assert_eq!(expected, hir);
519 /// ```
520 ///
521 /// And another example showing how common prefixes might get factored
522 /// out.
523 ///
524 /// ```
525 /// use regex_syntax::hir::{Hir, Class, ClassUnicode, ClassUnicodeRange};
526 ///
527 /// let hir = Hir::alternation(vec![
528 /// Hir::concat(vec![
529 /// Hir::literal("abc".as_bytes()),
530 /// Hir::class(Class::Unicode(ClassUnicode::new([
531 /// ClassUnicodeRange::new('A', 'Z'),
532 /// ]))),
533 /// ]),
534 /// Hir::concat(vec![
535 /// Hir::literal("abc".as_bytes()),
536 /// Hir::class(Class::Unicode(ClassUnicode::new([
537 /// ClassUnicodeRange::new('a', 'z'),
538 /// ]))),
539 /// ]),
540 /// ]);
541 /// let expected = Hir::concat(vec![
542 /// Hir::literal("abc".as_bytes()),
543 /// Hir::alternation(vec![
544 /// Hir::class(Class::Unicode(ClassUnicode::new([
545 /// ClassUnicodeRange::new('A', 'Z'),
546 /// ]))),
547 /// Hir::class(Class::Unicode(ClassUnicode::new([
548 /// ClassUnicodeRange::new('a', 'z'),
549 /// ]))),
550 /// ]),
551 /// ]);
552 /// assert_eq!(expected, hir);
553 /// ```
554 ///
555 /// Note that these sorts of simplifications are not guaranteed.
556 pub fn alternation(subs: Vec<Hir>) -> Hir {
557 // We rebuild the alternation by simplifying it. We proceed similarly
558 // as the concatenation case. But in this case, there's no literal
559 // simplification happening. We're just flattening alternations.
560 let mut new = Vec::with_capacity(subs.len());
561 for sub in subs {
562 let (kind, props) = sub.into_parts();
563 match kind {
564 HirKind::Alternation(subs2) => {
565 new.extend(subs2);
566 }
567 kind => {
568 new.push(Hir { kind, props });
569 }
570 }
571 }
572 if new.is_empty() {
573 return Hir::fail();
574 } else if new.len() == 1 {
575 return new.pop().unwrap();
576 }
577 // Now that it's completely flattened, look for the special case of
578 // 'char1|char2|...|charN' and collapse that into a class. Note that
579 // we look for 'char' first and then bytes. The issue here is that if
580 // we find both non-ASCII codepoints and non-ASCII singleton bytes,
581 // then it isn't actually possible to smush them into a single class.
582 // (Because classes are either "all codepoints" or "all bytes." You
583 // can have a class that both matches non-ASCII but valid UTF-8 and
584 // invalid UTF-8.) So we look for all chars and then all bytes, and
585 // don't handle anything else.
586 if let Some(singletons) = singleton_chars(&new) {
587 let it = singletons
588 .into_iter()
589 .map(|ch| ClassUnicodeRange { start: ch, end: ch });
590 return Hir::class(Class::Unicode(ClassUnicode::new(it)));
591 }
592 if let Some(singletons) = singleton_bytes(&new) {
593 let it = singletons
594 .into_iter()
595 .map(|b| ClassBytesRange { start: b, end: b });
596 return Hir::class(Class::Bytes(ClassBytes::new(it)));
597 }
598 // Similar to singleton chars, we can also look for alternations of
599 // classes. Those can be smushed into a single class.
600 if let Some(cls) = class_chars(&new) {
601 return Hir::class(cls);
602 }
603 if let Some(cls) = class_bytes(&new) {
604 return Hir::class(cls);
605 }
606 // Factor out a common prefix if we can, which might potentially
607 // simplify the expression and unlock other optimizations downstream.
608 // It also might generally make NFA matching and DFA construction
609 // faster by reducing the scope of branching in the regex.
610 new = match lift_common_prefix(new) {
611 Ok(hir) => return hir,
612 Err(unchanged) => unchanged,
613 };
614 let props = Properties::alternation(&new);
615 Hir { kind: HirKind::Alternation(new), props }
616 }
617
618 /// Returns an HIR expression for `.`.
619 ///
620 /// * [`Dot::AnyChar`] maps to `(?su-R:.)`.
621 /// * [`Dot::AnyByte`] maps to `(?s-Ru:.)`.
622 /// * [`Dot::AnyCharExceptLF`] maps to `(?u-Rs:.)`.
623 /// * [`Dot::AnyCharExceptCRLF`] maps to `(?Ru-s:.)`.
624 /// * [`Dot::AnyByteExceptLF`] maps to `(?-Rsu:.)`.
625 /// * [`Dot::AnyByteExceptCRLF`] maps to `(?R-su:.)`.
626 ///
627 /// # Example
628 ///
629 /// Note that this is a convenience routine for constructing the correct
630 /// character class based on the value of `Dot`. There is no explicit "dot"
631 /// HIR value. It is just an abbreviation for a common character class.
632 ///
633 /// ```
634 /// use regex_syntax::hir::{Hir, Dot, Class, ClassBytes, ClassBytesRange};
635 ///
636 /// let hir = Hir::dot(Dot::AnyByte);
637 /// let expected = Hir::class(Class::Bytes(ClassBytes::new([
638 /// ClassBytesRange::new(0x00, 0xFF),
639 /// ])));
640 /// assert_eq!(expected, hir);
641 /// ```
642 #[inline]
643 pub fn dot(dot: Dot) -> Hir {
644 match dot {
645 Dot::AnyChar => {
646 let mut cls = ClassUnicode::empty();
647 cls.push(ClassUnicodeRange::new('\0', '\u{10FFFF}'));
648 Hir::class(Class::Unicode(cls))
649 }
650 Dot::AnyByte => {
651 let mut cls = ClassBytes::empty();
652 cls.push(ClassBytesRange::new(b'\0', b'\xFF'));
653 Hir::class(Class::Bytes(cls))
654 }
655 Dot::AnyCharExcept(ch) => {
656 let mut cls =
657 ClassUnicode::new([ClassUnicodeRange::new(ch, ch)]);
658 cls.negate();
659 Hir::class(Class::Unicode(cls))
660 }
661 Dot::AnyCharExceptLF => {
662 let mut cls = ClassUnicode::empty();
663 cls.push(ClassUnicodeRange::new('\0', '\x09'));
664 cls.push(ClassUnicodeRange::new('\x0B', '\u{10FFFF}'));
665 Hir::class(Class::Unicode(cls))
666 }
667 Dot::AnyCharExceptCRLF => {
668 let mut cls = ClassUnicode::empty();
669 cls.push(ClassUnicodeRange::new('\0', '\x09'));
670 cls.push(ClassUnicodeRange::new('\x0B', '\x0C'));
671 cls.push(ClassUnicodeRange::new('\x0E', '\u{10FFFF}'));
672 Hir::class(Class::Unicode(cls))
673 }
674 Dot::AnyByteExcept(byte) => {
675 let mut cls =
676 ClassBytes::new([ClassBytesRange::new(byte, byte)]);
677 cls.negate();
678 Hir::class(Class::Bytes(cls))
679 }
680 Dot::AnyByteExceptLF => {
681 let mut cls = ClassBytes::empty();
682 cls.push(ClassBytesRange::new(b'\0', b'\x09'));
683 cls.push(ClassBytesRange::new(b'\x0B', b'\xFF'));
684 Hir::class(Class::Bytes(cls))
685 }
686 Dot::AnyByteExceptCRLF => {
687 let mut cls = ClassBytes::empty();
688 cls.push(ClassBytesRange::new(b'\0', b'\x09'));
689 cls.push(ClassBytesRange::new(b'\x0B', b'\x0C'));
690 cls.push(ClassBytesRange::new(b'\x0E', b'\xFF'));
691 Hir::class(Class::Bytes(cls))
692 }
693 }
694 }
695}
696
697/// The underlying kind of an arbitrary [`Hir`] expression.
698///
699/// An `HirKind` is principally useful for doing case analysis on the type
700/// of a regular expression. If you're looking to build new `Hir` values,
701/// then you _must_ use the smart constructors defined on `Hir`, like
702/// [`Hir::repetition`], to build new `Hir` values. The API intentionally does
703/// not expose any way of building an `Hir` directly from an `HirKind`.
704#[derive(Clone, Debug, Eq, PartialEq)]
705pub enum HirKind {
706 /// The empty regular expression, which matches everything, including the
707 /// empty string.
708 Empty,
709 /// A literalstring that matches exactly these bytes.
710 Literal(Literal),
711 /// A single character class that matches any of the characters in the
712 /// class. A class can either consist of Unicode scalar values as
713 /// characters, or it can use bytes.
714 ///
715 /// A class may be empty. In which case, it matches nothing.
716 Class(Class),
717 /// A look-around assertion. A look-around match always has zero length.
718 Look(Look),
719 /// A repetition operation applied to a sub-expression.
720 Repetition(Repetition),
721 /// A capturing group, which contains a sub-expression.
722 Capture(Capture),
723 /// A concatenation of expressions.
724 ///
725 /// A concatenation matches only if each of its sub-expressions match one
726 /// after the other.
727 ///
728 /// Concatenations are guaranteed by `Hir`'s smart constructors to always
729 /// have at least two sub-expressions.
730 Concat(Vec<Hir>),
731 /// An alternation of expressions.
732 ///
733 /// An alternation matches only if at least one of its sub-expressions
734 /// match. If multiple sub-expressions match, then the leftmost is
735 /// preferred.
736 ///
737 /// Alternations are guaranteed by `Hir`'s smart constructors to always
738 /// have at least two sub-expressions.
739 Alternation(Vec<Hir>),
740}
741
742impl HirKind {
743 /// Returns a slice of this kind's sub-expressions, if any.
744 pub fn subs(&self) -> &[Hir] {
745 use core::slice::from_ref;
746
747 match *self {
748 HirKind::Empty
749 | HirKind::Literal(_)
750 | HirKind::Class(_)
751 | HirKind::Look(_) => &[],
752 HirKind::Repetition(Repetition { ref sub: &Box, .. }) => from_ref(sub),
753 HirKind::Capture(Capture { ref sub: &Box, .. }) => from_ref(sub),
754 HirKind::Concat(ref subs: &Vec) => subs,
755 HirKind::Alternation(ref subs: &Vec) => subs,
756 }
757 }
758}
759
760impl core::fmt::Debug for Hir {
761 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
762 self.kind.fmt(f)
763 }
764}
765
766/// Print a display representation of this Hir.
767///
768/// The result of this is a valid regular expression pattern string.
769///
770/// This implementation uses constant stack space and heap space proportional
771/// to the size of the `Hir`.
772impl core::fmt::Display for Hir {
773 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
774 crate::hir::print::Printer::new().print(self, wtr:f)
775 }
776}
777
778/// The high-level intermediate representation of a literal.
779///
780/// A literal corresponds to `0` or more bytes that should be matched
781/// literally. The smart constructors defined on `Hir` will automatically
782/// concatenate adjacent literals into one literal, and will even automatically
783/// replace empty literals with `Hir::empty()`.
784///
785/// Note that despite a literal being represented by a sequence of bytes, its
786/// `Debug` implementation will attempt to print it as a normal string. (That
787/// is, not a sequence of decimal numbers.)
788#[derive(Clone, Eq, PartialEq)]
789pub struct Literal(pub Box<[u8]>);
790
791impl core::fmt::Debug for Literal {
792 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
793 crate::debug::Bytes(&self.0).fmt(f)
794 }
795}
796
797/// The high-level intermediate representation of a character class.
798///
799/// A character class corresponds to a set of characters. A character is either
800/// defined by a Unicode scalar value or a byte.
801///
802/// A character class, regardless of its character type, is represented by a
803/// sequence of non-overlapping non-adjacent ranges of characters.
804///
805/// There are no guarantees about which class variant is used. Generally
806/// speaking, the Unicode variat is used whenever a class needs to contain
807/// non-ASCII Unicode scalar values. But the Unicode variant can be used even
808/// when Unicode mode is disabled. For example, at the time of writing, the
809/// regex `(?-u:a|\xc2\xa0)` will compile down to HIR for the Unicode class
810/// `[a\u00A0]` due to optimizations.
811///
812/// Note that `Bytes` variant may be produced even when it exclusively matches
813/// valid UTF-8. This is because a `Bytes` variant represents an intention by
814/// the author of the regular expression to disable Unicode mode, which in turn
815/// impacts the semantics of case insensitive matching. For example, `(?i)k`
816/// and `(?i-u)k` will not match the same set of strings.
817#[derive(Clone, Eq, PartialEq)]
818pub enum Class {
819 /// A set of characters represented by Unicode scalar values.
820 Unicode(ClassUnicode),
821 /// A set of characters represented by arbitrary bytes (one byte per
822 /// character).
823 Bytes(ClassBytes),
824}
825
826impl Class {
827 /// Apply Unicode simple case folding to this character class, in place.
828 /// The character class will be expanded to include all simple case folded
829 /// character variants.
830 ///
831 /// If this is a byte oriented character class, then this will be limited
832 /// to the ASCII ranges `A-Z` and `a-z`.
833 ///
834 /// # Panics
835 ///
836 /// This routine panics when the case mapping data necessary for this
837 /// routine to complete is unavailable. This occurs when the `unicode-case`
838 /// feature is not enabled and the underlying class is Unicode oriented.
839 ///
840 /// Callers should prefer using `try_case_fold_simple` instead, which will
841 /// return an error instead of panicking.
842 pub fn case_fold_simple(&mut self) {
843 match *self {
844 Class::Unicode(ref mut x) => x.case_fold_simple(),
845 Class::Bytes(ref mut x) => x.case_fold_simple(),
846 }
847 }
848
849 /// Apply Unicode simple case folding to this character class, in place.
850 /// The character class will be expanded to include all simple case folded
851 /// character variants.
852 ///
853 /// If this is a byte oriented character class, then this will be limited
854 /// to the ASCII ranges `A-Z` and `a-z`.
855 ///
856 /// # Error
857 ///
858 /// This routine returns an error when the case mapping data necessary
859 /// for this routine to complete is unavailable. This occurs when the
860 /// `unicode-case` feature is not enabled and the underlying class is
861 /// Unicode oriented.
862 pub fn try_case_fold_simple(
863 &mut self,
864 ) -> core::result::Result<(), CaseFoldError> {
865 match *self {
866 Class::Unicode(ref mut x) => x.try_case_fold_simple()?,
867 Class::Bytes(ref mut x) => x.case_fold_simple(),
868 }
869 Ok(())
870 }
871
872 /// Negate this character class in place.
873 ///
874 /// After completion, this character class will contain precisely the
875 /// characters that weren't previously in the class.
876 pub fn negate(&mut self) {
877 match *self {
878 Class::Unicode(ref mut x) => x.negate(),
879 Class::Bytes(ref mut x) => x.negate(),
880 }
881 }
882
883 /// Returns true if and only if this character class will only ever match
884 /// valid UTF-8.
885 ///
886 /// A character class can match invalid UTF-8 only when the following
887 /// conditions are met:
888 ///
889 /// 1. The translator was configured to permit generating an expression
890 /// that can match invalid UTF-8. (By default, this is disabled.)
891 /// 2. Unicode mode (via the `u` flag) was disabled either in the concrete
892 /// syntax or in the parser builder. By default, Unicode mode is
893 /// enabled.
894 pub fn is_utf8(&self) -> bool {
895 match *self {
896 Class::Unicode(_) => true,
897 Class::Bytes(ref x) => x.is_ascii(),
898 }
899 }
900
901 /// Returns the length, in bytes, of the smallest string matched by this
902 /// character class.
903 ///
904 /// For non-empty byte oriented classes, this always returns `1`. For
905 /// non-empty Unicode oriented classes, this can return `1`, `2`, `3` or
906 /// `4`. For empty classes, `None` is returned. It is impossible for `0` to
907 /// be returned.
908 ///
909 /// # Example
910 ///
911 /// This example shows some examples of regexes and their corresponding
912 /// minimum length, if any.
913 ///
914 /// ```
915 /// use regex_syntax::{hir::Properties, parse};
916 ///
917 /// // The empty string has a min length of 0.
918 /// let hir = parse(r"")?;
919 /// assert_eq!(Some(0), hir.properties().minimum_len());
920 /// // As do other types of regexes that only match the empty string.
921 /// let hir = parse(r"^$\b\B")?;
922 /// assert_eq!(Some(0), hir.properties().minimum_len());
923 /// // A regex that can match the empty string but match more is still 0.
924 /// let hir = parse(r"a*")?;
925 /// assert_eq!(Some(0), hir.properties().minimum_len());
926 /// // A regex that matches nothing has no minimum defined.
927 /// let hir = parse(r"[a&&b]")?;
928 /// assert_eq!(None, hir.properties().minimum_len());
929 /// // Character classes usually have a minimum length of 1.
930 /// let hir = parse(r"\w")?;
931 /// assert_eq!(Some(1), hir.properties().minimum_len());
932 /// // But sometimes Unicode classes might be bigger!
933 /// let hir = parse(r"\p{Cyrillic}")?;
934 /// assert_eq!(Some(2), hir.properties().minimum_len());
935 ///
936 /// # Ok::<(), Box<dyn std::error::Error>>(())
937 /// ```
938 pub fn minimum_len(&self) -> Option<usize> {
939 match *self {
940 Class::Unicode(ref x) => x.minimum_len(),
941 Class::Bytes(ref x) => x.minimum_len(),
942 }
943 }
944
945 /// Returns the length, in bytes, of the longest string matched by this
946 /// character class.
947 ///
948 /// For non-empty byte oriented classes, this always returns `1`. For
949 /// non-empty Unicode oriented classes, this can return `1`, `2`, `3` or
950 /// `4`. For empty classes, `None` is returned. It is impossible for `0` to
951 /// be returned.
952 ///
953 /// # Example
954 ///
955 /// This example shows some examples of regexes and their corresponding
956 /// maximum length, if any.
957 ///
958 /// ```
959 /// use regex_syntax::{hir::Properties, parse};
960 ///
961 /// // The empty string has a max length of 0.
962 /// let hir = parse(r"")?;
963 /// assert_eq!(Some(0), hir.properties().maximum_len());
964 /// // As do other types of regexes that only match the empty string.
965 /// let hir = parse(r"^$\b\B")?;
966 /// assert_eq!(Some(0), hir.properties().maximum_len());
967 /// // A regex that matches nothing has no maximum defined.
968 /// let hir = parse(r"[a&&b]")?;
969 /// assert_eq!(None, hir.properties().maximum_len());
970 /// // Bounded repeats work as you expect.
971 /// let hir = parse(r"x{2,10}")?;
972 /// assert_eq!(Some(10), hir.properties().maximum_len());
973 /// // An unbounded repeat means there is no maximum.
974 /// let hir = parse(r"x{2,}")?;
975 /// assert_eq!(None, hir.properties().maximum_len());
976 /// // With Unicode enabled, \w can match up to 4 bytes!
977 /// let hir = parse(r"\w")?;
978 /// assert_eq!(Some(4), hir.properties().maximum_len());
979 /// // Without Unicode enabled, \w matches at most 1 byte.
980 /// let hir = parse(r"(?-u)\w")?;
981 /// assert_eq!(Some(1), hir.properties().maximum_len());
982 ///
983 /// # Ok::<(), Box<dyn std::error::Error>>(())
984 /// ```
985 pub fn maximum_len(&self) -> Option<usize> {
986 match *self {
987 Class::Unicode(ref x) => x.maximum_len(),
988 Class::Bytes(ref x) => x.maximum_len(),
989 }
990 }
991
992 /// Returns true if and only if this character class is empty. That is,
993 /// it has no elements.
994 ///
995 /// An empty character can never match anything, including an empty string.
996 pub fn is_empty(&self) -> bool {
997 match *self {
998 Class::Unicode(ref x) => x.ranges().is_empty(),
999 Class::Bytes(ref x) => x.ranges().is_empty(),
1000 }
1001 }
1002
1003 /// If this class consists of exactly one element (whether a codepoint or a
1004 /// byte), then return it as a literal byte string.
1005 ///
1006 /// If this class is empty or contains more than one element, then `None`
1007 /// is returned.
1008 pub fn literal(&self) -> Option<Vec<u8>> {
1009 match *self {
1010 Class::Unicode(ref x) => x.literal(),
1011 Class::Bytes(ref x) => x.literal(),
1012 }
1013 }
1014}
1015
1016impl core::fmt::Debug for Class {
1017 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
1018 use crate::debug::Byte;
1019
1020 let mut fmter: DebugSet<'_, '_> = f.debug_set();
1021 match *self {
1022 Class::Unicode(ref cls: &ClassUnicode) => {
1023 for r: &ClassUnicodeRange in cls.ranges().iter() {
1024 fmter.entry(&(r.start..=r.end));
1025 }
1026 }
1027 Class::Bytes(ref cls: &ClassBytes) => {
1028 for r: &ClassBytesRange in cls.ranges().iter() {
1029 fmter.entry(&(Byte(r.start)..=Byte(r.end)));
1030 }
1031 }
1032 }
1033 fmter.finish()
1034 }
1035}
1036
1037/// A set of characters represented by Unicode scalar values.
1038#[derive(Clone, Debug, Eq, PartialEq)]
1039pub struct ClassUnicode {
1040 set: IntervalSet<ClassUnicodeRange>,
1041}
1042
1043impl ClassUnicode {
1044 /// Create a new class from a sequence of ranges.
1045 ///
1046 /// The given ranges do not need to be in any specific order, and ranges
1047 /// may overlap. Ranges will automatically be sorted into a canonical
1048 /// non-overlapping order.
1049 pub fn new<I>(ranges: I) -> ClassUnicode
1050 where
1051 I: IntoIterator<Item = ClassUnicodeRange>,
1052 {
1053 ClassUnicode { set: IntervalSet::new(ranges) }
1054 }
1055
1056 /// Create a new class with no ranges.
1057 ///
1058 /// An empty class matches nothing. That is, it is equivalent to
1059 /// [`Hir::fail`].
1060 pub fn empty() -> ClassUnicode {
1061 ClassUnicode::new(vec![])
1062 }
1063
1064 /// Add a new range to this set.
1065 pub fn push(&mut self, range: ClassUnicodeRange) {
1066 self.set.push(range);
1067 }
1068
1069 /// Return an iterator over all ranges in this class.
1070 ///
1071 /// The iterator yields ranges in ascending order.
1072 pub fn iter(&self) -> ClassUnicodeIter<'_> {
1073 ClassUnicodeIter(self.set.iter())
1074 }
1075
1076 /// Return the underlying ranges as a slice.
1077 pub fn ranges(&self) -> &[ClassUnicodeRange] {
1078 self.set.intervals()
1079 }
1080
1081 /// Expand this character class such that it contains all case folded
1082 /// characters, according to Unicode's "simple" mapping. For example, if
1083 /// this class consists of the range `a-z`, then applying case folding will
1084 /// result in the class containing both the ranges `a-z` and `A-Z`.
1085 ///
1086 /// # Panics
1087 ///
1088 /// This routine panics when the case mapping data necessary for this
1089 /// routine to complete is unavailable. This occurs when the `unicode-case`
1090 /// feature is not enabled.
1091 ///
1092 /// Callers should prefer using `try_case_fold_simple` instead, which will
1093 /// return an error instead of panicking.
1094 pub fn case_fold_simple(&mut self) {
1095 self.set
1096 .case_fold_simple()
1097 .expect("unicode-case feature must be enabled");
1098 }
1099
1100 /// Expand this character class such that it contains all case folded
1101 /// characters, according to Unicode's "simple" mapping. For example, if
1102 /// this class consists of the range `a-z`, then applying case folding will
1103 /// result in the class containing both the ranges `a-z` and `A-Z`.
1104 ///
1105 /// # Error
1106 ///
1107 /// This routine returns an error when the case mapping data necessary
1108 /// for this routine to complete is unavailable. This occurs when the
1109 /// `unicode-case` feature is not enabled.
1110 pub fn try_case_fold_simple(
1111 &mut self,
1112 ) -> core::result::Result<(), CaseFoldError> {
1113 self.set.case_fold_simple()
1114 }
1115
1116 /// Negate this character class.
1117 ///
1118 /// For all `c` where `c` is a Unicode scalar value, if `c` was in this
1119 /// set, then it will not be in this set after negation.
1120 pub fn negate(&mut self) {
1121 self.set.negate();
1122 }
1123
1124 /// Union this character class with the given character class, in place.
1125 pub fn union(&mut self, other: &ClassUnicode) {
1126 self.set.union(&other.set);
1127 }
1128
1129 /// Intersect this character class with the given character class, in
1130 /// place.
1131 pub fn intersect(&mut self, other: &ClassUnicode) {
1132 self.set.intersect(&other.set);
1133 }
1134
1135 /// Subtract the given character class from this character class, in place.
1136 pub fn difference(&mut self, other: &ClassUnicode) {
1137 self.set.difference(&other.set);
1138 }
1139
1140 /// Compute the symmetric difference of the given character classes, in
1141 /// place.
1142 ///
1143 /// This computes the symmetric difference of two character classes. This
1144 /// removes all elements in this class that are also in the given class,
1145 /// but all adds all elements from the given class that aren't in this
1146 /// class. That is, the class will contain all elements in either class,
1147 /// but will not contain any elements that are in both classes.
1148 pub fn symmetric_difference(&mut self, other: &ClassUnicode) {
1149 self.set.symmetric_difference(&other.set);
1150 }
1151
1152 /// Returns true if and only if this character class will either match
1153 /// nothing or only ASCII bytes. Stated differently, this returns false
1154 /// if and only if this class contains a non-ASCII codepoint.
1155 pub fn is_ascii(&self) -> bool {
1156 self.set.intervals().last().map_or(true, |r| r.end <= '\x7F')
1157 }
1158
1159 /// Returns the length, in bytes, of the smallest string matched by this
1160 /// character class.
1161 ///
1162 /// Returns `None` when the class is empty.
1163 pub fn minimum_len(&self) -> Option<usize> {
1164 let first = self.ranges().get(0)?;
1165 // Correct because c1 < c2 implies c1.len_utf8() < c2.len_utf8().
1166 Some(first.start.len_utf8())
1167 }
1168
1169 /// Returns the length, in bytes, of the longest string matched by this
1170 /// character class.
1171 ///
1172 /// Returns `None` when the class is empty.
1173 pub fn maximum_len(&self) -> Option<usize> {
1174 let last = self.ranges().last()?;
1175 // Correct because c1 < c2 implies c1.len_utf8() < c2.len_utf8().
1176 Some(last.end.len_utf8())
1177 }
1178
1179 /// If this class consists of exactly one codepoint, then return it as
1180 /// a literal byte string.
1181 ///
1182 /// If this class is empty or contains more than one codepoint, then `None`
1183 /// is returned.
1184 pub fn literal(&self) -> Option<Vec<u8>> {
1185 let rs = self.ranges();
1186 if rs.len() == 1 && rs[0].start == rs[0].end {
1187 Some(rs[0].start.encode_utf8(&mut [0; 4]).to_string().into_bytes())
1188 } else {
1189 None
1190 }
1191 }
1192
1193 /// If this class consists of only ASCII ranges, then return its
1194 /// corresponding and equivalent byte class.
1195 pub fn to_byte_class(&self) -> Option<ClassBytes> {
1196 if !self.is_ascii() {
1197 return None;
1198 }
1199 Some(ClassBytes::new(self.ranges().iter().map(|r| {
1200 // Since we are guaranteed that our codepoint range is ASCII, the
1201 // 'u8::try_from' calls below are guaranteed to be correct.
1202 ClassBytesRange {
1203 start: u8::try_from(r.start).unwrap(),
1204 end: u8::try_from(r.end).unwrap(),
1205 }
1206 })))
1207 }
1208}
1209
1210/// An iterator over all ranges in a Unicode character class.
1211///
1212/// The lifetime `'a` refers to the lifetime of the underlying class.
1213#[derive(Debug)]
1214pub struct ClassUnicodeIter<'a>(IntervalSetIter<'a, ClassUnicodeRange>);
1215
1216impl<'a> Iterator for ClassUnicodeIter<'a> {
1217 type Item = &'a ClassUnicodeRange;
1218
1219 fn next(&mut self) -> Option<&'a ClassUnicodeRange> {
1220 self.0.next()
1221 }
1222}
1223
1224/// A single range of characters represented by Unicode scalar values.
1225///
1226/// The range is closed. That is, the start and end of the range are included
1227/// in the range.
1228#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
1229pub struct ClassUnicodeRange {
1230 start: char,
1231 end: char,
1232}
1233
1234impl core::fmt::Debug for ClassUnicodeRange {
1235 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1236 let start: String = if !self.start.is_whitespace() && !self.start.is_control()
1237 {
1238 self.start.to_string()
1239 } else {
1240 format!("0x{:X}", u32::from(self.start))
1241 };
1242 let end: String = if !self.end.is_whitespace() && !self.end.is_control() {
1243 self.end.to_string()
1244 } else {
1245 format!("0x{:X}", u32::from(self.end))
1246 };
1247 f&mut DebugStruct<'_, '_>.debug_struct("ClassUnicodeRange")
1248 .field("start", &start)
1249 .field(name:"end", &end)
1250 .finish()
1251 }
1252}
1253
1254impl Interval for ClassUnicodeRange {
1255 type Bound = char;
1256
1257 #[inline]
1258 fn lower(&self) -> char {
1259 self.start
1260 }
1261 #[inline]
1262 fn upper(&self) -> char {
1263 self.end
1264 }
1265 #[inline]
1266 fn set_lower(&mut self, bound: char) {
1267 self.start = bound;
1268 }
1269 #[inline]
1270 fn set_upper(&mut self, bound: char) {
1271 self.end = bound;
1272 }
1273
1274 /// Apply simple case folding to this Unicode scalar value range.
1275 ///
1276 /// Additional ranges are appended to the given vector. Canonical ordering
1277 /// is *not* maintained in the given vector.
1278 fn case_fold_simple(
1279 &self,
1280 ranges: &mut Vec<ClassUnicodeRange>,
1281 ) -> Result<(), unicode::CaseFoldError> {
1282 let mut folder = unicode::SimpleCaseFolder::new()?;
1283 if !folder.overlaps(self.start, self.end) {
1284 return Ok(());
1285 }
1286 let (start, end) = (u32::from(self.start), u32::from(self.end));
1287 for cp in (start..=end).filter_map(char::from_u32) {
1288 for &cp_folded in folder.mapping(cp) {
1289 ranges.push(ClassUnicodeRange::new(cp_folded, cp_folded));
1290 }
1291 }
1292 Ok(())
1293 }
1294}
1295
1296impl ClassUnicodeRange {
1297 /// Create a new Unicode scalar value range for a character class.
1298 ///
1299 /// The returned range is always in a canonical form. That is, the range
1300 /// returned always satisfies the invariant that `start <= end`.
1301 pub fn new(start: char, end: char) -> ClassUnicodeRange {
1302 ClassUnicodeRange::create(start, end)
1303 }
1304
1305 /// Return the start of this range.
1306 ///
1307 /// The start of a range is always less than or equal to the end of the
1308 /// range.
1309 pub fn start(&self) -> char {
1310 self.start
1311 }
1312
1313 /// Return the end of this range.
1314 ///
1315 /// The end of a range is always greater than or equal to the start of the
1316 /// range.
1317 pub fn end(&self) -> char {
1318 self.end
1319 }
1320
1321 /// Returns the number of codepoints in this range.
1322 pub fn len(&self) -> usize {
1323 let diff = 1 + u32::from(self.end) - u32::from(self.start);
1324 // This is likely to panic in 16-bit targets since a usize can only fit
1325 // 2^16. It's not clear what to do here, other than to return an error
1326 // when building a Unicode class that contains a range whose length
1327 // overflows usize. (Which, to be honest, is probably quite common on
1328 // 16-bit targets. For example, this would imply that '.' and '\p{any}'
1329 // would be impossible to build.)
1330 usize::try_from(diff).expect("char class len fits in usize")
1331 }
1332}
1333
1334/// A set of characters represented by arbitrary bytes.
1335///
1336/// Each byte corresponds to one character.
1337#[derive(Clone, Debug, Eq, PartialEq)]
1338pub struct ClassBytes {
1339 set: IntervalSet<ClassBytesRange>,
1340}
1341
1342impl ClassBytes {
1343 /// Create a new class from a sequence of ranges.
1344 ///
1345 /// The given ranges do not need to be in any specific order, and ranges
1346 /// may overlap. Ranges will automatically be sorted into a canonical
1347 /// non-overlapping order.
1348 pub fn new<I>(ranges: I) -> ClassBytes
1349 where
1350 I: IntoIterator<Item = ClassBytesRange>,
1351 {
1352 ClassBytes { set: IntervalSet::new(ranges) }
1353 }
1354
1355 /// Create a new class with no ranges.
1356 ///
1357 /// An empty class matches nothing. That is, it is equivalent to
1358 /// [`Hir::fail`].
1359 pub fn empty() -> ClassBytes {
1360 ClassBytes::new(vec![])
1361 }
1362
1363 /// Add a new range to this set.
1364 pub fn push(&mut self, range: ClassBytesRange) {
1365 self.set.push(range);
1366 }
1367
1368 /// Return an iterator over all ranges in this class.
1369 ///
1370 /// The iterator yields ranges in ascending order.
1371 pub fn iter(&self) -> ClassBytesIter<'_> {
1372 ClassBytesIter(self.set.iter())
1373 }
1374
1375 /// Return the underlying ranges as a slice.
1376 pub fn ranges(&self) -> &[ClassBytesRange] {
1377 self.set.intervals()
1378 }
1379
1380 /// Expand this character class such that it contains all case folded
1381 /// characters. For example, if this class consists of the range `a-z`,
1382 /// then applying case folding will result in the class containing both the
1383 /// ranges `a-z` and `A-Z`.
1384 ///
1385 /// Note that this only applies ASCII case folding, which is limited to the
1386 /// characters `a-z` and `A-Z`.
1387 pub fn case_fold_simple(&mut self) {
1388 self.set.case_fold_simple().expect("ASCII case folding never fails");
1389 }
1390
1391 /// Negate this byte class.
1392 ///
1393 /// For all `b` where `b` is a any byte, if `b` was in this set, then it
1394 /// will not be in this set after negation.
1395 pub fn negate(&mut self) {
1396 self.set.negate();
1397 }
1398
1399 /// Union this byte class with the given byte class, in place.
1400 pub fn union(&mut self, other: &ClassBytes) {
1401 self.set.union(&other.set);
1402 }
1403
1404 /// Intersect this byte class with the given byte class, in place.
1405 pub fn intersect(&mut self, other: &ClassBytes) {
1406 self.set.intersect(&other.set);
1407 }
1408
1409 /// Subtract the given byte class from this byte class, in place.
1410 pub fn difference(&mut self, other: &ClassBytes) {
1411 self.set.difference(&other.set);
1412 }
1413
1414 /// Compute the symmetric difference of the given byte classes, in place.
1415 ///
1416 /// This computes the symmetric difference of two byte classes. This
1417 /// removes all elements in this class that are also in the given class,
1418 /// but all adds all elements from the given class that aren't in this
1419 /// class. That is, the class will contain all elements in either class,
1420 /// but will not contain any elements that are in both classes.
1421 pub fn symmetric_difference(&mut self, other: &ClassBytes) {
1422 self.set.symmetric_difference(&other.set);
1423 }
1424
1425 /// Returns true if and only if this character class will either match
1426 /// nothing or only ASCII bytes. Stated differently, this returns false
1427 /// if and only if this class contains a non-ASCII byte.
1428 pub fn is_ascii(&self) -> bool {
1429 self.set.intervals().last().map_or(true, |r| r.end <= 0x7F)
1430 }
1431
1432 /// Returns the length, in bytes, of the smallest string matched by this
1433 /// character class.
1434 ///
1435 /// Returns `None` when the class is empty.
1436 pub fn minimum_len(&self) -> Option<usize> {
1437 if self.ranges().is_empty() {
1438 None
1439 } else {
1440 Some(1)
1441 }
1442 }
1443
1444 /// Returns the length, in bytes, of the longest string matched by this
1445 /// character class.
1446 ///
1447 /// Returns `None` when the class is empty.
1448 pub fn maximum_len(&self) -> Option<usize> {
1449 if self.ranges().is_empty() {
1450 None
1451 } else {
1452 Some(1)
1453 }
1454 }
1455
1456 /// If this class consists of exactly one byte, then return it as
1457 /// a literal byte string.
1458 ///
1459 /// If this class is empty or contains more than one byte, then `None`
1460 /// is returned.
1461 pub fn literal(&self) -> Option<Vec<u8>> {
1462 let rs = self.ranges();
1463 if rs.len() == 1 && rs[0].start == rs[0].end {
1464 Some(vec![rs[0].start])
1465 } else {
1466 None
1467 }
1468 }
1469
1470 /// If this class consists of only ASCII ranges, then return its
1471 /// corresponding and equivalent Unicode class.
1472 pub fn to_unicode_class(&self) -> Option<ClassUnicode> {
1473 if !self.is_ascii() {
1474 return None;
1475 }
1476 Some(ClassUnicode::new(self.ranges().iter().map(|r| {
1477 // Since we are guaranteed that our byte range is ASCII, the
1478 // 'char::from' calls below are correct and will not erroneously
1479 // convert a raw byte value into its corresponding codepoint.
1480 ClassUnicodeRange {
1481 start: char::from(r.start),
1482 end: char::from(r.end),
1483 }
1484 })))
1485 }
1486}
1487
1488/// An iterator over all ranges in a byte character class.
1489///
1490/// The lifetime `'a` refers to the lifetime of the underlying class.
1491#[derive(Debug)]
1492pub struct ClassBytesIter<'a>(IntervalSetIter<'a, ClassBytesRange>);
1493
1494impl<'a> Iterator for ClassBytesIter<'a> {
1495 type Item = &'a ClassBytesRange;
1496
1497 fn next(&mut self) -> Option<&'a ClassBytesRange> {
1498 self.0.next()
1499 }
1500}
1501
1502/// A single range of characters represented by arbitrary bytes.
1503///
1504/// The range is closed. That is, the start and end of the range are included
1505/// in the range.
1506#[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
1507pub struct ClassBytesRange {
1508 start: u8,
1509 end: u8,
1510}
1511
1512impl Interval for ClassBytesRange {
1513 type Bound = u8;
1514
1515 #[inline]
1516 fn lower(&self) -> u8 {
1517 self.start
1518 }
1519 #[inline]
1520 fn upper(&self) -> u8 {
1521 self.end
1522 }
1523 #[inline]
1524 fn set_lower(&mut self, bound: u8) {
1525 self.start = bound;
1526 }
1527 #[inline]
1528 fn set_upper(&mut self, bound: u8) {
1529 self.end = bound;
1530 }
1531
1532 /// Apply simple case folding to this byte range. Only ASCII case mappings
1533 /// (for a-z) are applied.
1534 ///
1535 /// Additional ranges are appended to the given vector. Canonical ordering
1536 /// is *not* maintained in the given vector.
1537 fn case_fold_simple(
1538 &self,
1539 ranges: &mut Vec<ClassBytesRange>,
1540 ) -> Result<(), unicode::CaseFoldError> {
1541 if !ClassBytesRange::new(b'a', b'z').is_intersection_empty(self) {
1542 let lower = cmp::max(self.start, b'a');
1543 let upper = cmp::min(self.end, b'z');
1544 ranges.push(ClassBytesRange::new(lower - 32, upper - 32));
1545 }
1546 if !ClassBytesRange::new(b'A', b'Z').is_intersection_empty(self) {
1547 let lower = cmp::max(self.start, b'A');
1548 let upper = cmp::min(self.end, b'Z');
1549 ranges.push(ClassBytesRange::new(lower + 32, upper + 32));
1550 }
1551 Ok(())
1552 }
1553}
1554
1555impl ClassBytesRange {
1556 /// Create a new byte range for a character class.
1557 ///
1558 /// The returned range is always in a canonical form. That is, the range
1559 /// returned always satisfies the invariant that `start <= end`.
1560 pub fn new(start: u8, end: u8) -> ClassBytesRange {
1561 ClassBytesRange::create(start, end)
1562 }
1563
1564 /// Return the start of this range.
1565 ///
1566 /// The start of a range is always less than or equal to the end of the
1567 /// range.
1568 pub fn start(&self) -> u8 {
1569 self.start
1570 }
1571
1572 /// Return the end of this range.
1573 ///
1574 /// The end of a range is always greater than or equal to the start of the
1575 /// range.
1576 pub fn end(&self) -> u8 {
1577 self.end
1578 }
1579
1580 /// Returns the number of bytes in this range.
1581 pub fn len(&self) -> usize {
1582 usize::from(self.end.checked_sub(self.start).unwrap())
1583 .checked_add(1)
1584 .unwrap()
1585 }
1586}
1587
1588impl core::fmt::Debug for ClassBytesRange {
1589 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1590 f&mut DebugStruct<'_, '_>.debug_struct("ClassBytesRange")
1591 .field("start", &crate::debug::Byte(self.start))
1592 .field(name:"end", &crate::debug::Byte(self.end))
1593 .finish()
1594 }
1595}
1596
1597/// The high-level intermediate representation for a look-around assertion.
1598///
1599/// An assertion match is always zero-length. Also called an "empty match."
1600#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1601pub enum Look {
1602 /// Match the beginning of text. Specifically, this matches at the starting
1603 /// position of the input.
1604 Start = 1 << 0,
1605 /// Match the end of text. Specifically, this matches at the ending
1606 /// position of the input.
1607 End = 1 << 1,
1608 /// Match the beginning of a line or the beginning of text. Specifically,
1609 /// this matches at the starting position of the input, or at the position
1610 /// immediately following a `\n` character.
1611 StartLF = 1 << 2,
1612 /// Match the end of a line or the end of text. Specifically, this matches
1613 /// at the end position of the input, or at the position immediately
1614 /// preceding a `\n` character.
1615 EndLF = 1 << 3,
1616 /// Match the beginning of a line or the beginning of text. Specifically,
1617 /// this matches at the starting position of the input, or at the position
1618 /// immediately following either a `\r` or `\n` character, but never after
1619 /// a `\r` when a `\n` follows.
1620 StartCRLF = 1 << 4,
1621 /// Match the end of a line or the end of text. Specifically, this matches
1622 /// at the end position of the input, or at the position immediately
1623 /// preceding a `\r` or `\n` character, but never before a `\n` when a `\r`
1624 /// precedes it.
1625 EndCRLF = 1 << 5,
1626 /// Match an ASCII-only word boundary. That is, this matches a position
1627 /// where the left adjacent character and right adjacent character
1628 /// correspond to a word and non-word or a non-word and word character.
1629 WordAscii = 1 << 6,
1630 /// Match an ASCII-only negation of a word boundary.
1631 WordAsciiNegate = 1 << 7,
1632 /// Match a Unicode-aware word boundary. That is, this matches a position
1633 /// where the left adjacent character and right adjacent character
1634 /// correspond to a word and non-word or a non-word and word character.
1635 WordUnicode = 1 << 8,
1636 /// Match a Unicode-aware negation of a word boundary.
1637 WordUnicodeNegate = 1 << 9,
1638 /// Match the start of an ASCII-only word boundary. That is, this matches a
1639 /// position at either the beginning of the haystack or where the previous
1640 /// character is not a word character and the following character is a word
1641 /// character.
1642 WordStartAscii = 1 << 10,
1643 /// Match the end of an ASCII-only word boundary. That is, this matches
1644 /// a position at either the end of the haystack or where the previous
1645 /// character is a word character and the following character is not a word
1646 /// character.
1647 WordEndAscii = 1 << 11,
1648 /// Match the start of a Unicode word boundary. That is, this matches a
1649 /// position at either the beginning of the haystack or where the previous
1650 /// character is not a word character and the following character is a word
1651 /// character.
1652 WordStartUnicode = 1 << 12,
1653 /// Match the end of a Unicode word boundary. That is, this matches a
1654 /// position at either the end of the haystack or where the previous
1655 /// character is a word character and the following character is not a word
1656 /// character.
1657 WordEndUnicode = 1 << 13,
1658 /// Match the start half of an ASCII-only word boundary. That is, this
1659 /// matches a position at either the beginning of the haystack or where the
1660 /// previous character is not a word character.
1661 WordStartHalfAscii = 1 << 14,
1662 /// Match the end half of an ASCII-only word boundary. That is, this
1663 /// matches a position at either the end of the haystack or where the
1664 /// following character is not a word character.
1665 WordEndHalfAscii = 1 << 15,
1666 /// Match the start half of a Unicode word boundary. That is, this matches
1667 /// a position at either the beginning of the haystack or where the
1668 /// previous character is not a word character.
1669 WordStartHalfUnicode = 1 << 16,
1670 /// Match the end half of a Unicode word boundary. That is, this matches
1671 /// a position at either the end of the haystack or where the following
1672 /// character is not a word character.
1673 WordEndHalfUnicode = 1 << 17,
1674}
1675
1676impl Look {
1677 /// Flip the look-around assertion to its equivalent for reverse searches.
1678 /// For example, `StartLF` gets translated to `EndLF`.
1679 ///
1680 /// Some assertions, such as `WordUnicode`, remain the same since they
1681 /// match the same positions regardless of the direction of the search.
1682 #[inline]
1683 pub const fn reversed(self) -> Look {
1684 match self {
1685 Look::Start => Look::End,
1686 Look::End => Look::Start,
1687 Look::StartLF => Look::EndLF,
1688 Look::EndLF => Look::StartLF,
1689 Look::StartCRLF => Look::EndCRLF,
1690 Look::EndCRLF => Look::StartCRLF,
1691 Look::WordAscii => Look::WordAscii,
1692 Look::WordAsciiNegate => Look::WordAsciiNegate,
1693 Look::WordUnicode => Look::WordUnicode,
1694 Look::WordUnicodeNegate => Look::WordUnicodeNegate,
1695 Look::WordStartAscii => Look::WordEndAscii,
1696 Look::WordEndAscii => Look::WordStartAscii,
1697 Look::WordStartUnicode => Look::WordEndUnicode,
1698 Look::WordEndUnicode => Look::WordStartUnicode,
1699 Look::WordStartHalfAscii => Look::WordEndHalfAscii,
1700 Look::WordEndHalfAscii => Look::WordStartHalfAscii,
1701 Look::WordStartHalfUnicode => Look::WordEndHalfUnicode,
1702 Look::WordEndHalfUnicode => Look::WordStartHalfUnicode,
1703 }
1704 }
1705
1706 /// Return the underlying representation of this look-around enumeration
1707 /// as an integer. Giving the return value to the [`Look::from_repr`]
1708 /// constructor is guaranteed to return the same look-around variant that
1709 /// one started with within a semver compatible release of this crate.
1710 #[inline]
1711 pub const fn as_repr(self) -> u32 {
1712 // AFAIK, 'as' is the only way to zero-cost convert an int enum to an
1713 // actual int.
1714 self as u32
1715 }
1716
1717 /// Given the underlying representation of a `Look` value, return the
1718 /// corresponding `Look` value if the representation is valid. Otherwise
1719 /// `None` is returned.
1720 #[inline]
1721 pub const fn from_repr(repr: u32) -> Option<Look> {
1722 match repr {
1723 0b00_0000_0000_0000_0001 => Some(Look::Start),
1724 0b00_0000_0000_0000_0010 => Some(Look::End),
1725 0b00_0000_0000_0000_0100 => Some(Look::StartLF),
1726 0b00_0000_0000_0000_1000 => Some(Look::EndLF),
1727 0b00_0000_0000_0001_0000 => Some(Look::StartCRLF),
1728 0b00_0000_0000_0010_0000 => Some(Look::EndCRLF),
1729 0b00_0000_0000_0100_0000 => Some(Look::WordAscii),
1730 0b00_0000_0000_1000_0000 => Some(Look::WordAsciiNegate),
1731 0b00_0000_0001_0000_0000 => Some(Look::WordUnicode),
1732 0b00_0000_0010_0000_0000 => Some(Look::WordUnicodeNegate),
1733 0b00_0000_0100_0000_0000 => Some(Look::WordStartAscii),
1734 0b00_0000_1000_0000_0000 => Some(Look::WordEndAscii),
1735 0b00_0001_0000_0000_0000 => Some(Look::WordStartUnicode),
1736 0b00_0010_0000_0000_0000 => Some(Look::WordEndUnicode),
1737 0b00_0100_0000_0000_0000 => Some(Look::WordStartHalfAscii),
1738 0b00_1000_0000_0000_0000 => Some(Look::WordEndHalfAscii),
1739 0b01_0000_0000_0000_0000 => Some(Look::WordStartHalfUnicode),
1740 0b10_0000_0000_0000_0000 => Some(Look::WordEndHalfUnicode),
1741 _ => None,
1742 }
1743 }
1744
1745 /// Returns a convenient single codepoint representation of this
1746 /// look-around assertion. Each assertion is guaranteed to be represented
1747 /// by a distinct character.
1748 ///
1749 /// This is useful for succinctly representing a look-around assertion in
1750 /// human friendly but succinct output intended for a programmer working on
1751 /// regex internals.
1752 #[inline]
1753 pub const fn as_char(self) -> char {
1754 match self {
1755 Look::Start => 'A',
1756 Look::End => 'z',
1757 Look::StartLF => '^',
1758 Look::EndLF => '$',
1759 Look::StartCRLF => 'r',
1760 Look::EndCRLF => 'R',
1761 Look::WordAscii => 'b',
1762 Look::WordAsciiNegate => 'B',
1763 Look::WordUnicode => '𝛃',
1764 Look::WordUnicodeNegate => '𝚩',
1765 Look::WordStartAscii => '<',
1766 Look::WordEndAscii => '>',
1767 Look::WordStartUnicode => '〈',
1768 Look::WordEndUnicode => '〉',
1769 Look::WordStartHalfAscii => '◁',
1770 Look::WordEndHalfAscii => '▷',
1771 Look::WordStartHalfUnicode => '◀',
1772 Look::WordEndHalfUnicode => '▶',
1773 }
1774 }
1775}
1776
1777/// The high-level intermediate representation for a capturing group.
1778///
1779/// A capturing group always has an index and a child expression. It may
1780/// also have a name associated with it (e.g., `(?P<foo>\w)`), but it's not
1781/// necessary.
1782///
1783/// Note that there is no explicit representation of a non-capturing group
1784/// in a `Hir`. Instead, non-capturing grouping is handled automatically by
1785/// the recursive structure of the `Hir` itself.
1786#[derive(Clone, Debug, Eq, PartialEq)]
1787pub struct Capture {
1788 /// The capture index of the capture.
1789 pub index: u32,
1790 /// The name of the capture, if it exists.
1791 pub name: Option<Box<str>>,
1792 /// The expression inside the capturing group, which may be empty.
1793 pub sub: Box<Hir>,
1794}
1795
1796/// The high-level intermediate representation of a repetition operator.
1797///
1798/// A repetition operator permits the repetition of an arbitrary
1799/// sub-expression.
1800#[derive(Clone, Debug, Eq, PartialEq)]
1801pub struct Repetition {
1802 /// The minimum range of the repetition.
1803 ///
1804 /// Note that special cases like `?`, `+` and `*` all get translated into
1805 /// the ranges `{0,1}`, `{1,}` and `{0,}`, respectively.
1806 ///
1807 /// When `min` is zero, this expression can match the empty string
1808 /// regardless of what its sub-expression is.
1809 pub min: u32,
1810 /// The maximum range of the repetition.
1811 ///
1812 /// Note that when `max` is `None`, `min` acts as a lower bound but where
1813 /// there is no upper bound. For something like `x{5}` where the min and
1814 /// max are equivalent, `min` will be set to `5` and `max` will be set to
1815 /// `Some(5)`.
1816 pub max: Option<u32>,
1817 /// Whether this repetition operator is greedy or not. A greedy operator
1818 /// will match as much as it can. A non-greedy operator will match as
1819 /// little as it can.
1820 ///
1821 /// Typically, operators are greedy by default and are only non-greedy when
1822 /// a `?` suffix is used, e.g., `(expr)*` is greedy while `(expr)*?` is
1823 /// not. However, this can be inverted via the `U` "ungreedy" flag.
1824 pub greedy: bool,
1825 /// The expression being repeated.
1826 pub sub: Box<Hir>,
1827}
1828
1829impl Repetition {
1830 /// Returns a new repetition with the same `min`, `max` and `greedy`
1831 /// values, but with its sub-expression replaced with the one given.
1832 pub fn with(&self, sub: Hir) -> Repetition {
1833 Repetition {
1834 min: self.min,
1835 max: self.max,
1836 greedy: self.greedy,
1837 sub: Box::new(sub),
1838 }
1839 }
1840}
1841
1842/// A type describing the different flavors of `.`.
1843///
1844/// This type is meant to be used with [`Hir::dot`], which is a convenience
1845/// routine for building HIR values derived from the `.` regex.
1846#[non_exhaustive]
1847#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1848pub enum Dot {
1849 /// Matches the UTF-8 encoding of any Unicode scalar value.
1850 ///
1851 /// This is equivalent to `(?su:.)` and also `\p{any}`.
1852 AnyChar,
1853 /// Matches any byte value.
1854 ///
1855 /// This is equivalent to `(?s-u:.)` and also `(?-u:[\x00-\xFF])`.
1856 AnyByte,
1857 /// Matches the UTF-8 encoding of any Unicode scalar value except for the
1858 /// `char` given.
1859 ///
1860 /// This is equivalent to using `(?u-s:.)` with the line terminator set
1861 /// to a particular ASCII byte. (Because of peculiarities in the regex
1862 /// engines, a line terminator must be a single byte. It follows that when
1863 /// UTF-8 mode is enabled, this single byte must also be a Unicode scalar
1864 /// value. That is, ti must be ASCII.)
1865 ///
1866 /// (This and `AnyCharExceptLF` both exist because of legacy reasons.
1867 /// `AnyCharExceptLF` will be dropped in the next breaking change release.)
1868 AnyCharExcept(char),
1869 /// Matches the UTF-8 encoding of any Unicode scalar value except for `\n`.
1870 ///
1871 /// This is equivalent to `(?u-s:.)` and also `[\p{any}--\n]`.
1872 AnyCharExceptLF,
1873 /// Matches the UTF-8 encoding of any Unicode scalar value except for `\r`
1874 /// and `\n`.
1875 ///
1876 /// This is equivalent to `(?uR-s:.)` and also `[\p{any}--\r\n]`.
1877 AnyCharExceptCRLF,
1878 /// Matches any byte value except for the `u8` given.
1879 ///
1880 /// This is equivalent to using `(?-us:.)` with the line terminator set
1881 /// to a particular ASCII byte. (Because of peculiarities in the regex
1882 /// engines, a line terminator must be a single byte. It follows that when
1883 /// UTF-8 mode is enabled, this single byte must also be a Unicode scalar
1884 /// value. That is, ti must be ASCII.)
1885 ///
1886 /// (This and `AnyByteExceptLF` both exist because of legacy reasons.
1887 /// `AnyByteExceptLF` will be dropped in the next breaking change release.)
1888 AnyByteExcept(u8),
1889 /// Matches any byte value except for `\n`.
1890 ///
1891 /// This is equivalent to `(?-su:.)` and also `(?-u:[[\x00-\xFF]--\n])`.
1892 AnyByteExceptLF,
1893 /// Matches any byte value except for `\r` and `\n`.
1894 ///
1895 /// This is equivalent to `(?R-su:.)` and also `(?-u:[[\x00-\xFF]--\r\n])`.
1896 AnyByteExceptCRLF,
1897}
1898
1899/// A custom `Drop` impl is used for `HirKind` such that it uses constant stack
1900/// space but heap space proportional to the depth of the total `Hir`.
1901impl Drop for Hir {
1902 fn drop(&mut self) {
1903 use core::mem;
1904
1905 match *self.kind() {
1906 HirKind::Empty
1907 | HirKind::Literal(_)
1908 | HirKind::Class(_)
1909 | HirKind::Look(_) => return,
1910 HirKind::Capture(ref x) if x.sub.kind.subs().is_empty() => return,
1911 HirKind::Repetition(ref x) if x.sub.kind.subs().is_empty() => {
1912 return
1913 }
1914 HirKind::Concat(ref x) if x.is_empty() => return,
1915 HirKind::Alternation(ref x) if x.is_empty() => return,
1916 _ => {}
1917 }
1918
1919 let mut stack = vec![mem::replace(self, Hir::empty())];
1920 while let Some(mut expr) = stack.pop() {
1921 match expr.kind {
1922 HirKind::Empty
1923 | HirKind::Literal(_)
1924 | HirKind::Class(_)
1925 | HirKind::Look(_) => {}
1926 HirKind::Capture(ref mut x) => {
1927 stack.push(mem::replace(&mut x.sub, Hir::empty()));
1928 }
1929 HirKind::Repetition(ref mut x) => {
1930 stack.push(mem::replace(&mut x.sub, Hir::empty()));
1931 }
1932 HirKind::Concat(ref mut x) => {
1933 stack.extend(x.drain(..));
1934 }
1935 HirKind::Alternation(ref mut x) => {
1936 stack.extend(x.drain(..));
1937 }
1938 }
1939 }
1940 }
1941}
1942
1943/// A type that collects various properties of an HIR value.
1944///
1945/// Properties are always scalar values and represent meta data that is
1946/// computed inductively on an HIR value. Properties are defined for all
1947/// HIR values.
1948///
1949/// All methods on a `Properties` value take constant time and are meant to
1950/// be cheap to call.
1951#[derive(Clone, Debug, Eq, PartialEq)]
1952pub struct Properties(Box<PropertiesI>);
1953
1954/// The property definition. It is split out so that we can box it, and
1955/// there by make `Properties` use less stack size. This is kind-of important
1956/// because every HIR value has a `Properties` attached to it.
1957///
1958/// This does have the unfortunate consequence that creating any HIR value
1959/// always leads to at least one alloc for properties, but this is generally
1960/// true anyway (for pretty much all HirKinds except for look-arounds).
1961#[derive(Clone, Debug, Eq, PartialEq)]
1962struct PropertiesI {
1963 minimum_len: Option<usize>,
1964 maximum_len: Option<usize>,
1965 look_set: LookSet,
1966 look_set_prefix: LookSet,
1967 look_set_suffix: LookSet,
1968 look_set_prefix_any: LookSet,
1969 look_set_suffix_any: LookSet,
1970 utf8: bool,
1971 explicit_captures_len: usize,
1972 static_explicit_captures_len: Option<usize>,
1973 literal: bool,
1974 alternation_literal: bool,
1975}
1976
1977impl Properties {
1978 /// Returns the length (in bytes) of the smallest string matched by this
1979 /// HIR.
1980 ///
1981 /// A return value of `0` is possible and occurs when the HIR can match an
1982 /// empty string.
1983 ///
1984 /// `None` is returned when there is no minimum length. This occurs in
1985 /// precisely the cases where the HIR matches nothing. i.e., The language
1986 /// the regex matches is empty. An example of such a regex is `\P{any}`.
1987 #[inline]
1988 pub fn minimum_len(&self) -> Option<usize> {
1989 self.0.minimum_len
1990 }
1991
1992 /// Returns the length (in bytes) of the longest string matched by this
1993 /// HIR.
1994 ///
1995 /// A return value of `0` is possible and occurs when nothing longer than
1996 /// the empty string is in the language described by this HIR.
1997 ///
1998 /// `None` is returned when there is no longest matching string. This
1999 /// occurs when the HIR matches nothing or when there is no upper bound on
2000 /// the length of matching strings. Example of such regexes are `\P{any}`
2001 /// (matches nothing) and `a+` (has no upper bound).
2002 #[inline]
2003 pub fn maximum_len(&self) -> Option<usize> {
2004 self.0.maximum_len
2005 }
2006
2007 /// Returns a set of all look-around assertions that appear at least once
2008 /// in this HIR value.
2009 #[inline]
2010 pub fn look_set(&self) -> LookSet {
2011 self.0.look_set
2012 }
2013
2014 /// Returns a set of all look-around assertions that appear as a prefix for
2015 /// this HIR value. That is, the set returned corresponds to the set of
2016 /// assertions that must be passed before matching any bytes in a haystack.
2017 ///
2018 /// For example, `hir.look_set_prefix().contains(Look::Start)` returns true
2019 /// if and only if the HIR is fully anchored at the start.
2020 #[inline]
2021 pub fn look_set_prefix(&self) -> LookSet {
2022 self.0.look_set_prefix
2023 }
2024
2025 /// Returns a set of all look-around assertions that appear as a _possible_
2026 /// prefix for this HIR value. That is, the set returned corresponds to the
2027 /// set of assertions that _may_ be passed before matching any bytes in a
2028 /// haystack.
2029 ///
2030 /// For example, `hir.look_set_prefix_any().contains(Look::Start)` returns
2031 /// true if and only if it's possible for the regex to match through a
2032 /// anchored assertion before consuming any input.
2033 #[inline]
2034 pub fn look_set_prefix_any(&self) -> LookSet {
2035 self.0.look_set_prefix_any
2036 }
2037
2038 /// Returns a set of all look-around assertions that appear as a suffix for
2039 /// this HIR value. That is, the set returned corresponds to the set of
2040 /// assertions that must be passed in order to be considered a match after
2041 /// all other consuming HIR expressions.
2042 ///
2043 /// For example, `hir.look_set_suffix().contains(Look::End)` returns true
2044 /// if and only if the HIR is fully anchored at the end.
2045 #[inline]
2046 pub fn look_set_suffix(&self) -> LookSet {
2047 self.0.look_set_suffix
2048 }
2049
2050 /// Returns a set of all look-around assertions that appear as a _possible_
2051 /// suffix for this HIR value. That is, the set returned corresponds to the
2052 /// set of assertions that _may_ be passed before matching any bytes in a
2053 /// haystack.
2054 ///
2055 /// For example, `hir.look_set_suffix_any().contains(Look::End)` returns
2056 /// true if and only if it's possible for the regex to match through a
2057 /// anchored assertion at the end of a match without consuming any input.
2058 #[inline]
2059 pub fn look_set_suffix_any(&self) -> LookSet {
2060 self.0.look_set_suffix_any
2061 }
2062
2063 /// Return true if and only if the corresponding HIR will always match
2064 /// valid UTF-8.
2065 ///
2066 /// When this returns false, then it is possible for this HIR expression to
2067 /// match invalid UTF-8, including by matching between the code units of
2068 /// a single UTF-8 encoded codepoint.
2069 ///
2070 /// Note that this returns true even when the corresponding HIR can match
2071 /// the empty string. Since an empty string can technically appear between
2072 /// UTF-8 code units, it is possible for a match to be reported that splits
2073 /// a codepoint which could in turn be considered matching invalid UTF-8.
2074 /// However, it is generally assumed that such empty matches are handled
2075 /// specially by the search routine if it is absolutely required that
2076 /// matches not split a codepoint.
2077 ///
2078 /// # Example
2079 ///
2080 /// This code example shows the UTF-8 property of a variety of patterns.
2081 ///
2082 /// ```
2083 /// use regex_syntax::{ParserBuilder, parse};
2084 ///
2085 /// // Examples of 'is_utf8() == true'.
2086 /// assert!(parse(r"a")?.properties().is_utf8());
2087 /// assert!(parse(r"[^a]")?.properties().is_utf8());
2088 /// assert!(parse(r".")?.properties().is_utf8());
2089 /// assert!(parse(r"\W")?.properties().is_utf8());
2090 /// assert!(parse(r"\b")?.properties().is_utf8());
2091 /// assert!(parse(r"\B")?.properties().is_utf8());
2092 /// assert!(parse(r"(?-u)\b")?.properties().is_utf8());
2093 /// assert!(parse(r"(?-u)\B")?.properties().is_utf8());
2094 /// // Unicode mode is enabled by default, and in
2095 /// // that mode, all \x hex escapes are treated as
2096 /// // codepoints. So this actually matches the UTF-8
2097 /// // encoding of U+00FF.
2098 /// assert!(parse(r"\xFF")?.properties().is_utf8());
2099 ///
2100 /// // Now we show examples of 'is_utf8() == false'.
2101 /// // The only way to do this is to force the parser
2102 /// // to permit invalid UTF-8, otherwise all of these
2103 /// // would fail to parse!
2104 /// let parse = |pattern| {
2105 /// ParserBuilder::new().utf8(false).build().parse(pattern)
2106 /// };
2107 /// assert!(!parse(r"(?-u)[^a]")?.properties().is_utf8());
2108 /// assert!(!parse(r"(?-u).")?.properties().is_utf8());
2109 /// assert!(!parse(r"(?-u)\W")?.properties().is_utf8());
2110 /// // Conversely to the equivalent example above,
2111 /// // when Unicode mode is disabled, \x hex escapes
2112 /// // are treated as their raw byte values.
2113 /// assert!(!parse(r"(?-u)\xFF")?.properties().is_utf8());
2114 /// // Note that just because we disabled UTF-8 in the
2115 /// // parser doesn't mean we still can't use Unicode.
2116 /// // It is enabled by default, so \xFF is still
2117 /// // equivalent to matching the UTF-8 encoding of
2118 /// // U+00FF by default.
2119 /// assert!(parse(r"\xFF")?.properties().is_utf8());
2120 /// // Even though we use raw bytes that individually
2121 /// // are not valid UTF-8, when combined together, the
2122 /// // overall expression *does* match valid UTF-8!
2123 /// assert!(parse(r"(?-u)\xE2\x98\x83")?.properties().is_utf8());
2124 ///
2125 /// # Ok::<(), Box<dyn std::error::Error>>(())
2126 /// ```
2127 #[inline]
2128 pub fn is_utf8(&self) -> bool {
2129 self.0.utf8
2130 }
2131
2132 /// Returns the total number of explicit capturing groups in the
2133 /// corresponding HIR.
2134 ///
2135 /// Note that this does not include the implicit capturing group
2136 /// corresponding to the entire match that is typically included by regex
2137 /// engines.
2138 ///
2139 /// # Example
2140 ///
2141 /// This method will return `0` for `a` and `1` for `(a)`:
2142 ///
2143 /// ```
2144 /// use regex_syntax::parse;
2145 ///
2146 /// assert_eq!(0, parse("a")?.properties().explicit_captures_len());
2147 /// assert_eq!(1, parse("(a)")?.properties().explicit_captures_len());
2148 ///
2149 /// # Ok::<(), Box<dyn std::error::Error>>(())
2150 /// ```
2151 #[inline]
2152 pub fn explicit_captures_len(&self) -> usize {
2153 self.0.explicit_captures_len
2154 }
2155
2156 /// Returns the total number of explicit capturing groups that appear in
2157 /// every possible match.
2158 ///
2159 /// If the number of capture groups can vary depending on the match, then
2160 /// this returns `None`. That is, a value is only returned when the number
2161 /// of matching groups is invariant or "static."
2162 ///
2163 /// Note that this does not include the implicit capturing group
2164 /// corresponding to the entire match.
2165 ///
2166 /// # Example
2167 ///
2168 /// This shows a few cases where a static number of capture groups is
2169 /// available and a few cases where it is not.
2170 ///
2171 /// ```
2172 /// use regex_syntax::parse;
2173 ///
2174 /// let len = |pattern| {
2175 /// parse(pattern).map(|h| {
2176 /// h.properties().static_explicit_captures_len()
2177 /// })
2178 /// };
2179 ///
2180 /// assert_eq!(Some(0), len("a")?);
2181 /// assert_eq!(Some(1), len("(a)")?);
2182 /// assert_eq!(Some(1), len("(a)|(b)")?);
2183 /// assert_eq!(Some(2), len("(a)(b)|(c)(d)")?);
2184 /// assert_eq!(None, len("(a)|b")?);
2185 /// assert_eq!(None, len("a|(b)")?);
2186 /// assert_eq!(None, len("(b)*")?);
2187 /// assert_eq!(Some(1), len("(b)+")?);
2188 ///
2189 /// # Ok::<(), Box<dyn std::error::Error>>(())
2190 /// ```
2191 #[inline]
2192 pub fn static_explicit_captures_len(&self) -> Option<usize> {
2193 self.0.static_explicit_captures_len
2194 }
2195
2196 /// Return true if and only if this HIR is a simple literal. This is
2197 /// only true when this HIR expression is either itself a `Literal` or a
2198 /// concatenation of only `Literal`s.
2199 ///
2200 /// For example, `f` and `foo` are literals, but `f+`, `(foo)`, `foo()` and
2201 /// the empty string are not (even though they contain sub-expressions that
2202 /// are literals).
2203 #[inline]
2204 pub fn is_literal(&self) -> bool {
2205 self.0.literal
2206 }
2207
2208 /// Return true if and only if this HIR is either a simple literal or an
2209 /// alternation of simple literals. This is only
2210 /// true when this HIR expression is either itself a `Literal` or a
2211 /// concatenation of only `Literal`s or an alternation of only `Literal`s.
2212 ///
2213 /// For example, `f`, `foo`, `a|b|c`, and `foo|bar|baz` are alternation
2214 /// literals, but `f+`, `(foo)`, `foo()`, and the empty pattern are not
2215 /// (even though that contain sub-expressions that are literals).
2216 #[inline]
2217 pub fn is_alternation_literal(&self) -> bool {
2218 self.0.alternation_literal
2219 }
2220
2221 /// Returns the total amount of heap memory usage, in bytes, used by this
2222 /// `Properties` value.
2223 #[inline]
2224 pub fn memory_usage(&self) -> usize {
2225 core::mem::size_of::<PropertiesI>()
2226 }
2227
2228 /// Returns a new set of properties that corresponds to the union of the
2229 /// iterator of properties given.
2230 ///
2231 /// This is useful when one has multiple `Hir` expressions and wants
2232 /// to combine them into a single alternation without constructing the
2233 /// corresponding `Hir`. This routine provides a way of combining the
2234 /// properties of each `Hir` expression into one set of properties
2235 /// representing the union of those expressions.
2236 ///
2237 /// # Example: union with HIRs that never match
2238 ///
2239 /// This example shows that unioning properties together with one that
2240 /// represents a regex that never matches will "poison" certain attributes,
2241 /// like the minimum and maximum lengths.
2242 ///
2243 /// ```
2244 /// use regex_syntax::{hir::Properties, parse};
2245 ///
2246 /// let hir1 = parse("ab?c?")?;
2247 /// assert_eq!(Some(1), hir1.properties().minimum_len());
2248 /// assert_eq!(Some(3), hir1.properties().maximum_len());
2249 ///
2250 /// let hir2 = parse(r"[a&&b]")?;
2251 /// assert_eq!(None, hir2.properties().minimum_len());
2252 /// assert_eq!(None, hir2.properties().maximum_len());
2253 ///
2254 /// let hir3 = parse(r"wxy?z?")?;
2255 /// assert_eq!(Some(2), hir3.properties().minimum_len());
2256 /// assert_eq!(Some(4), hir3.properties().maximum_len());
2257 ///
2258 /// let unioned = Properties::union([
2259 /// hir1.properties(),
2260 /// hir2.properties(),
2261 /// hir3.properties(),
2262 /// ]);
2263 /// assert_eq!(None, unioned.minimum_len());
2264 /// assert_eq!(None, unioned.maximum_len());
2265 ///
2266 /// # Ok::<(), Box<dyn std::error::Error>>(())
2267 /// ```
2268 ///
2269 /// The maximum length can also be "poisoned" by a pattern that has no
2270 /// upper bound on the length of a match. The minimum length remains
2271 /// unaffected:
2272 ///
2273 /// ```
2274 /// use regex_syntax::{hir::Properties, parse};
2275 ///
2276 /// let hir1 = parse("ab?c?")?;
2277 /// assert_eq!(Some(1), hir1.properties().minimum_len());
2278 /// assert_eq!(Some(3), hir1.properties().maximum_len());
2279 ///
2280 /// let hir2 = parse(r"a+")?;
2281 /// assert_eq!(Some(1), hir2.properties().minimum_len());
2282 /// assert_eq!(None, hir2.properties().maximum_len());
2283 ///
2284 /// let hir3 = parse(r"wxy?z?")?;
2285 /// assert_eq!(Some(2), hir3.properties().minimum_len());
2286 /// assert_eq!(Some(4), hir3.properties().maximum_len());
2287 ///
2288 /// let unioned = Properties::union([
2289 /// hir1.properties(),
2290 /// hir2.properties(),
2291 /// hir3.properties(),
2292 /// ]);
2293 /// assert_eq!(Some(1), unioned.minimum_len());
2294 /// assert_eq!(None, unioned.maximum_len());
2295 ///
2296 /// # Ok::<(), Box<dyn std::error::Error>>(())
2297 /// ```
2298 pub fn union<I, P>(props: I) -> Properties
2299 where
2300 I: IntoIterator<Item = P>,
2301 P: core::borrow::Borrow<Properties>,
2302 {
2303 let mut it = props.into_iter().peekable();
2304 // While empty alternations aren't possible, we still behave as if they
2305 // are. When we have an empty alternate, then clearly the look-around
2306 // prefix and suffix is empty. Otherwise, it is the intersection of all
2307 // prefixes and suffixes (respectively) of the branches.
2308 let fix = if it.peek().is_none() {
2309 LookSet::empty()
2310 } else {
2311 LookSet::full()
2312 };
2313 // And also, an empty alternate means we have 0 static capture groups,
2314 // but we otherwise start with the number corresponding to the first
2315 // alternate. If any subsequent alternate has a different number of
2316 // static capture groups, then we overall have a variation and not a
2317 // static number of groups.
2318 let static_explicit_captures_len =
2319 it.peek().and_then(|p| p.borrow().static_explicit_captures_len());
2320 // The base case is an empty alternation, which matches nothing.
2321 // Note though that empty alternations aren't possible, because the
2322 // Hir::alternation smart constructor rewrites those as empty character
2323 // classes.
2324 let mut props = PropertiesI {
2325 minimum_len: None,
2326 maximum_len: None,
2327 look_set: LookSet::empty(),
2328 look_set_prefix: fix,
2329 look_set_suffix: fix,
2330 look_set_prefix_any: LookSet::empty(),
2331 look_set_suffix_any: LookSet::empty(),
2332 utf8: true,
2333 explicit_captures_len: 0,
2334 static_explicit_captures_len,
2335 literal: false,
2336 alternation_literal: true,
2337 };
2338 let (mut min_poisoned, mut max_poisoned) = (false, false);
2339 // Handle properties that need to visit every child hir.
2340 for prop in it {
2341 let p = prop.borrow();
2342 props.look_set.set_union(p.look_set());
2343 props.look_set_prefix.set_intersect(p.look_set_prefix());
2344 props.look_set_suffix.set_intersect(p.look_set_suffix());
2345 props.look_set_prefix_any.set_union(p.look_set_prefix_any());
2346 props.look_set_suffix_any.set_union(p.look_set_suffix_any());
2347 props.utf8 = props.utf8 && p.is_utf8();
2348 props.explicit_captures_len = props
2349 .explicit_captures_len
2350 .saturating_add(p.explicit_captures_len());
2351 if props.static_explicit_captures_len
2352 != p.static_explicit_captures_len()
2353 {
2354 props.static_explicit_captures_len = None;
2355 }
2356 props.alternation_literal =
2357 props.alternation_literal && p.is_literal();
2358 if !min_poisoned {
2359 if let Some(xmin) = p.minimum_len() {
2360 if props.minimum_len.map_or(true, |pmin| xmin < pmin) {
2361 props.minimum_len = Some(xmin);
2362 }
2363 } else {
2364 props.minimum_len = None;
2365 min_poisoned = true;
2366 }
2367 }
2368 if !max_poisoned {
2369 if let Some(xmax) = p.maximum_len() {
2370 if props.maximum_len.map_or(true, |pmax| xmax > pmax) {
2371 props.maximum_len = Some(xmax);
2372 }
2373 } else {
2374 props.maximum_len = None;
2375 max_poisoned = true;
2376 }
2377 }
2378 }
2379 Properties(Box::new(props))
2380 }
2381}
2382
2383impl Properties {
2384 /// Create a new set of HIR properties for an empty regex.
2385 fn empty() -> Properties {
2386 let inner = PropertiesI {
2387 minimum_len: Some(0),
2388 maximum_len: Some(0),
2389 look_set: LookSet::empty(),
2390 look_set_prefix: LookSet::empty(),
2391 look_set_suffix: LookSet::empty(),
2392 look_set_prefix_any: LookSet::empty(),
2393 look_set_suffix_any: LookSet::empty(),
2394 // It is debatable whether an empty regex always matches at valid
2395 // UTF-8 boundaries. Strictly speaking, at a byte oriented view,
2396 // it is clearly false. There are, for example, many empty strings
2397 // between the bytes encoding a '☃'.
2398 //
2399 // However, when Unicode mode is enabled, the fundamental atom
2400 // of matching is really a codepoint. And in that scenario, an
2401 // empty regex is defined to only match at valid UTF-8 boundaries
2402 // and to never split a codepoint. It just so happens that this
2403 // enforcement is somewhat tricky to do for regexes that match
2404 // the empty string inside regex engines themselves. It usually
2405 // requires some layer above the regex engine to filter out such
2406 // matches.
2407 //
2408 // In any case, 'true' is really the only coherent option. If it
2409 // were false, for example, then 'a*' would also need to be false
2410 // since it too can match the empty string.
2411 utf8: true,
2412 explicit_captures_len: 0,
2413 static_explicit_captures_len: Some(0),
2414 literal: false,
2415 alternation_literal: false,
2416 };
2417 Properties(Box::new(inner))
2418 }
2419
2420 /// Create a new set of HIR properties for a literal regex.
2421 fn literal(lit: &Literal) -> Properties {
2422 let inner = PropertiesI {
2423 minimum_len: Some(lit.0.len()),
2424 maximum_len: Some(lit.0.len()),
2425 look_set: LookSet::empty(),
2426 look_set_prefix: LookSet::empty(),
2427 look_set_suffix: LookSet::empty(),
2428 look_set_prefix_any: LookSet::empty(),
2429 look_set_suffix_any: LookSet::empty(),
2430 utf8: core::str::from_utf8(&lit.0).is_ok(),
2431 explicit_captures_len: 0,
2432 static_explicit_captures_len: Some(0),
2433 literal: true,
2434 alternation_literal: true,
2435 };
2436 Properties(Box::new(inner))
2437 }
2438
2439 /// Create a new set of HIR properties for a character class.
2440 fn class(class: &Class) -> Properties {
2441 let inner = PropertiesI {
2442 minimum_len: class.minimum_len(),
2443 maximum_len: class.maximum_len(),
2444 look_set: LookSet::empty(),
2445 look_set_prefix: LookSet::empty(),
2446 look_set_suffix: LookSet::empty(),
2447 look_set_prefix_any: LookSet::empty(),
2448 look_set_suffix_any: LookSet::empty(),
2449 utf8: class.is_utf8(),
2450 explicit_captures_len: 0,
2451 static_explicit_captures_len: Some(0),
2452 literal: false,
2453 alternation_literal: false,
2454 };
2455 Properties(Box::new(inner))
2456 }
2457
2458 /// Create a new set of HIR properties for a look-around assertion.
2459 fn look(look: Look) -> Properties {
2460 let inner = PropertiesI {
2461 minimum_len: Some(0),
2462 maximum_len: Some(0),
2463 look_set: LookSet::singleton(look),
2464 look_set_prefix: LookSet::singleton(look),
2465 look_set_suffix: LookSet::singleton(look),
2466 look_set_prefix_any: LookSet::singleton(look),
2467 look_set_suffix_any: LookSet::singleton(look),
2468 // This requires a little explanation. Basically, we don't consider
2469 // matching an empty string to be equivalent to matching invalid
2470 // UTF-8, even though technically matching every empty string will
2471 // split the UTF-8 encoding of a single codepoint when treating a
2472 // UTF-8 encoded string as a sequence of bytes. Our defense here is
2473 // that in such a case, a codepoint should logically be treated as
2474 // the fundamental atom for matching, and thus the only valid match
2475 // points are between codepoints and not bytes.
2476 //
2477 // More practically, this is true here because it's also true
2478 // for 'Hir::empty()', otherwise something like 'a*' would be
2479 // considered to match invalid UTF-8. That in turn makes this
2480 // property borderline useless.
2481 utf8: true,
2482 explicit_captures_len: 0,
2483 static_explicit_captures_len: Some(0),
2484 literal: false,
2485 alternation_literal: false,
2486 };
2487 Properties(Box::new(inner))
2488 }
2489
2490 /// Create a new set of HIR properties for a repetition.
2491 fn repetition(rep: &Repetition) -> Properties {
2492 let p = rep.sub.properties();
2493 let minimum_len = p.minimum_len().map(|child_min| {
2494 let rep_min = usize::try_from(rep.min).unwrap_or(usize::MAX);
2495 child_min.saturating_mul(rep_min)
2496 });
2497 let maximum_len = rep.max.and_then(|rep_max| {
2498 let rep_max = usize::try_from(rep_max).ok()?;
2499 let child_max = p.maximum_len()?;
2500 child_max.checked_mul(rep_max)
2501 });
2502
2503 let mut inner = PropertiesI {
2504 minimum_len,
2505 maximum_len,
2506 look_set: p.look_set(),
2507 look_set_prefix: LookSet::empty(),
2508 look_set_suffix: LookSet::empty(),
2509 look_set_prefix_any: p.look_set_prefix_any(),
2510 look_set_suffix_any: p.look_set_suffix_any(),
2511 utf8: p.is_utf8(),
2512 explicit_captures_len: p.explicit_captures_len(),
2513 static_explicit_captures_len: p.static_explicit_captures_len(),
2514 literal: false,
2515 alternation_literal: false,
2516 };
2517 // If the repetition operator can match the empty string, then its
2518 // lookset prefix and suffixes themselves remain empty since they are
2519 // no longer required to match.
2520 if rep.min > 0 {
2521 inner.look_set_prefix = p.look_set_prefix();
2522 inner.look_set_suffix = p.look_set_suffix();
2523 }
2524 // If the static captures len of the sub-expression is not known or
2525 // is greater than zero, then it automatically propagates to the
2526 // repetition, regardless of the repetition. Otherwise, it might
2527 // change, but only when the repetition can match 0 times.
2528 if rep.min == 0
2529 && inner.static_explicit_captures_len.map_or(false, |len| len > 0)
2530 {
2531 // If we require a match 0 times, then our captures len is
2532 // guaranteed to be zero. Otherwise, if we *can* match the empty
2533 // string, then it's impossible to know how many captures will be
2534 // in the resulting match.
2535 if rep.max == Some(0) {
2536 inner.static_explicit_captures_len = Some(0);
2537 } else {
2538 inner.static_explicit_captures_len = None;
2539 }
2540 }
2541 Properties(Box::new(inner))
2542 }
2543
2544 /// Create a new set of HIR properties for a capture.
2545 fn capture(capture: &Capture) -> Properties {
2546 let p = capture.sub.properties();
2547 Properties(Box::new(PropertiesI {
2548 explicit_captures_len: p.explicit_captures_len().saturating_add(1),
2549 static_explicit_captures_len: p
2550 .static_explicit_captures_len()
2551 .map(|len| len.saturating_add(1)),
2552 literal: false,
2553 alternation_literal: false,
2554 ..*p.0.clone()
2555 }))
2556 }
2557
2558 /// Create a new set of HIR properties for a concatenation.
2559 fn concat(concat: &[Hir]) -> Properties {
2560 // The base case is an empty concatenation, which matches the empty
2561 // string. Note though that empty concatenations aren't possible,
2562 // because the Hir::concat smart constructor rewrites those as
2563 // Hir::empty.
2564 let mut props = PropertiesI {
2565 minimum_len: Some(0),
2566 maximum_len: Some(0),
2567 look_set: LookSet::empty(),
2568 look_set_prefix: LookSet::empty(),
2569 look_set_suffix: LookSet::empty(),
2570 look_set_prefix_any: LookSet::empty(),
2571 look_set_suffix_any: LookSet::empty(),
2572 utf8: true,
2573 explicit_captures_len: 0,
2574 static_explicit_captures_len: Some(0),
2575 literal: true,
2576 alternation_literal: true,
2577 };
2578 // Handle properties that need to visit every child hir.
2579 for x in concat.iter() {
2580 let p = x.properties();
2581 props.look_set.set_union(p.look_set());
2582 props.utf8 = props.utf8 && p.is_utf8();
2583 props.explicit_captures_len = props
2584 .explicit_captures_len
2585 .saturating_add(p.explicit_captures_len());
2586 props.static_explicit_captures_len = p
2587 .static_explicit_captures_len()
2588 .and_then(|len1| {
2589 Some((len1, props.static_explicit_captures_len?))
2590 })
2591 .and_then(|(len1, len2)| Some(len1.saturating_add(len2)));
2592 props.literal = props.literal && p.is_literal();
2593 props.alternation_literal =
2594 props.alternation_literal && p.is_alternation_literal();
2595 if let Some(minimum_len) = props.minimum_len {
2596 match p.minimum_len() {
2597 None => props.minimum_len = None,
2598 Some(len) => {
2599 // We use saturating arithmetic here because the
2600 // minimum is just a lower bound. We can't go any
2601 // higher than what our number types permit.
2602 props.minimum_len =
2603 Some(minimum_len.saturating_add(len));
2604 }
2605 }
2606 }
2607 if let Some(maximum_len) = props.maximum_len {
2608 match p.maximum_len() {
2609 None => props.maximum_len = None,
2610 Some(len) => {
2611 props.maximum_len = maximum_len.checked_add(len)
2612 }
2613 }
2614 }
2615 }
2616 // Handle the prefix properties, which only requires visiting
2617 // child exprs until one matches more than the empty string.
2618 let mut it = concat.iter();
2619 while let Some(x) = it.next() {
2620 props.look_set_prefix.set_union(x.properties().look_set_prefix());
2621 props
2622 .look_set_prefix_any
2623 .set_union(x.properties().look_set_prefix_any());
2624 if x.properties().maximum_len().map_or(true, |x| x > 0) {
2625 break;
2626 }
2627 }
2628 // Same thing for the suffix properties, but in reverse.
2629 let mut it = concat.iter().rev();
2630 while let Some(x) = it.next() {
2631 props.look_set_suffix.set_union(x.properties().look_set_suffix());
2632 props
2633 .look_set_suffix_any
2634 .set_union(x.properties().look_set_suffix_any());
2635 if x.properties().maximum_len().map_or(true, |x| x > 0) {
2636 break;
2637 }
2638 }
2639 Properties(Box::new(props))
2640 }
2641
2642 /// Create a new set of HIR properties for a concatenation.
2643 fn alternation(alts: &[Hir]) -> Properties {
2644 Properties::union(alts.iter().map(|hir| hir.properties()))
2645 }
2646}
2647
2648/// A set of look-around assertions.
2649///
2650/// This is useful for efficiently tracking look-around assertions. For
2651/// example, an [`Hir`] provides properties that return `LookSet`s.
2652#[derive(Clone, Copy, Default, Eq, PartialEq)]
2653pub struct LookSet {
2654 /// The underlying representation this set is exposed to make it possible
2655 /// to store it somewhere efficiently. The representation is that
2656 /// of a bitset, where each assertion occupies bit `i` where `i =
2657 /// Look::as_repr()`.
2658 ///
2659 /// Note that users of this internal representation must permit the full
2660 /// range of `u16` values to be represented. For example, even if the
2661 /// current implementation only makes use of the 10 least significant bits,
2662 /// it may use more bits in a future semver compatible release.
2663 pub bits: u32,
2664}
2665
2666impl LookSet {
2667 /// Create an empty set of look-around assertions.
2668 #[inline]
2669 pub fn empty() -> LookSet {
2670 LookSet { bits: 0 }
2671 }
2672
2673 /// Create a full set of look-around assertions.
2674 ///
2675 /// This set contains all possible look-around assertions.
2676 #[inline]
2677 pub fn full() -> LookSet {
2678 LookSet { bits: !0 }
2679 }
2680
2681 /// Create a look-around set containing the look-around assertion given.
2682 ///
2683 /// This is a convenience routine for creating an empty set and inserting
2684 /// one look-around assertions.
2685 #[inline]
2686 pub fn singleton(look: Look) -> LookSet {
2687 LookSet::empty().insert(look)
2688 }
2689
2690 /// Returns the total number of look-around assertions in this set.
2691 #[inline]
2692 pub fn len(self) -> usize {
2693 // OK because max value always fits in a u8, which in turn always
2694 // fits in a usize, regardless of target.
2695 usize::try_from(self.bits.count_ones()).unwrap()
2696 }
2697
2698 /// Returns true if and only if this set is empty.
2699 #[inline]
2700 pub fn is_empty(self) -> bool {
2701 self.len() == 0
2702 }
2703
2704 /// Returns true if and only if the given look-around assertion is in this
2705 /// set.
2706 #[inline]
2707 pub fn contains(self, look: Look) -> bool {
2708 self.bits & look.as_repr() != 0
2709 }
2710
2711 /// Returns true if and only if this set contains any anchor assertions.
2712 /// This includes both "start/end of haystack" and "start/end of line."
2713 #[inline]
2714 pub fn contains_anchor(&self) -> bool {
2715 self.contains_anchor_haystack() || self.contains_anchor_line()
2716 }
2717
2718 /// Returns true if and only if this set contains any "start/end of
2719 /// haystack" anchors. This doesn't include "start/end of line" anchors.
2720 #[inline]
2721 pub fn contains_anchor_haystack(&self) -> bool {
2722 self.contains(Look::Start) || self.contains(Look::End)
2723 }
2724
2725 /// Returns true if and only if this set contains any "start/end of line"
2726 /// anchors. This doesn't include "start/end of haystack" anchors. This
2727 /// includes both `\n` line anchors and CRLF (`\r\n`) aware line anchors.
2728 #[inline]
2729 pub fn contains_anchor_line(&self) -> bool {
2730 self.contains(Look::StartLF)
2731 || self.contains(Look::EndLF)
2732 || self.contains(Look::StartCRLF)
2733 || self.contains(Look::EndCRLF)
2734 }
2735
2736 /// Returns true if and only if this set contains any "start/end of line"
2737 /// anchors that only treat `\n` as line terminators. This does not include
2738 /// haystack anchors or CRLF aware line anchors.
2739 #[inline]
2740 pub fn contains_anchor_lf(&self) -> bool {
2741 self.contains(Look::StartLF) || self.contains(Look::EndLF)
2742 }
2743
2744 /// Returns true if and only if this set contains any "start/end of line"
2745 /// anchors that are CRLF-aware. This doesn't include "start/end of
2746 /// haystack" or "start/end of line-feed" anchors.
2747 #[inline]
2748 pub fn contains_anchor_crlf(&self) -> bool {
2749 self.contains(Look::StartCRLF) || self.contains(Look::EndCRLF)
2750 }
2751
2752 /// Returns true if and only if this set contains any word boundary or
2753 /// negated word boundary assertions. This include both Unicode and ASCII
2754 /// word boundaries.
2755 #[inline]
2756 pub fn contains_word(self) -> bool {
2757 self.contains_word_unicode() || self.contains_word_ascii()
2758 }
2759
2760 /// Returns true if and only if this set contains any Unicode word boundary
2761 /// or negated Unicode word boundary assertions.
2762 #[inline]
2763 pub fn contains_word_unicode(self) -> bool {
2764 self.contains(Look::WordUnicode)
2765 || self.contains(Look::WordUnicodeNegate)
2766 || self.contains(Look::WordStartUnicode)
2767 || self.contains(Look::WordEndUnicode)
2768 || self.contains(Look::WordStartHalfUnicode)
2769 || self.contains(Look::WordEndHalfUnicode)
2770 }
2771
2772 /// Returns true if and only if this set contains any ASCII word boundary
2773 /// or negated ASCII word boundary assertions.
2774 #[inline]
2775 pub fn contains_word_ascii(self) -> bool {
2776 self.contains(Look::WordAscii)
2777 || self.contains(Look::WordAsciiNegate)
2778 || self.contains(Look::WordStartAscii)
2779 || self.contains(Look::WordEndAscii)
2780 || self.contains(Look::WordStartHalfAscii)
2781 || self.contains(Look::WordEndHalfAscii)
2782 }
2783
2784 /// Returns an iterator over all of the look-around assertions in this set.
2785 #[inline]
2786 pub fn iter(self) -> LookSetIter {
2787 LookSetIter { set: self }
2788 }
2789
2790 /// Return a new set that is equivalent to the original, but with the given
2791 /// assertion added to it. If the assertion is already in the set, then the
2792 /// returned set is equivalent to the original.
2793 #[inline]
2794 pub fn insert(self, look: Look) -> LookSet {
2795 LookSet { bits: self.bits | look.as_repr() }
2796 }
2797
2798 /// Updates this set in place with the result of inserting the given
2799 /// assertion into this set.
2800 #[inline]
2801 pub fn set_insert(&mut self, look: Look) {
2802 *self = self.insert(look);
2803 }
2804
2805 /// Return a new set that is equivalent to the original, but with the given
2806 /// assertion removed from it. If the assertion is not in the set, then the
2807 /// returned set is equivalent to the original.
2808 #[inline]
2809 pub fn remove(self, look: Look) -> LookSet {
2810 LookSet { bits: self.bits & !look.as_repr() }
2811 }
2812
2813 /// Updates this set in place with the result of removing the given
2814 /// assertion from this set.
2815 #[inline]
2816 pub fn set_remove(&mut self, look: Look) {
2817 *self = self.remove(look);
2818 }
2819
2820 /// Returns a new set that is the result of subtracting the given set from
2821 /// this set.
2822 #[inline]
2823 pub fn subtract(self, other: LookSet) -> LookSet {
2824 LookSet { bits: self.bits & !other.bits }
2825 }
2826
2827 /// Updates this set in place with the result of subtracting the given set
2828 /// from this set.
2829 #[inline]
2830 pub fn set_subtract(&mut self, other: LookSet) {
2831 *self = self.subtract(other);
2832 }
2833
2834 /// Returns a new set that is the union of this and the one given.
2835 #[inline]
2836 pub fn union(self, other: LookSet) -> LookSet {
2837 LookSet { bits: self.bits | other.bits }
2838 }
2839
2840 /// Updates this set in place with the result of unioning it with the one
2841 /// given.
2842 #[inline]
2843 pub fn set_union(&mut self, other: LookSet) {
2844 *self = self.union(other);
2845 }
2846
2847 /// Returns a new set that is the intersection of this and the one given.
2848 #[inline]
2849 pub fn intersect(self, other: LookSet) -> LookSet {
2850 LookSet { bits: self.bits & other.bits }
2851 }
2852
2853 /// Updates this set in place with the result of intersecting it with the
2854 /// one given.
2855 #[inline]
2856 pub fn set_intersect(&mut self, other: LookSet) {
2857 *self = self.intersect(other);
2858 }
2859
2860 /// Return a `LookSet` from the slice given as a native endian 32-bit
2861 /// integer.
2862 ///
2863 /// # Panics
2864 ///
2865 /// This panics if `slice.len() < 4`.
2866 #[inline]
2867 pub fn read_repr(slice: &[u8]) -> LookSet {
2868 let bits = u32::from_ne_bytes(slice[..4].try_into().unwrap());
2869 LookSet { bits }
2870 }
2871
2872 /// Write a `LookSet` as a native endian 32-bit integer to the beginning
2873 /// of the slice given.
2874 ///
2875 /// # Panics
2876 ///
2877 /// This panics if `slice.len() < 4`.
2878 #[inline]
2879 pub fn write_repr(self, slice: &mut [u8]) {
2880 let raw = self.bits.to_ne_bytes();
2881 slice[0] = raw[0];
2882 slice[1] = raw[1];
2883 slice[2] = raw[2];
2884 slice[3] = raw[3];
2885 }
2886}
2887
2888impl core::fmt::Debug for LookSet {
2889 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
2890 if self.is_empty() {
2891 return write!(f, "∅");
2892 }
2893 for look: Look in self.iter() {
2894 write!(f, "{}", look.as_char())?;
2895 }
2896 Ok(())
2897 }
2898}
2899
2900/// An iterator over all look-around assertions in a [`LookSet`].
2901///
2902/// This iterator is created by [`LookSet::iter`].
2903#[derive(Clone, Debug)]
2904pub struct LookSetIter {
2905 set: LookSet,
2906}
2907
2908impl Iterator for LookSetIter {
2909 type Item = Look;
2910
2911 #[inline]
2912 fn next(&mut self) -> Option<Look> {
2913 if self.set.is_empty() {
2914 return None;
2915 }
2916 // We'll never have more than u8::MAX distinct look-around assertions,
2917 // so 'bit' will always fit into a u16.
2918 let bit: u16 = u16::try_from(self.set.bits.trailing_zeros()).unwrap();
2919 let look: Look = Look::from_repr(1 << bit)?;
2920 self.set = self.set.remove(look);
2921 Some(look)
2922 }
2923}
2924
2925/// Given a sequence of HIR values where each value corresponds to a Unicode
2926/// class (or an all-ASCII byte class), return a single Unicode class
2927/// corresponding to the union of the classes found.
2928fn class_chars(hirs: &[Hir]) -> Option<Class> {
2929 let mut cls: ClassUnicode = ClassUnicode::new(ranges:vec![]);
2930 for hir: &Hir in hirs.iter() {
2931 match *hir.kind() {
2932 HirKind::Class(Class::Unicode(ref cls2: &ClassUnicode)) => {
2933 cls.union(cls2);
2934 }
2935 HirKind::Class(Class::Bytes(ref cls2: &ClassBytes)) => {
2936 cls.union(&cls2.to_unicode_class()?);
2937 }
2938 _ => return None,
2939 };
2940 }
2941 Some(Class::Unicode(cls))
2942}
2943
2944/// Given a sequence of HIR values where each value corresponds to a byte class
2945/// (or an all-ASCII Unicode class), return a single byte class corresponding
2946/// to the union of the classes found.
2947fn class_bytes(hirs: &[Hir]) -> Option<Class> {
2948 let mut cls: ClassBytes = ClassBytes::new(ranges:vec![]);
2949 for hir: &Hir in hirs.iter() {
2950 match *hir.kind() {
2951 HirKind::Class(Class::Unicode(ref cls2: &ClassUnicode)) => {
2952 cls.union(&cls2.to_byte_class()?);
2953 }
2954 HirKind::Class(Class::Bytes(ref cls2: &ClassBytes)) => {
2955 cls.union(cls2);
2956 }
2957 _ => return None,
2958 };
2959 }
2960 Some(Class::Bytes(cls))
2961}
2962
2963/// Given a sequence of HIR values where each value corresponds to a literal
2964/// that is a single `char`, return that sequence of `char`s. Otherwise return
2965/// None. No deduplication is done.
2966fn singleton_chars(hirs: &[Hir]) -> Option<Vec<char>> {
2967 let mut singletons: Vec = vec![];
2968 for hir: &Hir in hirs.iter() {
2969 let literal: &Box<[u8]> = match *hir.kind() {
2970 HirKind::Literal(Literal(ref bytes: &Box<[u8]>)) => bytes,
2971 _ => return None,
2972 };
2973 let ch: char = match crate::debug::utf8_decode(bytes:literal) {
2974 None => return None,
2975 Some(Err(_)) => return None,
2976 Some(Ok(ch: char)) => ch,
2977 };
2978 if literal.len() != ch.len_utf8() {
2979 return None;
2980 }
2981 singletons.push(ch);
2982 }
2983 Some(singletons)
2984}
2985
2986/// Given a sequence of HIR values where each value corresponds to a literal
2987/// that is a single byte, return that sequence of bytes. Otherwise return
2988/// None. No deduplication is done.
2989fn singleton_bytes(hirs: &[Hir]) -> Option<Vec<u8>> {
2990 let mut singletons: Vec = vec![];
2991 for hir: &Hir in hirs.iter() {
2992 let literal: &Box<[u8]> = match *hir.kind() {
2993 HirKind::Literal(Literal(ref bytes: &Box<[u8]>)) => bytes,
2994 _ => return None,
2995 };
2996 if literal.len() != 1 {
2997 return None;
2998 }
2999 singletons.push(literal[0]);
3000 }
3001 Some(singletons)
3002}
3003
3004/// Looks for a common prefix in the list of alternation branches given. If one
3005/// is found, then an equivalent but (hopefully) simplified Hir is returned.
3006/// Otherwise, the original given list of branches is returned unmodified.
3007///
3008/// This is not quite as good as it could be. Right now, it requires that
3009/// all branches are 'Concat' expressions. It also doesn't do well with
3010/// literals. For example, given 'foofoo|foobar', it will not refactor it to
3011/// 'foo(?:foo|bar)' because literals are flattened into their own special
3012/// concatenation. (One wonders if perhaps 'Literal' should be a single atom
3013/// instead of a string of bytes because of this. Otherwise, handling the
3014/// current representation in this routine will be pretty gnarly. Sigh.)
3015fn lift_common_prefix(hirs: Vec<Hir>) -> Result<Hir, Vec<Hir>> {
3016 if hirs.len() <= 1 {
3017 return Err(hirs);
3018 }
3019 let mut prefix = match hirs[0].kind() {
3020 HirKind::Concat(ref xs) => &**xs,
3021 _ => return Err(hirs),
3022 };
3023 if prefix.is_empty() {
3024 return Err(hirs);
3025 }
3026 for h in hirs.iter().skip(1) {
3027 let concat = match h.kind() {
3028 HirKind::Concat(ref xs) => xs,
3029 _ => return Err(hirs),
3030 };
3031 let common_len = prefix
3032 .iter()
3033 .zip(concat.iter())
3034 .take_while(|(x, y)| x == y)
3035 .count();
3036 prefix = &prefix[..common_len];
3037 if prefix.is_empty() {
3038 return Err(hirs);
3039 }
3040 }
3041 let len = prefix.len();
3042 assert_ne!(0, len);
3043 let mut prefix_concat = vec![];
3044 let mut suffix_alts = vec![];
3045 for h in hirs {
3046 let mut concat = match h.into_kind() {
3047 HirKind::Concat(xs) => xs,
3048 // We required all sub-expressions to be
3049 // concats above, so we're only here if we
3050 // have a concat.
3051 _ => unreachable!(),
3052 };
3053 suffix_alts.push(Hir::concat(concat.split_off(len)));
3054 if prefix_concat.is_empty() {
3055 prefix_concat = concat;
3056 }
3057 }
3058 let mut concat = prefix_concat;
3059 concat.push(Hir::alternation(suffix_alts));
3060 Ok(Hir::concat(concat))
3061}
3062
3063#[cfg(test)]
3064mod tests {
3065 use super::*;
3066
3067 fn uclass(ranges: &[(char, char)]) -> ClassUnicode {
3068 let ranges: Vec<ClassUnicodeRange> = ranges
3069 .iter()
3070 .map(|&(s, e)| ClassUnicodeRange::new(s, e))
3071 .collect();
3072 ClassUnicode::new(ranges)
3073 }
3074
3075 fn bclass(ranges: &[(u8, u8)]) -> ClassBytes {
3076 let ranges: Vec<ClassBytesRange> =
3077 ranges.iter().map(|&(s, e)| ClassBytesRange::new(s, e)).collect();
3078 ClassBytes::new(ranges)
3079 }
3080
3081 fn uranges(cls: &ClassUnicode) -> Vec<(char, char)> {
3082 cls.iter().map(|x| (x.start(), x.end())).collect()
3083 }
3084
3085 #[cfg(feature = "unicode-case")]
3086 fn ucasefold(cls: &ClassUnicode) -> ClassUnicode {
3087 let mut cls_ = cls.clone();
3088 cls_.case_fold_simple();
3089 cls_
3090 }
3091
3092 fn uunion(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
3093 let mut cls_ = cls1.clone();
3094 cls_.union(cls2);
3095 cls_
3096 }
3097
3098 fn uintersect(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
3099 let mut cls_ = cls1.clone();
3100 cls_.intersect(cls2);
3101 cls_
3102 }
3103
3104 fn udifference(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
3105 let mut cls_ = cls1.clone();
3106 cls_.difference(cls2);
3107 cls_
3108 }
3109
3110 fn usymdifference(
3111 cls1: &ClassUnicode,
3112 cls2: &ClassUnicode,
3113 ) -> ClassUnicode {
3114 let mut cls_ = cls1.clone();
3115 cls_.symmetric_difference(cls2);
3116 cls_
3117 }
3118
3119 fn unegate(cls: &ClassUnicode) -> ClassUnicode {
3120 let mut cls_ = cls.clone();
3121 cls_.negate();
3122 cls_
3123 }
3124
3125 fn branges(cls: &ClassBytes) -> Vec<(u8, u8)> {
3126 cls.iter().map(|x| (x.start(), x.end())).collect()
3127 }
3128
3129 fn bcasefold(cls: &ClassBytes) -> ClassBytes {
3130 let mut cls_ = cls.clone();
3131 cls_.case_fold_simple();
3132 cls_
3133 }
3134
3135 fn bunion(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
3136 let mut cls_ = cls1.clone();
3137 cls_.union(cls2);
3138 cls_
3139 }
3140
3141 fn bintersect(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
3142 let mut cls_ = cls1.clone();
3143 cls_.intersect(cls2);
3144 cls_
3145 }
3146
3147 fn bdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
3148 let mut cls_ = cls1.clone();
3149 cls_.difference(cls2);
3150 cls_
3151 }
3152
3153 fn bsymdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
3154 let mut cls_ = cls1.clone();
3155 cls_.symmetric_difference(cls2);
3156 cls_
3157 }
3158
3159 fn bnegate(cls: &ClassBytes) -> ClassBytes {
3160 let mut cls_ = cls.clone();
3161 cls_.negate();
3162 cls_
3163 }
3164
3165 #[test]
3166 fn class_range_canonical_unicode() {
3167 let range = ClassUnicodeRange::new('\u{00FF}', '\0');
3168 assert_eq!('\0', range.start());
3169 assert_eq!('\u{00FF}', range.end());
3170 }
3171
3172 #[test]
3173 fn class_range_canonical_bytes() {
3174 let range = ClassBytesRange::new(b'\xFF', b'\0');
3175 assert_eq!(b'\0', range.start());
3176 assert_eq!(b'\xFF', range.end());
3177 }
3178
3179 #[test]
3180 fn class_canonicalize_unicode() {
3181 let cls = uclass(&[('a', 'c'), ('x', 'z')]);
3182 let expected = vec![('a', 'c'), ('x', 'z')];
3183 assert_eq!(expected, uranges(&cls));
3184
3185 let cls = uclass(&[('x', 'z'), ('a', 'c')]);
3186 let expected = vec![('a', 'c'), ('x', 'z')];
3187 assert_eq!(expected, uranges(&cls));
3188
3189 let cls = uclass(&[('x', 'z'), ('w', 'y')]);
3190 let expected = vec![('w', 'z')];
3191 assert_eq!(expected, uranges(&cls));
3192
3193 let cls = uclass(&[
3194 ('c', 'f'),
3195 ('a', 'g'),
3196 ('d', 'j'),
3197 ('a', 'c'),
3198 ('m', 'p'),
3199 ('l', 's'),
3200 ]);
3201 let expected = vec![('a', 'j'), ('l', 's')];
3202 assert_eq!(expected, uranges(&cls));
3203
3204 let cls = uclass(&[('x', 'z'), ('u', 'w')]);
3205 let expected = vec![('u', 'z')];
3206 assert_eq!(expected, uranges(&cls));
3207
3208 let cls = uclass(&[('\x00', '\u{10FFFF}'), ('\x00', '\u{10FFFF}')]);
3209 let expected = vec![('\x00', '\u{10FFFF}')];
3210 assert_eq!(expected, uranges(&cls));
3211
3212 let cls = uclass(&[('a', 'a'), ('b', 'b')]);
3213 let expected = vec![('a', 'b')];
3214 assert_eq!(expected, uranges(&cls));
3215 }
3216
3217 #[test]
3218 fn class_canonicalize_bytes() {
3219 let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]);
3220 let expected = vec![(b'a', b'c'), (b'x', b'z')];
3221 assert_eq!(expected, branges(&cls));
3222
3223 let cls = bclass(&[(b'x', b'z'), (b'a', b'c')]);
3224 let expected = vec![(b'a', b'c'), (b'x', b'z')];
3225 assert_eq!(expected, branges(&cls));
3226
3227 let cls = bclass(&[(b'x', b'z'), (b'w', b'y')]);
3228 let expected = vec![(b'w', b'z')];
3229 assert_eq!(expected, branges(&cls));
3230
3231 let cls = bclass(&[
3232 (b'c', b'f'),
3233 (b'a', b'g'),
3234 (b'd', b'j'),
3235 (b'a', b'c'),
3236 (b'm', b'p'),
3237 (b'l', b's'),
3238 ]);
3239 let expected = vec![(b'a', b'j'), (b'l', b's')];
3240 assert_eq!(expected, branges(&cls));
3241
3242 let cls = bclass(&[(b'x', b'z'), (b'u', b'w')]);
3243 let expected = vec![(b'u', b'z')];
3244 assert_eq!(expected, branges(&cls));
3245
3246 let cls = bclass(&[(b'\x00', b'\xFF'), (b'\x00', b'\xFF')]);
3247 let expected = vec![(b'\x00', b'\xFF')];
3248 assert_eq!(expected, branges(&cls));
3249
3250 let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]);
3251 let expected = vec![(b'a', b'b')];
3252 assert_eq!(expected, branges(&cls));
3253 }
3254
3255 #[test]
3256 #[cfg(feature = "unicode-case")]
3257 fn class_case_fold_unicode() {
3258 let cls = uclass(&[
3259 ('C', 'F'),
3260 ('A', 'G'),
3261 ('D', 'J'),
3262 ('A', 'C'),
3263 ('M', 'P'),
3264 ('L', 'S'),
3265 ('c', 'f'),
3266 ]);
3267 let expected = uclass(&[
3268 ('A', 'J'),
3269 ('L', 'S'),
3270 ('a', 'j'),
3271 ('l', 's'),
3272 ('\u{17F}', '\u{17F}'),
3273 ]);
3274 assert_eq!(expected, ucasefold(&cls));
3275
3276 let cls = uclass(&[('A', 'Z')]);
3277 let expected = uclass(&[
3278 ('A', 'Z'),
3279 ('a', 'z'),
3280 ('\u{17F}', '\u{17F}'),
3281 ('\u{212A}', '\u{212A}'),
3282 ]);
3283 assert_eq!(expected, ucasefold(&cls));
3284
3285 let cls = uclass(&[('a', 'z')]);
3286 let expected = uclass(&[
3287 ('A', 'Z'),
3288 ('a', 'z'),
3289 ('\u{17F}', '\u{17F}'),
3290 ('\u{212A}', '\u{212A}'),
3291 ]);
3292 assert_eq!(expected, ucasefold(&cls));
3293
3294 let cls = uclass(&[('A', 'A'), ('_', '_')]);
3295 let expected = uclass(&[('A', 'A'), ('_', '_'), ('a', 'a')]);
3296 assert_eq!(expected, ucasefold(&cls));
3297
3298 let cls = uclass(&[('A', 'A'), ('=', '=')]);
3299 let expected = uclass(&[('=', '='), ('A', 'A'), ('a', 'a')]);
3300 assert_eq!(expected, ucasefold(&cls));
3301
3302 let cls = uclass(&[('\x00', '\x10')]);
3303 assert_eq!(cls, ucasefold(&cls));
3304
3305 let cls = uclass(&[('k', 'k')]);
3306 let expected =
3307 uclass(&[('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}')]);
3308 assert_eq!(expected, ucasefold(&cls));
3309
3310 let cls = uclass(&[('@', '@')]);
3311 assert_eq!(cls, ucasefold(&cls));
3312 }
3313
3314 #[test]
3315 #[cfg(not(feature = "unicode-case"))]
3316 fn class_case_fold_unicode_disabled() {
3317 let mut cls = uclass(&[
3318 ('C', 'F'),
3319 ('A', 'G'),
3320 ('D', 'J'),
3321 ('A', 'C'),
3322 ('M', 'P'),
3323 ('L', 'S'),
3324 ('c', 'f'),
3325 ]);
3326 assert!(cls.try_case_fold_simple().is_err());
3327 }
3328
3329 #[test]
3330 #[should_panic]
3331 #[cfg(not(feature = "unicode-case"))]
3332 fn class_case_fold_unicode_disabled_panics() {
3333 let mut cls = uclass(&[
3334 ('C', 'F'),
3335 ('A', 'G'),
3336 ('D', 'J'),
3337 ('A', 'C'),
3338 ('M', 'P'),
3339 ('L', 'S'),
3340 ('c', 'f'),
3341 ]);
3342 cls.case_fold_simple();
3343 }
3344
3345 #[test]
3346 fn class_case_fold_bytes() {
3347 let cls = bclass(&[
3348 (b'C', b'F'),
3349 (b'A', b'G'),
3350 (b'D', b'J'),
3351 (b'A', b'C'),
3352 (b'M', b'P'),
3353 (b'L', b'S'),
3354 (b'c', b'f'),
3355 ]);
3356 let expected =
3357 bclass(&[(b'A', b'J'), (b'L', b'S'), (b'a', b'j'), (b'l', b's')]);
3358 assert_eq!(expected, bcasefold(&cls));
3359
3360 let cls = bclass(&[(b'A', b'Z')]);
3361 let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]);
3362 assert_eq!(expected, bcasefold(&cls));
3363
3364 let cls = bclass(&[(b'a', b'z')]);
3365 let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]);
3366 assert_eq!(expected, bcasefold(&cls));
3367
3368 let cls = bclass(&[(b'A', b'A'), (b'_', b'_')]);
3369 let expected = bclass(&[(b'A', b'A'), (b'_', b'_'), (b'a', b'a')]);
3370 assert_eq!(expected, bcasefold(&cls));
3371
3372 let cls = bclass(&[(b'A', b'A'), (b'=', b'=')]);
3373 let expected = bclass(&[(b'=', b'='), (b'A', b'A'), (b'a', b'a')]);
3374 assert_eq!(expected, bcasefold(&cls));
3375
3376 let cls = bclass(&[(b'\x00', b'\x10')]);
3377 assert_eq!(cls, bcasefold(&cls));
3378
3379 let cls = bclass(&[(b'k', b'k')]);
3380 let expected = bclass(&[(b'K', b'K'), (b'k', b'k')]);
3381 assert_eq!(expected, bcasefold(&cls));
3382
3383 let cls = bclass(&[(b'@', b'@')]);
3384 assert_eq!(cls, bcasefold(&cls));
3385 }
3386
3387 #[test]
3388 fn class_negate_unicode() {
3389 let cls = uclass(&[('a', 'a')]);
3390 let expected = uclass(&[('\x00', '\x60'), ('\x62', '\u{10FFFF}')]);
3391 assert_eq!(expected, unegate(&cls));
3392
3393 let cls = uclass(&[('a', 'a'), ('b', 'b')]);
3394 let expected = uclass(&[('\x00', '\x60'), ('\x63', '\u{10FFFF}')]);
3395 assert_eq!(expected, unegate(&cls));
3396
3397 let cls = uclass(&[('a', 'c'), ('x', 'z')]);
3398 let expected = uclass(&[
3399 ('\x00', '\x60'),
3400 ('\x64', '\x77'),
3401 ('\x7B', '\u{10FFFF}'),
3402 ]);
3403 assert_eq!(expected, unegate(&cls));
3404
3405 let cls = uclass(&[('\x00', 'a')]);
3406 let expected = uclass(&[('\x62', '\u{10FFFF}')]);
3407 assert_eq!(expected, unegate(&cls));
3408
3409 let cls = uclass(&[('a', '\u{10FFFF}')]);
3410 let expected = uclass(&[('\x00', '\x60')]);
3411 assert_eq!(expected, unegate(&cls));
3412
3413 let cls = uclass(&[('\x00', '\u{10FFFF}')]);
3414 let expected = uclass(&[]);
3415 assert_eq!(expected, unegate(&cls));
3416
3417 let cls = uclass(&[]);
3418 let expected = uclass(&[('\x00', '\u{10FFFF}')]);
3419 assert_eq!(expected, unegate(&cls));
3420
3421 let cls =
3422 uclass(&[('\x00', '\u{10FFFD}'), ('\u{10FFFF}', '\u{10FFFF}')]);
3423 let expected = uclass(&[('\u{10FFFE}', '\u{10FFFE}')]);
3424 assert_eq!(expected, unegate(&cls));
3425
3426 let cls = uclass(&[('\x00', '\u{D7FF}')]);
3427 let expected = uclass(&[('\u{E000}', '\u{10FFFF}')]);
3428 assert_eq!(expected, unegate(&cls));
3429
3430 let cls = uclass(&[('\x00', '\u{D7FE}')]);
3431 let expected = uclass(&[('\u{D7FF}', '\u{10FFFF}')]);
3432 assert_eq!(expected, unegate(&cls));
3433
3434 let cls = uclass(&[('\u{E000}', '\u{10FFFF}')]);
3435 let expected = uclass(&[('\x00', '\u{D7FF}')]);
3436 assert_eq!(expected, unegate(&cls));
3437
3438 let cls = uclass(&[('\u{E001}', '\u{10FFFF}')]);
3439 let expected = uclass(&[('\x00', '\u{E000}')]);
3440 assert_eq!(expected, unegate(&cls));
3441 }
3442
3443 #[test]
3444 fn class_negate_bytes() {
3445 let cls = bclass(&[(b'a', b'a')]);
3446 let expected = bclass(&[(b'\x00', b'\x60'), (b'\x62', b'\xFF')]);
3447 assert_eq!(expected, bnegate(&cls));
3448
3449 let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]);
3450 let expected = bclass(&[(b'\x00', b'\x60'), (b'\x63', b'\xFF')]);
3451 assert_eq!(expected, bnegate(&cls));
3452
3453 let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]);
3454 let expected = bclass(&[
3455 (b'\x00', b'\x60'),
3456 (b'\x64', b'\x77'),
3457 (b'\x7B', b'\xFF'),
3458 ]);
3459 assert_eq!(expected, bnegate(&cls));
3460
3461 let cls = bclass(&[(b'\x00', b'a')]);
3462 let expected = bclass(&[(b'\x62', b'\xFF')]);
3463 assert_eq!(expected, bnegate(&cls));
3464
3465 let cls = bclass(&[(b'a', b'\xFF')]);
3466 let expected = bclass(&[(b'\x00', b'\x60')]);
3467 assert_eq!(expected, bnegate(&cls));
3468
3469 let cls = bclass(&[(b'\x00', b'\xFF')]);
3470 let expected = bclass(&[]);
3471 assert_eq!(expected, bnegate(&cls));
3472
3473 let cls = bclass(&[]);
3474 let expected = bclass(&[(b'\x00', b'\xFF')]);
3475 assert_eq!(expected, bnegate(&cls));
3476
3477 let cls = bclass(&[(b'\x00', b'\xFD'), (b'\xFF', b'\xFF')]);
3478 let expected = bclass(&[(b'\xFE', b'\xFE')]);
3479 assert_eq!(expected, bnegate(&cls));
3480 }
3481
3482 #[test]
3483 fn class_union_unicode() {
3484 let cls1 = uclass(&[('a', 'g'), ('m', 't'), ('A', 'C')]);
3485 let cls2 = uclass(&[('a', 'z')]);
3486 let expected = uclass(&[('a', 'z'), ('A', 'C')]);
3487 assert_eq!(expected, uunion(&cls1, &cls2));
3488 }
3489
3490 #[test]
3491 fn class_union_bytes() {
3492 let cls1 = bclass(&[(b'a', b'g'), (b'm', b't'), (b'A', b'C')]);
3493 let cls2 = bclass(&[(b'a', b'z')]);
3494 let expected = bclass(&[(b'a', b'z'), (b'A', b'C')]);
3495 assert_eq!(expected, bunion(&cls1, &cls2));
3496 }
3497
3498 #[test]
3499 fn class_intersect_unicode() {
3500 let cls1 = uclass(&[]);
3501 let cls2 = uclass(&[('a', 'a')]);
3502 let expected = uclass(&[]);
3503 assert_eq!(expected, uintersect(&cls1, &cls2));
3504
3505 let cls1 = uclass(&[('a', 'a')]);
3506 let cls2 = uclass(&[('a', 'a')]);
3507 let expected = uclass(&[('a', 'a')]);
3508 assert_eq!(expected, uintersect(&cls1, &cls2));
3509
3510 let cls1 = uclass(&[('a', 'a')]);
3511 let cls2 = uclass(&[('b', 'b')]);
3512 let expected = uclass(&[]);
3513 assert_eq!(expected, uintersect(&cls1, &cls2));
3514
3515 let cls1 = uclass(&[('a', 'a')]);
3516 let cls2 = uclass(&[('a', 'c')]);
3517 let expected = uclass(&[('a', 'a')]);
3518 assert_eq!(expected, uintersect(&cls1, &cls2));
3519
3520 let cls1 = uclass(&[('a', 'b')]);
3521 let cls2 = uclass(&[('a', 'c')]);
3522 let expected = uclass(&[('a', 'b')]);
3523 assert_eq!(expected, uintersect(&cls1, &cls2));
3524
3525 let cls1 = uclass(&[('a', 'b')]);
3526 let cls2 = uclass(&[('b', 'c')]);
3527 let expected = uclass(&[('b', 'b')]);
3528 assert_eq!(expected, uintersect(&cls1, &cls2));
3529
3530 let cls1 = uclass(&[('a', 'b')]);
3531 let cls2 = uclass(&[('c', 'd')]);
3532 let expected = uclass(&[]);
3533 assert_eq!(expected, uintersect(&cls1, &cls2));
3534
3535 let cls1 = uclass(&[('b', 'c')]);
3536 let cls2 = uclass(&[('a', 'd')]);
3537 let expected = uclass(&[('b', 'c')]);
3538 assert_eq!(expected, uintersect(&cls1, &cls2));
3539
3540 let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3541 let cls2 = uclass(&[('a', 'h')]);
3542 let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3543 assert_eq!(expected, uintersect(&cls1, &cls2));
3544
3545 let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3546 let cls2 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3547 let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3548 assert_eq!(expected, uintersect(&cls1, &cls2));
3549
3550 let cls1 = uclass(&[('a', 'b'), ('g', 'h')]);
3551 let cls2 = uclass(&[('d', 'e'), ('k', 'l')]);
3552 let expected = uclass(&[]);
3553 assert_eq!(expected, uintersect(&cls1, &cls2));
3554
3555 let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
3556 let cls2 = uclass(&[('h', 'h')]);
3557 let expected = uclass(&[('h', 'h')]);
3558 assert_eq!(expected, uintersect(&cls1, &cls2));
3559
3560 let cls1 = uclass(&[('a', 'b'), ('e', 'f'), ('i', 'j')]);
3561 let cls2 = uclass(&[('c', 'd'), ('g', 'h'), ('k', 'l')]);
3562 let expected = uclass(&[]);
3563 assert_eq!(expected, uintersect(&cls1, &cls2));
3564
3565 let cls1 = uclass(&[('a', 'b'), ('c', 'd'), ('e', 'f')]);
3566 let cls2 = uclass(&[('b', 'c'), ('d', 'e'), ('f', 'g')]);
3567 let expected = uclass(&[('b', 'f')]);
3568 assert_eq!(expected, uintersect(&cls1, &cls2));
3569 }
3570
3571 #[test]
3572 fn class_intersect_bytes() {
3573 let cls1 = bclass(&[]);
3574 let cls2 = bclass(&[(b'a', b'a')]);
3575 let expected = bclass(&[]);
3576 assert_eq!(expected, bintersect(&cls1, &cls2));
3577
3578 let cls1 = bclass(&[(b'a', b'a')]);
3579 let cls2 = bclass(&[(b'a', b'a')]);
3580 let expected = bclass(&[(b'a', b'a')]);
3581 assert_eq!(expected, bintersect(&cls1, &cls2));
3582
3583 let cls1 = bclass(&[(b'a', b'a')]);
3584 let cls2 = bclass(&[(b'b', b'b')]);
3585 let expected = bclass(&[]);
3586 assert_eq!(expected, bintersect(&cls1, &cls2));
3587
3588 let cls1 = bclass(&[(b'a', b'a')]);
3589 let cls2 = bclass(&[(b'a', b'c')]);
3590 let expected = bclass(&[(b'a', b'a')]);
3591 assert_eq!(expected, bintersect(&cls1, &cls2));
3592
3593 let cls1 = bclass(&[(b'a', b'b')]);
3594 let cls2 = bclass(&[(b'a', b'c')]);
3595 let expected = bclass(&[(b'a', b'b')]);
3596 assert_eq!(expected, bintersect(&cls1, &cls2));
3597
3598 let cls1 = bclass(&[(b'a', b'b')]);
3599 let cls2 = bclass(&[(b'b', b'c')]);
3600 let expected = bclass(&[(b'b', b'b')]);
3601 assert_eq!(expected, bintersect(&cls1, &cls2));
3602
3603 let cls1 = bclass(&[(b'a', b'b')]);
3604 let cls2 = bclass(&[(b'c', b'd')]);
3605 let expected = bclass(&[]);
3606 assert_eq!(expected, bintersect(&cls1, &cls2));
3607
3608 let cls1 = bclass(&[(b'b', b'c')]);
3609 let cls2 = bclass(&[(b'a', b'd')]);
3610 let expected = bclass(&[(b'b', b'c')]);
3611 assert_eq!(expected, bintersect(&cls1, &cls2));
3612
3613 let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3614 let cls2 = bclass(&[(b'a', b'h')]);
3615 let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3616 assert_eq!(expected, bintersect(&cls1, &cls2));
3617
3618 let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3619 let cls2 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3620 let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3621 assert_eq!(expected, bintersect(&cls1, &cls2));
3622
3623 let cls1 = bclass(&[(b'a', b'b'), (b'g', b'h')]);
3624 let cls2 = bclass(&[(b'd', b'e'), (b'k', b'l')]);
3625 let expected = bclass(&[]);
3626 assert_eq!(expected, bintersect(&cls1, &cls2));
3627
3628 let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
3629 let cls2 = bclass(&[(b'h', b'h')]);
3630 let expected = bclass(&[(b'h', b'h')]);
3631 assert_eq!(expected, bintersect(&cls1, &cls2));
3632
3633 let cls1 = bclass(&[(b'a', b'b'), (b'e', b'f'), (b'i', b'j')]);
3634 let cls2 = bclass(&[(b'c', b'd'), (b'g', b'h'), (b'k', b'l')]);
3635 let expected = bclass(&[]);
3636 assert_eq!(expected, bintersect(&cls1, &cls2));
3637
3638 let cls1 = bclass(&[(b'a', b'b'), (b'c', b'd'), (b'e', b'f')]);
3639 let cls2 = bclass(&[(b'b', b'c'), (b'd', b'e'), (b'f', b'g')]);
3640 let expected = bclass(&[(b'b', b'f')]);
3641 assert_eq!(expected, bintersect(&cls1, &cls2));
3642 }
3643
3644 #[test]
3645 fn class_difference_unicode() {
3646 let cls1 = uclass(&[('a', 'a')]);
3647 let cls2 = uclass(&[('a', 'a')]);
3648 let expected = uclass(&[]);
3649 assert_eq!(expected, udifference(&cls1, &cls2));
3650
3651 let cls1 = uclass(&[('a', 'a')]);
3652 let cls2 = uclass(&[]);
3653 let expected = uclass(&[('a', 'a')]);
3654 assert_eq!(expected, udifference(&cls1, &cls2));
3655
3656 let cls1 = uclass(&[]);
3657 let cls2 = uclass(&[('a', 'a')]);
3658 let expected = uclass(&[]);
3659 assert_eq!(expected, udifference(&cls1, &cls2));
3660
3661 let cls1 = uclass(&[('a', 'z')]);
3662 let cls2 = uclass(&[('a', 'a')]);
3663 let expected = uclass(&[('b', 'z')]);
3664 assert_eq!(expected, udifference(&cls1, &cls2));
3665
3666 let cls1 = uclass(&[('a', 'z')]);
3667 let cls2 = uclass(&[('z', 'z')]);
3668 let expected = uclass(&[('a', 'y')]);
3669 assert_eq!(expected, udifference(&cls1, &cls2));
3670
3671 let cls1 = uclass(&[('a', 'z')]);
3672 let cls2 = uclass(&[('m', 'm')]);
3673 let expected = uclass(&[('a', 'l'), ('n', 'z')]);
3674 assert_eq!(expected, udifference(&cls1, &cls2));
3675
3676 let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
3677 let cls2 = uclass(&[('a', 'z')]);
3678 let expected = uclass(&[]);
3679 assert_eq!(expected, udifference(&cls1, &cls2));
3680
3681 let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
3682 let cls2 = uclass(&[('d', 'v')]);
3683 let expected = uclass(&[('a', 'c')]);
3684 assert_eq!(expected, udifference(&cls1, &cls2));
3685
3686 let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
3687 let cls2 = uclass(&[('b', 'g'), ('s', 'u')]);
3688 let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]);
3689 assert_eq!(expected, udifference(&cls1, &cls2));
3690
3691 let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
3692 let cls2 = uclass(&[('b', 'd'), ('e', 'g'), ('s', 'u')]);
3693 let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]);
3694 assert_eq!(expected, udifference(&cls1, &cls2));
3695
3696 let cls1 = uclass(&[('x', 'z')]);
3697 let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]);
3698 let expected = uclass(&[('x', 'z')]);
3699 assert_eq!(expected, udifference(&cls1, &cls2));
3700
3701 let cls1 = uclass(&[('a', 'z')]);
3702 let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]);
3703 let expected = uclass(&[('d', 'd'), ('h', 'r'), ('v', 'z')]);
3704 assert_eq!(expected, udifference(&cls1, &cls2));
3705 }
3706
3707 #[test]
3708 fn class_difference_bytes() {
3709 let cls1 = bclass(&[(b'a', b'a')]);
3710 let cls2 = bclass(&[(b'a', b'a')]);
3711 let expected = bclass(&[]);
3712 assert_eq!(expected, bdifference(&cls1, &cls2));
3713
3714 let cls1 = bclass(&[(b'a', b'a')]);
3715 let cls2 = bclass(&[]);
3716 let expected = bclass(&[(b'a', b'a')]);
3717 assert_eq!(expected, bdifference(&cls1, &cls2));
3718
3719 let cls1 = bclass(&[]);
3720 let cls2 = bclass(&[(b'a', b'a')]);
3721 let expected = bclass(&[]);
3722 assert_eq!(expected, bdifference(&cls1, &cls2));
3723
3724 let cls1 = bclass(&[(b'a', b'z')]);
3725 let cls2 = bclass(&[(b'a', b'a')]);
3726 let expected = bclass(&[(b'b', b'z')]);
3727 assert_eq!(expected, bdifference(&cls1, &cls2));
3728
3729 let cls1 = bclass(&[(b'a', b'z')]);
3730 let cls2 = bclass(&[(b'z', b'z')]);
3731 let expected = bclass(&[(b'a', b'y')]);
3732 assert_eq!(expected, bdifference(&cls1, &cls2));
3733
3734 let cls1 = bclass(&[(b'a', b'z')]);
3735 let cls2 = bclass(&[(b'm', b'm')]);
3736 let expected = bclass(&[(b'a', b'l'), (b'n', b'z')]);
3737 assert_eq!(expected, bdifference(&cls1, &cls2));
3738
3739 let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
3740 let cls2 = bclass(&[(b'a', b'z')]);
3741 let expected = bclass(&[]);
3742 assert_eq!(expected, bdifference(&cls1, &cls2));
3743
3744 let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
3745 let cls2 = bclass(&[(b'd', b'v')]);
3746 let expected = bclass(&[(b'a', b'c')]);
3747 assert_eq!(expected, bdifference(&cls1, &cls2));
3748
3749 let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
3750 let cls2 = bclass(&[(b'b', b'g'), (b's', b'u')]);
3751 let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]);
3752 assert_eq!(expected, bdifference(&cls1, &cls2));
3753
3754 let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
3755 let cls2 = bclass(&[(b'b', b'd'), (b'e', b'g'), (b's', b'u')]);
3756 let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]);
3757 assert_eq!(expected, bdifference(&cls1, &cls2));
3758
3759 let cls1 = bclass(&[(b'x', b'z')]);
3760 let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]);
3761 let expected = bclass(&[(b'x', b'z')]);
3762 assert_eq!(expected, bdifference(&cls1, &cls2));
3763
3764 let cls1 = bclass(&[(b'a', b'z')]);
3765 let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]);
3766 let expected = bclass(&[(b'd', b'd'), (b'h', b'r'), (b'v', b'z')]);
3767 assert_eq!(expected, bdifference(&cls1, &cls2));
3768 }
3769
3770 #[test]
3771 fn class_symmetric_difference_unicode() {
3772 let cls1 = uclass(&[('a', 'm')]);
3773 let cls2 = uclass(&[('g', 't')]);
3774 let expected = uclass(&[('a', 'f'), ('n', 't')]);
3775 assert_eq!(expected, usymdifference(&cls1, &cls2));
3776 }
3777
3778 #[test]
3779 fn class_symmetric_difference_bytes() {
3780 let cls1 = bclass(&[(b'a', b'm')]);
3781 let cls2 = bclass(&[(b'g', b't')]);
3782 let expected = bclass(&[(b'a', b'f'), (b'n', b't')]);
3783 assert_eq!(expected, bsymdifference(&cls1, &cls2));
3784 }
3785
3786 // We use a thread with an explicit stack size to test that our destructor
3787 // for Hir can handle arbitrarily sized expressions in constant stack
3788 // space. In case we run on a platform without threads (WASM?), we limit
3789 // this test to Windows/Unix.
3790 #[test]
3791 #[cfg(any(unix, windows))]
3792 fn no_stack_overflow_on_drop() {
3793 use std::thread;
3794
3795 let run = || {
3796 let mut expr = Hir::empty();
3797 for _ in 0..100 {
3798 expr = Hir::capture(Capture {
3799 index: 1,
3800 name: None,
3801 sub: Box::new(expr),
3802 });
3803 expr = Hir::repetition(Repetition {
3804 min: 0,
3805 max: Some(1),
3806 greedy: true,
3807 sub: Box::new(expr),
3808 });
3809
3810 expr = Hir {
3811 kind: HirKind::Concat(vec![expr]),
3812 props: Properties::empty(),
3813 };
3814 expr = Hir {
3815 kind: HirKind::Alternation(vec![expr]),
3816 props: Properties::empty(),
3817 };
3818 }
3819 assert!(!matches!(*expr.kind(), HirKind::Empty));
3820 };
3821
3822 // We run our test on a thread with a small stack size so we can
3823 // force the issue more easily.
3824 //
3825 // NOTE(2023-03-21): See the corresponding test in 'crate::ast::tests'
3826 // for context on the specific stack size chosen here.
3827 thread::Builder::new()
3828 .stack_size(16 << 10)
3829 .spawn(run)
3830 .unwrap()
3831 .join()
3832 .unwrap();
3833 }
3834
3835 #[test]
3836 fn look_set_iter() {
3837 let set = LookSet::empty();
3838 assert_eq!(0, set.iter().count());
3839
3840 let set = LookSet::full();
3841 assert_eq!(18, set.iter().count());
3842
3843 let set =
3844 LookSet::empty().insert(Look::StartLF).insert(Look::WordUnicode);
3845 assert_eq!(2, set.iter().count());
3846
3847 let set = LookSet::empty().insert(Look::StartLF);
3848 assert_eq!(1, set.iter().count());
3849
3850 let set = LookSet::empty().insert(Look::WordAsciiNegate);
3851 assert_eq!(1, set.iter().count());
3852 }
3853
3854 #[test]
3855 fn look_set_debug() {
3856 let res = format!("{:?}", LookSet::empty());
3857 assert_eq!("∅", res);
3858 let res = format!("{:?}", LookSet::full());
3859 assert_eq!("Az^$rRbB𝛃𝚩<>〈〉◁▷◀▶", res);
3860 }
3861}
3862