| 1 | use alloc::{boxed::Box, string::String, vec, vec::Vec}; |
| 2 | |
| 3 | use crate::{error::Error, utf8}; |
| 4 | |
| 5 | mod parse; |
| 6 | |
| 7 | /// Escapes all regular expression meta characters in `pattern`. |
| 8 | /// |
| 9 | /// The string returned may be safely used as a literal in a regular |
| 10 | /// expression. |
| 11 | pub fn escape(pattern: &str) -> String { |
| 12 | let mut buf: String = String::new(); |
| 13 | buf.reserve(additional:pattern.len()); |
| 14 | for ch: char in pattern.chars() { |
| 15 | if is_meta_character(ch) { |
| 16 | buf.push(ch:' \\' ); |
| 17 | } |
| 18 | buf.push(ch); |
| 19 | } |
| 20 | buf |
| 21 | } |
| 22 | |
| 23 | /// Returns true if the given character has significance in a regex. |
| 24 | /// |
| 25 | /// Generally speaking, these are the only characters which _must_ be escaped |
| 26 | /// in order to match their literal meaning. For example, to match a literal |
| 27 | /// `|`, one could write `\|`. Sometimes escaping isn't always necessary. For |
| 28 | /// example, `-` is treated as a meta character because of its significance |
| 29 | /// for writing ranges inside of character classes, but the regex `-` will |
| 30 | /// match a literal `-` because `-` has no special meaning outside of character |
| 31 | /// classes. |
| 32 | /// |
| 33 | /// In order to determine whether a character may be escaped at all, the |
| 34 | /// [`is_escapeable_character`] routine should be used. The difference between |
| 35 | /// `is_meta_character` and `is_escapeable_character` is that the latter will |
| 36 | /// return true for some characters that are _not_ meta characters. For |
| 37 | /// example, `%` and `\%` both match a literal `%` in all contexts. In other |
| 38 | /// words, `is_escapeable_character` includes "superfluous" escapes. |
| 39 | /// |
| 40 | /// Note that the set of characters for which this function returns `true` or |
| 41 | /// `false` is fixed and won't change in a semver compatible release. (In this |
| 42 | /// case, "semver compatible release" actually refers to the `regex` crate |
| 43 | /// itself, since reducing or expanding the set of meta characters would be a |
| 44 | /// breaking change for not just `regex-syntax` but also `regex` itself.) |
| 45 | fn is_meta_character(c: char) -> bool { |
| 46 | match c { |
| 47 | ' \\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | '[' | ']' | '{' |
| 48 | | '}' | '^' | '$' | '#' | '&' | '-' | '~' => true, |
| 49 | _ => false, |
| 50 | } |
| 51 | } |
| 52 | |
| 53 | /// Returns true if the given character can be escaped in a regex. |
| 54 | /// |
| 55 | /// This returns true in all cases that `is_meta_character` returns true, but |
| 56 | /// also returns true in some cases where `is_meta_character` returns false. |
| 57 | /// For example, `%` is not a meta character, but it is escapeable. That is, |
| 58 | /// `%` and `\%` both match a literal `%` in all contexts. |
| 59 | /// |
| 60 | /// The purpose of this routine is to provide knowledge about what characters |
| 61 | /// may be escaped. Namely, most regex engines permit "superfluous" escapes |
| 62 | /// where characters without any special significance may be escaped even |
| 63 | /// though there is no actual _need_ to do so. |
| 64 | /// |
| 65 | /// This will return false for some characters. For example, `e` is not |
| 66 | /// escapeable. Therefore, `\e` will either result in a parse error (which is |
| 67 | /// true today), or it could backwards compatibly evolve into a new construct |
| 68 | /// with its own meaning. Indeed, that is the purpose of banning _some_ |
| 69 | /// superfluous escapes: it provides a way to evolve the syntax in a compatible |
| 70 | /// manner. |
| 71 | fn is_escapeable_character(c: char) -> bool { |
| 72 | // Certainly escapeable if it's a meta character. |
| 73 | if is_meta_character(c) { |
| 74 | return true; |
| 75 | } |
| 76 | // Any character that isn't ASCII is definitely not escapeable. There's |
| 77 | // no real need to allow things like \☃ right? |
| 78 | if !c.is_ascii() { |
| 79 | return false; |
| 80 | } |
| 81 | // Otherwise, we basically say that everything is escapeable unless it's a |
| 82 | // letter or digit. Things like \3 are either octal (when enabled) or an |
| 83 | // error, and we should keep it that way. Otherwise, letters are reserved |
| 84 | // for adding new syntax in a backwards compatible way. |
| 85 | match c { |
| 86 | '0' ..='9' | 'A' ..='Z' | 'a' ..='z' => false, |
| 87 | // While not currently supported, we keep these as not escapeable to |
| 88 | // give us some flexibility with respect to supporting the \< and |
| 89 | // \> word boundary assertions in the future. By rejecting them as |
| 90 | // escapeable, \< and \> will result in a parse error. Thus, we can |
| 91 | // turn them into something else in the future without it being a |
| 92 | // backwards incompatible change. |
| 93 | '<' | '>' => false, |
| 94 | _ => true, |
| 95 | } |
| 96 | } |
| 97 | |
| 98 | /// The configuration for a regex parser. |
| 99 | #[derive (Clone, Copy, Debug)] |
| 100 | pub(crate) struct Config { |
| 101 | /// The maximum number of times we're allowed to recurse. |
| 102 | /// |
| 103 | /// Note that unlike the regex-syntax parser, we actually use recursion in |
| 104 | /// this parser for simplicity. My hope is that by setting a conservative |
| 105 | /// default call limit and providing a way to configure it, that we can |
| 106 | /// keep this simplification. But if we must, we can re-work the parser to |
| 107 | /// put the call stack on the heap like regex-syntax does. |
| 108 | pub(crate) nest_limit: u32, |
| 109 | /// Various flags that control how a pattern is interpreted. |
| 110 | pub(crate) flags: Flags, |
| 111 | } |
| 112 | |
| 113 | impl Default for Config { |
| 114 | fn default() -> Config { |
| 115 | Config { nest_limit: 50, flags: Flags::default() } |
| 116 | } |
| 117 | } |
| 118 | |
| 119 | /// Various flags that control the interpretation of the pattern. |
| 120 | /// |
| 121 | /// These can be set via explicit configuration in code, or change dynamically |
| 122 | /// during parsing via inline flags. For example, `foo(?i:bar)baz` will match |
| 123 | /// `foo` and `baz` case sensitiviely and `bar` case insensitively (assuming a |
| 124 | /// default configuration). |
| 125 | #[derive (Clone, Copy, Debug, Default)] |
| 126 | pub(crate) struct Flags { |
| 127 | /// Whether to match case insensitively. |
| 128 | /// |
| 129 | /// This is the `i` flag. |
| 130 | pub(crate) case_insensitive: bool, |
| 131 | /// Whether `^` and `$` should be treated as line anchors or not. |
| 132 | /// |
| 133 | /// This is the `m` flag. |
| 134 | pub(crate) multi_line: bool, |
| 135 | /// Whether `.` should match line terminators or not. |
| 136 | /// |
| 137 | /// This is the `s` flag. |
| 138 | pub(crate) dot_matches_new_line: bool, |
| 139 | /// Whether to swap the meaning of greedy and non-greedy operators. |
| 140 | /// |
| 141 | /// This is the `U` flag. |
| 142 | pub(crate) swap_greed: bool, |
| 143 | /// Whether to enable CRLF mode. |
| 144 | /// |
| 145 | /// This is the `R` flag. |
| 146 | pub(crate) crlf: bool, |
| 147 | /// Whether to ignore whitespace. i.e., verbose mode. |
| 148 | /// |
| 149 | /// This is the `x` flag. |
| 150 | pub(crate) ignore_whitespace: bool, |
| 151 | } |
| 152 | |
| 153 | #[derive (Clone, Debug, Eq, PartialEq)] |
| 154 | pub(crate) struct Hir { |
| 155 | kind: HirKind, |
| 156 | is_start_anchored: bool, |
| 157 | is_match_empty: bool, |
| 158 | static_explicit_captures_len: Option<usize>, |
| 159 | } |
| 160 | |
| 161 | #[derive (Clone, Debug, Eq, PartialEq)] |
| 162 | pub(crate) enum HirKind { |
| 163 | Empty, |
| 164 | Char(char), |
| 165 | Class(Class), |
| 166 | Look(Look), |
| 167 | Repetition(Repetition), |
| 168 | Capture(Capture), |
| 169 | Concat(Vec<Hir>), |
| 170 | Alternation(Vec<Hir>), |
| 171 | } |
| 172 | |
| 173 | impl Hir { |
| 174 | /// Parses the given pattern string with the given configuration into a |
| 175 | /// structured representation. If the pattern is invalid, then an error |
| 176 | /// is returned. |
| 177 | pub(crate) fn parse(config: Config, pattern: &str) -> Result<Hir, Error> { |
| 178 | self::parse::Parser::new(config, pattern).parse() |
| 179 | } |
| 180 | |
| 181 | /// Returns the underlying kind of this high-level intermediate |
| 182 | /// representation. |
| 183 | /// |
| 184 | /// Note that there is explicitly no way to build an `Hir` directly from |
| 185 | /// an `HirKind`. If you need to do that, then you must do case analysis |
| 186 | /// on the `HirKind` and call the appropriate smart constructor on `Hir`. |
| 187 | pub(crate) fn kind(&self) -> &HirKind { |
| 188 | &self.kind |
| 189 | } |
| 190 | |
| 191 | /// Returns true if and only if this Hir expression can only match at the |
| 192 | /// beginning of a haystack. |
| 193 | pub(crate) fn is_start_anchored(&self) -> bool { |
| 194 | self.is_start_anchored |
| 195 | } |
| 196 | |
| 197 | /// Returns true if and only if this Hir expression can match the empty |
| 198 | /// string. |
| 199 | pub(crate) fn is_match_empty(&self) -> bool { |
| 200 | self.is_match_empty |
| 201 | } |
| 202 | |
| 203 | /// If the pattern always reports the same number of matching capture groups |
| 204 | /// for every match, then this returns the number of those groups. This |
| 205 | /// doesn't include the implicit group found in every pattern. |
| 206 | pub(crate) fn static_explicit_captures_len(&self) -> Option<usize> { |
| 207 | self.static_explicit_captures_len |
| 208 | } |
| 209 | |
| 210 | fn fail() -> Hir { |
| 211 | let kind = HirKind::Class(Class { ranges: vec![] }); |
| 212 | Hir { |
| 213 | kind, |
| 214 | is_start_anchored: false, |
| 215 | is_match_empty: false, |
| 216 | static_explicit_captures_len: Some(0), |
| 217 | } |
| 218 | } |
| 219 | |
| 220 | fn empty() -> Hir { |
| 221 | let kind = HirKind::Empty; |
| 222 | Hir { |
| 223 | kind, |
| 224 | is_start_anchored: false, |
| 225 | is_match_empty: true, |
| 226 | static_explicit_captures_len: Some(0), |
| 227 | } |
| 228 | } |
| 229 | |
| 230 | fn char(ch: char) -> Hir { |
| 231 | let kind = HirKind::Char(ch); |
| 232 | Hir { |
| 233 | kind, |
| 234 | is_start_anchored: false, |
| 235 | is_match_empty: false, |
| 236 | static_explicit_captures_len: Some(0), |
| 237 | } |
| 238 | } |
| 239 | |
| 240 | fn class(class: Class) -> Hir { |
| 241 | let kind = HirKind::Class(class); |
| 242 | Hir { |
| 243 | kind, |
| 244 | is_start_anchored: false, |
| 245 | is_match_empty: false, |
| 246 | static_explicit_captures_len: Some(0), |
| 247 | } |
| 248 | } |
| 249 | |
| 250 | fn look(look: Look) -> Hir { |
| 251 | let kind = HirKind::Look(look); |
| 252 | Hir { |
| 253 | kind, |
| 254 | is_start_anchored: matches!(look, Look::Start), |
| 255 | is_match_empty: true, |
| 256 | static_explicit_captures_len: Some(0), |
| 257 | } |
| 258 | } |
| 259 | |
| 260 | fn repetition(rep: Repetition) -> Hir { |
| 261 | if rep.min == 0 && rep.max == Some(0) { |
| 262 | return Hir::empty(); |
| 263 | } else if rep.min == 1 && rep.max == Some(1) { |
| 264 | return *rep.sub; |
| 265 | } |
| 266 | let is_start_anchored = rep.min > 0 && rep.sub.is_start_anchored; |
| 267 | let is_match_empty = rep.min == 0 || rep.sub.is_match_empty; |
| 268 | let mut static_explicit_captures_len = |
| 269 | rep.sub.static_explicit_captures_len; |
| 270 | // If the static captures len of the sub-expression is not known or |
| 271 | // is greater than zero, then it automatically propagates to the |
| 272 | // repetition, regardless of the repetition. Otherwise, it might |
| 273 | // change, but only when the repetition can match 0 times. |
| 274 | if rep.min == 0 |
| 275 | && static_explicit_captures_len.map_or(false, |len| len > 0) |
| 276 | { |
| 277 | // If we require a match 0 times, then our captures len is |
| 278 | // guaranteed to be zero. Otherwise, if we *can* match the empty |
| 279 | // string, then it's impossible to know how many captures will be |
| 280 | // in the resulting match. |
| 281 | if rep.max == Some(0) { |
| 282 | static_explicit_captures_len = Some(0); |
| 283 | } else { |
| 284 | static_explicit_captures_len = None; |
| 285 | } |
| 286 | } |
| 287 | Hir { |
| 288 | kind: HirKind::Repetition(rep), |
| 289 | is_start_anchored, |
| 290 | is_match_empty, |
| 291 | static_explicit_captures_len, |
| 292 | } |
| 293 | } |
| 294 | |
| 295 | fn capture(cap: Capture) -> Hir { |
| 296 | let is_start_anchored = cap.sub.is_start_anchored; |
| 297 | let is_match_empty = cap.sub.is_match_empty; |
| 298 | let static_explicit_captures_len = cap |
| 299 | .sub |
| 300 | .static_explicit_captures_len |
| 301 | .map(|len| len.saturating_add(1)); |
| 302 | let kind = HirKind::Capture(cap); |
| 303 | Hir { |
| 304 | kind, |
| 305 | is_start_anchored, |
| 306 | is_match_empty, |
| 307 | static_explicit_captures_len, |
| 308 | } |
| 309 | } |
| 310 | |
| 311 | fn concat(mut subs: Vec<Hir>) -> Hir { |
| 312 | if subs.is_empty() { |
| 313 | Hir::empty() |
| 314 | } else if subs.len() == 1 { |
| 315 | subs.pop().unwrap() |
| 316 | } else { |
| 317 | let is_start_anchored = subs[0].is_start_anchored; |
| 318 | let mut is_match_empty = true; |
| 319 | let mut static_explicit_captures_len = Some(0usize); |
| 320 | for sub in subs.iter() { |
| 321 | is_match_empty = is_match_empty && sub.is_match_empty; |
| 322 | static_explicit_captures_len = static_explicit_captures_len |
| 323 | .and_then(|len1| { |
| 324 | Some((len1, sub.static_explicit_captures_len?)) |
| 325 | }) |
| 326 | .and_then(|(len1, len2)| Some(len1.saturating_add(len2))); |
| 327 | } |
| 328 | Hir { |
| 329 | kind: HirKind::Concat(subs), |
| 330 | is_start_anchored, |
| 331 | is_match_empty, |
| 332 | static_explicit_captures_len, |
| 333 | } |
| 334 | } |
| 335 | } |
| 336 | |
| 337 | fn alternation(mut subs: Vec<Hir>) -> Hir { |
| 338 | if subs.is_empty() { |
| 339 | Hir::fail() |
| 340 | } else if subs.len() == 1 { |
| 341 | subs.pop().unwrap() |
| 342 | } else { |
| 343 | let mut it = subs.iter().peekable(); |
| 344 | let mut is_start_anchored = |
| 345 | it.peek().map_or(false, |sub| sub.is_start_anchored); |
| 346 | let mut is_match_empty = |
| 347 | it.peek().map_or(false, |sub| sub.is_match_empty); |
| 348 | let mut static_explicit_captures_len = |
| 349 | it.peek().and_then(|sub| sub.static_explicit_captures_len); |
| 350 | for sub in it { |
| 351 | is_start_anchored = is_start_anchored && sub.is_start_anchored; |
| 352 | is_match_empty = is_match_empty || sub.is_match_empty; |
| 353 | if static_explicit_captures_len |
| 354 | != sub.static_explicit_captures_len |
| 355 | { |
| 356 | static_explicit_captures_len = None; |
| 357 | } |
| 358 | } |
| 359 | Hir { |
| 360 | kind: HirKind::Alternation(subs), |
| 361 | is_start_anchored, |
| 362 | is_match_empty, |
| 363 | static_explicit_captures_len, |
| 364 | } |
| 365 | } |
| 366 | } |
| 367 | } |
| 368 | |
| 369 | impl HirKind { |
| 370 | /// Returns a slice of this kind's sub-expressions, if any. |
| 371 | fn subs(&self) -> &[Hir] { |
| 372 | use core::slice::from_ref; |
| 373 | |
| 374 | match *self { |
| 375 | HirKind::Empty |
| 376 | | HirKind::Char(_) |
| 377 | | HirKind::Class(_) |
| 378 | | HirKind::Look(_) => &[], |
| 379 | HirKind::Repetition(Repetition { ref sub: &Box, .. }) => from_ref(sub), |
| 380 | HirKind::Capture(Capture { ref sub: &Box, .. }) => from_ref(sub), |
| 381 | HirKind::Concat(ref subs: &Vec) => subs, |
| 382 | HirKind::Alternation(ref subs: &Vec) => subs, |
| 383 | } |
| 384 | } |
| 385 | } |
| 386 | |
| 387 | #[derive (Clone, Debug, Eq, PartialEq)] |
| 388 | pub(crate) struct Class { |
| 389 | pub(crate) ranges: Vec<ClassRange>, |
| 390 | } |
| 391 | |
| 392 | impl Class { |
| 393 | /// Create a new class from the given ranges. The ranges may be provided |
| 394 | /// in any order or may even overlap. They will be automatically |
| 395 | /// canonicalized. |
| 396 | fn new<I: IntoIterator<Item = ClassRange>>(ranges: I) -> Class { |
| 397 | let mut class = Class { ranges: ranges.into_iter().collect() }; |
| 398 | class.canonicalize(); |
| 399 | class |
| 400 | } |
| 401 | |
| 402 | /// Expand this class such that it matches the ASCII codepoints in this set |
| 403 | /// case insensitively. |
| 404 | fn ascii_case_fold(&mut self) { |
| 405 | let len = self.ranges.len(); |
| 406 | for i in 0..len { |
| 407 | if let Some(folded) = self.ranges[i].ascii_case_fold() { |
| 408 | self.ranges.push(folded); |
| 409 | } |
| 410 | } |
| 411 | self.canonicalize(); |
| 412 | } |
| 413 | |
| 414 | /// Negate this set. |
| 415 | /// |
| 416 | /// For all `x` where `x` is any element, if `x` was in this set, then it |
| 417 | /// will not be in this set after negation. |
| 418 | fn negate(&mut self) { |
| 419 | const MIN: char = ' \x00' ; |
| 420 | const MAX: char = char::MAX; |
| 421 | |
| 422 | if self.ranges.is_empty() { |
| 423 | self.ranges.push(ClassRange { start: MIN, end: MAX }); |
| 424 | return; |
| 425 | } |
| 426 | |
| 427 | // There should be a way to do this in-place with constant memory, |
| 428 | // but I couldn't figure out a simple way to do it. So just append |
| 429 | // the negation to the end of this range, and then drain it before |
| 430 | // we're done. |
| 431 | let drain_end = self.ranges.len(); |
| 432 | |
| 433 | // If our class doesn't start the minimum possible char, then negation |
| 434 | // needs to include all codepoints up to the minimum in this set. |
| 435 | if self.ranges[0].start > MIN { |
| 436 | self.ranges.push(ClassRange { |
| 437 | start: MIN, |
| 438 | // OK because we know it's bigger than MIN. |
| 439 | end: prev_char(self.ranges[0].start).unwrap(), |
| 440 | }); |
| 441 | } |
| 442 | for i in 1..drain_end { |
| 443 | // let lower = self.ranges[i - 1].upper().increment(); |
| 444 | // let upper = self.ranges[i].lower().decrement(); |
| 445 | // self.ranges.push(I::create(lower, upper)); |
| 446 | self.ranges.push(ClassRange { |
| 447 | // OK because we know i-1 is never the last range and therefore |
| 448 | // there must be a range greater than it. It therefore follows |
| 449 | // that 'end' can never be char::MAX, and thus there must be |
| 450 | // a next char. |
| 451 | start: next_char(self.ranges[i - 1].end).unwrap(), |
| 452 | // Since 'i' is guaranteed to never be the first range, it |
| 453 | // follows that there is always a range before this and thus |
| 454 | // 'start' can never be '\x00'. Thus, there must be a previous |
| 455 | // char. |
| 456 | end: prev_char(self.ranges[i].start).unwrap(), |
| 457 | }); |
| 458 | } |
| 459 | if self.ranges[drain_end - 1].end < MAX { |
| 460 | // let lower = self.ranges[drain_end - 1].upper().increment(); |
| 461 | // self.ranges.push(I::create(lower, I::Bound::max_value())); |
| 462 | self.ranges.push(ClassRange { |
| 463 | // OK because we know 'end' is less than char::MAX, and thus |
| 464 | // there is a next char. |
| 465 | start: next_char(self.ranges[drain_end - 1].end).unwrap(), |
| 466 | end: MAX, |
| 467 | }); |
| 468 | } |
| 469 | self.ranges.drain(..drain_end); |
| 470 | // We don't need to canonicalize because we processed the ranges above |
| 471 | // in canonical order and the new ranges we added based on those are |
| 472 | // also necessarily in canonical order. |
| 473 | } |
| 474 | |
| 475 | /// Converts this set into a canonical ordering. |
| 476 | fn canonicalize(&mut self) { |
| 477 | if self.is_canonical() { |
| 478 | return; |
| 479 | } |
| 480 | self.ranges.sort(); |
| 481 | assert!(!self.ranges.is_empty()); |
| 482 | |
| 483 | // Is there a way to do this in-place with constant memory? I couldn't |
| 484 | // figure out a way to do it. So just append the canonicalization to |
| 485 | // the end of this range, and then drain it before we're done. |
| 486 | let drain_end = self.ranges.len(); |
| 487 | for oldi in 0..drain_end { |
| 488 | // If we've added at least one new range, then check if we can |
| 489 | // merge this range in the previously added range. |
| 490 | if self.ranges.len() > drain_end { |
| 491 | let (last, rest) = self.ranges.split_last_mut().unwrap(); |
| 492 | if let Some(union) = last.union(&rest[oldi]) { |
| 493 | *last = union; |
| 494 | continue; |
| 495 | } |
| 496 | } |
| 497 | self.ranges.push(self.ranges[oldi]); |
| 498 | } |
| 499 | self.ranges.drain(..drain_end); |
| 500 | } |
| 501 | |
| 502 | /// Returns true if and only if this class is in a canonical ordering. |
| 503 | fn is_canonical(&self) -> bool { |
| 504 | for pair in self.ranges.windows(2) { |
| 505 | if pair[0] >= pair[1] { |
| 506 | return false; |
| 507 | } |
| 508 | if pair[0].is_contiguous(&pair[1]) { |
| 509 | return false; |
| 510 | } |
| 511 | } |
| 512 | true |
| 513 | } |
| 514 | } |
| 515 | |
| 516 | #[derive (Clone, Copy, Debug, Eq, PartialEq, PartialOrd, Ord)] |
| 517 | pub(crate) struct ClassRange { |
| 518 | pub(crate) start: char, |
| 519 | pub(crate) end: char, |
| 520 | } |
| 521 | |
| 522 | impl ClassRange { |
| 523 | /// Apply simple case folding to this byte range. Only ASCII case mappings |
| 524 | /// (for A-Za-z) are applied. |
| 525 | /// |
| 526 | /// Additional ranges are appended to the given vector. Canonical ordering |
| 527 | /// is *not* maintained in the given vector. |
| 528 | fn ascii_case_fold(&self) -> Option<ClassRange> { |
| 529 | if !(ClassRange { start: 'a' , end: 'z' }).is_intersection_empty(self) { |
| 530 | let start = core::cmp::max(self.start, 'a' ); |
| 531 | let end = core::cmp::min(self.end, 'z' ); |
| 532 | return Some(ClassRange { |
| 533 | start: char::try_from(u32::from(start) - 32).unwrap(), |
| 534 | end: char::try_from(u32::from(end) - 32).unwrap(), |
| 535 | }); |
| 536 | } |
| 537 | if !(ClassRange { start: 'A' , end: 'Z' }).is_intersection_empty(self) { |
| 538 | let start = core::cmp::max(self.start, 'A' ); |
| 539 | let end = core::cmp::min(self.end, 'Z' ); |
| 540 | return Some(ClassRange { |
| 541 | start: char::try_from(u32::from(start) + 32).unwrap(), |
| 542 | end: char::try_from(u32::from(end) + 32).unwrap(), |
| 543 | }); |
| 544 | } |
| 545 | None |
| 546 | } |
| 547 | |
| 548 | /// Union the given overlapping range into this range. |
| 549 | /// |
| 550 | /// If the two ranges aren't contiguous, then this returns `None`. |
| 551 | fn union(&self, other: &ClassRange) -> Option<ClassRange> { |
| 552 | if !self.is_contiguous(other) { |
| 553 | return None; |
| 554 | } |
| 555 | let start = core::cmp::min(self.start, other.start); |
| 556 | let end = core::cmp::max(self.end, other.end); |
| 557 | Some(ClassRange { start, end }) |
| 558 | } |
| 559 | |
| 560 | /// Returns true if and only if the two ranges are contiguous. Two ranges |
| 561 | /// are contiguous if and only if the ranges are either overlapping or |
| 562 | /// adjacent. |
| 563 | fn is_contiguous(&self, other: &ClassRange) -> bool { |
| 564 | let (s1, e1) = (u32::from(self.start), u32::from(self.end)); |
| 565 | let (s2, e2) = (u32::from(other.start), u32::from(other.end)); |
| 566 | core::cmp::max(s1, s2) <= core::cmp::min(e1, e2).saturating_add(1) |
| 567 | } |
| 568 | |
| 569 | /// Returns true if and only if the intersection of this range and the |
| 570 | /// other range is empty. |
| 571 | fn is_intersection_empty(&self, other: &ClassRange) -> bool { |
| 572 | let (s1, e1) = (self.start, self.end); |
| 573 | let (s2, e2) = (other.start, other.end); |
| 574 | core::cmp::max(s1, s2) > core::cmp::min(e1, e2) |
| 575 | } |
| 576 | } |
| 577 | |
| 578 | /// The high-level intermediate representation for a look-around assertion. |
| 579 | /// |
| 580 | /// An assertion match is always zero-length. Also called an "empty match." |
| 581 | #[derive (Clone, Copy, Debug, Eq, PartialEq)] |
| 582 | pub(crate) enum Look { |
| 583 | /// Match the beginning of text. Specifically, this matches at the starting |
| 584 | /// position of the input. |
| 585 | Start = 1 << 0, |
| 586 | /// Match the end of text. Specifically, this matches at the ending |
| 587 | /// position of the input. |
| 588 | End = 1 << 1, |
| 589 | /// Match the beginning of a line or the beginning of text. Specifically, |
| 590 | /// this matches at the starting position of the input, or at the position |
| 591 | /// immediately following a `\n` character. |
| 592 | StartLF = 1 << 2, |
| 593 | /// Match the end of a line or the end of text. Specifically, this matches |
| 594 | /// at the end position of the input, or at the position immediately |
| 595 | /// preceding a `\n` character. |
| 596 | EndLF = 1 << 3, |
| 597 | /// Match the beginning of a line or the beginning of text. Specifically, |
| 598 | /// this matches at the starting position of the input, or at the position |
| 599 | /// immediately following either a `\r` or `\n` character, but never after |
| 600 | /// a `\r` when a `\n` follows. |
| 601 | StartCRLF = 1 << 4, |
| 602 | /// Match the end of a line or the end of text. Specifically, this matches |
| 603 | /// at the end position of the input, or at the position immediately |
| 604 | /// preceding a `\r` or `\n` character, but never before a `\n` when a `\r` |
| 605 | /// precedes it. |
| 606 | EndCRLF = 1 << 5, |
| 607 | /// Match an ASCII-only word boundary. That is, this matches a position |
| 608 | /// where the left adjacent character and right adjacent character |
| 609 | /// correspond to a word and non-word or a non-word and word character. |
| 610 | Word = 1 << 6, |
| 611 | /// Match an ASCII-only negation of a word boundary. |
| 612 | WordNegate = 1 << 7, |
| 613 | /// Match the start of an ASCII-only word boundary. That is, this matches a |
| 614 | /// position at either the beginning of the haystack or where the previous |
| 615 | /// character is not a word character and the following character is a word |
| 616 | /// character. |
| 617 | WordStart = 1 << 8, |
| 618 | /// Match the end of an ASCII-only word boundary. That is, this matches |
| 619 | /// a position at either the end of the haystack or where the previous |
| 620 | /// character is a word character and the following character is not a word |
| 621 | /// character. |
| 622 | WordEnd = 1 << 9, |
| 623 | /// Match the start half of an ASCII-only word boundary. That is, this |
| 624 | /// matches a position at either the beginning of the haystack or where the |
| 625 | /// previous character is not a word character. |
| 626 | WordStartHalf = 1 << 10, |
| 627 | /// Match the end half of an ASCII-only word boundary. That is, this |
| 628 | /// matches a position at either the end of the haystack or where the |
| 629 | /// following character is not a word character. |
| 630 | WordEndHalf = 1 << 11, |
| 631 | } |
| 632 | |
| 633 | impl Look { |
| 634 | /// Returns true if the given position in the given haystack matches this |
| 635 | /// look-around assertion. |
| 636 | pub(crate) fn is_match(&self, haystack: &[u8], at: usize) -> bool { |
| 637 | use self::Look::*; |
| 638 | |
| 639 | match *self { |
| 640 | Start => at == 0, |
| 641 | End => at == haystack.len(), |
| 642 | StartLF => at == 0 || haystack[at - 1] == b' \n' , |
| 643 | EndLF => at == haystack.len() || haystack[at] == b' \n' , |
| 644 | StartCRLF => { |
| 645 | at == 0 |
| 646 | || haystack[at - 1] == b' \n' |
| 647 | || (haystack[at - 1] == b' \r' |
| 648 | && (at >= haystack.len() || haystack[at] != b' \n' )) |
| 649 | } |
| 650 | EndCRLF => { |
| 651 | at == haystack.len() |
| 652 | || haystack[at] == b' \r' |
| 653 | || (haystack[at] == b' \n' |
| 654 | && (at == 0 || haystack[at - 1] != b' \r' )) |
| 655 | } |
| 656 | Word => { |
| 657 | let word_before = |
| 658 | at > 0 && utf8::is_word_byte(haystack[at - 1]); |
| 659 | let word_after = |
| 660 | at < haystack.len() && utf8::is_word_byte(haystack[at]); |
| 661 | word_before != word_after |
| 662 | } |
| 663 | WordNegate => { |
| 664 | let word_before = |
| 665 | at > 0 && utf8::is_word_byte(haystack[at - 1]); |
| 666 | let word_after = |
| 667 | at < haystack.len() && utf8::is_word_byte(haystack[at]); |
| 668 | word_before == word_after |
| 669 | } |
| 670 | WordStart => { |
| 671 | let word_before = |
| 672 | at > 0 && utf8::is_word_byte(haystack[at - 1]); |
| 673 | let word_after = |
| 674 | at < haystack.len() && utf8::is_word_byte(haystack[at]); |
| 675 | !word_before && word_after |
| 676 | } |
| 677 | WordEnd => { |
| 678 | let word_before = |
| 679 | at > 0 && utf8::is_word_byte(haystack[at - 1]); |
| 680 | let word_after = |
| 681 | at < haystack.len() && utf8::is_word_byte(haystack[at]); |
| 682 | word_before && !word_after |
| 683 | } |
| 684 | WordStartHalf => { |
| 685 | let word_before = |
| 686 | at > 0 && utf8::is_word_byte(haystack[at - 1]); |
| 687 | !word_before |
| 688 | } |
| 689 | WordEndHalf => { |
| 690 | let word_after = |
| 691 | at < haystack.len() && utf8::is_word_byte(haystack[at]); |
| 692 | !word_after |
| 693 | } |
| 694 | } |
| 695 | } |
| 696 | } |
| 697 | |
| 698 | /// The high-level intermediate representation of a repetition operator. |
| 699 | /// |
| 700 | /// A repetition operator permits the repetition of an arbitrary |
| 701 | /// sub-expression. |
| 702 | #[derive (Clone, Debug, Eq, PartialEq)] |
| 703 | pub(crate) struct Repetition { |
| 704 | /// The minimum range of the repetition. |
| 705 | /// |
| 706 | /// Note that special cases like `?`, `+` and `*` all get translated into |
| 707 | /// the ranges `{0,1}`, `{1,}` and `{0,}`, respectively. |
| 708 | /// |
| 709 | /// When `min` is zero, this expression can match the empty string |
| 710 | /// regardless of what its sub-expression is. |
| 711 | pub(crate) min: u32, |
| 712 | /// The maximum range of the repetition. |
| 713 | /// |
| 714 | /// Note that when `max` is `None`, `min` acts as a lower bound but where |
| 715 | /// there is no upper bound. For something like `x{5}` where the min and |
| 716 | /// max are equivalent, `min` will be set to `5` and `max` will be set to |
| 717 | /// `Some(5)`. |
| 718 | pub(crate) max: Option<u32>, |
| 719 | /// Whether this repetition operator is greedy or not. A greedy operator |
| 720 | /// will match as much as it can. A non-greedy operator will match as |
| 721 | /// little as it can. |
| 722 | /// |
| 723 | /// Typically, operators are greedy by default and are only non-greedy when |
| 724 | /// a `?` suffix is used, e.g., `(expr)*` is greedy while `(expr)*?` is |
| 725 | /// not. However, this can be inverted via the `U` "ungreedy" flag. |
| 726 | pub(crate) greedy: bool, |
| 727 | /// The expression being repeated. |
| 728 | pub(crate) sub: Box<Hir>, |
| 729 | } |
| 730 | |
| 731 | /// The high-level intermediate representation for a capturing group. |
| 732 | /// |
| 733 | /// A capturing group always has an index and a child expression. It may |
| 734 | /// also have a name associated with it (e.g., `(?P<foo>\w)`), but it's not |
| 735 | /// necessary. |
| 736 | /// |
| 737 | /// Note that there is no explicit representation of a non-capturing group |
| 738 | /// in a `Hir`. Instead, non-capturing grouping is handled automatically by |
| 739 | /// the recursive structure of the `Hir` itself. |
| 740 | #[derive (Clone, Debug, Eq, PartialEq)] |
| 741 | pub(crate) struct Capture { |
| 742 | /// The capture index of the capture. |
| 743 | pub(crate) index: u32, |
| 744 | /// The name of the capture, if it exists. |
| 745 | pub(crate) name: Option<Box<str>>, |
| 746 | /// The expression inside the capturing group, which may be empty. |
| 747 | pub(crate) sub: Box<Hir>, |
| 748 | } |
| 749 | |
| 750 | fn next_char(ch: char) -> Option<char> { |
| 751 | // Skip over the surrogate range. |
| 752 | if ch == ' \u{D7FF}' { |
| 753 | return Some(' \u{E000}' ); |
| 754 | } |
| 755 | // OK because char::MAX < u32::MAX and we handle U+D7FF above. |
| 756 | char::from_u32(u32::from(ch).checked_add(1).unwrap()) |
| 757 | } |
| 758 | |
| 759 | fn prev_char(ch: char) -> Option<char> { |
| 760 | // Skip over the surrogate range. |
| 761 | if ch == ' \u{E000}' { |
| 762 | return Some(' \u{D7FF}' ); |
| 763 | } |
| 764 | // OK because subtracting 1 from any valid scalar value other than 0 |
| 765 | // and U+E000 yields a valid scalar value. |
| 766 | Some(char::from_u32(u32::from(ch).checked_sub(1)?).unwrap()) |
| 767 | } |
| 768 | |
| 769 | impl Drop for Hir { |
| 770 | fn drop(&mut self) { |
| 771 | use core::mem; |
| 772 | |
| 773 | match *self.kind() { |
| 774 | HirKind::Empty |
| 775 | | HirKind::Char(_) |
| 776 | | HirKind::Class(_) |
| 777 | | HirKind::Look(_) => return, |
| 778 | HirKind::Capture(ref x) if x.sub.kind.subs().is_empty() => return, |
| 779 | HirKind::Repetition(ref x) if x.sub.kind.subs().is_empty() => { |
| 780 | return |
| 781 | } |
| 782 | HirKind::Concat(ref x) if x.is_empty() => return, |
| 783 | HirKind::Alternation(ref x) if x.is_empty() => return, |
| 784 | _ => {} |
| 785 | } |
| 786 | |
| 787 | let mut stack = vec![mem::replace(self, Hir::empty())]; |
| 788 | while let Some(mut expr) = stack.pop() { |
| 789 | match expr.kind { |
| 790 | HirKind::Empty |
| 791 | | HirKind::Char(_) |
| 792 | | HirKind::Class(_) |
| 793 | | HirKind::Look(_) => {} |
| 794 | HirKind::Capture(ref mut x) => { |
| 795 | stack.push(mem::replace(&mut x.sub, Hir::empty())); |
| 796 | } |
| 797 | HirKind::Repetition(ref mut x) => { |
| 798 | stack.push(mem::replace(&mut x.sub, Hir::empty())); |
| 799 | } |
| 800 | HirKind::Concat(ref mut x) => { |
| 801 | stack.extend(x.drain(..)); |
| 802 | } |
| 803 | HirKind::Alternation(ref mut x) => { |
| 804 | stack.extend(x.drain(..)); |
| 805 | } |
| 806 | } |
| 807 | } |
| 808 | } |
| 809 | } |
| 810 | |