| 1 | use core::{cell::RefCell, mem::size_of}; |
| 2 | |
| 3 | use alloc::{string::String, sync::Arc, vec, vec::Vec}; |
| 4 | |
| 5 | use crate::{ |
| 6 | error::Error, |
| 7 | hir::{self, Hir, HirKind}, |
| 8 | int::U32, |
| 9 | }; |
| 10 | |
| 11 | pub(crate) type StateID = u32; |
| 12 | |
| 13 | #[derive (Clone, Copy, Debug)] |
| 14 | pub(crate) struct Config { |
| 15 | pub(crate) size_limit: Option<usize>, |
| 16 | } |
| 17 | |
| 18 | impl Default for Config { |
| 19 | fn default() -> Config { |
| 20 | Config { size_limit: Some(10 * (1 << 20)) } |
| 21 | } |
| 22 | } |
| 23 | |
| 24 | #[derive (Clone)] |
| 25 | pub(crate) struct NFA { |
| 26 | /// The pattern string this NFA was generated from. |
| 27 | /// |
| 28 | /// We put it here for lack of a better place to put it. ¯\_(ツ)_/¯ |
| 29 | pattern: String, |
| 30 | /// The states that make up this NFA. |
| 31 | states: Vec<State>, |
| 32 | /// The ID of the start state. |
| 33 | start: StateID, |
| 34 | /// Whether this NFA can only match at the beginning of a haystack. |
| 35 | is_start_anchored: bool, |
| 36 | /// Whether this NFA can match the empty string. |
| 37 | is_match_empty: bool, |
| 38 | /// If every match has the same number of matching capture groups, then |
| 39 | /// this corresponds to the number of groups. |
| 40 | static_explicit_captures_len: Option<usize>, |
| 41 | /// A map from capture group name to its corresponding index. |
| 42 | cap_name_to_index: CaptureNameMap, |
| 43 | /// A map from capture group index to the corresponding name, if one |
| 44 | /// exists. |
| 45 | cap_index_to_name: Vec<Option<Arc<str>>>, |
| 46 | /// Heap memory used indirectly by NFA states and other things (like the |
| 47 | /// various capturing group representations above). Since each state |
| 48 | /// might use a different amount of heap, we need to keep track of this |
| 49 | /// incrementally. |
| 50 | memory_extra: usize, |
| 51 | } |
| 52 | |
| 53 | impl NFA { |
| 54 | /// Creates a new NFA from the given configuration and HIR. |
| 55 | pub(crate) fn new( |
| 56 | config: Config, |
| 57 | pattern: String, |
| 58 | hir: &Hir, |
| 59 | ) -> Result<NFA, Error> { |
| 60 | Compiler::new(config, pattern).compile(hir) |
| 61 | } |
| 62 | |
| 63 | /// Returns the pattern string used to construct this NFA. |
| 64 | pub(crate) fn pattern(&self) -> &str { |
| 65 | &self.pattern |
| 66 | } |
| 67 | |
| 68 | /// Returns the state corresponding to the given ID. |
| 69 | /// |
| 70 | /// # Panics |
| 71 | /// |
| 72 | /// If the ID does not refer to a valid state, then this panics. |
| 73 | pub(crate) fn state(&self, id: StateID) -> &State { |
| 74 | &self.states[id.as_usize()] |
| 75 | } |
| 76 | |
| 77 | /// Returns the total number of states in this NFA. |
| 78 | pub(crate) fn len(&self) -> usize { |
| 79 | self.states.len() |
| 80 | } |
| 81 | |
| 82 | /// Returns the ID of the starting state for this NFA. |
| 83 | pub(crate) fn start(&self) -> StateID { |
| 84 | self.start |
| 85 | } |
| 86 | |
| 87 | /// Returns the capture group index for the corresponding named group. |
| 88 | /// If no such group with the given name exists, then `None` is returned. |
| 89 | pub(crate) fn to_index(&self, name: &str) -> Option<usize> { |
| 90 | self.cap_name_to_index.get(name).cloned().map(|i| i.as_usize()) |
| 91 | } |
| 92 | |
| 93 | /* |
| 94 | /// Returns the capture group name for the corresponding index. |
| 95 | /// If no such group with the given index, then `None` is returned. |
| 96 | pub(crate) fn to_name(&self, index: usize) -> Option<&str> { |
| 97 | self.cap_index_to_name.get(index)?.as_deref() |
| 98 | } |
| 99 | */ |
| 100 | |
| 101 | /// Returns an iterator over all of the capture groups, along with their |
| 102 | /// names if they exist, in this NFA. |
| 103 | pub(crate) fn capture_names(&self) -> CaptureNames<'_> { |
| 104 | CaptureNames { it: self.cap_index_to_name.iter() } |
| 105 | } |
| 106 | |
| 107 | /// Returns the total number of capture groups, including the first and |
| 108 | /// implicit group, in this NFA. |
| 109 | pub(crate) fn group_len(&self) -> usize { |
| 110 | self.cap_index_to_name.len() |
| 111 | } |
| 112 | |
| 113 | /// Returns true if and only if this NFA can only match at the beginning of |
| 114 | /// a haystack. |
| 115 | pub(crate) fn is_start_anchored(&self) -> bool { |
| 116 | self.is_start_anchored |
| 117 | } |
| 118 | |
| 119 | /// If the pattern always reports the same number of matching capture groups |
| 120 | /// for every match, then this returns the number of those groups. This |
| 121 | /// doesn't include the implicit group found in every pattern. |
| 122 | pub(crate) fn static_explicit_captures_len(&self) -> Option<usize> { |
| 123 | self.static_explicit_captures_len |
| 124 | } |
| 125 | |
| 126 | /// Returns the heap memory usage, in bytes, used by this NFA. |
| 127 | fn memory_usage(&self) -> usize { |
| 128 | (self.states.len() * size_of::<State>()) |
| 129 | + (self.cap_index_to_name.len() * size_of::<Option<Arc<str>>>()) |
| 130 | + self.memory_extra |
| 131 | } |
| 132 | } |
| 133 | |
| 134 | impl core::fmt::Debug for NFA { |
| 135 | fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { |
| 136 | writeln!(f, "NFA(" )?; |
| 137 | writeln!(f, "pattern: {}" , self.pattern)?; |
| 138 | for (sid: usize, state: &State) in self.states.iter().enumerate() { |
| 139 | writeln!(f, " {:07?}: {:?}" , sid, state)?; |
| 140 | } |
| 141 | writeln!(f, ")" )?; |
| 142 | Ok(()) |
| 143 | } |
| 144 | } |
| 145 | |
| 146 | /// An iterator over all capture groups in an NFA. |
| 147 | /// |
| 148 | /// If a particular group has a name, then it is yielded. Otherwise, `None` |
| 149 | /// is yielded. |
| 150 | #[derive (Clone, Debug)] |
| 151 | pub(crate) struct CaptureNames<'a> { |
| 152 | it: core::slice::Iter<'a, Option<Arc<str>>>, |
| 153 | } |
| 154 | |
| 155 | impl<'a> Iterator for CaptureNames<'a> { |
| 156 | type Item = Option<&'a str>; |
| 157 | |
| 158 | fn next(&mut self) -> Option<Option<&'a str>> { |
| 159 | self.it.next().map(|n: &'a Option>| n.as_deref()) |
| 160 | } |
| 161 | } |
| 162 | |
| 163 | #[derive (Clone, Eq, PartialEq)] |
| 164 | pub(crate) enum State { |
| 165 | Char { target: StateID, ch: char }, |
| 166 | Ranges { target: StateID, ranges: Vec<(char, char)> }, |
| 167 | Splits { targets: Vec<StateID>, reverse: bool }, |
| 168 | Goto { target: StateID, look: Option<hir::Look> }, |
| 169 | Capture { target: StateID, slot: u32 }, |
| 170 | Fail, |
| 171 | Match, |
| 172 | } |
| 173 | |
| 174 | impl State { |
| 175 | /// Returns the heap memory usage of this NFA state in bytes. |
| 176 | fn memory_usage(&self) -> usize { |
| 177 | match *self { |
| 178 | State::Char { .. } |
| 179 | | State::Goto { .. } |
| 180 | | State::Capture { .. } |
| 181 | | State::Fail { .. } |
| 182 | | State::Match => 0, |
| 183 | State::Splits { ref targets, .. } => { |
| 184 | targets.len() * size_of::<StateID>() |
| 185 | } |
| 186 | State::Ranges { ref ranges, .. } => { |
| 187 | ranges.len() * size_of::<(char, char)>() |
| 188 | } |
| 189 | } |
| 190 | } |
| 191 | |
| 192 | /// Returns an iterator over the given split targets. The order of the |
| 193 | /// iterator yields elements in reverse when `reverse` is true. |
| 194 | pub(crate) fn iter_splits<'a>( |
| 195 | splits: &'a [StateID], |
| 196 | reverse: bool, |
| 197 | ) -> impl Iterator<Item = StateID> + 'a { |
| 198 | let mut it = splits.iter(); |
| 199 | core::iter::from_fn(move || { |
| 200 | if reverse { it.next_back() } else { it.next() }.copied() |
| 201 | }) |
| 202 | } |
| 203 | } |
| 204 | |
| 205 | impl core::fmt::Debug for State { |
| 206 | fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { |
| 207 | match *self { |
| 208 | State::Char { target, ch } => { |
| 209 | write!(f, " {:?} => {:?}" , ch, target) |
| 210 | } |
| 211 | State::Ranges { target, ref ranges } => { |
| 212 | for (i, &(start, end)) in ranges.iter().enumerate() { |
| 213 | if i > 0 { |
| 214 | write!(f, ", " )?; |
| 215 | } |
| 216 | write!(f, " {:?}- {:?} => {:?}" , start, end, target)?; |
| 217 | } |
| 218 | Ok(()) |
| 219 | } |
| 220 | State::Splits { ref targets, reverse } => { |
| 221 | write!(f, "splits(" )?; |
| 222 | for (i, sid) in |
| 223 | State::iter_splits(targets, reverse).enumerate() |
| 224 | { |
| 225 | if i > 0 { |
| 226 | write!(f, ", " )?; |
| 227 | } |
| 228 | write!(f, " {:?}" , sid)?; |
| 229 | } |
| 230 | write!(f, ")" ) |
| 231 | } |
| 232 | State::Goto { target, look: None } => { |
| 233 | write!(f, "goto( {:?})" , target) |
| 234 | } |
| 235 | State::Goto { target, look: Some(look) } => { |
| 236 | write!(f, " {:?} => {:?}" , look, target) |
| 237 | } |
| 238 | State::Capture { target, slot } => { |
| 239 | write!(f, "capture(slot= {:?}) => {:?}" , slot, target,) |
| 240 | } |
| 241 | State::Fail => write!(f, "FAIL" ), |
| 242 | State::Match => { |
| 243 | write!(f, "MATCH" ) |
| 244 | } |
| 245 | } |
| 246 | } |
| 247 | } |
| 248 | |
| 249 | /// A map from capture group name to its corresponding capture group index. |
| 250 | /// |
| 251 | /// We define a type alias here so that we can transparently use a `HashMap` |
| 252 | /// whenever it's available. We do so presumably because it's faster, although |
| 253 | /// there are no benchmarks verifying this. |
| 254 | #[cfg (feature = "std" )] |
| 255 | type CaptureNameMap = std::collections::HashMap<Arc<str>, u32>; |
| 256 | #[cfg (not(feature = "std" ))] |
| 257 | type CaptureNameMap = alloc::collections::BTreeMap<Arc<str>, u32>; |
| 258 | |
| 259 | #[derive (Debug)] |
| 260 | struct Compiler { |
| 261 | config: Config, |
| 262 | nfa: RefCell<NFA>, |
| 263 | } |
| 264 | |
| 265 | impl Compiler { |
| 266 | fn new(config: Config, pattern: String) -> Compiler { |
| 267 | let nfa = RefCell::new(NFA { |
| 268 | pattern, |
| 269 | states: vec![], |
| 270 | start: 0, |
| 271 | is_start_anchored: false, |
| 272 | is_match_empty: false, |
| 273 | static_explicit_captures_len: None, |
| 274 | cap_name_to_index: CaptureNameMap::default(), |
| 275 | cap_index_to_name: vec![], |
| 276 | memory_extra: 0, |
| 277 | }); |
| 278 | Compiler { config, nfa } |
| 279 | } |
| 280 | |
| 281 | fn compile(self, hir: &Hir) -> Result<NFA, Error> { |
| 282 | self.nfa.borrow_mut().is_start_anchored = hir.is_start_anchored(); |
| 283 | self.nfa.borrow_mut().is_match_empty = hir.is_match_empty(); |
| 284 | self.nfa.borrow_mut().static_explicit_captures_len = |
| 285 | hir.static_explicit_captures_len(); |
| 286 | let compiled = self.c_capture(0, None, hir)?; |
| 287 | let mat = self.add(State::Match)?; |
| 288 | self.patch(compiled.end, mat)?; |
| 289 | self.nfa.borrow_mut().start = compiled.start; |
| 290 | Ok(self.nfa.into_inner()) |
| 291 | } |
| 292 | |
| 293 | fn c(&self, hir: &Hir) -> Result<ThompsonRef, Error> { |
| 294 | match *hir.kind() { |
| 295 | HirKind::Empty => self.c_empty(), |
| 296 | HirKind::Char(ch) => self.c_char(ch), |
| 297 | HirKind::Class(ref class) => self.c_class(class), |
| 298 | HirKind::Look(ref look) => self.c_look(look), |
| 299 | HirKind::Repetition(ref rep) => self.c_repetition(rep), |
| 300 | HirKind::Capture(ref cap) => { |
| 301 | self.c_capture(cap.index, cap.name.as_deref(), &cap.sub) |
| 302 | } |
| 303 | HirKind::Concat(ref subs) => { |
| 304 | self.c_concat(subs.iter().map(|s| self.c(s))) |
| 305 | } |
| 306 | HirKind::Alternation(ref subs) => { |
| 307 | self.c_alternation(subs.iter().map(|s| self.c(s))) |
| 308 | } |
| 309 | } |
| 310 | } |
| 311 | |
| 312 | /// Compile a "fail" state that can never be transitioned out of. |
| 313 | fn c_fail(&self) -> Result<ThompsonRef, Error> { |
| 314 | let id = self.add(State::Fail)?; |
| 315 | Ok(ThompsonRef { start: id, end: id }) |
| 316 | } |
| 317 | |
| 318 | /// Compile an "empty" state with one unconditional epsilon transition. |
| 319 | /// |
| 320 | /// Both the `start` and `end` locations point to the state created. |
| 321 | /// Callers will likely want to keep the `start`, but patch the `end` to |
| 322 | /// point to some other state. |
| 323 | fn c_empty(&self) -> Result<ThompsonRef, Error> { |
| 324 | let id = self.add_empty()?; |
| 325 | Ok(ThompsonRef { start: id, end: id }) |
| 326 | } |
| 327 | |
| 328 | /// Compile the given literal char to an NFA. |
| 329 | fn c_char(&self, ch: char) -> Result<ThompsonRef, Error> { |
| 330 | let id = self.add(State::Char { target: 0, ch })?; |
| 331 | Ok(ThompsonRef { start: id, end: id }) |
| 332 | } |
| 333 | |
| 334 | /// Compile the given character class into an NFA. |
| 335 | /// |
| 336 | /// If the class is empty, then this compiles to a `Fail` state. |
| 337 | fn c_class(&self, class: &hir::Class) -> Result<ThompsonRef, Error> { |
| 338 | let id = if class.ranges.is_empty() { |
| 339 | // Technically using an explicit fail state probably isn't |
| 340 | // necessary. Because if you try to match against an empty Ranges, |
| 341 | // then it should turn up with nothing regardless of input, and |
| 342 | // thus "acts" like a Fail state. But it's better to be more |
| 343 | // explicit, and there's no real cost to doing so. |
| 344 | self.add(State::Fail) |
| 345 | } else { |
| 346 | let ranges = |
| 347 | class.ranges.iter().map(|r| (r.start, r.end)).collect(); |
| 348 | self.add(State::Ranges { target: 0, ranges }) |
| 349 | }?; |
| 350 | Ok(ThompsonRef { start: id, end: id }) |
| 351 | } |
| 352 | |
| 353 | /// Compile the given HIR look-around assertion to an NFA look-around |
| 354 | /// assertion. |
| 355 | fn c_look(&self, look: &hir::Look) -> Result<ThompsonRef, Error> { |
| 356 | let id = self.add(State::Goto { target: 0, look: Some(*look) })?; |
| 357 | Ok(ThompsonRef { start: id, end: id }) |
| 358 | } |
| 359 | |
| 360 | /// Compile the given repetition expression. This handles all types of |
| 361 | /// repetitions and greediness. |
| 362 | fn c_repetition( |
| 363 | &self, |
| 364 | rep: &hir::Repetition, |
| 365 | ) -> Result<ThompsonRef, Error> { |
| 366 | match (rep.min, rep.max) { |
| 367 | (0, Some(1)) => self.c_zero_or_one(&rep.sub, rep.greedy), |
| 368 | (min, None) => self.c_at_least(&rep.sub, rep.greedy, min), |
| 369 | (min, Some(max)) if min == max => self.c_exactly(&rep.sub, min), |
| 370 | (min, Some(max)) => self.c_bounded(&rep.sub, rep.greedy, min, max), |
| 371 | } |
| 372 | } |
| 373 | |
| 374 | /// Compile the given expression such that it matches at least `min` times, |
| 375 | /// but no more than `max` times. |
| 376 | /// |
| 377 | /// When `greedy` is true, then the preference is for the expression to |
| 378 | /// match as much as possible. Otherwise, it will match as little as |
| 379 | /// possible. |
| 380 | fn c_bounded( |
| 381 | &self, |
| 382 | hir: &Hir, |
| 383 | greedy: bool, |
| 384 | min: u32, |
| 385 | max: u32, |
| 386 | ) -> Result<ThompsonRef, Error> { |
| 387 | let prefix = self.c_exactly(hir, min)?; |
| 388 | if min == max { |
| 389 | return Ok(prefix); |
| 390 | } |
| 391 | |
| 392 | // It is tempting here to compile the rest here as a concatenation |
| 393 | // of zero-or-one matches. i.e., for `a{2,5}`, compile it as if it |
| 394 | // were `aaa?a?a?`. The problem here is that it leads to this program: |
| 395 | // |
| 396 | // >000000: 61 => 01 |
| 397 | // 000001: 61 => 02 |
| 398 | // 000002: union(03, 04) |
| 399 | // 000003: 61 => 04 |
| 400 | // 000004: union(05, 06) |
| 401 | // 000005: 61 => 06 |
| 402 | // 000006: union(07, 08) |
| 403 | // 000007: 61 => 08 |
| 404 | // 000008: MATCH |
| 405 | // |
| 406 | // And effectively, once you hit state 2, the epsilon closure will |
| 407 | // include states 3, 5, 6, 7 and 8, which is quite a bit. It is better |
| 408 | // to instead compile it like so: |
| 409 | // |
| 410 | // >000000: 61 => 01 |
| 411 | // 000001: 61 => 02 |
| 412 | // 000002: union(03, 08) |
| 413 | // 000003: 61 => 04 |
| 414 | // 000004: union(05, 08) |
| 415 | // 000005: 61 => 06 |
| 416 | // 000006: union(07, 08) |
| 417 | // 000007: 61 => 08 |
| 418 | // 000008: MATCH |
| 419 | // |
| 420 | // So that the epsilon closure of state 2 is now just 3 and 8. |
| 421 | let empty = self.add_empty()?; |
| 422 | let mut prev_end = prefix.end; |
| 423 | for _ in min..max { |
| 424 | let splits = |
| 425 | self.add(State::Splits { targets: vec![], reverse: !greedy })?; |
| 426 | let compiled = self.c(hir)?; |
| 427 | self.patch(prev_end, splits)?; |
| 428 | self.patch(splits, compiled.start)?; |
| 429 | self.patch(splits, empty)?; |
| 430 | prev_end = compiled.end; |
| 431 | } |
| 432 | self.patch(prev_end, empty)?; |
| 433 | Ok(ThompsonRef { start: prefix.start, end: empty }) |
| 434 | } |
| 435 | |
| 436 | /// Compile the given expression such that it may be matched `n` or more |
| 437 | /// times, where `n` can be any integer. (Although a particularly large |
| 438 | /// integer is likely to run afoul of any configured size limits.) |
| 439 | /// |
| 440 | /// When `greedy` is true, then the preference is for the expression to |
| 441 | /// match as much as possible. Otherwise, it will match as little as |
| 442 | /// possible. |
| 443 | fn c_at_least( |
| 444 | &self, |
| 445 | hir: &Hir, |
| 446 | greedy: bool, |
| 447 | n: u32, |
| 448 | ) -> Result<ThompsonRef, Error> { |
| 449 | if n == 0 { |
| 450 | // When the expression cannot match the empty string, then we |
| 451 | // can get away with something much simpler: just one 'alt' |
| 452 | // instruction that optionally repeats itself. But if the expr |
| 453 | // can match the empty string... see below. |
| 454 | if !hir.is_match_empty() { |
| 455 | let splits = self.add(State::Splits { |
| 456 | targets: vec![], |
| 457 | reverse: !greedy, |
| 458 | })?; |
| 459 | let compiled = self.c(hir)?; |
| 460 | self.patch(splits, compiled.start)?; |
| 461 | self.patch(compiled.end, splits)?; |
| 462 | return Ok(ThompsonRef { start: splits, end: splits }); |
| 463 | } |
| 464 | |
| 465 | // What's going on here? Shouldn't x* be simpler than this? It |
| 466 | // turns out that when implementing leftmost-first (Perl-like) |
| 467 | // match semantics, x* results in an incorrect preference order |
| 468 | // when computing the transitive closure of states if and only if |
| 469 | // 'x' can match the empty string. So instead, we compile x* as |
| 470 | // (x+)?, which preserves the correct preference order. |
| 471 | // |
| 472 | // See: https://github.com/rust-lang/regex/issues/779 |
| 473 | let compiled = self.c(hir)?; |
| 474 | let plus = |
| 475 | self.add(State::Splits { targets: vec![], reverse: !greedy })?; |
| 476 | self.patch(compiled.end, plus)?; |
| 477 | self.patch(plus, compiled.start)?; |
| 478 | |
| 479 | let question = |
| 480 | self.add(State::Splits { targets: vec![], reverse: !greedy })?; |
| 481 | let empty = self.add_empty()?; |
| 482 | self.patch(question, compiled.start)?; |
| 483 | self.patch(question, empty)?; |
| 484 | self.patch(plus, empty)?; |
| 485 | Ok(ThompsonRef { start: question, end: empty }) |
| 486 | } else if n == 1 { |
| 487 | let compiled = self.c(hir)?; |
| 488 | let splits = |
| 489 | self.add(State::Splits { targets: vec![], reverse: !greedy })?; |
| 490 | self.patch(compiled.end, splits)?; |
| 491 | self.patch(splits, compiled.start)?; |
| 492 | Ok(ThompsonRef { start: compiled.start, end: splits }) |
| 493 | } else { |
| 494 | let prefix = self.c_exactly(hir, n - 1)?; |
| 495 | let last = self.c(hir)?; |
| 496 | let splits = |
| 497 | self.add(State::Splits { targets: vec![], reverse: !greedy })?; |
| 498 | self.patch(prefix.end, last.start)?; |
| 499 | self.patch(last.end, splits)?; |
| 500 | self.patch(splits, last.start)?; |
| 501 | Ok(ThompsonRef { start: prefix.start, end: splits }) |
| 502 | } |
| 503 | } |
| 504 | |
| 505 | /// Compile the given expression such that it may be matched zero or one |
| 506 | /// times. |
| 507 | /// |
| 508 | /// When `greedy` is true, then the preference is for the expression to |
| 509 | /// match as much as possible. Otherwise, it will match as little as |
| 510 | /// possible. |
| 511 | fn c_zero_or_one( |
| 512 | &self, |
| 513 | hir: &Hir, |
| 514 | greedy: bool, |
| 515 | ) -> Result<ThompsonRef, Error> { |
| 516 | let splits = |
| 517 | self.add(State::Splits { targets: vec![], reverse: !greedy })?; |
| 518 | let compiled = self.c(hir)?; |
| 519 | let empty = self.add_empty()?; |
| 520 | self.patch(splits, compiled.start)?; |
| 521 | self.patch(splits, empty)?; |
| 522 | self.patch(compiled.end, empty)?; |
| 523 | Ok(ThompsonRef { start: splits, end: empty }) |
| 524 | } |
| 525 | |
| 526 | /// Compile the given HIR expression exactly `n` times. |
| 527 | fn c_exactly(&self, hir: &Hir, n: u32) -> Result<ThompsonRef, Error> { |
| 528 | self.c_concat((0..n).map(|_| self.c(hir))) |
| 529 | } |
| 530 | |
| 531 | /// Compile the given expression and insert capturing states at the |
| 532 | /// beginning and end of it. The slot for the capture states is computed |
| 533 | /// from the index. |
| 534 | fn c_capture( |
| 535 | &self, |
| 536 | index: u32, |
| 537 | name: Option<&str>, |
| 538 | hir: &Hir, |
| 539 | ) -> Result<ThompsonRef, Error> { |
| 540 | // For discontiguous indices, push placeholders for earlier capture |
| 541 | // groups that weren't explicitly added. This can happen, for example, |
| 542 | // with patterns like '(a){0}(a)' where '(a){0}' is completely removed |
| 543 | // from the pattern. |
| 544 | let existing_groups_len = self.nfa.borrow().cap_index_to_name.len(); |
| 545 | for _ in 0..(index.as_usize().saturating_sub(existing_groups_len)) { |
| 546 | self.nfa.borrow_mut().cap_index_to_name.push(None); |
| 547 | } |
| 548 | if index.as_usize() >= existing_groups_len { |
| 549 | if let Some(name) = name { |
| 550 | let name = Arc::from(name); |
| 551 | let mut nfa = self.nfa.borrow_mut(); |
| 552 | nfa.cap_name_to_index.insert(Arc::clone(&name), index); |
| 553 | nfa.cap_index_to_name.push(Some(Arc::clone(&name))); |
| 554 | // This is an approximation. |
| 555 | nfa.memory_extra += name.len() + size_of::<u32>(); |
| 556 | } else { |
| 557 | self.nfa.borrow_mut().cap_index_to_name.push(None); |
| 558 | } |
| 559 | } |
| 560 | |
| 561 | let Some(slot) = index.checked_mul(2) else { |
| 562 | return Err(Error::new("capture group slots exhausted" )); |
| 563 | }; |
| 564 | let start = self.add(State::Capture { target: 0, slot })?; |
| 565 | let inner = self.c(hir)?; |
| 566 | let Some(slot) = slot.checked_add(1) else { |
| 567 | return Err(Error::new("capture group slots exhausted" )); |
| 568 | }; |
| 569 | let end = self.add(State::Capture { target: 0, slot })?; |
| 570 | self.patch(start, inner.start)?; |
| 571 | self.patch(inner.end, end)?; |
| 572 | |
| 573 | Ok(ThompsonRef { start, end }) |
| 574 | } |
| 575 | |
| 576 | /// Compile a concatenation of the sub-expressions yielded by the given |
| 577 | /// iterator. If the iterator yields no elements, then this compiles down |
| 578 | /// to an "empty" state that always matches. |
| 579 | fn c_concat<I>(&self, mut it: I) -> Result<ThompsonRef, Error> |
| 580 | where |
| 581 | I: Iterator<Item = Result<ThompsonRef, Error>>, |
| 582 | { |
| 583 | let ThompsonRef { start, mut end } = match it.next() { |
| 584 | Some(result) => result?, |
| 585 | None => return self.c_empty(), |
| 586 | }; |
| 587 | for result in it { |
| 588 | let compiled = result?; |
| 589 | self.patch(end, compiled.start)?; |
| 590 | end = compiled.end; |
| 591 | } |
| 592 | Ok(ThompsonRef { start, end }) |
| 593 | } |
| 594 | |
| 595 | /// Compile an alternation, where each element yielded by the given |
| 596 | /// iterator represents an item in the alternation. If the iterator yields |
| 597 | /// no elements, then this compiles down to a "fail" state. |
| 598 | /// |
| 599 | /// In an alternation, expressions appearing earlier are "preferred" at |
| 600 | /// match time over expressions appearing later. (This is currently always |
| 601 | /// true, as this crate only supports leftmost-first semantics.) |
| 602 | fn c_alternation<I>(&self, mut it: I) -> Result<ThompsonRef, Error> |
| 603 | where |
| 604 | I: Iterator<Item = Result<ThompsonRef, Error>>, |
| 605 | { |
| 606 | let first = match it.next() { |
| 607 | None => return self.c_fail(), |
| 608 | Some(result) => result?, |
| 609 | }; |
| 610 | let second = match it.next() { |
| 611 | None => return Ok(first), |
| 612 | Some(result) => result?, |
| 613 | }; |
| 614 | |
| 615 | let splits = |
| 616 | self.add(State::Splits { targets: vec![], reverse: false })?; |
| 617 | let end = self.add_empty()?; |
| 618 | self.patch(splits, first.start)?; |
| 619 | self.patch(first.end, end)?; |
| 620 | self.patch(splits, second.start)?; |
| 621 | self.patch(second.end, end)?; |
| 622 | for result in it { |
| 623 | let compiled = result?; |
| 624 | self.patch(splits, compiled.start)?; |
| 625 | self.patch(compiled.end, end)?; |
| 626 | } |
| 627 | Ok(ThompsonRef { start: splits, end }) |
| 628 | } |
| 629 | |
| 630 | /// A convenience routine for adding an empty state, also known as an |
| 631 | /// unconditional epsilon transition. These are quite useful for making |
| 632 | /// NFA construction simpler. |
| 633 | /// |
| 634 | /// (In the regex crate, we do a second pass to remove these, but don't |
| 635 | /// bother with that here.) |
| 636 | fn add_empty(&self) -> Result<StateID, Error> { |
| 637 | self.add(State::Goto { target: 0, look: None }) |
| 638 | } |
| 639 | |
| 640 | /// The common implementation of "add a state." It handles the common |
| 641 | /// error cases of state ID exhausting (by owning state ID allocation) and |
| 642 | /// whether the size limit has been exceeded. |
| 643 | fn add(&self, state: State) -> Result<StateID, Error> { |
| 644 | let id = u32::try_from(self.nfa.borrow().states.len()) |
| 645 | .map_err(|_| Error::new("exhausted state IDs, too many states" ))?; |
| 646 | self.nfa.borrow_mut().memory_extra += state.memory_usage(); |
| 647 | self.nfa.borrow_mut().states.push(state); |
| 648 | self.check_size_limit()?; |
| 649 | Ok(id) |
| 650 | } |
| 651 | |
| 652 | /// Add a transition from one state to another. |
| 653 | /// |
| 654 | /// This routine is called "patch" since it is very common to add the |
| 655 | /// states you want, typically with "dummy" state ID transitions, and then |
| 656 | /// "patch" in the real state IDs later. This is because you don't always |
| 657 | /// know all of the necessary state IDs to add because they might not |
| 658 | /// exist yet. |
| 659 | /// |
| 660 | /// # Errors |
| 661 | /// |
| 662 | /// This may error if patching leads to an increase in heap usage beyond |
| 663 | /// the configured size limit. Heap usage only grows when patching adds a |
| 664 | /// new transition (as in the case of a "splits" state). |
| 665 | fn patch(&self, from: StateID, to: StateID) -> Result<(), Error> { |
| 666 | let mut new_memory_extra = self.nfa.borrow().memory_extra; |
| 667 | match self.nfa.borrow_mut().states[from.as_usize()] { |
| 668 | State::Char { ref mut target, .. } => { |
| 669 | *target = to; |
| 670 | } |
| 671 | State::Ranges { ref mut target, .. } => { |
| 672 | *target = to; |
| 673 | } |
| 674 | State::Splits { ref mut targets, .. } => { |
| 675 | targets.push(to); |
| 676 | new_memory_extra += size_of::<StateID>(); |
| 677 | } |
| 678 | State::Goto { ref mut target, .. } => { |
| 679 | *target = to; |
| 680 | } |
| 681 | State::Capture { ref mut target, .. } => { |
| 682 | *target = to; |
| 683 | } |
| 684 | State::Fail | State::Match => {} |
| 685 | } |
| 686 | if new_memory_extra != self.nfa.borrow().memory_extra { |
| 687 | self.nfa.borrow_mut().memory_extra = new_memory_extra; |
| 688 | self.check_size_limit()?; |
| 689 | } |
| 690 | Ok(()) |
| 691 | } |
| 692 | |
| 693 | /// Checks that the current heap memory usage of the NFA being compiled |
| 694 | /// doesn't exceed the configured size limit. If it does, an error is |
| 695 | /// returned. |
| 696 | fn check_size_limit(&self) -> Result<(), Error> { |
| 697 | if let Some(limit) = self.config.size_limit { |
| 698 | if self.nfa.borrow().memory_usage() > limit { |
| 699 | return Err(Error::new("compiled regex exceeded size limit" )); |
| 700 | } |
| 701 | } |
| 702 | Ok(()) |
| 703 | } |
| 704 | } |
| 705 | |
| 706 | /// A value that represents the result of compiling a sub-expression of a |
| 707 | /// regex's HIR. Specifically, this represents a sub-graph of the NFA that |
| 708 | /// has an initial state at `start` and a final state at `end`. |
| 709 | #[derive (Clone, Copy, Debug)] |
| 710 | struct ThompsonRef { |
| 711 | start: StateID, |
| 712 | end: StateID, |
| 713 | } |
| 714 | |