| 1 | /*! |
| 2 | An NFA backed Pike VM for executing regex searches with capturing groups. |
| 3 | |
| 4 | This module provides a [`PikeVM`] that works by simulating an NFA and |
| 5 | resolving all spans of capturing groups that participate in a match. |
| 6 | */ |
| 7 | |
| 8 | #[cfg (feature = "internal-instrument-pikevm" )] |
| 9 | use core::cell::RefCell; |
| 10 | |
| 11 | use alloc::{vec, vec::Vec}; |
| 12 | |
| 13 | use crate::{ |
| 14 | nfa::thompson::{self, BuildError, State, NFA}, |
| 15 | util::{ |
| 16 | captures::Captures, |
| 17 | empty, iter, |
| 18 | prefilter::Prefilter, |
| 19 | primitives::{NonMaxUsize, PatternID, SmallIndex, StateID}, |
| 20 | search::{ |
| 21 | Anchored, HalfMatch, Input, Match, MatchKind, PatternSet, Span, |
| 22 | }, |
| 23 | sparse_set::SparseSet, |
| 24 | }, |
| 25 | }; |
| 26 | |
| 27 | /// A simple macro for conditionally executing instrumentation logic when |
| 28 | /// the 'trace' log level is enabled. This is a compile-time no-op when the |
| 29 | /// 'internal-instrument-pikevm' feature isn't enabled. The intent here is that |
| 30 | /// this makes it easier to avoid doing extra work when instrumentation isn't |
| 31 | /// enabled. |
| 32 | /// |
| 33 | /// This macro accepts a closure of type `|&mut Counters|`. The closure can |
| 34 | /// then increment counters (or whatever) in accordance with what one wants |
| 35 | /// to track. |
| 36 | macro_rules! instrument { |
| 37 | ($fun:expr) => { |
| 38 | #[cfg(feature = "internal-instrument-pikevm" )] |
| 39 | { |
| 40 | let fun: &mut dyn FnMut(&mut Counters) = &mut $fun; |
| 41 | COUNTERS.with(|c: &RefCell<Counters>| fun(&mut *c.borrow_mut())); |
| 42 | } |
| 43 | }; |
| 44 | } |
| 45 | |
| 46 | #[cfg (feature = "internal-instrument-pikevm" )] |
| 47 | std::thread_local! { |
| 48 | /// Effectively global state used to keep track of instrumentation |
| 49 | /// counters. The "proper" way to do this is to thread it through the |
| 50 | /// PikeVM, but it makes the code quite icky. Since this is just a |
| 51 | /// debugging feature, we're content to relegate it to thread local |
| 52 | /// state. When instrumentation is enabled, the counters are reset at the |
| 53 | /// beginning of every search and printed (with the 'trace' log level) at |
| 54 | /// the end of every search. |
| 55 | static COUNTERS: RefCell<Counters> = RefCell::new(Counters::empty()); |
| 56 | } |
| 57 | |
| 58 | /// The configuration used for building a [`PikeVM`]. |
| 59 | /// |
| 60 | /// A PikeVM configuration is a simple data object that is typically used with |
| 61 | /// [`Builder::configure`]. It can be cheaply cloned. |
| 62 | /// |
| 63 | /// A default configuration can be created either with `Config::new`, or |
| 64 | /// perhaps more conveniently, with [`PikeVM::config`]. |
| 65 | #[derive (Clone, Debug, Default)] |
| 66 | pub struct Config { |
| 67 | match_kind: Option<MatchKind>, |
| 68 | pre: Option<Option<Prefilter>>, |
| 69 | } |
| 70 | |
| 71 | impl Config { |
| 72 | /// Return a new default PikeVM configuration. |
| 73 | pub fn new() -> Config { |
| 74 | Config::default() |
| 75 | } |
| 76 | |
| 77 | /// Set the desired match semantics. |
| 78 | /// |
| 79 | /// The default is [`MatchKind::LeftmostFirst`], which corresponds to the |
| 80 | /// match semantics of Perl-like regex engines. That is, when multiple |
| 81 | /// patterns would match at the same leftmost position, the pattern that |
| 82 | /// appears first in the concrete syntax is chosen. |
| 83 | /// |
| 84 | /// Currently, the only other kind of match semantics supported is |
| 85 | /// [`MatchKind::All`]. This corresponds to "classical DFA" construction |
| 86 | /// where all possible matches are visited in the NFA by the `PikeVM`. |
| 87 | /// |
| 88 | /// Typically, `All` is used when one wants to execute an overlapping |
| 89 | /// search and `LeftmostFirst` otherwise. In particular, it rarely makes |
| 90 | /// sense to use `All` with the various "leftmost" find routines, since the |
| 91 | /// leftmost routines depend on the `LeftmostFirst` automata construction |
| 92 | /// strategy. Specifically, `LeftmostFirst` results in the `PikeVM` |
| 93 | /// simulating dead states as a way to terminate the search and report a |
| 94 | /// match. `LeftmostFirst` also supports non-greedy matches using this |
| 95 | /// strategy where as `All` does not. |
| 96 | pub fn match_kind(mut self, kind: MatchKind) -> Config { |
| 97 | self.match_kind = Some(kind); |
| 98 | self |
| 99 | } |
| 100 | |
| 101 | /// Set a prefilter to be used whenever a start state is entered. |
| 102 | /// |
| 103 | /// A [`Prefilter`] in this context is meant to accelerate searches by |
| 104 | /// looking for literal prefixes that every match for the corresponding |
| 105 | /// pattern (or patterns) must start with. Once a prefilter produces a |
| 106 | /// match, the underlying search routine continues on to try and confirm |
| 107 | /// the match. |
| 108 | /// |
| 109 | /// Be warned that setting a prefilter does not guarantee that the search |
| 110 | /// will be faster. While it's usually a good bet, if the prefilter |
| 111 | /// produces a lot of false positive candidates (i.e., positions matched |
| 112 | /// by the prefilter but not by the regex), then the overall result can |
| 113 | /// be slower than if you had just executed the regex engine without any |
| 114 | /// prefilters. |
| 115 | /// |
| 116 | /// By default no prefilter is set. |
| 117 | /// |
| 118 | /// # Example |
| 119 | /// |
| 120 | /// ``` |
| 121 | /// use regex_automata::{ |
| 122 | /// nfa::thompson::pikevm::PikeVM, |
| 123 | /// util::prefilter::Prefilter, |
| 124 | /// Input, Match, MatchKind, |
| 125 | /// }; |
| 126 | /// |
| 127 | /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo" , "bar" ]); |
| 128 | /// let re = PikeVM::builder() |
| 129 | /// .configure(PikeVM::config().prefilter(pre)) |
| 130 | /// .build(r"(foo|bar)[a-z]+" )?; |
| 131 | /// let mut cache = re.create_cache(); |
| 132 | /// let input = Input::new("foo1 barfox bar" ); |
| 133 | /// assert_eq!(Some(Match::must(0, 5..11)), re.find(&mut cache, input)); |
| 134 | /// |
| 135 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 136 | /// ``` |
| 137 | /// |
| 138 | /// Be warned though that an incorrect prefilter can lead to incorrect |
| 139 | /// results! |
| 140 | /// |
| 141 | /// ``` |
| 142 | /// use regex_automata::{ |
| 143 | /// nfa::thompson::pikevm::PikeVM, |
| 144 | /// util::prefilter::Prefilter, |
| 145 | /// Input, HalfMatch, MatchKind, |
| 146 | /// }; |
| 147 | /// |
| 148 | /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo" , "car" ]); |
| 149 | /// let re = PikeVM::builder() |
| 150 | /// .configure(PikeVM::config().prefilter(pre)) |
| 151 | /// .build(r"(foo|bar)[a-z]+" )?; |
| 152 | /// let mut cache = re.create_cache(); |
| 153 | /// let input = Input::new("foo1 barfox bar" ); |
| 154 | /// // No match reported even though there clearly is one! |
| 155 | /// assert_eq!(None, re.find(&mut cache, input)); |
| 156 | /// |
| 157 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 158 | /// ``` |
| 159 | pub fn prefilter(mut self, pre: Option<Prefilter>) -> Config { |
| 160 | self.pre = Some(pre); |
| 161 | self |
| 162 | } |
| 163 | |
| 164 | /// Returns the match semantics set in this configuration. |
| 165 | pub fn get_match_kind(&self) -> MatchKind { |
| 166 | self.match_kind.unwrap_or(MatchKind::LeftmostFirst) |
| 167 | } |
| 168 | |
| 169 | /// Returns the prefilter set in this configuration, if one at all. |
| 170 | pub fn get_prefilter(&self) -> Option<&Prefilter> { |
| 171 | self.pre.as_ref().unwrap_or(&None).as_ref() |
| 172 | } |
| 173 | |
| 174 | /// Overwrite the default configuration such that the options in `o` are |
| 175 | /// always used. If an option in `o` is not set, then the corresponding |
| 176 | /// option in `self` is used. If it's not set in `self` either, then it |
| 177 | /// remains not set. |
| 178 | pub(crate) fn overwrite(&self, o: Config) -> Config { |
| 179 | Config { |
| 180 | match_kind: o.match_kind.or(self.match_kind), |
| 181 | pre: o.pre.or_else(|| self.pre.clone()), |
| 182 | } |
| 183 | } |
| 184 | } |
| 185 | |
| 186 | /// A builder for a `PikeVM`. |
| 187 | /// |
| 188 | /// This builder permits configuring options for the syntax of a pattern, |
| 189 | /// the NFA construction and the `PikeVM` construction. This builder is |
| 190 | /// different from a general purpose regex builder in that it permits fine |
| 191 | /// grain configuration of the construction process. The trade off for this is |
| 192 | /// complexity, and the possibility of setting a configuration that might not |
| 193 | /// make sense. For example, there are two different UTF-8 modes: |
| 194 | /// |
| 195 | /// * [`util::syntax::Config::utf8`](crate::util::syntax::Config::utf8) |
| 196 | /// controls whether the pattern itself can contain sub-expressions that match |
| 197 | /// invalid UTF-8. |
| 198 | /// * [`thompson::Config::utf8`] controls whether empty matches that split a |
| 199 | /// Unicode codepoint are reported or not. |
| 200 | /// |
| 201 | /// Generally speaking, callers will want to either enable all of these or |
| 202 | /// disable all of these. |
| 203 | /// |
| 204 | /// # Example |
| 205 | /// |
| 206 | /// This example shows how to disable UTF-8 mode in the syntax and the regex |
| 207 | /// itself. This is generally what you want for matching on arbitrary bytes. |
| 208 | /// |
| 209 | /// ``` |
| 210 | /// use regex_automata::{ |
| 211 | /// nfa::thompson::{self, pikevm::PikeVM}, |
| 212 | /// util::syntax, |
| 213 | /// Match, |
| 214 | /// }; |
| 215 | /// |
| 216 | /// let re = PikeVM::builder() |
| 217 | /// .syntax(syntax::Config::new().utf8(false)) |
| 218 | /// .thompson(thompson::Config::new().utf8(false)) |
| 219 | /// .build(r"foo(?-u:[^b])ar.*" )?; |
| 220 | /// let mut cache = re.create_cache(); |
| 221 | /// |
| 222 | /// let haystack = b" \xFEfoo \xFFarzz \xE2\x98\xFF\n" ; |
| 223 | /// let expected = Some(Match::must(0, 1..9)); |
| 224 | /// let got = re.find_iter(&mut cache, haystack).next(); |
| 225 | /// assert_eq!(expected, got); |
| 226 | /// // Notice that `(?-u:[^b])` matches invalid UTF-8, |
| 227 | /// // but the subsequent `.*` does not! Disabling UTF-8 |
| 228 | /// // on the syntax permits this. |
| 229 | /// // |
| 230 | /// // N.B. This example does not show the impact of |
| 231 | /// // disabling UTF-8 mode on a PikeVM Config, since that |
| 232 | /// // only impacts regexes that can produce matches of |
| 233 | /// // length 0. |
| 234 | /// assert_eq!(b"foo \xFFarzz" , &haystack[got.unwrap().range()]); |
| 235 | /// |
| 236 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 237 | /// ``` |
| 238 | #[derive (Clone, Debug)] |
| 239 | pub struct Builder { |
| 240 | config: Config, |
| 241 | #[cfg (feature = "syntax" )] |
| 242 | thompson: thompson::Compiler, |
| 243 | } |
| 244 | |
| 245 | impl Builder { |
| 246 | /// Create a new PikeVM builder with its default configuration. |
| 247 | pub fn new() -> Builder { |
| 248 | Builder { |
| 249 | config: Config::default(), |
| 250 | #[cfg (feature = "syntax" )] |
| 251 | thompson: thompson::Compiler::new(), |
| 252 | } |
| 253 | } |
| 254 | |
| 255 | /// Build a `PikeVM` from the given pattern. |
| 256 | /// |
| 257 | /// If there was a problem parsing or compiling the pattern, then an error |
| 258 | /// is returned. |
| 259 | #[cfg (feature = "syntax" )] |
| 260 | pub fn build(&self, pattern: &str) -> Result<PikeVM, BuildError> { |
| 261 | self.build_many(&[pattern]) |
| 262 | } |
| 263 | |
| 264 | /// Build a `PikeVM` from the given patterns. |
| 265 | #[cfg (feature = "syntax" )] |
| 266 | pub fn build_many<P: AsRef<str>>( |
| 267 | &self, |
| 268 | patterns: &[P], |
| 269 | ) -> Result<PikeVM, BuildError> { |
| 270 | let nfa = self.thompson.build_many(patterns)?; |
| 271 | self.build_from_nfa(nfa) |
| 272 | } |
| 273 | |
| 274 | /// Build a `PikeVM` directly from its NFA. |
| 275 | /// |
| 276 | /// Note that when using this method, any configuration that applies to the |
| 277 | /// construction of the NFA itself will of course be ignored, since the NFA |
| 278 | /// given here is already built. |
| 279 | pub fn build_from_nfa(&self, nfa: NFA) -> Result<PikeVM, BuildError> { |
| 280 | nfa.look_set_any().available().map_err(BuildError::word)?; |
| 281 | Ok(PikeVM { config: self.config.clone(), nfa }) |
| 282 | } |
| 283 | |
| 284 | /// Apply the given `PikeVM` configuration options to this builder. |
| 285 | pub fn configure(&mut self, config: Config) -> &mut Builder { |
| 286 | self.config = self.config.overwrite(config); |
| 287 | self |
| 288 | } |
| 289 | |
| 290 | /// Set the syntax configuration for this builder using |
| 291 | /// [`syntax::Config`](crate::util::syntax::Config). |
| 292 | /// |
| 293 | /// This permits setting things like case insensitivity, Unicode and multi |
| 294 | /// line mode. |
| 295 | /// |
| 296 | /// These settings only apply when constructing a PikeVM directly from a |
| 297 | /// pattern. |
| 298 | #[cfg (feature = "syntax" )] |
| 299 | pub fn syntax( |
| 300 | &mut self, |
| 301 | config: crate::util::syntax::Config, |
| 302 | ) -> &mut Builder { |
| 303 | self.thompson.syntax(config); |
| 304 | self |
| 305 | } |
| 306 | |
| 307 | /// Set the Thompson NFA configuration for this builder using |
| 308 | /// [`nfa::thompson::Config`](crate::nfa::thompson::Config). |
| 309 | /// |
| 310 | /// This permits setting things like if additional time should be spent |
| 311 | /// shrinking the size of the NFA. |
| 312 | /// |
| 313 | /// These settings only apply when constructing a PikeVM directly from a |
| 314 | /// pattern. |
| 315 | #[cfg (feature = "syntax" )] |
| 316 | pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder { |
| 317 | self.thompson.configure(config); |
| 318 | self |
| 319 | } |
| 320 | } |
| 321 | |
| 322 | /// A virtual machine for executing regex searches with capturing groups. |
| 323 | /// |
| 324 | /// # Infallible APIs |
| 325 | /// |
| 326 | /// Unlike most other regex engines in this crate, a `PikeVM` never returns an |
| 327 | /// error at search time. It supports all [`Anchored`] configurations, never |
| 328 | /// quits and works on haystacks of arbitrary length. |
| 329 | /// |
| 330 | /// There are two caveats to mention though: |
| 331 | /// |
| 332 | /// * If an invalid pattern ID is given to a search via [`Anchored::Pattern`], |
| 333 | /// then the PikeVM will report "no match." This is consistent with all other |
| 334 | /// regex engines in this crate. |
| 335 | /// * When using [`PikeVM::which_overlapping_matches`] with a [`PatternSet`] |
| 336 | /// that has insufficient capacity to store all valid pattern IDs, then if a |
| 337 | /// match occurs for a `PatternID` that cannot be inserted, it is silently |
| 338 | /// dropped as if it did not match. |
| 339 | /// |
| 340 | /// # Advice |
| 341 | /// |
| 342 | /// The `PikeVM` is generally the most "powerful" regex engine in this crate. |
| 343 | /// "Powerful" in this context means that it can handle any regular expression |
| 344 | /// that is parseable by `regex-syntax` and any size haystack. Regretably, |
| 345 | /// the `PikeVM` is also simultaneously often the _slowest_ regex engine in |
| 346 | /// practice. This results in an annoying situation where one generally tries |
| 347 | /// to pick any other regex engine (or perhaps none at all) before being |
| 348 | /// forced to fall back to a `PikeVM`. |
| 349 | /// |
| 350 | /// For example, a common strategy for dealing with capturing groups is to |
| 351 | /// actually look for the overall match of the regex using a faster regex |
| 352 | /// engine, like a [lazy DFA](crate::hybrid::regex::Regex). Once the overall |
| 353 | /// match is found, one can then run the `PikeVM` on just the match span to |
| 354 | /// find the spans of the capturing groups. In this way, the faster regex |
| 355 | /// engine does the majority of the work, while the `PikeVM` only lends its |
| 356 | /// power in a more limited role. |
| 357 | /// |
| 358 | /// Unfortunately, this isn't always possible because the faster regex engines |
| 359 | /// don't support all of the regex features in `regex-syntax`. This notably |
| 360 | /// includes (and is currently limited to) Unicode word boundaries. So if |
| 361 | /// your pattern has Unicode word boundaries, you typically can't use a |
| 362 | /// DFA-based regex engine at all (unless you [enable heuristic support for |
| 363 | /// it](crate::hybrid::dfa::Config::unicode_word_boundary)). (The [one-pass |
| 364 | /// DFA](crate::dfa::onepass::DFA) can handle Unicode word boundaries for |
| 365 | /// anchored searches only, but in a cruel sort of joke, many Unicode features |
| 366 | /// tend to result in making the regex _not_ one-pass.) |
| 367 | /// |
| 368 | /// # Example |
| 369 | /// |
| 370 | /// This example shows that the `PikeVM` implements Unicode word boundaries |
| 371 | /// correctly by default. |
| 372 | /// |
| 373 | /// ``` |
| 374 | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
| 375 | /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; |
| 376 | /// |
| 377 | /// let re = PikeVM::new(r"\b\w+\b" )?; |
| 378 | /// let mut cache = re.create_cache(); |
| 379 | /// |
| 380 | /// let mut it = re.find_iter(&mut cache, "Шерлок Холмс" ); |
| 381 | /// assert_eq!(Some(Match::must(0, 0..12)), it.next()); |
| 382 | /// assert_eq!(Some(Match::must(0, 13..23)), it.next()); |
| 383 | /// assert_eq!(None, it.next()); |
| 384 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 385 | /// ``` |
| 386 | #[derive (Clone, Debug)] |
| 387 | pub struct PikeVM { |
| 388 | config: Config, |
| 389 | nfa: NFA, |
| 390 | } |
| 391 | |
| 392 | impl PikeVM { |
| 393 | /// Parse the given regular expression using the default configuration and |
| 394 | /// return the corresponding `PikeVM`. |
| 395 | /// |
| 396 | /// If you want a non-default configuration, then use the [`Builder`] to |
| 397 | /// set your own configuration. |
| 398 | /// |
| 399 | /// # Example |
| 400 | /// |
| 401 | /// ``` |
| 402 | /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; |
| 403 | /// |
| 404 | /// let re = PikeVM::new("foo[0-9]+bar" )?; |
| 405 | /// let mut cache = re.create_cache(); |
| 406 | /// assert_eq!( |
| 407 | /// Some(Match::must(0, 3..14)), |
| 408 | /// re.find_iter(&mut cache, "zzzfoo12345barzzz" ).next(), |
| 409 | /// ); |
| 410 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 411 | /// ``` |
| 412 | #[cfg (feature = "syntax" )] |
| 413 | pub fn new(pattern: &str) -> Result<PikeVM, BuildError> { |
| 414 | PikeVM::builder().build(pattern) |
| 415 | } |
| 416 | |
| 417 | /// Like `new`, but parses multiple patterns into a single "multi regex." |
| 418 | /// This similarly uses the default regex configuration. |
| 419 | /// |
| 420 | /// # Example |
| 421 | /// |
| 422 | /// ``` |
| 423 | /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; |
| 424 | /// |
| 425 | /// let re = PikeVM::new_many(&["[a-z]+" , "[0-9]+" ])?; |
| 426 | /// let mut cache = re.create_cache(); |
| 427 | /// |
| 428 | /// let mut it = re.find_iter(&mut cache, "abc 1 foo 4567 0 quux" ); |
| 429 | /// assert_eq!(Some(Match::must(0, 0..3)), it.next()); |
| 430 | /// assert_eq!(Some(Match::must(1, 4..5)), it.next()); |
| 431 | /// assert_eq!(Some(Match::must(0, 6..9)), it.next()); |
| 432 | /// assert_eq!(Some(Match::must(1, 10..14)), it.next()); |
| 433 | /// assert_eq!(Some(Match::must(1, 15..16)), it.next()); |
| 434 | /// assert_eq!(Some(Match::must(0, 17..21)), it.next()); |
| 435 | /// assert_eq!(None, it.next()); |
| 436 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 437 | /// ``` |
| 438 | #[cfg (feature = "syntax" )] |
| 439 | pub fn new_many<P: AsRef<str>>( |
| 440 | patterns: &[P], |
| 441 | ) -> Result<PikeVM, BuildError> { |
| 442 | PikeVM::builder().build_many(patterns) |
| 443 | } |
| 444 | |
| 445 | /// Like `new`, but builds a PikeVM directly from an NFA. This is useful |
| 446 | /// if you already have an NFA, or even if you hand-assembled the NFA. |
| 447 | /// |
| 448 | /// # Example |
| 449 | /// |
| 450 | /// This shows how to hand assemble a regular expression via its HIR, |
| 451 | /// compile an NFA from it and build a PikeVM from the NFA. |
| 452 | /// |
| 453 | /// ``` |
| 454 | /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match}; |
| 455 | /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange}; |
| 456 | /// |
| 457 | /// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![ |
| 458 | /// ClassBytesRange::new(b'0' , b'9' ), |
| 459 | /// ClassBytesRange::new(b'A' , b'Z' ), |
| 460 | /// ClassBytesRange::new(b'_' , b'_' ), |
| 461 | /// ClassBytesRange::new(b'a' , b'z' ), |
| 462 | /// ]))); |
| 463 | /// |
| 464 | /// let config = NFA::config().nfa_size_limit(Some(1_000)); |
| 465 | /// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?; |
| 466 | /// |
| 467 | /// let re = PikeVM::new_from_nfa(nfa)?; |
| 468 | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
| 469 | /// let expected = Some(Match::must(0, 3..4)); |
| 470 | /// re.captures(&mut cache, "!@#A#@!" , &mut caps); |
| 471 | /// assert_eq!(expected, caps.get_match()); |
| 472 | /// |
| 473 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 474 | /// ``` |
| 475 | pub fn new_from_nfa(nfa: NFA) -> Result<PikeVM, BuildError> { |
| 476 | PikeVM::builder().build_from_nfa(nfa) |
| 477 | } |
| 478 | |
| 479 | /// Create a new `PikeVM` that matches every input. |
| 480 | /// |
| 481 | /// # Example |
| 482 | /// |
| 483 | /// ``` |
| 484 | /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; |
| 485 | /// |
| 486 | /// let re = PikeVM::always_match()?; |
| 487 | /// let mut cache = re.create_cache(); |
| 488 | /// |
| 489 | /// let expected = Match::must(0, 0..0); |
| 490 | /// assert_eq!(Some(expected), re.find_iter(&mut cache, "" ).next()); |
| 491 | /// assert_eq!(Some(expected), re.find_iter(&mut cache, "foo" ).next()); |
| 492 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 493 | /// ``` |
| 494 | pub fn always_match() -> Result<PikeVM, BuildError> { |
| 495 | let nfa = thompson::NFA::always_match(); |
| 496 | PikeVM::new_from_nfa(nfa) |
| 497 | } |
| 498 | |
| 499 | /// Create a new `PikeVM` that never matches any input. |
| 500 | /// |
| 501 | /// # Example |
| 502 | /// |
| 503 | /// ``` |
| 504 | /// use regex_automata::nfa::thompson::pikevm::PikeVM; |
| 505 | /// |
| 506 | /// let re = PikeVM::never_match()?; |
| 507 | /// let mut cache = re.create_cache(); |
| 508 | /// |
| 509 | /// assert_eq!(None, re.find_iter(&mut cache, "" ).next()); |
| 510 | /// assert_eq!(None, re.find_iter(&mut cache, "foo" ).next()); |
| 511 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 512 | /// ``` |
| 513 | pub fn never_match() -> Result<PikeVM, BuildError> { |
| 514 | let nfa = thompson::NFA::never_match(); |
| 515 | PikeVM::new_from_nfa(nfa) |
| 516 | } |
| 517 | |
| 518 | /// Return a default configuration for a `PikeVM`. |
| 519 | /// |
| 520 | /// This is a convenience routine to avoid needing to import the `Config` |
| 521 | /// type when customizing the construction of a `PikeVM`. |
| 522 | /// |
| 523 | /// # Example |
| 524 | /// |
| 525 | /// This example shows how to disable UTF-8 mode. When UTF-8 mode is |
| 526 | /// disabled, zero-width matches that split a codepoint are allowed. |
| 527 | /// Otherwise they are never reported. |
| 528 | /// |
| 529 | /// In the code below, notice that `""` is permitted to match positions |
| 530 | /// that split the encoding of a codepoint. |
| 531 | /// |
| 532 | /// ``` |
| 533 | /// use regex_automata::{nfa::thompson::{self, pikevm::PikeVM}, Match}; |
| 534 | /// |
| 535 | /// let re = PikeVM::builder() |
| 536 | /// .thompson(thompson::Config::new().utf8(false)) |
| 537 | /// .build(r"" )?; |
| 538 | /// let mut cache = re.create_cache(); |
| 539 | /// |
| 540 | /// let haystack = "a☃z" ; |
| 541 | /// let mut it = re.find_iter(&mut cache, haystack); |
| 542 | /// assert_eq!(Some(Match::must(0, 0..0)), it.next()); |
| 543 | /// assert_eq!(Some(Match::must(0, 1..1)), it.next()); |
| 544 | /// assert_eq!(Some(Match::must(0, 2..2)), it.next()); |
| 545 | /// assert_eq!(Some(Match::must(0, 3..3)), it.next()); |
| 546 | /// assert_eq!(Some(Match::must(0, 4..4)), it.next()); |
| 547 | /// assert_eq!(Some(Match::must(0, 5..5)), it.next()); |
| 548 | /// assert_eq!(None, it.next()); |
| 549 | /// |
| 550 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 551 | /// ``` |
| 552 | pub fn config() -> Config { |
| 553 | Config::new() |
| 554 | } |
| 555 | |
| 556 | /// Return a builder for configuring the construction of a `PikeVM`. |
| 557 | /// |
| 558 | /// This is a convenience routine to avoid needing to import the |
| 559 | /// [`Builder`] type in common cases. |
| 560 | /// |
| 561 | /// # Example |
| 562 | /// |
| 563 | /// This example shows how to use the builder to disable UTF-8 mode |
| 564 | /// everywhere. |
| 565 | /// |
| 566 | /// ``` |
| 567 | /// use regex_automata::{ |
| 568 | /// nfa::thompson::{self, pikevm::PikeVM}, |
| 569 | /// util::syntax, |
| 570 | /// Match, |
| 571 | /// }; |
| 572 | /// |
| 573 | /// let re = PikeVM::builder() |
| 574 | /// .syntax(syntax::Config::new().utf8(false)) |
| 575 | /// .thompson(thompson::Config::new().utf8(false)) |
| 576 | /// .build(r"foo(?-u:[^b])ar.*" )?; |
| 577 | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
| 578 | /// |
| 579 | /// let haystack = b" \xFEfoo \xFFarzz \xE2\x98\xFF\n" ; |
| 580 | /// let expected = Some(Match::must(0, 1..9)); |
| 581 | /// re.captures(&mut cache, haystack, &mut caps); |
| 582 | /// assert_eq!(expected, caps.get_match()); |
| 583 | /// |
| 584 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 585 | /// ``` |
| 586 | pub fn builder() -> Builder { |
| 587 | Builder::new() |
| 588 | } |
| 589 | |
| 590 | /// Create a new empty set of capturing groups that is guaranteed to be |
| 591 | /// valid for the search APIs on this `PikeVM`. |
| 592 | /// |
| 593 | /// A `Captures` value created for a specific `PikeVM` cannot be used with |
| 594 | /// any other `PikeVM`. |
| 595 | /// |
| 596 | /// This is a convenience function for [`Captures::all`]. See the |
| 597 | /// [`Captures`] documentation for an explanation of its alternative |
| 598 | /// constructors that permit the `PikeVM` to do less work during a search, |
| 599 | /// and thus might make it faster. |
| 600 | pub fn create_captures(&self) -> Captures { |
| 601 | Captures::all(self.get_nfa().group_info().clone()) |
| 602 | } |
| 603 | |
| 604 | /// Create a new cache for this `PikeVM`. |
| 605 | /// |
| 606 | /// The cache returned should only be used for searches for this |
| 607 | /// `PikeVM`. If you want to reuse the cache for another `PikeVM`, then |
| 608 | /// you must call [`Cache::reset`] with that `PikeVM` (or, equivalently, |
| 609 | /// [`PikeVM::reset_cache`]). |
| 610 | pub fn create_cache(&self) -> Cache { |
| 611 | Cache::new(self) |
| 612 | } |
| 613 | |
| 614 | /// Reset the given cache such that it can be used for searching with the |
| 615 | /// this `PikeVM` (and only this `PikeVM`). |
| 616 | /// |
| 617 | /// A cache reset permits reusing memory already allocated in this cache |
| 618 | /// with a different `PikeVM`. |
| 619 | /// |
| 620 | /// # Example |
| 621 | /// |
| 622 | /// This shows how to re-purpose a cache for use with a different `PikeVM`. |
| 623 | /// |
| 624 | /// ``` |
| 625 | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
| 626 | /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; |
| 627 | /// |
| 628 | /// let re1 = PikeVM::new(r"\w" )?; |
| 629 | /// let re2 = PikeVM::new(r"\W" )?; |
| 630 | /// |
| 631 | /// let mut cache = re1.create_cache(); |
| 632 | /// assert_eq!( |
| 633 | /// Some(Match::must(0, 0..2)), |
| 634 | /// re1.find_iter(&mut cache, "Δ" ).next(), |
| 635 | /// ); |
| 636 | /// |
| 637 | /// // Using 'cache' with re2 is not allowed. It may result in panics or |
| 638 | /// // incorrect results. In order to re-purpose the cache, we must reset |
| 639 | /// // it with the PikeVM we'd like to use it with. |
| 640 | /// // |
| 641 | /// // Similarly, after this reset, using the cache with 're1' is also not |
| 642 | /// // allowed. |
| 643 | /// re2.reset_cache(&mut cache); |
| 644 | /// assert_eq!( |
| 645 | /// Some(Match::must(0, 0..3)), |
| 646 | /// re2.find_iter(&mut cache, "☃" ).next(), |
| 647 | /// ); |
| 648 | /// |
| 649 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 650 | /// ``` |
| 651 | pub fn reset_cache(&self, cache: &mut Cache) { |
| 652 | cache.reset(self); |
| 653 | } |
| 654 | |
| 655 | /// Returns the total number of patterns compiled into this `PikeVM`. |
| 656 | /// |
| 657 | /// In the case of a `PikeVM` that contains no patterns, this returns `0`. |
| 658 | /// |
| 659 | /// # Example |
| 660 | /// |
| 661 | /// This example shows the pattern length for a `PikeVM` that never |
| 662 | /// matches: |
| 663 | /// |
| 664 | /// ``` |
| 665 | /// use regex_automata::nfa::thompson::pikevm::PikeVM; |
| 666 | /// |
| 667 | /// let re = PikeVM::never_match()?; |
| 668 | /// assert_eq!(re.pattern_len(), 0); |
| 669 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 670 | /// ``` |
| 671 | /// |
| 672 | /// And another example for a `PikeVM` that matches at every position: |
| 673 | /// |
| 674 | /// ``` |
| 675 | /// use regex_automata::nfa::thompson::pikevm::PikeVM; |
| 676 | /// |
| 677 | /// let re = PikeVM::always_match()?; |
| 678 | /// assert_eq!(re.pattern_len(), 1); |
| 679 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 680 | /// ``` |
| 681 | /// |
| 682 | /// And finally, a `PikeVM` that was constructed from multiple patterns: |
| 683 | /// |
| 684 | /// ``` |
| 685 | /// use regex_automata::nfa::thompson::pikevm::PikeVM; |
| 686 | /// |
| 687 | /// let re = PikeVM::new_many(&["[0-9]+" , "[a-z]+" , "[A-Z]+" ])?; |
| 688 | /// assert_eq!(re.pattern_len(), 3); |
| 689 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 690 | /// ``` |
| 691 | pub fn pattern_len(&self) -> usize { |
| 692 | self.nfa.pattern_len() |
| 693 | } |
| 694 | |
| 695 | /// Return the config for this `PikeVM`. |
| 696 | #[inline ] |
| 697 | pub fn get_config(&self) -> &Config { |
| 698 | &self.config |
| 699 | } |
| 700 | |
| 701 | /// Returns a reference to the underlying NFA. |
| 702 | #[inline ] |
| 703 | pub fn get_nfa(&self) -> &NFA { |
| 704 | &self.nfa |
| 705 | } |
| 706 | } |
| 707 | |
| 708 | impl PikeVM { |
| 709 | /// Returns true if and only if this `PikeVM` matches the given haystack. |
| 710 | /// |
| 711 | /// This routine may short circuit if it knows that scanning future |
| 712 | /// input will never lead to a different result. In particular, if the |
| 713 | /// underlying NFA enters a match state, then this routine will return |
| 714 | /// `true` immediately without inspecting any future input. (Consider how |
| 715 | /// this might make a difference given the regex `a+` on the haystack |
| 716 | /// `aaaaaaaaaaaaaaa`. This routine can stop after it sees the first `a`, |
| 717 | /// but routines like `find` need to continue searching because `+` is |
| 718 | /// greedy by default.) |
| 719 | /// |
| 720 | /// # Example |
| 721 | /// |
| 722 | /// This shows basic usage: |
| 723 | /// |
| 724 | /// ``` |
| 725 | /// use regex_automata::nfa::thompson::pikevm::PikeVM; |
| 726 | /// |
| 727 | /// let re = PikeVM::new("foo[0-9]+bar" )?; |
| 728 | /// let mut cache = re.create_cache(); |
| 729 | /// |
| 730 | /// assert!(re.is_match(&mut cache, "foo12345bar" )); |
| 731 | /// assert!(!re.is_match(&mut cache, "foobar" )); |
| 732 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 733 | /// ``` |
| 734 | /// |
| 735 | /// # Example: consistency with search APIs |
| 736 | /// |
| 737 | /// `is_match` is guaranteed to return `true` whenever `find` returns a |
| 738 | /// match. This includes searches that are executed entirely within a |
| 739 | /// codepoint: |
| 740 | /// |
| 741 | /// ``` |
| 742 | /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Input}; |
| 743 | /// |
| 744 | /// let re = PikeVM::new("a*" )?; |
| 745 | /// let mut cache = re.create_cache(); |
| 746 | /// |
| 747 | /// assert!(!re.is_match(&mut cache, Input::new("☃" ).span(1..2))); |
| 748 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 749 | /// ``` |
| 750 | /// |
| 751 | /// Notice that when UTF-8 mode is disabled, then the above reports a |
| 752 | /// match because the restriction against zero-width matches that split a |
| 753 | /// codepoint has been lifted: |
| 754 | /// |
| 755 | /// ``` |
| 756 | /// use regex_automata::{nfa::thompson::{pikevm::PikeVM, NFA}, Input}; |
| 757 | /// |
| 758 | /// let re = PikeVM::builder() |
| 759 | /// .thompson(NFA::config().utf8(false)) |
| 760 | /// .build("a*" )?; |
| 761 | /// let mut cache = re.create_cache(); |
| 762 | /// |
| 763 | /// assert!(re.is_match(&mut cache, Input::new("☃" ).span(1..2))); |
| 764 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 765 | /// ``` |
| 766 | #[inline ] |
| 767 | pub fn is_match<'h, I: Into<Input<'h>>>( |
| 768 | &self, |
| 769 | cache: &mut Cache, |
| 770 | input: I, |
| 771 | ) -> bool { |
| 772 | let input = input.into().earliest(true); |
| 773 | self.search_slots(cache, &input, &mut []).is_some() |
| 774 | } |
| 775 | |
| 776 | /// Executes a leftmost forward search and returns a `Match` if one exists. |
| 777 | /// |
| 778 | /// This routine only includes the overall match span. To get access to the |
| 779 | /// individual spans of each capturing group, use [`PikeVM::captures`]. |
| 780 | /// |
| 781 | /// # Example |
| 782 | /// |
| 783 | /// Leftmost first match semantics corresponds to the match with the |
| 784 | /// smallest starting offset, but where the end offset is determined by |
| 785 | /// preferring earlier branches in the original regular expression. For |
| 786 | /// example, `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam` |
| 787 | /// will match `Samwise` in `Samwise`. |
| 788 | /// |
| 789 | /// Generally speaking, the "leftmost first" match is how most backtracking |
| 790 | /// regular expressions tend to work. This is in contrast to POSIX-style |
| 791 | /// regular expressions that yield "leftmost longest" matches. Namely, |
| 792 | /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using |
| 793 | /// leftmost longest semantics. (This crate does not currently support |
| 794 | /// leftmost longest semantics.) |
| 795 | /// |
| 796 | /// ``` |
| 797 | /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; |
| 798 | /// |
| 799 | /// let re = PikeVM::new("foo[0-9]+" )?; |
| 800 | /// let mut cache = re.create_cache(); |
| 801 | /// let expected = Match::must(0, 0..8); |
| 802 | /// assert_eq!(Some(expected), re.find(&mut cache, "foo12345" )); |
| 803 | /// |
| 804 | /// // Even though a match is found after reading the first byte (`a`), |
| 805 | /// // the leftmost first match semantics demand that we find the earliest |
| 806 | /// // match that prefers earlier parts of the pattern over later parts. |
| 807 | /// let re = PikeVM::new("abc|a" )?; |
| 808 | /// let mut cache = re.create_cache(); |
| 809 | /// let expected = Match::must(0, 0..3); |
| 810 | /// assert_eq!(Some(expected), re.find(&mut cache, "abc" )); |
| 811 | /// |
| 812 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 813 | /// ``` |
| 814 | #[inline ] |
| 815 | pub fn find<'h, I: Into<Input<'h>>>( |
| 816 | &self, |
| 817 | cache: &mut Cache, |
| 818 | input: I, |
| 819 | ) -> Option<Match> { |
| 820 | let input = input.into(); |
| 821 | if self.get_nfa().pattern_len() == 1 { |
| 822 | let mut slots = [None, None]; |
| 823 | let pid = self.search_slots(cache, &input, &mut slots)?; |
| 824 | let start = slots[0]?.get(); |
| 825 | let end = slots[1]?.get(); |
| 826 | return Some(Match::new(pid, Span { start, end })); |
| 827 | } |
| 828 | let ginfo = self.get_nfa().group_info(); |
| 829 | let slots_len = ginfo.implicit_slot_len(); |
| 830 | let mut slots = vec![None; slots_len]; |
| 831 | let pid = self.search_slots(cache, &input, &mut slots)?; |
| 832 | let start = slots[pid.as_usize() * 2]?.get(); |
| 833 | let end = slots[pid.as_usize() * 2 + 1]?.get(); |
| 834 | Some(Match::new(pid, Span { start, end })) |
| 835 | } |
| 836 | |
| 837 | /// Executes a leftmost forward search and writes the spans of capturing |
| 838 | /// groups that participated in a match into the provided [`Captures`] |
| 839 | /// value. If no match was found, then [`Captures::is_match`] is guaranteed |
| 840 | /// to return `false`. |
| 841 | /// |
| 842 | /// # Example |
| 843 | /// |
| 844 | /// ``` |
| 845 | /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span}; |
| 846 | /// |
| 847 | /// let re = PikeVM::new(r"^([0-9]{4})-([0-9]{2})-([0-9]{2})$" )?; |
| 848 | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
| 849 | /// |
| 850 | /// re.captures(&mut cache, "2010-03-14" , &mut caps); |
| 851 | /// assert!(caps.is_match()); |
| 852 | /// assert_eq!(Some(Span::from(0..4)), caps.get_group(1)); |
| 853 | /// assert_eq!(Some(Span::from(5..7)), caps.get_group(2)); |
| 854 | /// assert_eq!(Some(Span::from(8..10)), caps.get_group(3)); |
| 855 | /// |
| 856 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 857 | /// ``` |
| 858 | #[inline ] |
| 859 | pub fn captures<'h, I: Into<Input<'h>>>( |
| 860 | &self, |
| 861 | cache: &mut Cache, |
| 862 | input: I, |
| 863 | caps: &mut Captures, |
| 864 | ) { |
| 865 | self.search(cache, &input.into(), caps) |
| 866 | } |
| 867 | |
| 868 | /// Returns an iterator over all non-overlapping leftmost matches in the |
| 869 | /// given bytes. If no match exists, then the iterator yields no elements. |
| 870 | /// |
| 871 | /// # Example |
| 872 | /// |
| 873 | /// ``` |
| 874 | /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; |
| 875 | /// |
| 876 | /// let re = PikeVM::new("foo[0-9]+" )?; |
| 877 | /// let mut cache = re.create_cache(); |
| 878 | /// |
| 879 | /// let text = "foo1 foo12 foo123" ; |
| 880 | /// let matches: Vec<Match> = re.find_iter(&mut cache, text).collect(); |
| 881 | /// assert_eq!(matches, vec![ |
| 882 | /// Match::must(0, 0..4), |
| 883 | /// Match::must(0, 5..10), |
| 884 | /// Match::must(0, 11..17), |
| 885 | /// ]); |
| 886 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 887 | /// ``` |
| 888 | #[inline ] |
| 889 | pub fn find_iter<'r, 'c, 'h, I: Into<Input<'h>>>( |
| 890 | &'r self, |
| 891 | cache: &'c mut Cache, |
| 892 | input: I, |
| 893 | ) -> FindMatches<'r, 'c, 'h> { |
| 894 | let caps = Captures::matches(self.get_nfa().group_info().clone()); |
| 895 | let it = iter::Searcher::new(input.into()); |
| 896 | FindMatches { re: self, cache, caps, it } |
| 897 | } |
| 898 | |
| 899 | /// Returns an iterator over all non-overlapping `Captures` values. If no |
| 900 | /// match exists, then the iterator yields no elements. |
| 901 | /// |
| 902 | /// This yields the same matches as [`PikeVM::find_iter`], but it includes |
| 903 | /// the spans of all capturing groups that participate in each match. |
| 904 | /// |
| 905 | /// **Tip:** See [`util::iter::Searcher`](crate::util::iter::Searcher) for |
| 906 | /// how to correctly iterate over all matches in a haystack while avoiding |
| 907 | /// the creation of a new `Captures` value for every match. (Which you are |
| 908 | /// forced to do with an `Iterator`.) |
| 909 | /// |
| 910 | /// # Example |
| 911 | /// |
| 912 | /// ``` |
| 913 | /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span}; |
| 914 | /// |
| 915 | /// let re = PikeVM::new("foo(?P<numbers>[0-9]+)" )?; |
| 916 | /// let mut cache = re.create_cache(); |
| 917 | /// |
| 918 | /// let text = "foo1 foo12 foo123" ; |
| 919 | /// let matches: Vec<Span> = re |
| 920 | /// .captures_iter(&mut cache, text) |
| 921 | /// // The unwrap is OK since 'numbers' matches if the pattern matches. |
| 922 | /// .map(|caps| caps.get_group_by_name("numbers" ).unwrap()) |
| 923 | /// .collect(); |
| 924 | /// assert_eq!(matches, vec![ |
| 925 | /// Span::from(3..4), |
| 926 | /// Span::from(8..10), |
| 927 | /// Span::from(14..17), |
| 928 | /// ]); |
| 929 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 930 | /// ``` |
| 931 | #[inline ] |
| 932 | pub fn captures_iter<'r, 'c, 'h, I: Into<Input<'h>>>( |
| 933 | &'r self, |
| 934 | cache: &'c mut Cache, |
| 935 | input: I, |
| 936 | ) -> CapturesMatches<'r, 'c, 'h> { |
| 937 | let caps = self.create_captures(); |
| 938 | let it = iter::Searcher::new(input.into()); |
| 939 | CapturesMatches { re: self, cache, caps, it } |
| 940 | } |
| 941 | } |
| 942 | |
| 943 | impl PikeVM { |
| 944 | /// Executes a leftmost forward search and writes the spans of capturing |
| 945 | /// groups that participated in a match into the provided [`Captures`] |
| 946 | /// value. If no match was found, then [`Captures::is_match`] is guaranteed |
| 947 | /// to return `false`. |
| 948 | /// |
| 949 | /// This is like [`PikeVM::captures`], but it accepts a concrete `&Input` |
| 950 | /// instead of an `Into<Input>`. |
| 951 | /// |
| 952 | /// # Example: specific pattern search |
| 953 | /// |
| 954 | /// This example shows how to build a multi-PikeVM that permits searching |
| 955 | /// for specific patterns. |
| 956 | /// |
| 957 | /// ``` |
| 958 | /// use regex_automata::{ |
| 959 | /// nfa::thompson::pikevm::PikeVM, |
| 960 | /// Anchored, Match, PatternID, Input, |
| 961 | /// }; |
| 962 | /// |
| 963 | /// let re = PikeVM::new_many(&["[a-z0-9]{6}" , "[a-z][a-z0-9]{5}" ])?; |
| 964 | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
| 965 | /// let haystack = "foo123" ; |
| 966 | /// |
| 967 | /// // Since we are using the default leftmost-first match and both |
| 968 | /// // patterns match at the same starting position, only the first pattern |
| 969 | /// // will be returned in this case when doing a search for any of the |
| 970 | /// // patterns. |
| 971 | /// let expected = Some(Match::must(0, 0..6)); |
| 972 | /// re.search(&mut cache, &Input::new(haystack), &mut caps); |
| 973 | /// assert_eq!(expected, caps.get_match()); |
| 974 | /// |
| 975 | /// // But if we want to check whether some other pattern matches, then we |
| 976 | /// // can provide its pattern ID. |
| 977 | /// let expected = Some(Match::must(1, 0..6)); |
| 978 | /// let input = Input::new(haystack) |
| 979 | /// .anchored(Anchored::Pattern(PatternID::must(1))); |
| 980 | /// re.search(&mut cache, &input, &mut caps); |
| 981 | /// assert_eq!(expected, caps.get_match()); |
| 982 | /// |
| 983 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 984 | /// ``` |
| 985 | /// |
| 986 | /// # Example: specifying the bounds of a search |
| 987 | /// |
| 988 | /// This example shows how providing the bounds of a search can produce |
| 989 | /// different results than simply sub-slicing the haystack. |
| 990 | /// |
| 991 | /// ``` |
| 992 | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
| 993 | /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match, Input}; |
| 994 | /// |
| 995 | /// let re = PikeVM::new(r"\b[0-9]{3}\b" )?; |
| 996 | /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); |
| 997 | /// let haystack = "foo123bar" ; |
| 998 | /// |
| 999 | /// // Since we sub-slice the haystack, the search doesn't know about |
| 1000 | /// // the larger context and assumes that `123` is surrounded by word |
| 1001 | /// // boundaries. And of course, the match position is reported relative |
| 1002 | /// // to the sub-slice as well, which means we get `0..3` instead of |
| 1003 | /// // `3..6`. |
| 1004 | /// let expected = Some(Match::must(0, 0..3)); |
| 1005 | /// re.search(&mut cache, &Input::new(&haystack[3..6]), &mut caps); |
| 1006 | /// assert_eq!(expected, caps.get_match()); |
| 1007 | /// |
| 1008 | /// // But if we provide the bounds of the search within the context of the |
| 1009 | /// // entire haystack, then the search can take the surrounding context |
| 1010 | /// // into account. (And if we did find a match, it would be reported |
| 1011 | /// // as a valid offset into `haystack` instead of its sub-slice.) |
| 1012 | /// let expected = None; |
| 1013 | /// let input = Input::new(haystack).range(3..6); |
| 1014 | /// re.search(&mut cache, &input, &mut caps); |
| 1015 | /// assert_eq!(expected, caps.get_match()); |
| 1016 | /// |
| 1017 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 1018 | /// ``` |
| 1019 | #[inline ] |
| 1020 | pub fn search( |
| 1021 | &self, |
| 1022 | cache: &mut Cache, |
| 1023 | input: &Input<'_>, |
| 1024 | caps: &mut Captures, |
| 1025 | ) { |
| 1026 | caps.set_pattern(None); |
| 1027 | let pid = self.search_slots(cache, input, caps.slots_mut()); |
| 1028 | caps.set_pattern(pid); |
| 1029 | } |
| 1030 | |
| 1031 | /// Executes a leftmost forward search and writes the spans of capturing |
| 1032 | /// groups that participated in a match into the provided `slots`, and |
| 1033 | /// returns the matching pattern ID. The contents of the slots for patterns |
| 1034 | /// other than the matching pattern are unspecified. If no match was found, |
| 1035 | /// then `None` is returned and the contents of `slots` is unspecified. |
| 1036 | /// |
| 1037 | /// This is like [`PikeVM::search`], but it accepts a raw slots slice |
| 1038 | /// instead of a `Captures` value. This is useful in contexts where you |
| 1039 | /// don't want or need to allocate a `Captures`. |
| 1040 | /// |
| 1041 | /// It is legal to pass _any_ number of slots to this routine. If the regex |
| 1042 | /// engine would otherwise write a slot offset that doesn't fit in the |
| 1043 | /// provided slice, then it is simply skipped. In general though, there are |
| 1044 | /// usually three slice lengths you might want to use: |
| 1045 | /// |
| 1046 | /// * An empty slice, if you only care about which pattern matched. |
| 1047 | /// * A slice with |
| 1048 | /// [`pattern_len() * 2`](crate::nfa::thompson::NFA::pattern_len) |
| 1049 | /// slots, if you only care about the overall match spans for each matching |
| 1050 | /// pattern. |
| 1051 | /// * A slice with |
| 1052 | /// [`slot_len()`](crate::util::captures::GroupInfo::slot_len) slots, which |
| 1053 | /// permits recording match offsets for every capturing group in every |
| 1054 | /// pattern. |
| 1055 | /// |
| 1056 | /// # Example |
| 1057 | /// |
| 1058 | /// This example shows how to find the overall match offsets in a |
| 1059 | /// multi-pattern search without allocating a `Captures` value. Indeed, we |
| 1060 | /// can put our slots right on the stack. |
| 1061 | /// |
| 1062 | /// ``` |
| 1063 | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
| 1064 | /// use regex_automata::{nfa::thompson::pikevm::PikeVM, PatternID, Input}; |
| 1065 | /// |
| 1066 | /// let re = PikeVM::new_many(&[ |
| 1067 | /// r"\pL+" , |
| 1068 | /// r"\d+" , |
| 1069 | /// ])?; |
| 1070 | /// let mut cache = re.create_cache(); |
| 1071 | /// let input = Input::new("!@#123" ); |
| 1072 | /// |
| 1073 | /// // We only care about the overall match offsets here, so we just |
| 1074 | /// // allocate two slots for each pattern. Each slot records the start |
| 1075 | /// // and end of the match. |
| 1076 | /// let mut slots = [None; 4]; |
| 1077 | /// let pid = re.search_slots(&mut cache, &input, &mut slots); |
| 1078 | /// assert_eq!(Some(PatternID::must(1)), pid); |
| 1079 | /// |
| 1080 | /// // The overall match offsets are always at 'pid * 2' and 'pid * 2 + 1'. |
| 1081 | /// // See 'GroupInfo' for more details on the mapping between groups and |
| 1082 | /// // slot indices. |
| 1083 | /// let slot_start = pid.unwrap().as_usize() * 2; |
| 1084 | /// let slot_end = slot_start + 1; |
| 1085 | /// assert_eq!(Some(3), slots[slot_start].map(|s| s.get())); |
| 1086 | /// assert_eq!(Some(6), slots[slot_end].map(|s| s.get())); |
| 1087 | /// |
| 1088 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 1089 | /// ``` |
| 1090 | #[inline ] |
| 1091 | pub fn search_slots( |
| 1092 | &self, |
| 1093 | cache: &mut Cache, |
| 1094 | input: &Input<'_>, |
| 1095 | slots: &mut [Option<NonMaxUsize>], |
| 1096 | ) -> Option<PatternID> { |
| 1097 | let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); |
| 1098 | if !utf8empty { |
| 1099 | let hm = self.search_slots_imp(cache, input, slots)?; |
| 1100 | return Some(hm.pattern()); |
| 1101 | } |
| 1102 | // There is an unfortunate special case where if the regex can |
| 1103 | // match the empty string and UTF-8 mode is enabled, the search |
| 1104 | // implementation requires that the slots have at least as much space |
| 1105 | // to report the bounds of any match. This is so zero-width matches |
| 1106 | // that split a codepoint can be filtered out. |
| 1107 | // |
| 1108 | // Note that if utf8empty is true, we specialize the case for when |
| 1109 | // the number of patterns is 1. In that case, we can just use a stack |
| 1110 | // allocation. Otherwise we resort to a heap allocation, which we |
| 1111 | // convince ourselves we're fine with due to the pathological nature of |
| 1112 | // this case. |
| 1113 | let min = self.get_nfa().group_info().implicit_slot_len(); |
| 1114 | if slots.len() >= min { |
| 1115 | let hm = self.search_slots_imp(cache, input, slots)?; |
| 1116 | return Some(hm.pattern()); |
| 1117 | } |
| 1118 | if self.get_nfa().pattern_len() == 1 { |
| 1119 | let mut enough = [None, None]; |
| 1120 | let got = self.search_slots_imp(cache, input, &mut enough); |
| 1121 | // This is OK because we know `enough` is strictly bigger than |
| 1122 | // `slots`, otherwise this special case isn't reached. |
| 1123 | slots.copy_from_slice(&enough[..slots.len()]); |
| 1124 | return got.map(|hm| hm.pattern()); |
| 1125 | } |
| 1126 | let mut enough = vec![None; min]; |
| 1127 | let got = self.search_slots_imp(cache, input, &mut enough); |
| 1128 | // This is OK because we know `enough` is strictly bigger than `slots`, |
| 1129 | // otherwise this special case isn't reached. |
| 1130 | slots.copy_from_slice(&enough[..slots.len()]); |
| 1131 | got.map(|hm| hm.pattern()) |
| 1132 | } |
| 1133 | |
| 1134 | /// This is the actual implementation of `search_slots_imp` that |
| 1135 | /// doesn't account for the special case when 1) the NFA has UTF-8 mode |
| 1136 | /// enabled, 2) the NFA can match the empty string and 3) the caller has |
| 1137 | /// provided an insufficient number of slots to record match offsets. |
| 1138 | #[inline (never)] |
| 1139 | fn search_slots_imp( |
| 1140 | &self, |
| 1141 | cache: &mut Cache, |
| 1142 | input: &Input<'_>, |
| 1143 | slots: &mut [Option<NonMaxUsize>], |
| 1144 | ) -> Option<HalfMatch> { |
| 1145 | let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); |
| 1146 | let hm = match self.search_imp(cache, input, slots) { |
| 1147 | None => return None, |
| 1148 | Some(hm) if !utf8empty => return Some(hm), |
| 1149 | Some(hm) => hm, |
| 1150 | }; |
| 1151 | empty::skip_splits_fwd(input, hm, hm.offset(), |input| { |
| 1152 | Ok(self |
| 1153 | .search_imp(cache, input, slots) |
| 1154 | .map(|hm| (hm, hm.offset()))) |
| 1155 | }) |
| 1156 | // OK because the PikeVM never errors. |
| 1157 | .unwrap() |
| 1158 | } |
| 1159 | |
| 1160 | /// Writes the set of patterns that match anywhere in the given search |
| 1161 | /// configuration to `patset`. If multiple patterns match at the same |
| 1162 | /// position and this `PikeVM` was configured with [`MatchKind::All`] |
| 1163 | /// semantics, then all matching patterns are written to the given set. |
| 1164 | /// |
| 1165 | /// Unless all of the patterns in this `PikeVM` are anchored, then |
| 1166 | /// generally speaking, this will visit every byte in the haystack. |
| 1167 | /// |
| 1168 | /// This search routine *does not* clear the pattern set. This gives some |
| 1169 | /// flexibility to the caller (e.g., running multiple searches with the |
| 1170 | /// same pattern set), but does make the API bug-prone if you're reusing |
| 1171 | /// the same pattern set for multiple searches but intended them to be |
| 1172 | /// independent. |
| 1173 | /// |
| 1174 | /// If a pattern ID matched but the given `PatternSet` does not have |
| 1175 | /// sufficient capacity to store it, then it is not inserted and silently |
| 1176 | /// dropped. |
| 1177 | /// |
| 1178 | /// # Example |
| 1179 | /// |
| 1180 | /// This example shows how to find all matching patterns in a haystack, |
| 1181 | /// even when some patterns match at the same position as other patterns. |
| 1182 | /// |
| 1183 | /// ``` |
| 1184 | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
| 1185 | /// use regex_automata::{ |
| 1186 | /// nfa::thompson::pikevm::PikeVM, |
| 1187 | /// Input, MatchKind, PatternSet, |
| 1188 | /// }; |
| 1189 | /// |
| 1190 | /// let patterns = &[ |
| 1191 | /// r"\w+" , r"\d+" , r"\pL+" , r"foo" , r"bar" , r"barfoo" , r"foobar" , |
| 1192 | /// ]; |
| 1193 | /// let re = PikeVM::builder() |
| 1194 | /// .configure(PikeVM::config().match_kind(MatchKind::All)) |
| 1195 | /// .build_many(patterns)?; |
| 1196 | /// let mut cache = re.create_cache(); |
| 1197 | /// |
| 1198 | /// let input = Input::new("foobar" ); |
| 1199 | /// let mut patset = PatternSet::new(re.pattern_len()); |
| 1200 | /// re.which_overlapping_matches(&mut cache, &input, &mut patset); |
| 1201 | /// let expected = vec![0, 2, 3, 4, 6]; |
| 1202 | /// let got: Vec<usize> = patset.iter().map(|p| p.as_usize()).collect(); |
| 1203 | /// assert_eq!(expected, got); |
| 1204 | /// |
| 1205 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 1206 | /// ``` |
| 1207 | #[inline ] |
| 1208 | pub fn which_overlapping_matches( |
| 1209 | &self, |
| 1210 | cache: &mut Cache, |
| 1211 | input: &Input<'_>, |
| 1212 | patset: &mut PatternSet, |
| 1213 | ) { |
| 1214 | self.which_overlapping_imp(cache, input, patset) |
| 1215 | } |
| 1216 | } |
| 1217 | |
| 1218 | impl PikeVM { |
| 1219 | /// The implementation of standard leftmost search. |
| 1220 | /// |
| 1221 | /// Capturing group spans are written to `slots`, but only if requested. |
| 1222 | /// `slots` can be any length. Any slot in the NFA that is activated but |
| 1223 | /// which is out of bounds for the given `slots` is ignored. |
| 1224 | fn search_imp( |
| 1225 | &self, |
| 1226 | cache: &mut Cache, |
| 1227 | input: &Input<'_>, |
| 1228 | slots: &mut [Option<NonMaxUsize>], |
| 1229 | ) -> Option<HalfMatch> { |
| 1230 | cache.setup_search(slots.len()); |
| 1231 | if input.is_done() { |
| 1232 | return None; |
| 1233 | } |
| 1234 | // Why do we even care about this? Well, in our 'Captures' |
| 1235 | // representation, we use usize::MAX as a sentinel to indicate "no |
| 1236 | // match." This isn't problematic so long as our haystack doesn't have |
| 1237 | // a maximal length. Byte slices are guaranteed by Rust to have a |
| 1238 | // length that fits into isize, and so this assert should always pass. |
| 1239 | // But we put it here to make our assumption explicit. |
| 1240 | assert!( |
| 1241 | input.haystack().len() < core::usize::MAX, |
| 1242 | "byte slice lengths must be less than usize MAX" , |
| 1243 | ); |
| 1244 | instrument!(|c| c.reset(&self.nfa)); |
| 1245 | |
| 1246 | // Whether we want to visit all match states instead of emulating the |
| 1247 | // 'leftmost' semantics of typical backtracking regex engines. |
| 1248 | let allmatches = |
| 1249 | self.config.get_match_kind().continue_past_first_match(); |
| 1250 | let (anchored, start_id) = match self.start_config(input) { |
| 1251 | None => return None, |
| 1252 | Some(config) => config, |
| 1253 | }; |
| 1254 | |
| 1255 | let pre = |
| 1256 | if anchored { None } else { self.get_config().get_prefilter() }; |
| 1257 | let Cache { ref mut stack, ref mut curr, ref mut next } = cache; |
| 1258 | let mut hm = None; |
| 1259 | // Yes, our search doesn't end at input.end(), but includes it. This |
| 1260 | // is necessary because matches are delayed by one byte, just like |
| 1261 | // how the DFA engines work. The delay is used to handle look-behind |
| 1262 | // assertions. In the case of the PikeVM, the delay is implemented |
| 1263 | // by not considering a match to exist until it is visited in |
| 1264 | // 'steps'. Technically, we know a match exists in the previous |
| 1265 | // iteration via 'epsilon_closure'. (It's the same thing in NFA-to-DFA |
| 1266 | // determinization. We don't mark a DFA state as a match state if it |
| 1267 | // contains an NFA match state, but rather, whether the DFA state was |
| 1268 | // generated by a transition from a DFA state that contains an NFA |
| 1269 | // match state.) |
| 1270 | let mut at = input.start(); |
| 1271 | while at <= input.end() { |
| 1272 | // If we have no states left to visit, then there are some cases |
| 1273 | // where we know we can quit early or even skip ahead. |
| 1274 | if curr.set.is_empty() { |
| 1275 | // We have a match and we haven't been instructed to continue |
| 1276 | // on even after finding a match, so we can quit. |
| 1277 | if hm.is_some() && !allmatches { |
| 1278 | break; |
| 1279 | } |
| 1280 | // If we're running an anchored search and we've advanced |
| 1281 | // beyond the start position with no other states to try, then |
| 1282 | // we will never observe a match and thus can stop. |
| 1283 | if anchored && at > input.start() { |
| 1284 | break; |
| 1285 | } |
| 1286 | // If there no states left to explore at this position and we |
| 1287 | // know we can't terminate early, then we are effectively at |
| 1288 | // the starting state of the NFA. If we fell through here, |
| 1289 | // we'd end up adding our '(?s-u:.)*?' prefix and it would be |
| 1290 | // the only thing in 'curr'. So we might as well just skip |
| 1291 | // ahead until we find something that we know might advance us |
| 1292 | // forward. |
| 1293 | if let Some(ref pre) = pre { |
| 1294 | let span = Span::from(at..input.end()); |
| 1295 | match pre.find(input.haystack(), span) { |
| 1296 | None => break, |
| 1297 | Some(ref span) => at = span.start, |
| 1298 | } |
| 1299 | } |
| 1300 | } |
| 1301 | // Instead of using the NFA's unanchored start state, we actually |
| 1302 | // always use its anchored starting state. As a result, when doing |
| 1303 | // an unanchored search, we need to simulate our own '(?s-u:.)*?' |
| 1304 | // prefix, to permit a match to appear anywhere. |
| 1305 | // |
| 1306 | // Now, we don't *have* to do things this way. We could use the |
| 1307 | // NFA's unanchored starting state and do one 'epsilon_closure' |
| 1308 | // call from that starting state before the main loop here. And |
| 1309 | // that is just as correct. However, it turns out to be slower |
| 1310 | // than our approach here because it slightly increases the cost |
| 1311 | // of processing each byte by requiring us to visit more NFA |
| 1312 | // states to deal with the additional NFA states in the unanchored |
| 1313 | // prefix. By simulating it explicitly here, we lower those costs |
| 1314 | // substantially. The cost is itself small, but it adds up for |
| 1315 | // large haystacks. |
| 1316 | // |
| 1317 | // In order to simulate the '(?s-u:.)*?' prefix---which is not |
| 1318 | // greedy---we are careful not to perform an epsilon closure on |
| 1319 | // the start state if we already have a match. Namely, if we |
| 1320 | // did otherwise, we would never reach a terminating condition |
| 1321 | // because there would always be additional states to process. |
| 1322 | // In effect, the exclusion of running 'epsilon_closure' when |
| 1323 | // we have a match corresponds to the "dead" states we have in |
| 1324 | // our DFA regex engines. Namely, in a DFA, match states merely |
| 1325 | // instruct the search execution to record the current offset as |
| 1326 | // the most recently seen match. It is the dead state that actually |
| 1327 | // indicates when to stop the search (other than EOF or quit |
| 1328 | // states). |
| 1329 | // |
| 1330 | // However, when 'allmatches' is true, the caller has asked us to |
| 1331 | // leave in every possible match state. This tends not to make a |
| 1332 | // whole lot of sense in unanchored searches, because it means the |
| 1333 | // search really cannot terminate until EOF. And often, in that |
| 1334 | // case, you wind up skipping over a bunch of matches and are left |
| 1335 | // with the "last" match. Arguably, it just doesn't make a lot of |
| 1336 | // sense to run a 'leftmost' search (which is what this routine is) |
| 1337 | // with 'allmatches' set to true. But the DFAs support it and this |
| 1338 | // matches their behavior. (Generally, 'allmatches' is useful for |
| 1339 | // overlapping searches or leftmost anchored searches to find the |
| 1340 | // longest possible match by ignoring match priority.) |
| 1341 | // |
| 1342 | // Additionally, when we're running an anchored search, this |
| 1343 | // epsilon closure should only be computed at the beginning of the |
| 1344 | // search. If we re-computed it at every position, we would be |
| 1345 | // simulating an unanchored search when we were tasked to perform |
| 1346 | // an anchored search. |
| 1347 | if (!hm.is_some() || allmatches) |
| 1348 | && (!anchored || at == input.start()) |
| 1349 | { |
| 1350 | // Since we are adding to the 'curr' active states and since |
| 1351 | // this is for the start ID, we use a slots slice that is |
| 1352 | // guaranteed to have the right length but where every element |
| 1353 | // is absent. This is exactly what we want, because this |
| 1354 | // epsilon closure is responsible for simulating an unanchored |
| 1355 | // '(?s:.)*?' prefix. It is specifically outside of any |
| 1356 | // capturing groups, and thus, using slots that are always |
| 1357 | // absent is correct. |
| 1358 | // |
| 1359 | // Note though that we can't just use '&mut []' here, since |
| 1360 | // this epsilon closure may traverse through 'Captures' epsilon |
| 1361 | // transitions, and thus must be able to write offsets to the |
| 1362 | // slots given which are later copied to slot values in 'curr'. |
| 1363 | let slots = next.slot_table.all_absent(); |
| 1364 | self.epsilon_closure(stack, slots, curr, input, at, start_id); |
| 1365 | } |
| 1366 | if let Some(pid) = self.nexts(stack, curr, next, input, at, slots) |
| 1367 | { |
| 1368 | hm = Some(HalfMatch::new(pid, at)); |
| 1369 | } |
| 1370 | // Unless the caller asked us to return early, we need to mush on |
| 1371 | // to see if we can extend our match. (But note that 'nexts' will |
| 1372 | // quit right after seeing a match when match_kind==LeftmostFirst, |
| 1373 | // as is consistent with leftmost-first match priority.) |
| 1374 | if input.get_earliest() && hm.is_some() { |
| 1375 | break; |
| 1376 | } |
| 1377 | core::mem::swap(curr, next); |
| 1378 | next.set.clear(); |
| 1379 | at += 1; |
| 1380 | } |
| 1381 | instrument!(|c| c.eprint(&self.nfa)); |
| 1382 | hm |
| 1383 | } |
| 1384 | |
| 1385 | /// The implementation for the 'which_overlapping_matches' API. Basically, |
| 1386 | /// we do a single scan through the entire haystack (unless our regex |
| 1387 | /// or search is anchored) and record every pattern that matched. In |
| 1388 | /// particular, when MatchKind::All is used, this supports overlapping |
| 1389 | /// matches. So if we have the regexes 'sam' and 'samwise', they will |
| 1390 | /// *both* be reported in the pattern set when searching the haystack |
| 1391 | /// 'samwise'. |
| 1392 | fn which_overlapping_imp( |
| 1393 | &self, |
| 1394 | cache: &mut Cache, |
| 1395 | input: &Input<'_>, |
| 1396 | patset: &mut PatternSet, |
| 1397 | ) { |
| 1398 | // NOTE: This is effectively a copy of 'search_imp' above, but with no |
| 1399 | // captures support and instead writes patterns that matched directly |
| 1400 | // to 'patset'. See that routine for better commentary about what's |
| 1401 | // going on in this routine. We probably could unify the routines using |
| 1402 | // generics or more helper routines, but I'm not sure it's worth it. |
| 1403 | // |
| 1404 | // NOTE: We somewhat go out of our way here to support things like |
| 1405 | // 'input.get_earliest()' and 'leftmost-first' match semantics. Neither |
| 1406 | // of those seem particularly relevant to this routine, but they are |
| 1407 | // both supported by the DFA analogs of this routine by construction |
| 1408 | // and composition, so it seems like good sense to have the PikeVM |
| 1409 | // match that behavior. |
| 1410 | |
| 1411 | cache.setup_search(0); |
| 1412 | if input.is_done() { |
| 1413 | return; |
| 1414 | } |
| 1415 | assert!( |
| 1416 | input.haystack().len() < core::usize::MAX, |
| 1417 | "byte slice lengths must be less than usize MAX" , |
| 1418 | ); |
| 1419 | instrument!(|c| c.reset(&self.nfa)); |
| 1420 | |
| 1421 | let allmatches = |
| 1422 | self.config.get_match_kind().continue_past_first_match(); |
| 1423 | let (anchored, start_id) = match self.start_config(input) { |
| 1424 | None => return, |
| 1425 | Some(config) => config, |
| 1426 | }; |
| 1427 | |
| 1428 | let Cache { ref mut stack, ref mut curr, ref mut next } = cache; |
| 1429 | for at in input.start()..=input.end() { |
| 1430 | let any_matches = !patset.is_empty(); |
| 1431 | if curr.set.is_empty() { |
| 1432 | if any_matches && !allmatches { |
| 1433 | break; |
| 1434 | } |
| 1435 | if anchored && at > input.start() { |
| 1436 | break; |
| 1437 | } |
| 1438 | } |
| 1439 | if !any_matches || allmatches { |
| 1440 | let slots = &mut []; |
| 1441 | self.epsilon_closure(stack, slots, curr, input, at, start_id); |
| 1442 | } |
| 1443 | self.nexts_overlapping(stack, curr, next, input, at, patset); |
| 1444 | // If we found a match and filled our set, then there is no more |
| 1445 | // additional info that we can provide. Thus, we can quit. We also |
| 1446 | // quit if the caller asked us to stop at the earliest point that |
| 1447 | // we know a match exists. |
| 1448 | if patset.is_full() || input.get_earliest() { |
| 1449 | break; |
| 1450 | } |
| 1451 | core::mem::swap(curr, next); |
| 1452 | next.set.clear(); |
| 1453 | } |
| 1454 | instrument!(|c| c.eprint(&self.nfa)); |
| 1455 | } |
| 1456 | |
| 1457 | /// Process the active states in 'curr' to find the states (written to |
| 1458 | /// 'next') we should process for the next byte in the haystack. |
| 1459 | /// |
| 1460 | /// 'stack' is used to perform a depth first traversal of the NFA when |
| 1461 | /// computing an epsilon closure. |
| 1462 | /// |
| 1463 | /// When a match is found, the slots for that match state (in 'curr') are |
| 1464 | /// copied to 'caps'. Moreover, once a match is seen, processing for 'curr' |
| 1465 | /// stops (unless the PikeVM was configured with MatchKind::All semantics). |
| 1466 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 1467 | fn nexts( |
| 1468 | &self, |
| 1469 | stack: &mut Vec<FollowEpsilon>, |
| 1470 | curr: &mut ActiveStates, |
| 1471 | next: &mut ActiveStates, |
| 1472 | input: &Input<'_>, |
| 1473 | at: usize, |
| 1474 | slots: &mut [Option<NonMaxUsize>], |
| 1475 | ) -> Option<PatternID> { |
| 1476 | instrument!(|c| c.record_state_set(&curr.set)); |
| 1477 | let mut pid = None; |
| 1478 | let ActiveStates { ref set, ref mut slot_table } = *curr; |
| 1479 | for sid in set.iter() { |
| 1480 | pid = match self.next(stack, slot_table, next, input, at, sid) { |
| 1481 | None => continue, |
| 1482 | Some(pid) => Some(pid), |
| 1483 | }; |
| 1484 | slots.copy_from_slice(slot_table.for_state(sid)); |
| 1485 | if !self.config.get_match_kind().continue_past_first_match() { |
| 1486 | break; |
| 1487 | } |
| 1488 | } |
| 1489 | pid |
| 1490 | } |
| 1491 | |
| 1492 | /// Like 'nexts', but for the overlapping case. This doesn't write any |
| 1493 | /// slots, and instead just writes which pattern matched in 'patset'. |
| 1494 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 1495 | fn nexts_overlapping( |
| 1496 | &self, |
| 1497 | stack: &mut Vec<FollowEpsilon>, |
| 1498 | curr: &mut ActiveStates, |
| 1499 | next: &mut ActiveStates, |
| 1500 | input: &Input<'_>, |
| 1501 | at: usize, |
| 1502 | patset: &mut PatternSet, |
| 1503 | ) { |
| 1504 | instrument!(|c| c.record_state_set(&curr.set)); |
| 1505 | let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); |
| 1506 | let ActiveStates { ref set, ref mut slot_table } = *curr; |
| 1507 | for sid in set.iter() { |
| 1508 | let pid = match self.next(stack, slot_table, next, input, at, sid) |
| 1509 | { |
| 1510 | None => continue, |
| 1511 | Some(pid) => pid, |
| 1512 | }; |
| 1513 | // This handles the case of finding a zero-width match that splits |
| 1514 | // a codepoint. Namely, if we're in UTF-8 mode AND we know we can |
| 1515 | // match the empty string, then the only valid way of getting to |
| 1516 | // this point with an offset that splits a codepoint is when we |
| 1517 | // have an empty match. Such matches, in UTF-8 mode, must not be |
| 1518 | // reported. So we just skip them here and pretend as if we did |
| 1519 | // not see a match. |
| 1520 | if utf8empty && !input.is_char_boundary(at) { |
| 1521 | continue; |
| 1522 | } |
| 1523 | let _ = patset.try_insert(pid); |
| 1524 | if !self.config.get_match_kind().continue_past_first_match() { |
| 1525 | break; |
| 1526 | } |
| 1527 | } |
| 1528 | } |
| 1529 | |
| 1530 | /// Starting from 'sid', if the position 'at' in the 'input' haystack has a |
| 1531 | /// transition defined out of 'sid', then add the state transitioned to and |
| 1532 | /// its epsilon closure to the 'next' set of states to explore. |
| 1533 | /// |
| 1534 | /// 'stack' is used by the epsilon closure computation to perform a depth |
| 1535 | /// first traversal of the NFA. |
| 1536 | /// |
| 1537 | /// 'curr_slot_table' should be the table of slots for the current set of |
| 1538 | /// states being explored. If there is a transition out of 'sid', then |
| 1539 | /// sid's row in the slot table is used to perform the epsilon closure. |
| 1540 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 1541 | fn next( |
| 1542 | &self, |
| 1543 | stack: &mut Vec<FollowEpsilon>, |
| 1544 | curr_slot_table: &mut SlotTable, |
| 1545 | next: &mut ActiveStates, |
| 1546 | input: &Input<'_>, |
| 1547 | at: usize, |
| 1548 | sid: StateID, |
| 1549 | ) -> Option<PatternID> { |
| 1550 | instrument!(|c| c.record_step(sid)); |
| 1551 | match *self.nfa.state(sid) { |
| 1552 | State::Fail |
| 1553 | | State::Look { .. } |
| 1554 | | State::Union { .. } |
| 1555 | | State::BinaryUnion { .. } |
| 1556 | | State::Capture { .. } => None, |
| 1557 | State::ByteRange { ref trans } => { |
| 1558 | if trans.matches(input.haystack(), at) { |
| 1559 | let slots = curr_slot_table.for_state(sid); |
| 1560 | // OK because 'at <= haystack.len() < usize::MAX', so |
| 1561 | // adding 1 will never wrap. |
| 1562 | let at = at.wrapping_add(1); |
| 1563 | self.epsilon_closure( |
| 1564 | stack, slots, next, input, at, trans.next, |
| 1565 | ); |
| 1566 | } |
| 1567 | None |
| 1568 | } |
| 1569 | State::Sparse(ref sparse) => { |
| 1570 | if let Some(next_sid) = sparse.matches(input.haystack(), at) { |
| 1571 | let slots = curr_slot_table.for_state(sid); |
| 1572 | // OK because 'at <= haystack.len() < usize::MAX', so |
| 1573 | // adding 1 will never wrap. |
| 1574 | let at = at.wrapping_add(1); |
| 1575 | self.epsilon_closure( |
| 1576 | stack, slots, next, input, at, next_sid, |
| 1577 | ); |
| 1578 | } |
| 1579 | None |
| 1580 | } |
| 1581 | State::Dense(ref dense) => { |
| 1582 | if let Some(next_sid) = dense.matches(input.haystack(), at) { |
| 1583 | let slots = curr_slot_table.for_state(sid); |
| 1584 | // OK because 'at <= haystack.len() < usize::MAX', so |
| 1585 | // adding 1 will never wrap. |
| 1586 | let at = at.wrapping_add(1); |
| 1587 | self.epsilon_closure( |
| 1588 | stack, slots, next, input, at, next_sid, |
| 1589 | ); |
| 1590 | } |
| 1591 | None |
| 1592 | } |
| 1593 | State::Match { pattern_id } => Some(pattern_id), |
| 1594 | } |
| 1595 | } |
| 1596 | |
| 1597 | /// Compute the epsilon closure of 'sid', writing the closure into 'next' |
| 1598 | /// while copying slot values from 'curr_slots' into corresponding states |
| 1599 | /// in 'next'. 'curr_slots' should be the slot values corresponding to |
| 1600 | /// 'sid'. |
| 1601 | /// |
| 1602 | /// The given 'stack' is used to perform a depth first traversal of the |
| 1603 | /// NFA by recursively following all epsilon transitions out of 'sid'. |
| 1604 | /// Conditional epsilon transitions are followed if and only if they are |
| 1605 | /// satisfied for the position 'at' in the 'input' haystack. |
| 1606 | /// |
| 1607 | /// While this routine may write to 'curr_slots', once it returns, any |
| 1608 | /// writes are undone and the original values (even if absent) are |
| 1609 | /// restored. |
| 1610 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 1611 | fn epsilon_closure( |
| 1612 | &self, |
| 1613 | stack: &mut Vec<FollowEpsilon>, |
| 1614 | curr_slots: &mut [Option<NonMaxUsize>], |
| 1615 | next: &mut ActiveStates, |
| 1616 | input: &Input<'_>, |
| 1617 | at: usize, |
| 1618 | sid: StateID, |
| 1619 | ) { |
| 1620 | instrument!(|c| { |
| 1621 | c.record_closure(sid); |
| 1622 | c.record_stack_push(sid); |
| 1623 | }); |
| 1624 | stack.push(FollowEpsilon::Explore(sid)); |
| 1625 | while let Some(frame) = stack.pop() { |
| 1626 | match frame { |
| 1627 | FollowEpsilon::RestoreCapture { slot, offset: pos } => { |
| 1628 | curr_slots[slot] = pos; |
| 1629 | } |
| 1630 | FollowEpsilon::Explore(sid) => { |
| 1631 | self.epsilon_closure_explore( |
| 1632 | stack, curr_slots, next, input, at, sid, |
| 1633 | ); |
| 1634 | } |
| 1635 | } |
| 1636 | } |
| 1637 | } |
| 1638 | |
| 1639 | /// Explore all of the epsilon transitions out of 'sid'. This is mostly |
| 1640 | /// split out from 'epsilon_closure' in order to clearly delineate |
| 1641 | /// the actual work of computing an epsilon closure from the stack |
| 1642 | /// book-keeping. |
| 1643 | /// |
| 1644 | /// This will push any additional explorations needed on to 'stack'. |
| 1645 | /// |
| 1646 | /// 'curr_slots' should refer to the slots for the currently active NFA |
| 1647 | /// state. That is, the current state we are stepping through. These |
| 1648 | /// slots are mutated in place as new 'Captures' states are traversed |
| 1649 | /// during epsilon closure, but the slots are restored to their original |
| 1650 | /// values once the full epsilon closure is completed. The ultimate use of |
| 1651 | /// 'curr_slots' is to copy them to the corresponding 'next_slots', so that |
| 1652 | /// the capturing group spans are forwarded from the currently active state |
| 1653 | /// to the next. |
| 1654 | /// |
| 1655 | /// 'next' refers to the next set of active states. Computing an epsilon |
| 1656 | /// closure may increase the next set of active states. |
| 1657 | /// |
| 1658 | /// 'input' refers to the caller's input configuration and 'at' refers to |
| 1659 | /// the current position in the haystack. These are used to check whether |
| 1660 | /// conditional epsilon transitions (like look-around) are satisfied at |
| 1661 | /// the current position. If they aren't, then the epsilon closure won't |
| 1662 | /// include them. |
| 1663 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 1664 | fn epsilon_closure_explore( |
| 1665 | &self, |
| 1666 | stack: &mut Vec<FollowEpsilon>, |
| 1667 | curr_slots: &mut [Option<NonMaxUsize>], |
| 1668 | next: &mut ActiveStates, |
| 1669 | input: &Input<'_>, |
| 1670 | at: usize, |
| 1671 | mut sid: StateID, |
| 1672 | ) { |
| 1673 | // We can avoid pushing some state IDs on to our stack in precisely |
| 1674 | // the cases where a 'push(x)' would be immediately followed by a 'x |
| 1675 | // = pop()'. This is achieved by this outer-loop. We simply set 'sid' |
| 1676 | // to be the next state ID we want to explore once we're done with |
| 1677 | // our initial exploration. In practice, this avoids a lot of stack |
| 1678 | // thrashing. |
| 1679 | loop { |
| 1680 | instrument!(|c| c.record_set_insert(sid)); |
| 1681 | // Record this state as part of our next set of active states. If |
| 1682 | // we've already explored it, then no need to do it again. |
| 1683 | if !next.set.insert(sid) { |
| 1684 | return; |
| 1685 | } |
| 1686 | match *self.nfa.state(sid) { |
| 1687 | State::Fail |
| 1688 | | State::Match { .. } |
| 1689 | | State::ByteRange { .. } |
| 1690 | | State::Sparse { .. } |
| 1691 | | State::Dense { .. } => { |
| 1692 | next.slot_table.for_state(sid).copy_from_slice(curr_slots); |
| 1693 | return; |
| 1694 | } |
| 1695 | State::Look { look, next } => { |
| 1696 | // OK because we don't permit building a searcher with a |
| 1697 | // Unicode word boundary if the requisite Unicode data is |
| 1698 | // unavailable. |
| 1699 | if !self.nfa.look_matcher().matches_inline( |
| 1700 | look, |
| 1701 | input.haystack(), |
| 1702 | at, |
| 1703 | ) { |
| 1704 | return; |
| 1705 | } |
| 1706 | sid = next; |
| 1707 | } |
| 1708 | State::Union { ref alternates } => { |
| 1709 | sid = match alternates.get(0) { |
| 1710 | None => return, |
| 1711 | Some(&sid) => sid, |
| 1712 | }; |
| 1713 | instrument!(|c| { |
| 1714 | for &alt in &alternates[1..] { |
| 1715 | c.record_stack_push(alt); |
| 1716 | } |
| 1717 | }); |
| 1718 | stack.extend( |
| 1719 | alternates[1..] |
| 1720 | .iter() |
| 1721 | .copied() |
| 1722 | .rev() |
| 1723 | .map(FollowEpsilon::Explore), |
| 1724 | ); |
| 1725 | } |
| 1726 | State::BinaryUnion { alt1, alt2 } => { |
| 1727 | sid = alt1; |
| 1728 | instrument!(|c| c.record_stack_push(sid)); |
| 1729 | stack.push(FollowEpsilon::Explore(alt2)); |
| 1730 | } |
| 1731 | State::Capture { next, slot, .. } => { |
| 1732 | // There's no need to do anything with slots that |
| 1733 | // ultimately won't be copied into the caller-provided |
| 1734 | // 'Captures' value. So we just skip dealing with them at |
| 1735 | // all. |
| 1736 | if slot.as_usize() < curr_slots.len() { |
| 1737 | instrument!(|c| c.record_stack_push(sid)); |
| 1738 | stack.push(FollowEpsilon::RestoreCapture { |
| 1739 | slot, |
| 1740 | offset: curr_slots[slot], |
| 1741 | }); |
| 1742 | // OK because length of a slice must fit into an isize. |
| 1743 | curr_slots[slot] = Some(NonMaxUsize::new(at).unwrap()); |
| 1744 | } |
| 1745 | sid = next; |
| 1746 | } |
| 1747 | } |
| 1748 | } |
| 1749 | } |
| 1750 | |
| 1751 | /// Return the starting configuration of a PikeVM search. |
| 1752 | /// |
| 1753 | /// The "start config" is basically whether the search should be anchored |
| 1754 | /// or not and the NFA state ID at which to begin the search. The state ID |
| 1755 | /// returned always corresponds to an anchored starting state even when the |
| 1756 | /// search is unanchored. This is because the PikeVM search loop deals with |
| 1757 | /// unanchored searches with an explicit epsilon closure out of the start |
| 1758 | /// state. |
| 1759 | /// |
| 1760 | /// This routine accounts for both the caller's `Input` configuration |
| 1761 | /// and the pattern itself. For example, even if the caller asks for an |
| 1762 | /// unanchored search, if the pattern itself is anchored, then this will |
| 1763 | /// always return 'true' because implementing an unanchored search in that |
| 1764 | /// case would be incorrect. |
| 1765 | /// |
| 1766 | /// Similarly, if the caller requests an anchored search for a particular |
| 1767 | /// pattern, then the starting state ID returned will reflect that. |
| 1768 | /// |
| 1769 | /// If a pattern ID is given in the input configuration that is not in |
| 1770 | /// this regex, then `None` is returned. |
| 1771 | fn start_config(&self, input: &Input<'_>) -> Option<(bool, StateID)> { |
| 1772 | match input.get_anchored() { |
| 1773 | // Only way we're unanchored is if both the caller asked for an |
| 1774 | // unanchored search *and* the pattern is itself not anchored. |
| 1775 | Anchored::No => Some(( |
| 1776 | self.nfa.is_always_start_anchored(), |
| 1777 | self.nfa.start_anchored(), |
| 1778 | )), |
| 1779 | Anchored::Yes => Some((true, self.nfa.start_anchored())), |
| 1780 | Anchored::Pattern(pid) => { |
| 1781 | Some((true, self.nfa.start_pattern(pid)?)) |
| 1782 | } |
| 1783 | } |
| 1784 | } |
| 1785 | } |
| 1786 | |
| 1787 | /// An iterator over all non-overlapping matches for a particular search. |
| 1788 | /// |
| 1789 | /// The iterator yields a [`Match`] value until no more matches could be found. |
| 1790 | /// |
| 1791 | /// The lifetime parameters are as follows: |
| 1792 | /// |
| 1793 | /// * `'r` represents the lifetime of the PikeVM. |
| 1794 | /// * `'c` represents the lifetime of the PikeVM's cache. |
| 1795 | /// * `'h` represents the lifetime of the haystack being searched. |
| 1796 | /// |
| 1797 | /// This iterator can be created with the [`PikeVM::find_iter`] method. |
| 1798 | #[derive (Debug)] |
| 1799 | pub struct FindMatches<'r, 'c, 'h> { |
| 1800 | re: &'r PikeVM, |
| 1801 | cache: &'c mut Cache, |
| 1802 | caps: Captures, |
| 1803 | it: iter::Searcher<'h>, |
| 1804 | } |
| 1805 | |
| 1806 | impl<'r, 'c, 'h> Iterator for FindMatches<'r, 'c, 'h> { |
| 1807 | type Item = Match; |
| 1808 | |
| 1809 | #[inline ] |
| 1810 | fn next(&mut self) -> Option<Match> { |
| 1811 | // Splitting 'self' apart seems necessary to appease borrowck. |
| 1812 | let FindMatches { re: &PikeVM, ref mut cache: &mut &mut Cache, ref mut caps: &mut Captures, ref mut it: &mut Searcher<'_> } = |
| 1813 | *self; |
| 1814 | // 'advance' converts errors into panics, which is OK here because |
| 1815 | // the PikeVM can never return an error. |
| 1816 | it.advance(|input: &Input<'_>| { |
| 1817 | re.search(cache, input, caps); |
| 1818 | Ok(caps.get_match()) |
| 1819 | }) |
| 1820 | } |
| 1821 | } |
| 1822 | |
| 1823 | /// An iterator over all non-overlapping leftmost matches, with their capturing |
| 1824 | /// groups, for a particular search. |
| 1825 | /// |
| 1826 | /// The iterator yields a [`Captures`] value until no more matches could be |
| 1827 | /// found. |
| 1828 | /// |
| 1829 | /// The lifetime parameters are as follows: |
| 1830 | /// |
| 1831 | /// * `'r` represents the lifetime of the PikeVM. |
| 1832 | /// * `'c` represents the lifetime of the PikeVM's cache. |
| 1833 | /// * `'h` represents the lifetime of the haystack being searched. |
| 1834 | /// |
| 1835 | /// This iterator can be created with the [`PikeVM::captures_iter`] method. |
| 1836 | #[derive (Debug)] |
| 1837 | pub struct CapturesMatches<'r, 'c, 'h> { |
| 1838 | re: &'r PikeVM, |
| 1839 | cache: &'c mut Cache, |
| 1840 | caps: Captures, |
| 1841 | it: iter::Searcher<'h>, |
| 1842 | } |
| 1843 | |
| 1844 | impl<'r, 'c, 'h> Iterator for CapturesMatches<'r, 'c, 'h> { |
| 1845 | type Item = Captures; |
| 1846 | |
| 1847 | #[inline ] |
| 1848 | fn next(&mut self) -> Option<Captures> { |
| 1849 | // Splitting 'self' apart seems necessary to appease borrowck. |
| 1850 | let CapturesMatches { re: &PikeVM, ref mut cache: &mut &mut Cache, ref mut caps: &mut Captures, ref mut it: &mut Searcher<'_> } = |
| 1851 | *self; |
| 1852 | // 'advance' converts errors into panics, which is OK here because |
| 1853 | // the PikeVM can never return an error. |
| 1854 | it.advance(|input: &Input<'_>| { |
| 1855 | re.search(cache, input, caps); |
| 1856 | Ok(caps.get_match()) |
| 1857 | }); |
| 1858 | if caps.is_match() { |
| 1859 | Some(caps.clone()) |
| 1860 | } else { |
| 1861 | None |
| 1862 | } |
| 1863 | } |
| 1864 | } |
| 1865 | |
| 1866 | /// A cache represents mutable state that a [`PikeVM`] requires during a |
| 1867 | /// search. |
| 1868 | /// |
| 1869 | /// For a given [`PikeVM`], its corresponding cache may be created either via |
| 1870 | /// [`PikeVM::create_cache`], or via [`Cache::new`]. They are equivalent in |
| 1871 | /// every way, except the former does not require explicitly importing `Cache`. |
| 1872 | /// |
| 1873 | /// A particular `Cache` is coupled with the [`PikeVM`] from which it |
| 1874 | /// was created. It may only be used with that `PikeVM`. A cache and its |
| 1875 | /// allocations may be re-purposed via [`Cache::reset`], in which case, it can |
| 1876 | /// only be used with the new `PikeVM` (and not the old one). |
| 1877 | #[derive (Clone, Debug)] |
| 1878 | pub struct Cache { |
| 1879 | /// Stack used while computing epsilon closure. This effectively lets us |
| 1880 | /// move what is more naturally expressed through recursion to a stack |
| 1881 | /// on the heap. |
| 1882 | stack: Vec<FollowEpsilon>, |
| 1883 | /// The current active states being explored for the current byte in the |
| 1884 | /// haystack. |
| 1885 | curr: ActiveStates, |
| 1886 | /// The next set of states we're building that will be explored for the |
| 1887 | /// next byte in the haystack. |
| 1888 | next: ActiveStates, |
| 1889 | } |
| 1890 | |
| 1891 | impl Cache { |
| 1892 | /// Create a new [`PikeVM`] cache. |
| 1893 | /// |
| 1894 | /// A potentially more convenient routine to create a cache is |
| 1895 | /// [`PikeVM::create_cache`], as it does not require also importing the |
| 1896 | /// `Cache` type. |
| 1897 | /// |
| 1898 | /// If you want to reuse the returned `Cache` with some other `PikeVM`, |
| 1899 | /// then you must call [`Cache::reset`] with the desired `PikeVM`. |
| 1900 | pub fn new(re: &PikeVM) -> Cache { |
| 1901 | Cache { |
| 1902 | stack: vec![], |
| 1903 | curr: ActiveStates::new(re), |
| 1904 | next: ActiveStates::new(re), |
| 1905 | } |
| 1906 | } |
| 1907 | |
| 1908 | /// Reset this cache such that it can be used for searching with a |
| 1909 | /// different [`PikeVM`]. |
| 1910 | /// |
| 1911 | /// A cache reset permits reusing memory already allocated in this cache |
| 1912 | /// with a different `PikeVM`. |
| 1913 | /// |
| 1914 | /// # Example |
| 1915 | /// |
| 1916 | /// This shows how to re-purpose a cache for use with a different `PikeVM`. |
| 1917 | /// |
| 1918 | /// ``` |
| 1919 | /// # if cfg!(miri) { return Ok(()); } // miri takes too long |
| 1920 | /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; |
| 1921 | /// |
| 1922 | /// let re1 = PikeVM::new(r"\w" )?; |
| 1923 | /// let re2 = PikeVM::new(r"\W" )?; |
| 1924 | /// |
| 1925 | /// let mut cache = re1.create_cache(); |
| 1926 | /// assert_eq!( |
| 1927 | /// Some(Match::must(0, 0..2)), |
| 1928 | /// re1.find_iter(&mut cache, "Δ" ).next(), |
| 1929 | /// ); |
| 1930 | /// |
| 1931 | /// // Using 'cache' with re2 is not allowed. It may result in panics or |
| 1932 | /// // incorrect results. In order to re-purpose the cache, we must reset |
| 1933 | /// // it with the PikeVM we'd like to use it with. |
| 1934 | /// // |
| 1935 | /// // Similarly, after this reset, using the cache with 're1' is also not |
| 1936 | /// // allowed. |
| 1937 | /// cache.reset(&re2); |
| 1938 | /// assert_eq!( |
| 1939 | /// Some(Match::must(0, 0..3)), |
| 1940 | /// re2.find_iter(&mut cache, "☃" ).next(), |
| 1941 | /// ); |
| 1942 | /// |
| 1943 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 1944 | /// ``` |
| 1945 | pub fn reset(&mut self, re: &PikeVM) { |
| 1946 | self.curr.reset(re); |
| 1947 | self.next.reset(re); |
| 1948 | } |
| 1949 | |
| 1950 | /// Returns the heap memory usage, in bytes, of this cache. |
| 1951 | /// |
| 1952 | /// This does **not** include the stack size used up by this cache. To |
| 1953 | /// compute that, use `std::mem::size_of::<Cache>()`. |
| 1954 | pub fn memory_usage(&self) -> usize { |
| 1955 | use core::mem::size_of; |
| 1956 | (self.stack.len() * size_of::<FollowEpsilon>()) |
| 1957 | + self.curr.memory_usage() |
| 1958 | + self.next.memory_usage() |
| 1959 | } |
| 1960 | |
| 1961 | /// Clears this cache. This should be called at the start of every search |
| 1962 | /// to ensure we start with a clean slate. |
| 1963 | /// |
| 1964 | /// This also sets the length of the capturing groups used in the current |
| 1965 | /// search. This permits an optimization where by 'SlotTable::for_state' |
| 1966 | /// only returns the number of slots equivalent to the number of slots |
| 1967 | /// given in the 'Captures' value. This may be less than the total number |
| 1968 | /// of possible slots, e.g., when one only wants to track overall match |
| 1969 | /// offsets. This in turn permits less copying of capturing group spans |
| 1970 | /// in the PikeVM. |
| 1971 | fn setup_search(&mut self, captures_slot_len: usize) { |
| 1972 | self.stack.clear(); |
| 1973 | self.curr.setup_search(captures_slot_len); |
| 1974 | self.next.setup_search(captures_slot_len); |
| 1975 | } |
| 1976 | } |
| 1977 | |
| 1978 | /// A set of active states used to "simulate" the execution of an NFA via the |
| 1979 | /// PikeVM. |
| 1980 | /// |
| 1981 | /// There are two sets of these used during NFA simulation. One set corresponds |
| 1982 | /// to the "current" set of states being traversed for the current position |
| 1983 | /// in a haystack. The other set corresponds to the "next" set of states being |
| 1984 | /// built, which will become the new "current" set for the next position in the |
| 1985 | /// haystack. These two sets correspond to CLIST and NLIST in Thompson's |
| 1986 | /// original paper regexes: https://dl.acm.org/doi/pdf/10.1145/363347.363387 |
| 1987 | /// |
| 1988 | /// In addition to representing a set of NFA states, this also maintains slot |
| 1989 | /// values for each state. These slot values are what turn the NFA simulation |
| 1990 | /// into the "Pike VM." Namely, they track capturing group values for each |
| 1991 | /// state. During the computation of epsilon closure, we copy slot values from |
| 1992 | /// states in the "current" set to the "next" set. Eventually, once a match |
| 1993 | /// is found, the slot values for that match state are what we write to the |
| 1994 | /// caller provided 'Captures' value. |
| 1995 | #[derive (Clone, Debug)] |
| 1996 | struct ActiveStates { |
| 1997 | /// The set of active NFA states. This set preserves insertion order, which |
| 1998 | /// is critical for simulating the match semantics of backtracking regex |
| 1999 | /// engines. |
| 2000 | set: SparseSet, |
| 2001 | /// The slots for every NFA state, where each slot stores a (possibly |
| 2002 | /// absent) offset. Every capturing group has two slots. One for a start |
| 2003 | /// offset and one for an end offset. |
| 2004 | slot_table: SlotTable, |
| 2005 | } |
| 2006 | |
| 2007 | impl ActiveStates { |
| 2008 | /// Create a new set of active states for the given PikeVM. The active |
| 2009 | /// states returned may only be used with the given PikeVM. (Use 'reset' |
| 2010 | /// to re-purpose the allocation for a different PikeVM.) |
| 2011 | fn new(re: &PikeVM) -> ActiveStates { |
| 2012 | let mut active = ActiveStates { |
| 2013 | set: SparseSet::new(0), |
| 2014 | slot_table: SlotTable::new(), |
| 2015 | }; |
| 2016 | active.reset(re); |
| 2017 | active |
| 2018 | } |
| 2019 | |
| 2020 | /// Reset this set of active states such that it can be used with the given |
| 2021 | /// PikeVM (and only that PikeVM). |
| 2022 | fn reset(&mut self, re: &PikeVM) { |
| 2023 | self.set.resize(re.get_nfa().states().len()); |
| 2024 | self.slot_table.reset(re); |
| 2025 | } |
| 2026 | |
| 2027 | /// Return the heap memory usage, in bytes, used by this set of active |
| 2028 | /// states. |
| 2029 | /// |
| 2030 | /// This does not include the stack size of this value. |
| 2031 | fn memory_usage(&self) -> usize { |
| 2032 | self.set.memory_usage() + self.slot_table.memory_usage() |
| 2033 | } |
| 2034 | |
| 2035 | /// Setup this set of active states for a new search. The given slot |
| 2036 | /// length should be the number of slots in a caller provided 'Captures' |
| 2037 | /// (and may be zero). |
| 2038 | fn setup_search(&mut self, captures_slot_len: usize) { |
| 2039 | self.set.clear(); |
| 2040 | self.slot_table.setup_search(captures_slot_len); |
| 2041 | } |
| 2042 | } |
| 2043 | |
| 2044 | /// A table of slots, where each row represent a state in an NFA. Thus, the |
| 2045 | /// table has room for storing slots for every single state in an NFA. |
| 2046 | /// |
| 2047 | /// This table is represented with a single contiguous allocation. In general, |
| 2048 | /// the notion of "capturing group" doesn't really exist at this level of |
| 2049 | /// abstraction, hence the name "slot" instead. (Indeed, every capturing group |
| 2050 | /// maps to a pair of slots, one for the start offset and one for the end |
| 2051 | /// offset.) Slots are indexed by the 'Captures' NFA state. |
| 2052 | /// |
| 2053 | /// N.B. Not every state actually needs a row of slots. Namely, states that |
| 2054 | /// only have epsilon transitions currently never have anything written to |
| 2055 | /// their rows in this table. Thus, the table is somewhat wasteful in its heap |
| 2056 | /// usage. However, it is important to maintain fast random access by state |
| 2057 | /// ID, which means one giant table tends to work well. RE2 takes a different |
| 2058 | /// approach here and allocates each row as its own reference counted thing. |
| 2059 | /// I explored such a strategy at one point here, but couldn't get it to work |
| 2060 | /// well using entirely safe code. (To the ambitious reader: I encourage you to |
| 2061 | /// re-litigate that experiment.) I very much wanted to stick to safe code, but |
| 2062 | /// could be convinced otherwise if there was a solid argument and the safety |
| 2063 | /// was encapsulated well. |
| 2064 | #[derive (Clone, Debug)] |
| 2065 | struct SlotTable { |
| 2066 | /// The actual table of offsets. |
| 2067 | table: Vec<Option<NonMaxUsize>>, |
| 2068 | /// The number of slots per state, i.e., the table's stride or the length |
| 2069 | /// of each row. |
| 2070 | slots_per_state: usize, |
| 2071 | /// The number of slots in the caller-provided 'Captures' value for the |
| 2072 | /// current search. Setting this to 'slots_per_state' is always correct, |
| 2073 | /// but may be wasteful. |
| 2074 | slots_for_captures: usize, |
| 2075 | } |
| 2076 | |
| 2077 | impl SlotTable { |
| 2078 | /// Create a new slot table. |
| 2079 | /// |
| 2080 | /// One should call 'reset' with the corresponding PikeVM before use. |
| 2081 | fn new() -> SlotTable { |
| 2082 | SlotTable { table: vec![], slots_for_captures: 0, slots_per_state: 0 } |
| 2083 | } |
| 2084 | |
| 2085 | /// Reset this slot table such that it can be used with the given PikeVM |
| 2086 | /// (and only that PikeVM). |
| 2087 | fn reset(&mut self, re: &PikeVM) { |
| 2088 | let nfa = re.get_nfa(); |
| 2089 | self.slots_per_state = nfa.group_info().slot_len(); |
| 2090 | // This is always correct, but may be reduced for a particular search |
| 2091 | // if a 'Captures' has fewer slots, e.g., none at all or only slots |
| 2092 | // for tracking the overall match instead of all slots for every |
| 2093 | // group. |
| 2094 | self.slots_for_captures = core::cmp::max( |
| 2095 | self.slots_per_state, |
| 2096 | nfa.pattern_len().checked_mul(2).unwrap(), |
| 2097 | ); |
| 2098 | let len = nfa |
| 2099 | .states() |
| 2100 | .len() |
| 2101 | .checked_mul(self.slots_per_state) |
| 2102 | // Add space to account for scratch space used during a search. |
| 2103 | .and_then(|x| x.checked_add(self.slots_for_captures)) |
| 2104 | // It seems like this could actually panic on legitimate inputs on |
| 2105 | // 32-bit targets, and very likely to panic on 16-bit. Should we |
| 2106 | // somehow convert this to an error? What about something similar |
| 2107 | // for the lazy DFA cache? If you're tripping this assert, please |
| 2108 | // file a bug. |
| 2109 | .expect("slot table length doesn't overflow" ); |
| 2110 | // This happens about as often as a regex is compiled, so it probably |
| 2111 | // should be at debug level, but I found it quite distracting and not |
| 2112 | // particularly useful. |
| 2113 | trace!( |
| 2114 | "resizing PikeVM active states table to {} entries \ |
| 2115 | (slots_per_state={})" , |
| 2116 | len, |
| 2117 | self.slots_per_state, |
| 2118 | ); |
| 2119 | self.table.resize(len, None); |
| 2120 | } |
| 2121 | |
| 2122 | /// Return the heap memory usage, in bytes, used by this slot table. |
| 2123 | /// |
| 2124 | /// This does not include the stack size of this value. |
| 2125 | fn memory_usage(&self) -> usize { |
| 2126 | self.table.len() * core::mem::size_of::<Option<NonMaxUsize>>() |
| 2127 | } |
| 2128 | |
| 2129 | /// Perform any per-search setup for this slot table. |
| 2130 | /// |
| 2131 | /// In particular, this sets the length of the number of slots used in the |
| 2132 | /// 'Captures' given by the caller (if any at all). This number may be |
| 2133 | /// smaller than the total number of slots available, e.g., when the caller |
| 2134 | /// is only interested in tracking the overall match and not the spans of |
| 2135 | /// every matching capturing group. Only tracking the overall match can |
| 2136 | /// save a substantial amount of time copying capturing spans during a |
| 2137 | /// search. |
| 2138 | fn setup_search(&mut self, captures_slot_len: usize) { |
| 2139 | self.slots_for_captures = captures_slot_len; |
| 2140 | } |
| 2141 | |
| 2142 | /// Return a mutable slice of the slots for the given state. |
| 2143 | /// |
| 2144 | /// Note that the length of the slice returned may be less than the total |
| 2145 | /// number of slots available for this state. In particular, the length |
| 2146 | /// always matches the number of slots indicated via 'setup_search'. |
| 2147 | fn for_state(&mut self, sid: StateID) -> &mut [Option<NonMaxUsize>] { |
| 2148 | let i = sid.as_usize() * self.slots_per_state; |
| 2149 | &mut self.table[i..i + self.slots_for_captures] |
| 2150 | } |
| 2151 | |
| 2152 | /// Return a slice of slots of appropriate length where every slot offset |
| 2153 | /// is guaranteed to be absent. This is useful in cases where you need to |
| 2154 | /// compute an epsilon closure outside of the user supplied regex, and thus |
| 2155 | /// never want it to have any capturing slots set. |
| 2156 | fn all_absent(&mut self) -> &mut [Option<NonMaxUsize>] { |
| 2157 | let i = self.table.len() - self.slots_for_captures; |
| 2158 | &mut self.table[i..i + self.slots_for_captures] |
| 2159 | } |
| 2160 | } |
| 2161 | |
| 2162 | /// Represents a stack frame for use while computing an epsilon closure. |
| 2163 | /// |
| 2164 | /// (An "epsilon closure" refers to the set of reachable NFA states from a |
| 2165 | /// single state without consuming any input. That is, the set of all epsilon |
| 2166 | /// transitions not only from that single state, but from every other state |
| 2167 | /// reachable by an epsilon transition as well. This is why it's called a |
| 2168 | /// "closure." Computing an epsilon closure is also done during DFA |
| 2169 | /// determinization! Compare and contrast the epsilon closure here in this |
| 2170 | /// PikeVM and the one used for determinization in crate::util::determinize.) |
| 2171 | /// |
| 2172 | /// Computing the epsilon closure in a Thompson NFA proceeds via a depth |
| 2173 | /// first traversal over all epsilon transitions from a particular state. |
| 2174 | /// (A depth first traversal is important because it emulates the same priority |
| 2175 | /// of matches that is typically found in backtracking regex engines.) This |
| 2176 | /// depth first traversal is naturally expressed using recursion, but to avoid |
| 2177 | /// a call stack size proportional to the size of a regex, we put our stack on |
| 2178 | /// the heap instead. |
| 2179 | /// |
| 2180 | /// This stack thus consists of call frames. The typical call frame is |
| 2181 | /// `Explore`, which instructs epsilon closure to explore the epsilon |
| 2182 | /// transitions from that state. (Subsequent epsilon transitions are then |
| 2183 | /// pushed on to the stack as more `Explore` frames.) If the state ID being |
| 2184 | /// explored has no epsilon transitions, then the capturing group slots are |
| 2185 | /// copied from the original state that sparked the epsilon closure (from the |
| 2186 | /// 'step' routine) to the state ID being explored. This way, capturing group |
| 2187 | /// slots are forwarded from the previous state to the next. |
| 2188 | /// |
| 2189 | /// The other stack frame, `RestoreCaptures`, instructs the epsilon closure to |
| 2190 | /// set the position for a particular slot back to some particular offset. This |
| 2191 | /// frame is pushed when `Explore` sees a `Capture` transition. `Explore` will |
| 2192 | /// set the offset of the slot indicated in `Capture` to the current offset, |
| 2193 | /// and then push the old offset on to the stack as a `RestoreCapture` frame. |
| 2194 | /// Thus, the new offset is only used until the epsilon closure reverts back to |
| 2195 | /// the `RestoreCapture` frame. In effect, this gives the `Capture` epsilon |
| 2196 | /// transition its "scope" to only states that come "after" it during depth |
| 2197 | /// first traversal. |
| 2198 | #[derive (Clone, Debug)] |
| 2199 | enum FollowEpsilon { |
| 2200 | /// Explore the epsilon transitions from a state ID. |
| 2201 | Explore(StateID), |
| 2202 | /// Reset the given `slot` to the given `offset` (which might be `None`). |
| 2203 | RestoreCapture { slot: SmallIndex, offset: Option<NonMaxUsize> }, |
| 2204 | } |
| 2205 | |
| 2206 | /// A set of counters that "instruments" a PikeVM search. To enable this, you |
| 2207 | /// must enable the 'internal-instrument-pikevm' feature. Then run your Rust |
| 2208 | /// program with RUST_LOG=regex_automata::nfa::thompson::pikevm=trace set in |
| 2209 | /// the environment. The metrics collected will be dumped automatically for |
| 2210 | /// every search executed by the PikeVM. |
| 2211 | /// |
| 2212 | /// NOTE: When 'internal-instrument-pikevm' is enabled, it will likely cause an |
| 2213 | /// absolute decrease in wall-clock performance, even if the 'trace' log level |
| 2214 | /// isn't enabled. (Although, we do try to avoid extra costs when 'trace' isn't |
| 2215 | /// enabled.) The main point of instrumentation is to get counts of various |
| 2216 | /// events that occur during the PikeVM's execution. |
| 2217 | /// |
| 2218 | /// This is a somewhat hacked together collection of metrics that are useful |
| 2219 | /// to gather from a PikeVM search. In particular, it lets us scrutinize the |
| 2220 | /// performance profile of a search beyond what general purpose profiling tools |
| 2221 | /// give us. Namely, we orient the profiling data around the specific states of |
| 2222 | /// the NFA. |
| 2223 | /// |
| 2224 | /// In other words, this lets us see which parts of the NFA graph are most |
| 2225 | /// frequently activated. This then provides direction for optimization |
| 2226 | /// opportunities. |
| 2227 | /// |
| 2228 | /// The really sad part about this is that it absolutely clutters up the PikeVM |
| 2229 | /// implementation. :'( Another approach would be to just manually add this |
| 2230 | /// code in whenever I want this kind of profiling data, but it's complicated |
| 2231 | /// and tedious enough that I went with this approach... for now. |
| 2232 | /// |
| 2233 | /// When instrumentation is enabled (which also turns on 'logging'), then a |
| 2234 | /// `Counters` is initialized for every search and `trace`'d just before the |
| 2235 | /// search returns to the caller. |
| 2236 | /// |
| 2237 | /// Tip: When debugging performance problems with the PikeVM, it's best to try |
| 2238 | /// to work with an NFA that is as small as possible. Otherwise the state graph |
| 2239 | /// is likely to be too big to digest. |
| 2240 | #[cfg (feature = "internal-instrument-pikevm" )] |
| 2241 | #[derive (Clone, Debug)] |
| 2242 | struct Counters { |
| 2243 | /// The number of times the NFA is in a particular permutation of states. |
| 2244 | state_sets: alloc::collections::BTreeMap<Vec<StateID>, u64>, |
| 2245 | /// The number of times 'step' is called for a particular state ID (which |
| 2246 | /// indexes this array). |
| 2247 | steps: Vec<u64>, |
| 2248 | /// The number of times an epsilon closure was computed for a state. |
| 2249 | closures: Vec<u64>, |
| 2250 | /// The number of times a particular state ID is pushed on to a stack while |
| 2251 | /// computing an epsilon closure. |
| 2252 | stack_pushes: Vec<u64>, |
| 2253 | /// The number of times a particular state ID is inserted into a sparse set |
| 2254 | /// while computing an epsilon closure. |
| 2255 | set_inserts: Vec<u64>, |
| 2256 | } |
| 2257 | |
| 2258 | #[cfg (feature = "internal-instrument-pikevm" )] |
| 2259 | impl Counters { |
| 2260 | fn empty() -> Counters { |
| 2261 | Counters { |
| 2262 | state_sets: alloc::collections::BTreeMap::new(), |
| 2263 | steps: vec![], |
| 2264 | closures: vec![], |
| 2265 | stack_pushes: vec![], |
| 2266 | set_inserts: vec![], |
| 2267 | } |
| 2268 | } |
| 2269 | |
| 2270 | fn reset(&mut self, nfa: &NFA) { |
| 2271 | let len = nfa.states().len(); |
| 2272 | |
| 2273 | self.state_sets.clear(); |
| 2274 | |
| 2275 | self.steps.clear(); |
| 2276 | self.steps.resize(len, 0); |
| 2277 | |
| 2278 | self.closures.clear(); |
| 2279 | self.closures.resize(len, 0); |
| 2280 | |
| 2281 | self.stack_pushes.clear(); |
| 2282 | self.stack_pushes.resize(len, 0); |
| 2283 | |
| 2284 | self.set_inserts.clear(); |
| 2285 | self.set_inserts.resize(len, 0); |
| 2286 | } |
| 2287 | |
| 2288 | fn eprint(&self, nfa: &NFA) { |
| 2289 | trace!("===== START PikeVM Instrumentation Output =====" ); |
| 2290 | // We take the top-K most occurring state sets. Otherwise the output |
| 2291 | // is likely to be overwhelming. And we probably only care about the |
| 2292 | // most frequently occurring ones anyway. |
| 2293 | const LIMIT: usize = 20; |
| 2294 | let mut set_counts = |
| 2295 | self.state_sets.iter().collect::<Vec<(&Vec<StateID>, &u64)>>(); |
| 2296 | set_counts.sort_by_key(|(_, &count)| core::cmp::Reverse(count)); |
| 2297 | trace!("## PikeVM frequency of state sets (top {})" , LIMIT); |
| 2298 | for (set, count) in set_counts.iter().take(LIMIT) { |
| 2299 | trace!("{:?}: {}" , set, count); |
| 2300 | } |
| 2301 | if set_counts.len() > LIMIT { |
| 2302 | trace!( |
| 2303 | "... {} sets omitted (out of {} total)" , |
| 2304 | set_counts.len() - LIMIT, |
| 2305 | set_counts.len(), |
| 2306 | ); |
| 2307 | } |
| 2308 | |
| 2309 | trace!("" ); |
| 2310 | trace!("## PikeVM total frequency of events" ); |
| 2311 | trace!( |
| 2312 | "steps: {}, closures: {}, stack-pushes: {}, set-inserts: {}" , |
| 2313 | self.steps.iter().copied().sum::<u64>(), |
| 2314 | self.closures.iter().copied().sum::<u64>(), |
| 2315 | self.stack_pushes.iter().copied().sum::<u64>(), |
| 2316 | self.set_inserts.iter().copied().sum::<u64>(), |
| 2317 | ); |
| 2318 | |
| 2319 | trace!("" ); |
| 2320 | trace!("## PikeVM frequency of events broken down by state" ); |
| 2321 | for sid in 0..self.steps.len() { |
| 2322 | trace!( |
| 2323 | "{:06}: steps: {}, closures: {}, \ |
| 2324 | stack-pushes: {}, set-inserts: {}" , |
| 2325 | sid, |
| 2326 | self.steps[sid], |
| 2327 | self.closures[sid], |
| 2328 | self.stack_pushes[sid], |
| 2329 | self.set_inserts[sid], |
| 2330 | ); |
| 2331 | } |
| 2332 | |
| 2333 | trace!("" ); |
| 2334 | trace!("## NFA debug display" ); |
| 2335 | trace!("{:?}" , nfa); |
| 2336 | trace!("===== END PikeVM Instrumentation Output =====" ); |
| 2337 | } |
| 2338 | |
| 2339 | fn record_state_set(&mut self, set: &SparseSet) { |
| 2340 | let set = set.iter().collect::<Vec<StateID>>(); |
| 2341 | *self.state_sets.entry(set).or_insert(0) += 1; |
| 2342 | } |
| 2343 | |
| 2344 | fn record_step(&mut self, sid: StateID) { |
| 2345 | self.steps[sid] += 1; |
| 2346 | } |
| 2347 | |
| 2348 | fn record_closure(&mut self, sid: StateID) { |
| 2349 | self.closures[sid] += 1; |
| 2350 | } |
| 2351 | |
| 2352 | fn record_stack_push(&mut self, sid: StateID) { |
| 2353 | self.stack_pushes[sid] += 1; |
| 2354 | } |
| 2355 | |
| 2356 | fn record_set_insert(&mut self, sid: StateID) { |
| 2357 | self.set_inserts[sid] += 1; |
| 2358 | } |
| 2359 | } |
| 2360 | |